2005-06-09 19:59:09 +00:00
|
|
|
/*-
|
1994-05-24 10:09:53 +00:00
|
|
|
* Copyright (c) 1982, 1986, 1988, 1990, 1993
|
2007-01-08 17:49:59 +00:00
|
|
|
* The Regents of the University of California.
|
2006-03-15 12:45:35 +00:00
|
|
|
* Copyright (c) 2004 The FreeBSD Foundation
|
2008-07-03 06:47:45 +00:00
|
|
|
* Copyright (c) 2004-2008 Robert N. M. Watson
|
2007-01-08 17:49:59 +00:00
|
|
|
* All rights reserved.
|
1994-05-24 10:09:53 +00:00
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
|
|
|
|
*/
|
|
|
|
|
2006-04-01 10:43:02 +00:00
|
|
|
/*
|
|
|
|
* Comments on the socket life cycle:
|
|
|
|
*
|
|
|
|
* soalloc() sets of socket layer state for a socket, called only by
|
|
|
|
* socreate() and sonewconn(). Socket layer private.
|
|
|
|
*
|
2006-07-23 20:36:04 +00:00
|
|
|
* sodealloc() tears down socket layer state for a socket, called only by
|
2006-04-01 10:43:02 +00:00
|
|
|
* sofree() and sonewconn(). Socket layer private.
|
|
|
|
*
|
|
|
|
* pru_attach() associates protocol layer state with an allocated socket;
|
|
|
|
* called only once, may fail, aborting socket allocation. This is called
|
|
|
|
* from socreate() and sonewconn(). Socket layer private.
|
|
|
|
*
|
|
|
|
* pru_detach() disassociates protocol layer state from an attached socket,
|
|
|
|
* and will be called exactly once for sockets in which pru_attach() has
|
|
|
|
* been successfully called. If pru_attach() returned an error,
|
|
|
|
* pru_detach() will not be called. Socket layer private.
|
|
|
|
*
|
2006-07-23 20:36:04 +00:00
|
|
|
* pru_abort() and pru_close() notify the protocol layer that the last
|
|
|
|
* consumer of a socket is starting to tear down the socket, and that the
|
|
|
|
* protocol should terminate the connection. Historically, pru_abort() also
|
|
|
|
* detached protocol state from the socket state, but this is no longer the
|
|
|
|
* case.
|
|
|
|
*
|
2006-04-01 10:43:02 +00:00
|
|
|
* socreate() creates a socket and attaches protocol state. This is a public
|
|
|
|
* interface that may be used by socket layer consumers to create new
|
|
|
|
* sockets.
|
|
|
|
*
|
|
|
|
* sonewconn() creates a socket and attaches protocol state. This is a
|
|
|
|
* public interface that may be used by protocols to create new sockets when
|
|
|
|
* a new connection is received and will be available for accept() on a
|
|
|
|
* listen socket.
|
|
|
|
*
|
|
|
|
* soclose() destroys a socket after possibly waiting for it to disconnect.
|
|
|
|
* This is a public interface that socket consumers should use to close and
|
|
|
|
* release a socket when done with it.
|
|
|
|
*
|
|
|
|
* soabort() destroys a socket without waiting for it to disconnect (used
|
|
|
|
* only for incoming connections that are already partially or fully
|
|
|
|
* connected). This is used internally by the socket layer when clearing
|
|
|
|
* listen socket queues (due to overflow or close on the listen socket), but
|
|
|
|
* is also a public interface protocols may use to abort connections in
|
|
|
|
* their incomplete listen queues should they no longer be required. Sockets
|
2006-07-23 20:36:04 +00:00
|
|
|
* placed in completed connection listen queues should not be aborted for
|
|
|
|
* reasons described in the comment above the soclose() implementation. This
|
|
|
|
* is not a general purpose close routine, and except in the specific
|
|
|
|
* circumstances described here, should not be used.
|
2006-04-01 10:43:02 +00:00
|
|
|
*
|
|
|
|
* sofree() will free a socket and its protocol state if all references on
|
|
|
|
* the socket have been released, and is the public interface to attempt to
|
|
|
|
* free a socket when a reference is removed. This is a socket layer private
|
|
|
|
* interface.
|
|
|
|
*
|
|
|
|
* NOTE: In addition to socreate() and soclose(), which provide a single
|
|
|
|
* socket reference to the consumer to be managed as required, there are two
|
|
|
|
* calls to explicitly manage socket references, soref(), and sorele().
|
|
|
|
* Currently, these are generally required only when transitioning a socket
|
|
|
|
* from a listen queue to a file descriptor, in order to prevent garbage
|
|
|
|
* collection of the socket at an untimely moment. For a number of reasons,
|
|
|
|
* these interfaces are not preferred, and should be avoided.
|
2012-12-07 22:13:33 +00:00
|
|
|
*
|
2011-02-16 21:29:13 +00:00
|
|
|
* NOTE: With regard to VNETs the general rule is that callers do not set
|
|
|
|
* curvnet. Exceptions to this rule include soabort(), sodisconnect(),
|
|
|
|
* sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
|
|
|
|
* and sorflush(), which are usually called from a pre-set VNET context.
|
|
|
|
* sopoll() currently does not need a VNET context to be set.
|
2006-04-01 10:43:02 +00:00
|
|
|
*/
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
2000-11-20 01:35:25 +00:00
|
|
|
#include "opt_inet.h"
|
MFp4:
Bring in updated jail support from bz_jail branch.
This enhances the current jail implementation to permit multiple
addresses per jail. In addtion to IPv4, IPv6 is supported as well.
Due to updated checks it is even possible to have jails without
an IP address at all, which basically gives one a chroot with
restricted process view, no networking,..
SCTP support was updated and supports IPv6 in jails as well.
Cpuset support permits jails to be bound to specific processor
sets after creation.
Jails can have an unrestricted (no duplicate protection, etc.) name
in addition to the hostname. The jail name cannot be changed from
within a jail and is considered to be used for management purposes
or as audit-token in the future.
DDB 'show jails' command was added to aid debugging.
Proper compat support permits 32bit jail binaries to be used on 64bit
systems to manage jails. Also backward compatibility was preserved where
possible: for jail v1 syscalls, as well as with user space management
utilities.
Both jail as well as prison version were updated for the new features.
A gap was intentionally left as the intermediate versions had been
used by various patches floating around the last years.
Bump __FreeBSD_version for the afore mentioned and in kernel changes.
Special thanks to:
- Pawel Jakub Dawidek (pjd) for his multi-IPv4 patches
and Olivier Houchard (cognet) for initial single-IPv6 patches.
- Jeff Roberson (jeff) and Randall Stewart (rrs) for their
help, ideas and review on cpuset and SCTP support.
- Robert Watson (rwatson) for lots and lots of help, discussions,
suggestions and review of most of the patch at various stages.
- John Baldwin (jhb) for his help.
- Simon L. Nielsen (simon) as early adopter testing changes
on cluster machines as well as all the testers and people
who provided feedback the last months on freebsd-jail and
other channels.
- My employer, CK Software GmbH, for the support so I could work on this.
Reviewed by: (see above)
MFC after: 3 months (this is just so that I get the mail)
X-MFC Before: 7.2-RELEASE if possible
2008-11-29 14:32:14 +00:00
|
|
|
#include "opt_inet6.h"
|
2005-10-27 04:26:35 +00:00
|
|
|
#include "opt_compat.h"
|
2000-11-20 01:35:25 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
1997-03-23 03:37:54 +00:00
|
|
|
#include <sys/fcntl.h>
|
2005-06-09 19:59:09 +00:00
|
|
|
#include <sys/limits.h>
|
|
|
|
#include <sys/lock.h>
|
|
|
|
#include <sys/mac.h>
|
2002-08-01 17:47:56 +00:00
|
|
|
#include <sys/malloc.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/mbuf.h>
|
2005-06-09 19:59:09 +00:00
|
|
|
#include <sys/mutex.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/domain.h>
|
2000-04-16 18:53:38 +00:00
|
|
|
#include <sys/file.h> /* for struct knote */
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/kernel.h>
|
2000-04-16 18:53:38 +00:00
|
|
|
#include <sys/event.h>
|
2006-06-10 14:34:07 +00:00
|
|
|
#include <sys/eventhandler.h>
|
1997-09-14 02:34:14 +00:00
|
|
|
#include <sys/poll.h>
|
1999-06-17 23:54:50 +00:00
|
|
|
#include <sys/proc.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/protosw.h>
|
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <sys/socketvar.h>
|
|
|
|
#include <sys/resourcevar.h>
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
#include <net/route.h>
|
1994-10-02 17:35:40 +00:00
|
|
|
#include <sys/signalvar.h>
|
2007-03-26 08:59:03 +00:00
|
|
|
#include <sys/stat.h>
|
2007-05-03 14:42:42 +00:00
|
|
|
#include <sys/sx.h>
|
1995-11-03 18:33:46 +00:00
|
|
|
#include <sys/sysctl.h>
|
1998-03-28 10:33:27 +00:00
|
|
|
#include <sys/uio.h>
|
2000-06-04 04:28:31 +00:00
|
|
|
#include <sys/jail.h>
|
2012-10-29 12:14:57 +00:00
|
|
|
#include <sys/syslog.h>
|
2013-03-11 17:43:55 +00:00
|
|
|
#include <netinet/in.h>
|
2009-08-01 19:26:27 +00:00
|
|
|
|
|
|
|
#include <net/vnet.h>
|
2001-05-01 08:13:21 +00:00
|
|
|
|
2006-10-22 11:52:19 +00:00
|
|
|
#include <security/mac/mac_framework.h>
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
#include <vm/uma.h>
|
1995-11-03 18:33:46 +00:00
|
|
|
|
2010-03-11 14:49:06 +00:00
|
|
|
#ifdef COMPAT_FREEBSD32
|
2005-10-27 04:26:35 +00:00
|
|
|
#include <sys/mount.h>
|
2008-11-22 12:36:15 +00:00
|
|
|
#include <sys/sysent.h>
|
2005-10-27 04:26:35 +00:00
|
|
|
#include <compat/freebsd32/freebsd32.h>
|
|
|
|
#endif
|
1997-08-21 20:33:42 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
static int soreceive_rcvoob(struct socket *so, struct uio *uio,
|
|
|
|
int flags);
|
|
|
|
|
|
|
|
static void filt_sordetach(struct knote *kn);
|
|
|
|
static int filt_soread(struct knote *kn, long hint);
|
|
|
|
static void filt_sowdetach(struct knote *kn);
|
2000-04-16 18:53:38 +00:00
|
|
|
static int filt_sowrite(struct knote *kn, long hint);
|
|
|
|
static int filt_solisten(struct knote *kn, long hint);
|
|
|
|
|
2009-09-12 20:03:45 +00:00
|
|
|
static struct filterops solisten_filtops = {
|
|
|
|
.f_isfd = 1,
|
|
|
|
.f_detach = filt_sordetach,
|
|
|
|
.f_event = filt_solisten,
|
|
|
|
};
|
|
|
|
static struct filterops soread_filtops = {
|
|
|
|
.f_isfd = 1,
|
|
|
|
.f_detach = filt_sordetach,
|
|
|
|
.f_event = filt_soread,
|
|
|
|
};
|
|
|
|
static struct filterops sowrite_filtops = {
|
|
|
|
.f_isfd = 1,
|
|
|
|
.f_detach = filt_sowdetach,
|
|
|
|
.f_event = filt_sowrite,
|
|
|
|
};
|
2000-04-16 18:53:38 +00:00
|
|
|
|
1998-05-15 20:11:40 +00:00
|
|
|
so_gen_t so_gencnt; /* generation count for sockets */
|
|
|
|
|
1997-10-12 20:26:33 +00:00
|
|
|
MALLOC_DEFINE(M_SONAME, "soname", "socket name");
|
|
|
|
MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
|
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
#define VNET_SO_ASSERT(so) \
|
|
|
|
VNET_ASSERT(curvnet != NULL, \
|
|
|
|
("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
|
|
|
|
|
2012-10-20 10:51:32 +00:00
|
|
|
/*
|
|
|
|
* Limit on the number of connections in the listen queue waiting
|
|
|
|
* for accept(2).
|
2012-10-20 12:53:14 +00:00
|
|
|
* NB: The orginal sysctl somaxconn is still available but hidden
|
2012-10-20 19:38:22 +00:00
|
|
|
* to prevent confusion about the actual purpose of this number.
|
2012-10-20 10:51:32 +00:00
|
|
|
*/
|
1995-11-03 18:33:46 +00:00
|
|
|
static int somaxconn = SOMAXCONN;
|
2012-10-20 10:51:32 +00:00
|
|
|
|
|
|
|
static int
|
|
|
|
sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
int val;
|
|
|
|
|
|
|
|
val = somaxconn;
|
|
|
|
error = sysctl_handle_int(oidp, &val, 0, req);
|
|
|
|
if (error || !req->newptr )
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
if (val < 1 || val > USHRT_MAX)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
somaxconn = val;
|
|
|
|
return (0);
|
|
|
|
}
|
2012-10-20 12:53:14 +00:00
|
|
|
SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW,
|
2012-10-20 10:51:32 +00:00
|
|
|
0, sizeof(int), sysctl_somaxconn, "I",
|
|
|
|
"Maximum listen socket pending connection accept queue size");
|
2012-10-20 12:53:14 +00:00
|
|
|
SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
|
|
|
|
CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP,
|
|
|
|
0, sizeof(int), sysctl_somaxconn, "I",
|
|
|
|
"Maximum listen socket pending connection accept queue size (compat)");
|
2012-10-20 10:51:32 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
static int numopensockets;
|
|
|
|
SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
|
|
|
|
&numopensockets, 0, "Number of open sockets");
|
2012-10-20 10:51:32 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
|
|
|
* accept_mtx locks down per-socket fields relating to accept queues. See
|
|
|
|
* socketvar.h for an annotation of the protected fields of struct socket.
|
|
|
|
*/
|
2013-05-06 16:42:18 +00:00
|
|
|
struct mtx accept_mtx;
|
2005-06-09 19:59:09 +00:00
|
|
|
MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* so_global_mtx protects so_gencnt, numopensockets, and the per-socket
|
|
|
|
* so_gencnt field.
|
|
|
|
*/
|
2013-05-06 16:42:18 +00:00
|
|
|
static struct mtx so_global_mtx;
|
2005-06-09 19:59:09 +00:00
|
|
|
MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
|
1994-05-25 09:21:21 +00:00
|
|
|
|
2006-07-23 20:36:04 +00:00
|
|
|
/*
|
|
|
|
* General IPC sysctl name space, used by sockets and a variety of other IPC
|
|
|
|
* types.
|
|
|
|
*/
|
2006-06-10 14:34:07 +00:00
|
|
|
SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
|
|
|
|
|
2012-10-19 10:15:32 +00:00
|
|
|
/*
|
|
|
|
* Initialize the socket subsystem and set up the socket
|
|
|
|
* memory allocator.
|
|
|
|
*/
|
2012-12-07 22:15:51 +00:00
|
|
|
static uma_zone_t socket_zone;
|
2012-10-19 12:16:29 +00:00
|
|
|
int maxsockets;
|
|
|
|
|
2012-10-19 10:15:32 +00:00
|
|
|
static void
|
|
|
|
socket_zone_change(void *tag)
|
|
|
|
{
|
|
|
|
|
2012-12-07 22:15:51 +00:00
|
|
|
maxsockets = uma_zone_set_max(socket_zone, maxsockets);
|
2012-10-19 10:15:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
socket_init(void *tag)
|
|
|
|
{
|
|
|
|
|
2012-12-07 22:13:33 +00:00
|
|
|
socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
|
|
|
|
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
|
2012-12-07 22:15:51 +00:00
|
|
|
maxsockets = uma_zone_set_max(socket_zone, maxsockets);
|
2012-12-07 22:30:30 +00:00
|
|
|
uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
|
2012-12-07 22:13:33 +00:00
|
|
|
EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
|
|
|
|
EVENTHANDLER_PRI_FIRST);
|
2012-10-19 10:15:32 +00:00
|
|
|
}
|
|
|
|
SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
|
|
|
|
|
2012-10-19 12:16:29 +00:00
|
|
|
/*
|
|
|
|
* Initialise maxsockets. This SYSINIT must be run after
|
|
|
|
* tunable_mbinit().
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
init_maxsockets(void *ignored)
|
|
|
|
{
|
|
|
|
|
|
|
|
TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
|
Base the mbuf related limits on the available physical memory or
kernel memory, whichever is lower. The overall mbuf related memory
limit must be set so that mbufs (and clusters of various sizes)
can't exhaust physical RAM or KVM.
The limit is set to half of the physical RAM or KVM (whichever is
lower) as the baseline. In any normal scenario we want to leave
at least half of the physmem/kvm for other kernel functions and
userspace to prevent it from swapping too easily. Via a tunable
kern.maxmbufmem the limit can be upped to at most 3/4 of physmem/kvm.
At the same time divorce maxfiles from maxusers and set maxfiles to
physpages / 8 with a floor based on maxusers. This way busy servers
can make use of the significantly increased mbuf limits with a much
larger number of open sockets.
Tidy up ordering in init_param2() and check up on some users of
those values calculated here.
Out of the overall mbuf memory limit 2K clusters and 4K (page size)
clusters to get 1/4 each because these are the most heavily used mbuf
sizes. 2K clusters are used for MTU 1500 ethernet inbound packets.
4K clusters are used whenever possible for sends on sockets and thus
outbound packets. The larger cluster sizes of 9K and 16K are limited
to 1/6 of the overall mbuf memory limit. When jumbo MTU's are used
these large clusters will end up only on the inbound path. They are
not used on outbound, there it's still 4K. Yes, that will stay that
way because otherwise we run into lots of complications in the
stack. And it really isn't a problem, so don't make a scene.
Normal mbufs (256B) weren't limited at all previously. This was
problematic as there are certain places in the kernel that on
allocation failure of clusters try to piece together their packet
from smaller mbufs.
The mbuf limit is the number of all other mbuf sizes together plus
some more to allow for standalone mbufs (ACK for example) and to
send off a copy of a cluster. Unfortunately there isn't a way to
set an overall limit for all mbuf memory together as UMA doesn't
support such a limiting.
NB: Every cluster also has an mbuf associated with it.
Two examples on the revised mbuf sizing limits:
1GB KVM:
512MB limit for mbufs
419,430 mbufs
65,536 2K mbuf clusters
32,768 4K mbuf clusters
9,709 9K mbuf clusters
5,461 16K mbuf clusters
16GB RAM:
8GB limit for mbufs
33,554,432 mbufs
1,048,576 2K mbuf clusters
524,288 4K mbuf clusters
155,344 9K mbuf clusters
87,381 16K mbuf clusters
These defaults should be sufficient for even the most demanding
network loads.
MFC after: 1 month
2012-11-27 21:19:58 +00:00
|
|
|
maxsockets = imax(maxsockets, maxfiles);
|
2012-10-19 12:16:29 +00:00
|
|
|
}
|
|
|
|
SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
|
|
|
|
|
2006-07-23 20:36:04 +00:00
|
|
|
/*
|
|
|
|
* Sysctl to get and set the maximum global sockets limit. Notify protocols
|
|
|
|
* of the change so that they can update their dependent limits as required.
|
|
|
|
*/
|
2006-06-10 14:34:07 +00:00
|
|
|
static int
|
|
|
|
sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
int error, newmaxsockets;
|
|
|
|
|
|
|
|
newmaxsockets = maxsockets;
|
2007-06-04 18:25:08 +00:00
|
|
|
error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
|
2006-06-10 14:34:07 +00:00
|
|
|
if (error == 0 && req->newptr) {
|
Base the mbuf related limits on the available physical memory or
kernel memory, whichever is lower. The overall mbuf related memory
limit must be set so that mbufs (and clusters of various sizes)
can't exhaust physical RAM or KVM.
The limit is set to half of the physical RAM or KVM (whichever is
lower) as the baseline. In any normal scenario we want to leave
at least half of the physmem/kvm for other kernel functions and
userspace to prevent it from swapping too easily. Via a tunable
kern.maxmbufmem the limit can be upped to at most 3/4 of physmem/kvm.
At the same time divorce maxfiles from maxusers and set maxfiles to
physpages / 8 with a floor based on maxusers. This way busy servers
can make use of the significantly increased mbuf limits with a much
larger number of open sockets.
Tidy up ordering in init_param2() and check up on some users of
those values calculated here.
Out of the overall mbuf memory limit 2K clusters and 4K (page size)
clusters to get 1/4 each because these are the most heavily used mbuf
sizes. 2K clusters are used for MTU 1500 ethernet inbound packets.
4K clusters are used whenever possible for sends on sockets and thus
outbound packets. The larger cluster sizes of 9K and 16K are limited
to 1/6 of the overall mbuf memory limit. When jumbo MTU's are used
these large clusters will end up only on the inbound path. They are
not used on outbound, there it's still 4K. Yes, that will stay that
way because otherwise we run into lots of complications in the
stack. And it really isn't a problem, so don't make a scene.
Normal mbufs (256B) weren't limited at all previously. This was
problematic as there are certain places in the kernel that on
allocation failure of clusters try to piece together their packet
from smaller mbufs.
The mbuf limit is the number of all other mbuf sizes together plus
some more to allow for standalone mbufs (ACK for example) and to
send off a copy of a cluster. Unfortunately there isn't a way to
set an overall limit for all mbuf memory together as UMA doesn't
support such a limiting.
NB: Every cluster also has an mbuf associated with it.
Two examples on the revised mbuf sizing limits:
1GB KVM:
512MB limit for mbufs
419,430 mbufs
65,536 2K mbuf clusters
32,768 4K mbuf clusters
9,709 9K mbuf clusters
5,461 16K mbuf clusters
16GB RAM:
8GB limit for mbufs
33,554,432 mbufs
1,048,576 2K mbuf clusters
524,288 4K mbuf clusters
155,344 9K mbuf clusters
87,381 16K mbuf clusters
These defaults should be sufficient for even the most demanding
network loads.
MFC after: 1 month
2012-11-27 21:19:58 +00:00
|
|
|
if (newmaxsockets > maxsockets &&
|
|
|
|
newmaxsockets <= maxfiles) {
|
2006-06-10 14:34:07 +00:00
|
|
|
maxsockets = newmaxsockets;
|
|
|
|
EVENTHANDLER_INVOKE(maxsockets_change);
|
|
|
|
} else
|
|
|
|
error = EINVAL;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
|
|
|
|
&maxsockets, 0, sysctl_maxsockets, "IU",
|
|
|
|
"Maximum number of sockets avaliable");
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Socket operation routines. These routines are called by the routines in
|
|
|
|
* sys_socket.c or from a system process, and implement the semantics of
|
|
|
|
* socket operations by switching out to the protocol specific routines.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1998-05-15 20:11:40 +00:00
|
|
|
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Get a socket structure from our zone, and initialize it. Note that it
|
|
|
|
* would probably be better to allocate socket and PCB at the same time, but
|
|
|
|
* I'm not convinced that all the protocols can be easily modified to do
|
|
|
|
* this.
|
2005-06-09 19:59:09 +00:00
|
|
|
*
|
|
|
|
* soalloc() returns a socket with a ref count of 0.
|
1998-05-15 20:11:40 +00:00
|
|
|
*/
|
2006-06-10 14:34:07 +00:00
|
|
|
static struct socket *
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
soalloc(struct vnet *vnet)
|
1998-05-15 20:11:40 +00:00
|
|
|
{
|
|
|
|
struct socket *so;
|
|
|
|
|
2007-02-26 10:45:21 +00:00
|
|
|
so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
|
2006-06-08 22:33:18 +00:00
|
|
|
if (so == NULL)
|
|
|
|
return (NULL);
|
2005-06-09 19:59:09 +00:00
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
if (mac_socket_init(so, M_NOWAIT) != 0) {
|
2006-06-08 22:33:18 +00:00
|
|
|
uma_zfree(socket_zone, so);
|
|
|
|
return (NULL);
|
1998-05-15 20:11:40 +00:00
|
|
|
}
|
2006-06-08 22:33:18 +00:00
|
|
|
#endif
|
|
|
|
SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
|
|
|
|
SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
|
2007-05-03 14:42:42 +00:00
|
|
|
sx_init(&so->so_snd.sb_sx, "so_snd_sx");
|
|
|
|
sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
|
2006-06-08 22:33:18 +00:00
|
|
|
TAILQ_INIT(&so->so_aiojobq);
|
|
|
|
mtx_lock(&so_global_mtx);
|
|
|
|
so->so_gencnt = ++so_gencnt;
|
|
|
|
++numopensockets;
|
Permit buiding kernels with options VIMAGE, restricted to only a single
active network stack instance. Turning on options VIMAGE at compile
time yields the following changes relative to default kernel build:
1) V_ accessor macros for virtualized variables resolve to structure
fields via base pointers, instead of being resolved as fields in global
structs or plain global variables. As an example, V_ifnet becomes:
options VIMAGE: ((struct vnet_net *) vnet_net)->_ifnet
default build: vnet_net_0._ifnet
options VIMAGE_GLOBALS: ifnet
2) INIT_VNET_* macros will declare and set up base pointers to be used
by V_ accessor macros, instead of resolving to whitespace:
INIT_VNET_NET(ifp->if_vnet); becomes
struct vnet_net *vnet_net = (ifp->if_vnet)->mod_data[VNET_MOD_NET];
3) Memory for vnet modules registered via vnet_mod_register() is now
allocated at run time in sys/kern/kern_vimage.c, instead of per vnet
module structs being declared as globals. If required, vnet modules
can now request the framework to provide them with allocated bzeroed
memory by filling in the vmi_size field in their vmi_modinfo structures.
4) structs socket, ifnet, inpcbinfo, tcpcb and syncache_head are
extended to hold a pointer to the parent vnet. options VIMAGE builds
will fill in those fields as required.
5) curvnet is introduced as a new global variable in options VIMAGE
builds, always pointing to the default and only struct vnet.
6) struct sysctl_oid has been extended with additional two fields to
store major and minor virtualization module identifiers, oid_v_subs and
oid_v_mod. SYSCTL_V_* family of macros will fill in those fields
accordingly, and store the offset in the appropriate vnet container
struct in oid_arg1.
In sysctl handlers dealing with virtualized sysctls, the
SYSCTL_RESOLVE_V_ARG1() macro will compute the address of the target
variable and make it available in arg1 variable for further processing.
Unused fields in structs vnet_inet, vnet_inet6 and vnet_ipfw have
been deleted.
Reviewed by: bz, rwatson
Approved by: julian (mentor)
2009-04-30 13:36:26 +00:00
|
|
|
#ifdef VIMAGE
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
|
|
|
|
__func__, __LINE__, so));
|
2009-07-19 17:40:45 +00:00
|
|
|
vnet->vnet_sockcnt++;
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
so->so_vnet = vnet;
|
Permit buiding kernels with options VIMAGE, restricted to only a single
active network stack instance. Turning on options VIMAGE at compile
time yields the following changes relative to default kernel build:
1) V_ accessor macros for virtualized variables resolve to structure
fields via base pointers, instead of being resolved as fields in global
structs or plain global variables. As an example, V_ifnet becomes:
options VIMAGE: ((struct vnet_net *) vnet_net)->_ifnet
default build: vnet_net_0._ifnet
options VIMAGE_GLOBALS: ifnet
2) INIT_VNET_* macros will declare and set up base pointers to be used
by V_ accessor macros, instead of resolving to whitespace:
INIT_VNET_NET(ifp->if_vnet); becomes
struct vnet_net *vnet_net = (ifp->if_vnet)->mod_data[VNET_MOD_NET];
3) Memory for vnet modules registered via vnet_mod_register() is now
allocated at run time in sys/kern/kern_vimage.c, instead of per vnet
module structs being declared as globals. If required, vnet modules
can now request the framework to provide them with allocated bzeroed
memory by filling in the vmi_size field in their vmi_modinfo structures.
4) structs socket, ifnet, inpcbinfo, tcpcb and syncache_head are
extended to hold a pointer to the parent vnet. options VIMAGE builds
will fill in those fields as required.
5) curvnet is introduced as a new global variable in options VIMAGE
builds, always pointing to the default and only struct vnet.
6) struct sysctl_oid has been extended with additional two fields to
store major and minor virtualization module identifiers, oid_v_subs and
oid_v_mod. SYSCTL_V_* family of macros will fill in those fields
accordingly, and store the offset in the appropriate vnet container
struct in oid_arg1.
In sysctl handlers dealing with virtualized sysctls, the
SYSCTL_RESOLVE_V_ARG1() macro will compute the address of the target
variable and make it available in arg1 variable for further processing.
Unused fields in structs vnet_inet, vnet_inet6 and vnet_ipfw have
been deleted.
Reviewed by: bz, rwatson
Approved by: julian (mentor)
2009-04-30 13:36:26 +00:00
|
|
|
#endif
|
2006-06-08 22:33:18 +00:00
|
|
|
mtx_unlock(&so_global_mtx);
|
2005-06-09 19:59:09 +00:00
|
|
|
return (so);
|
1998-05-15 20:11:40 +00:00
|
|
|
}
|
|
|
|
|
2006-07-23 20:36:04 +00:00
|
|
|
/*
|
|
|
|
* Free the storage associated with a socket at the socket layer, tear down
|
|
|
|
* locks, labels, etc. All protocol state is assumed already to have been
|
|
|
|
* torn down (and possibly never set up) by the caller.
|
|
|
|
*/
|
2006-06-10 14:34:07 +00:00
|
|
|
static void
|
|
|
|
sodealloc(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
|
|
|
|
KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
|
|
|
|
|
|
|
|
mtx_lock(&so_global_mtx);
|
|
|
|
so->so_gencnt = ++so_gencnt;
|
2006-08-02 00:45:27 +00:00
|
|
|
--numopensockets; /* Could be below, but faster here. */
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
#ifdef VIMAGE
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
|
|
|
|
__func__, __LINE__, so));
|
2009-07-19 17:40:45 +00:00
|
|
|
so->so_vnet->vnet_sockcnt--;
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
#endif
|
2006-06-10 14:34:07 +00:00
|
|
|
mtx_unlock(&so_global_mtx);
|
|
|
|
if (so->so_rcv.sb_hiwat)
|
|
|
|
(void)chgsbsize(so->so_cred->cr_uidinfo,
|
|
|
|
&so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
|
|
|
|
if (so->so_snd.sb_hiwat)
|
|
|
|
(void)chgsbsize(so->so_cred->cr_uidinfo,
|
|
|
|
&so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
|
|
|
|
#ifdef INET
|
|
|
|
/* remove acccept filter if one is present. */
|
|
|
|
if (so->so_accf != NULL)
|
|
|
|
do_setopt_accept_filter(so, NULL);
|
|
|
|
#endif
|
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
mac_socket_destroy(so);
|
2006-06-10 14:34:07 +00:00
|
|
|
#endif
|
|
|
|
crfree(so->so_cred);
|
2007-05-03 14:42:42 +00:00
|
|
|
sx_destroy(&so->so_snd.sb_sx);
|
|
|
|
sx_destroy(&so->so_rcv.sb_sx);
|
2006-06-10 14:34:07 +00:00
|
|
|
SOCKBUF_LOCK_DESTROY(&so->so_snd);
|
|
|
|
SOCKBUF_LOCK_DESTROY(&so->so_rcv);
|
|
|
|
uma_zfree(socket_zone, so);
|
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
|
|
|
* socreate returns a socket with a ref count of 1. The socket should be
|
|
|
|
* closed with soclose().
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
socreate(int dom, struct socket **aso, int type, int proto,
|
|
|
|
struct ucred *cred, struct thread *td)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct protosw *prp;
|
|
|
|
struct socket *so;
|
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
if (proto)
|
|
|
|
prp = pffindproto(dom, proto, type);
|
|
|
|
else
|
|
|
|
prp = pffindtype(dom, type);
|
2000-06-04 04:28:31 +00:00
|
|
|
|
2012-12-07 02:22:48 +00:00
|
|
|
if (prp == NULL) {
|
|
|
|
/* No support for domain. */
|
|
|
|
if (pffinddomain(dom) == NULL)
|
|
|
|
return (EAFNOSUPPORT);
|
|
|
|
/* No support for socket type. */
|
|
|
|
if (proto == 0 && type != 0)
|
|
|
|
return (EPROTOTYPE);
|
|
|
|
return (EPROTONOSUPPORT);
|
|
|
|
}
|
|
|
|
if (prp->pr_usrreqs->pru_attach == NULL ||
|
2005-06-09 19:59:09 +00:00
|
|
|
prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
|
2000-06-13 15:44:04 +00:00
|
|
|
return (EPROTONOSUPPORT);
|
|
|
|
|
2009-02-05 14:15:18 +00:00
|
|
|
if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
|
2000-06-04 04:28:31 +00:00
|
|
|
return (EPROTONOSUPPORT);
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
if (prp->pr_type != type)
|
|
|
|
return (EPROTOTYPE);
|
2009-06-15 19:01:53 +00:00
|
|
|
so = soalloc(CRED_TO_VNET(cred));
|
2005-06-09 19:59:09 +00:00
|
|
|
if (so == NULL)
|
1998-05-15 20:11:40 +00:00
|
|
|
return (ENOBUFS);
|
|
|
|
|
1996-03-11 15:37:44 +00:00
|
|
|
TAILQ_INIT(&so->so_incomp);
|
|
|
|
TAILQ_INIT(&so->so_comp);
|
1994-05-24 10:09:53 +00:00
|
|
|
so->so_type = type;
|
2005-06-09 19:59:09 +00:00
|
|
|
so->so_cred = crhold(cred);
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
if ((prp->pr_domain->dom_family == PF_INET) ||
|
2012-02-03 11:00:53 +00:00
|
|
|
(prp->pr_domain->dom_family == PF_INET6) ||
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
(prp->pr_domain->dom_family == PF_ROUTE))
|
|
|
|
so->so_fibnum = td->td_proc->p_fibnum;
|
|
|
|
else
|
|
|
|
so->so_fibnum = 0;
|
1994-05-24 10:09:53 +00:00
|
|
|
so->so_proto = prp;
|
2005-06-09 19:59:09 +00:00
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
mac_socket_create(cred, so);
|
2005-06-09 19:59:09 +00:00
|
|
|
#endif
|
2009-06-10 20:59:32 +00:00
|
|
|
knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
|
|
|
|
knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
|
2005-06-09 19:59:09 +00:00
|
|
|
so->so_count = 1;
|
2007-02-01 17:53:41 +00:00
|
|
|
/*
|
|
|
|
* Auto-sizing of socket buffers is managed by the protocols and
|
|
|
|
* the appropriate flags must be set in the pru_attach function.
|
|
|
|
*/
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
2005-06-09 19:59:09 +00:00
|
|
|
error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_RESTORE();
|
1994-05-24 10:09:53 +00:00
|
|
|
if (error) {
|
2006-08-11 23:03:10 +00:00
|
|
|
KASSERT(so->so_count == 1, ("socreate: so_count %d",
|
|
|
|
so->so_count));
|
|
|
|
so->so_count = 0;
|
2006-07-11 21:56:58 +00:00
|
|
|
sodealloc(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
*aso = so;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2006-06-10 14:34:07 +00:00
|
|
|
#ifdef REGRESSION
|
|
|
|
static int regression_sonewconn_earlytest = 1;
|
|
|
|
SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
|
|
|
|
®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* When an attempt at a new connection is noted on a socket which accepts
|
|
|
|
* connections, sonewconn is called. If the connection is possible (subject
|
|
|
|
* to space constraints, etc.) then we allocate a new structure, propoerly
|
|
|
|
* linked into the data structure of the original socket, and return this.
|
2013-11-08 20:11:15 +00:00
|
|
|
* Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED.
|
2006-06-10 14:34:07 +00:00
|
|
|
*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Note: the ref count on the socket is 0 on return.
|
2006-06-10 14:34:07 +00:00
|
|
|
*/
|
|
|
|
struct socket *
|
2007-05-16 20:41:08 +00:00
|
|
|
sonewconn(struct socket *head, int connstatus)
|
2006-06-10 14:34:07 +00:00
|
|
|
{
|
2013-10-31 20:33:21 +00:00
|
|
|
static struct timeval lastover;
|
|
|
|
static struct timeval overinterval = { 60, 0 };
|
|
|
|
static int overcount;
|
|
|
|
|
2007-05-16 20:41:08 +00:00
|
|
|
struct socket *so;
|
2006-06-10 14:34:07 +00:00
|
|
|
int over;
|
|
|
|
|
|
|
|
ACCEPT_LOCK();
|
|
|
|
over = (head->so_qlen > 3 * head->so_qlimit / 2);
|
|
|
|
ACCEPT_UNLOCK();
|
|
|
|
#ifdef REGRESSION
|
2012-10-29 12:14:57 +00:00
|
|
|
if (regression_sonewconn_earlytest && over) {
|
2006-06-10 14:34:07 +00:00
|
|
|
#else
|
2012-10-29 12:14:57 +00:00
|
|
|
if (over) {
|
2006-06-10 14:34:07 +00:00
|
|
|
#endif
|
2013-10-31 20:33:21 +00:00
|
|
|
overcount++;
|
|
|
|
|
|
|
|
if (ratecheck(&lastover, &overinterval)) {
|
|
|
|
log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
|
|
|
|
"%i already in queue awaiting acceptance "
|
|
|
|
"(%d occurrences)\n",
|
|
|
|
__func__, head->so_pcb, head->so_qlen, overcount);
|
|
|
|
|
|
|
|
overcount = 0;
|
|
|
|
}
|
|
|
|
|
2006-06-10 14:34:07 +00:00
|
|
|
return (NULL);
|
2012-10-29 12:14:57 +00:00
|
|
|
}
|
2011-02-11 13:27:00 +00:00
|
|
|
VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
|
|
|
|
__func__, __LINE__, head));
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
so = soalloc(head->so_vnet);
|
2012-10-29 12:14:57 +00:00
|
|
|
if (so == NULL) {
|
|
|
|
log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
|
|
|
|
"limit reached or out of memory\n",
|
|
|
|
__func__, head->so_pcb);
|
2006-06-10 14:34:07 +00:00
|
|
|
return (NULL);
|
2012-10-29 12:14:57 +00:00
|
|
|
}
|
2006-06-10 14:34:07 +00:00
|
|
|
if ((head->so_options & SO_ACCEPTFILTER) != 0)
|
|
|
|
connstatus = 0;
|
|
|
|
so->so_head = head;
|
|
|
|
so->so_type = head->so_type;
|
|
|
|
so->so_options = head->so_options &~ SO_ACCEPTCONN;
|
|
|
|
so->so_linger = head->so_linger;
|
|
|
|
so->so_state = head->so_state | SS_NOFDREF;
|
2009-07-28 19:43:27 +00:00
|
|
|
so->so_fibnum = head->so_fibnum;
|
2006-06-10 14:34:07 +00:00
|
|
|
so->so_proto = head->so_proto;
|
|
|
|
so->so_cred = crhold(head->so_cred);
|
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
mac_socket_newconn(head, so);
|
2006-06-10 14:34:07 +00:00
|
|
|
#endif
|
2009-06-10 20:59:32 +00:00
|
|
|
knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
|
|
|
|
knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(head);
|
2012-10-29 12:14:57 +00:00
|
|
|
if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
|
|
|
|
sodealloc(so);
|
|
|
|
log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
|
|
|
|
__func__, head->so_pcb);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
|
2006-06-10 14:34:07 +00:00
|
|
|
sodealloc(so);
|
2012-10-29 12:14:57 +00:00
|
|
|
log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
|
|
|
|
__func__, head->so_pcb);
|
2006-06-10 14:34:07 +00:00
|
|
|
return (NULL);
|
|
|
|
}
|
2006-09-10 17:08:06 +00:00
|
|
|
so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
|
|
|
|
so->so_snd.sb_lowat = head->so_snd.sb_lowat;
|
|
|
|
so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
|
|
|
|
so->so_snd.sb_timeo = head->so_snd.sb_timeo;
|
2007-02-01 17:53:41 +00:00
|
|
|
so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
|
|
|
|
so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
|
2006-06-10 14:34:07 +00:00
|
|
|
so->so_state |= connstatus;
|
|
|
|
ACCEPT_LOCK();
|
2012-11-27 20:04:52 +00:00
|
|
|
/*
|
|
|
|
* The accept socket may be tearing down but we just
|
|
|
|
* won a race on the ACCEPT_LOCK.
|
2013-03-11 17:43:55 +00:00
|
|
|
* However, if sctp_peeloff() is called on a 1-to-many
|
|
|
|
* style socket, the SO_ACCEPTCONN doesn't need to be set.
|
2012-11-27 20:04:52 +00:00
|
|
|
*/
|
2013-03-11 17:43:55 +00:00
|
|
|
if (!(head->so_options & SO_ACCEPTCONN) &&
|
|
|
|
((head->so_proto->pr_protocol != IPPROTO_SCTP) ||
|
|
|
|
(head->so_type != SOCK_SEQPACKET))) {
|
2012-11-27 20:04:52 +00:00
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_head = NULL;
|
|
|
|
sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */
|
|
|
|
return (NULL);
|
|
|
|
}
|
2006-06-10 14:34:07 +00:00
|
|
|
if (connstatus) {
|
|
|
|
TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
|
|
|
|
so->so_qstate |= SQ_COMP;
|
|
|
|
head->so_qlen++;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Keep removing sockets from the head until there's room for
|
|
|
|
* us to insert on the tail. In pre-locking revisions, this
|
|
|
|
* was a simple if(), but as we could be racing with other
|
|
|
|
* threads and soabort() requires dropping locks, we must
|
|
|
|
* loop waiting for the condition to be true.
|
|
|
|
*/
|
|
|
|
while (head->so_incqlen > head->so_qlimit) {
|
|
|
|
struct socket *sp;
|
|
|
|
sp = TAILQ_FIRST(&head->so_incomp);
|
|
|
|
TAILQ_REMOVE(&head->so_incomp, sp, so_list);
|
|
|
|
head->so_incqlen--;
|
|
|
|
sp->so_qstate &= ~SQ_INCOMP;
|
|
|
|
sp->so_head = NULL;
|
|
|
|
ACCEPT_UNLOCK();
|
|
|
|
soabort(sp);
|
|
|
|
ACCEPT_LOCK();
|
|
|
|
}
|
|
|
|
TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
|
|
|
|
so->so_qstate |= SQ_INCOMP;
|
|
|
|
head->so_incqlen++;
|
|
|
|
}
|
|
|
|
ACCEPT_UNLOCK();
|
|
|
|
if (connstatus) {
|
|
|
|
sorwakeup(head);
|
|
|
|
wakeup_one(&head->so_timeo);
|
|
|
|
}
|
|
|
|
return (so);
|
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
|
|
|
|
CURVNET_RESTORE();
|
2013-03-02 21:11:30 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
CURVNET_SET(so->so_vnet);
|
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
|
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
|
|
|
* solisten() transitions a socket from a non-listening state to a listening
|
|
|
|
* state, but can also be used to update the listen queue depth on an
|
|
|
|
* existing listen socket. The protocol will call back into the sockets
|
|
|
|
* layer using solisten_proto_check() and solisten_proto() to check and set
|
|
|
|
* socket-layer listen state. Call backs are used so that the protocol can
|
2005-09-18 10:46:34 +00:00
|
|
|
* acquire both protocol and socket layer locks in whatever order is required
|
2005-06-09 19:59:09 +00:00
|
|
|
* by the protocol.
|
|
|
|
*
|
|
|
|
* Protocol implementors are advised to hold the socket lock across the
|
|
|
|
* socket-layer test and set to avoid races at the socket layer.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
solisten(struct socket *so, int backlog, struct thread *td)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2011-02-16 21:29:13 +00:00
|
|
|
int error;
|
2005-06-09 19:59:09 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
|
|
|
|
CURVNET_RESTORE();
|
2013-03-02 21:11:30 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
solisten_proto_check(struct socket *so)
|
2005-06-09 19:59:09 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
SOCK_LOCK_ASSERT(so);
|
|
|
|
|
|
|
|
if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
|
|
|
|
SS_ISDISCONNECTING))
|
|
|
|
return (EINVAL);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
solisten_proto(struct socket *so, int backlog)
|
2005-06-09 19:59:09 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
SOCK_LOCK_ASSERT(so);
|
|
|
|
|
2005-10-30 19:44:40 +00:00
|
|
|
if (backlog < 0 || backlog > somaxconn)
|
|
|
|
backlog = somaxconn;
|
|
|
|
so->so_qlimit = backlog;
|
2005-06-09 19:59:09 +00:00
|
|
|
so->so_options |= SO_ACCEPTCONN;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-09-18 11:18:42 +00:00
|
|
|
* Evaluate the reference count and named references on a socket; if no
|
|
|
|
* references remain, free it. This should be called whenever a reference is
|
|
|
|
* released, such as in sorele(), but also when named reference flags are
|
|
|
|
* cleared in socket or protocol code.
|
2005-06-09 19:59:09 +00:00
|
|
|
*
|
2010-09-18 11:18:42 +00:00
|
|
|
* sofree() will free the socket if:
|
Chance protocol switch method pru_detach() so that it returns void
rather than an error. Detaches do not "fail", they other occur or
the protocol flags SS_PROTOREF to take ownership of the socket.
soclose() no longer looks at so_pcb to see if it's NULL, relying
entirely on the protocol to decide whether it's time to free the
socket or not using SS_PROTOREF. so_pcb is now entirely owned and
managed by the protocol code. Likewise, no longer test so_pcb in
other socket functions, such as soreceive(), which have no business
digging into protocol internals.
Protocol detach routines no longer try to free the socket on detach,
this is performed in the socket code if the protocol permits it.
In rts_detach(), no longer test for rp != NULL in detach, and
likewise in other protocols that don't permit a NULL so_pcb, reduce
the incidence of testing for it during detach.
netinet and netinet6 are not fully updated to this change, which
will be in an upcoming commit. In their current state they may leak
memory or panic.
MFC after: 3 months
2006-04-01 15:42:02 +00:00
|
|
|
*
|
|
|
|
* - There are no outstanding file descriptor references or related consumers
|
|
|
|
* (so_count == 0).
|
|
|
|
*
|
|
|
|
* - The socket has been closed by user space, if ever open (SS_NOFDREF).
|
|
|
|
*
|
|
|
|
* - The protocol does not have an outstanding strong reference on the socket
|
|
|
|
* (SS_PROTOREF).
|
|
|
|
*
|
2006-04-23 15:37:23 +00:00
|
|
|
* - The socket is not in a completed connection queue, so a process has been
|
2006-04-23 15:33:38 +00:00
|
|
|
* notified that it is present. If it is removed, the user process may
|
|
|
|
* block in accept() despite select() saying the socket was ready.
|
2005-06-09 19:59:09 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
sofree(struct socket *so)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2006-08-01 10:30:26 +00:00
|
|
|
struct protosw *pr = so->so_proto;
|
2005-06-09 19:59:09 +00:00
|
|
|
struct socket *head;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
ACCEPT_LOCK_ASSERT();
|
|
|
|
SOCK_LOCK_ASSERT(so);
|
|
|
|
|
Chance protocol switch method pru_detach() so that it returns void
rather than an error. Detaches do not "fail", they other occur or
the protocol flags SS_PROTOREF to take ownership of the socket.
soclose() no longer looks at so_pcb to see if it's NULL, relying
entirely on the protocol to decide whether it's time to free the
socket or not using SS_PROTOREF. so_pcb is now entirely owned and
managed by the protocol code. Likewise, no longer test so_pcb in
other socket functions, such as soreceive(), which have no business
digging into protocol internals.
Protocol detach routines no longer try to free the socket on detach,
this is performed in the socket code if the protocol permits it.
In rts_detach(), no longer test for rp != NULL in detach, and
likewise in other protocols that don't permit a NULL so_pcb, reduce
the incidence of testing for it during detach.
netinet and netinet6 are not fully updated to this change, which
will be in an upcoming commit. In their current state they may leak
memory or panic.
MFC after: 3 months
2006-04-01 15:42:02 +00:00
|
|
|
if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
|
2006-04-23 15:33:38 +00:00
|
|
|
(so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
ACCEPT_UNLOCK();
|
1994-05-24 10:09:53 +00:00
|
|
|
return;
|
2005-06-09 19:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
head = so->so_head;
|
Integrate accept locking from rwatson_netperf, introducing a new
global mutex, accept_mtx, which serializes access to the following
fields across all sockets:
so_qlen so_incqlen so_qstate
so_comp so_incomp so_list
so_head
While providing only coarse granularity, this approach avoids lock
order issues between sockets by avoiding ownership of the fields
by a specific socket and its per-socket mutexes.
While here, rewrite soclose(), sofree(), soaccept(), and
sonewconn() to add assertions, close additional races and address
lock order concerns. In particular:
- Reorganize the optimistic concurrency behavior in accept1() to
always allocate a file descriptor with falloc() so that if we do
find a socket, we don't have to encounter the "Oh, there wasn't
a socket" race that can occur if falloc() sleeps in the current
code, which broke inbound accept() ordering, not to mention
requiring backing out socket state changes in a way that raced
with the protocol level. We may want to add a lockless read of
the queue state if polling of empty queues proves to be important
to optimize.
- In accept1(), soref() the socket while holding the accept lock
so that the socket cannot be free'd in a race with the protocol
layer. Likewise in netgraph equivilents of the accept1() code.
- In sonewconn(), loop waiting for the queue to be small enough to
insert our new socket once we've committed to inserting it, or
races can occur that cause the incomplete socket queue to
overfill. In the previously implementation, it was sufficient
to simply tested once since calling soabort() didn't release
synchronization permitting another thread to insert a socket as
we discard a previous one.
- In soclose()/sofree()/et al, it is the responsibility of the
caller to remove a socket from the incomplete connection queue
before calling soabort(), which prevents soabort() from having
to walk into the accept socket to release the socket from its
queue, and avoids races when releasing the accept mutex to enter
soabort(), permitting soabort() to avoid lock ordering issues
with the caller.
- Generally cluster accept queue related operations together
throughout these functions in order to facilitate locking.
Annotate new locking in socketvar.h.
2004-06-02 04:15:39 +00:00
|
|
|
if (head != NULL) {
|
2005-06-09 19:59:09 +00:00
|
|
|
KASSERT((so->so_qstate & SQ_COMP) != 0 ||
|
|
|
|
(so->so_qstate & SQ_INCOMP) != 0,
|
|
|
|
("sofree: so_head != NULL, but neither SQ_COMP nor "
|
|
|
|
"SQ_INCOMP"));
|
|
|
|
KASSERT((so->so_qstate & SQ_COMP) == 0 ||
|
|
|
|
(so->so_qstate & SQ_INCOMP) == 0,
|
|
|
|
("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
|
|
|
|
TAILQ_REMOVE(&head->so_incomp, so, so_list);
|
|
|
|
head->so_incqlen--;
|
|
|
|
so->so_qstate &= ~SQ_INCOMP;
|
1996-03-11 15:37:44 +00:00
|
|
|
so->so_head = NULL;
|
2002-05-31 11:52:35 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
KASSERT((so->so_qstate & SQ_COMP) == 0 &&
|
|
|
|
(so->so_qstate & SQ_INCOMP) == 0,
|
|
|
|
("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
|
|
|
|
so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
|
2006-11-22 23:54:29 +00:00
|
|
|
if (so->so_options & SO_ACCEPTCONN) {
|
2012-12-07 22:13:33 +00:00
|
|
|
KASSERT((TAILQ_EMPTY(&so->so_comp)),
|
|
|
|
("sofree: so_comp populated"));
|
|
|
|
KASSERT((TAILQ_EMPTY(&so->so_incomp)),
|
|
|
|
("sofree: so_incomp populated"));
|
2006-11-22 23:54:29 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
ACCEPT_UNLOCK();
|
Chance protocol switch method pru_detach() so that it returns void
rather than an error. Detaches do not "fail", they other occur or
the protocol flags SS_PROTOREF to take ownership of the socket.
soclose() no longer looks at so_pcb to see if it's NULL, relying
entirely on the protocol to decide whether it's time to free the
socket or not using SS_PROTOREF. so_pcb is now entirely owned and
managed by the protocol code. Likewise, no longer test so_pcb in
other socket functions, such as soreceive(), which have no business
digging into protocol internals.
Protocol detach routines no longer try to free the socket on detach,
this is performed in the socket code if the protocol permits it.
In rts_detach(), no longer test for rp != NULL in detach, and
likewise in other protocols that don't permit a NULL so_pcb, reduce
the incidence of testing for it during detach.
netinet and netinet6 are not fully updated to this change, which
will be in an upcoming commit. In their current state they may leak
memory or panic.
MFC after: 3 months
2006-04-01 15:42:02 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2007-03-22 13:21:24 +00:00
|
|
|
if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
|
|
|
|
(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
|
|
|
|
if (pr->pr_usrreqs->pru_detach != NULL)
|
|
|
|
(*pr->pr_usrreqs->pru_detach)(so);
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
2006-08-01 10:30:26 +00:00
|
|
|
* From this point on, we assume that no other references to this
|
|
|
|
* socket exist anywhere else in the stack. Therefore, no locks need
|
|
|
|
* to be acquired or held.
|
|
|
|
*
|
|
|
|
* We used to do a lot of socket buffer and socket locking here, as
|
|
|
|
* well as invoke sorflush() and perform wakeups. The direct call to
|
|
|
|
* dom_dispose() and sbrelease_internal() are an inlining of what was
|
|
|
|
* necessary from sorflush().
|
|
|
|
*
|
|
|
|
* Notice that the socket buffer and kqueue state are torn down
|
|
|
|
* before calling pru_detach. This means that protocols shold not
|
2007-05-03 14:42:42 +00:00
|
|
|
* assume they can perform socket wakeups, etc, in their detach code.
|
2005-06-09 19:59:09 +00:00
|
|
|
*/
|
2006-08-01 10:30:26 +00:00
|
|
|
sbdestroy(&so->so_snd, so);
|
|
|
|
sbdestroy(&so->so_rcv, so);
|
Fix a deficiency in the selinfo interface:
If a selinfo object is recorded (via selrecord()) and then it is
quickly destroyed, with the waiters missing the opportunity to awake,
at the next iteration they will find the selinfo object destroyed,
causing a PF#.
That happens because the selinfo interface has no way to drain the
waiters before to destroy the registered selinfo object. Also this
race is quite rare to get in practice, because it would require a
selrecord(), a poll request by another thread and a quick destruction
of the selrecord()'ed selinfo object.
Fix this by adding the seldrain() routine which should be called
before to destroy the selinfo objects (in order to avoid such case),
and fix the present cases where it might have already been called.
Sometimes, the context is safe enough to prevent this type of race,
like it happens in device drivers which installs selinfo objects on
poll callbacks. There, the destruction of the selinfo object happens
at driver detach time, when all the filedescriptors should be already
closed, thus there cannot be a race.
For this case, mfi(4) device driver can be set as an example, as it
implements a full correct logic for preventing this from happening.
Sponsored by: Sandvine Incorporated
Reported by: rstone
Tested by: pluknet
Reviewed by: jhb, kib
Approved by: re (bz)
MFC after: 3 weeks
2011-08-25 15:51:54 +00:00
|
|
|
seldrain(&so->so_snd.sb_sel);
|
|
|
|
seldrain(&so->so_rcv.sb_sel);
|
2006-08-02 18:37:44 +00:00
|
|
|
knlist_destroy(&so->so_rcv.sb_sel.si_note);
|
|
|
|
knlist_destroy(&so->so_snd.sb_sel.si_note);
|
1998-05-15 20:11:40 +00:00
|
|
|
sodealloc(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Close a socket on last file table reference removal. Initiate disconnect
|
|
|
|
* if connected. Free socket when disconnect complete.
|
2005-06-09 19:59:09 +00:00
|
|
|
*
|
2006-07-23 20:36:04 +00:00
|
|
|
* This function will sorele() the socket. Note that soclose() may be called
|
|
|
|
* prior to the ref count reaching zero. The actual socket structure will
|
|
|
|
* not be freed until the ref count reaches zero.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soclose(struct socket *so)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
int error = 0;
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
|
2005-06-09 19:56:38 +00:00
|
|
|
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
2005-06-09 19:59:09 +00:00
|
|
|
funsetown(&so->so_sigio);
|
2006-11-22 23:54:29 +00:00
|
|
|
if (so->so_state & SS_ISCONNECTED) {
|
|
|
|
if ((so->so_state & SS_ISDISCONNECTING) == 0) {
|
|
|
|
error = sodisconnect(so);
|
2010-05-27 15:27:31 +00:00
|
|
|
if (error) {
|
|
|
|
if (error == ENOTCONN)
|
|
|
|
error = 0;
|
2006-11-22 23:54:29 +00:00
|
|
|
goto drop;
|
2010-05-27 15:27:31 +00:00
|
|
|
}
|
2006-11-22 23:54:29 +00:00
|
|
|
}
|
|
|
|
if (so->so_options & SO_LINGER) {
|
|
|
|
if ((so->so_state & SS_ISDISCONNECTING) &&
|
|
|
|
(so->so_state & SS_NBIO))
|
|
|
|
goto drop;
|
|
|
|
while (so->so_state & SS_ISCONNECTED) {
|
|
|
|
error = tsleep(&so->so_timeo,
|
2012-12-07 22:13:33 +00:00
|
|
|
PSOCK | PCATCH, "soclos",
|
|
|
|
so->so_linger * hz);
|
2006-11-22 23:54:29 +00:00
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
drop:
|
|
|
|
if (so->so_proto->pr_usrreqs->pru_close != NULL)
|
|
|
|
(*so->so_proto->pr_usrreqs->pru_close)(so);
|
2012-11-27 20:04:52 +00:00
|
|
|
ACCEPT_LOCK();
|
2005-06-09 19:59:09 +00:00
|
|
|
if (so->so_options & SO_ACCEPTCONN) {
|
|
|
|
struct socket *sp;
|
2012-11-27 20:04:52 +00:00
|
|
|
/*
|
|
|
|
* Prevent new additions to the accept queues due
|
|
|
|
* to ACCEPT_LOCK races while we are draining them.
|
|
|
|
*/
|
|
|
|
so->so_options &= ~SO_ACCEPTCONN;
|
2005-06-09 19:59:09 +00:00
|
|
|
while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
|
|
|
|
TAILQ_REMOVE(&so->so_incomp, sp, so_list);
|
|
|
|
so->so_incqlen--;
|
|
|
|
sp->so_qstate &= ~SQ_INCOMP;
|
|
|
|
sp->so_head = NULL;
|
|
|
|
ACCEPT_UNLOCK();
|
2006-03-16 07:03:14 +00:00
|
|
|
soabort(sp);
|
2005-06-09 19:59:09 +00:00
|
|
|
ACCEPT_LOCK();
|
1996-04-16 03:50:08 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
|
1999-01-25 16:58:56 +00:00
|
|
|
TAILQ_REMOVE(&so->so_comp, sp, so_list);
|
1999-02-02 07:23:28 +00:00
|
|
|
so->so_qlen--;
|
2005-06-09 19:59:09 +00:00
|
|
|
sp->so_qstate &= ~SQ_COMP;
|
1999-02-02 07:23:28 +00:00
|
|
|
sp->so_head = NULL;
|
2005-06-09 19:59:09 +00:00
|
|
|
ACCEPT_UNLOCK();
|
2006-03-16 07:03:14 +00:00
|
|
|
soabort(sp);
|
2005-06-09 19:59:09 +00:00
|
|
|
ACCEPT_LOCK();
|
1996-04-16 03:50:08 +00:00
|
|
|
}
|
2012-11-27 20:04:52 +00:00
|
|
|
KASSERT((TAILQ_EMPTY(&so->so_comp)),
|
|
|
|
("%s: so_comp populated", __func__));
|
|
|
|
KASSERT((TAILQ_EMPTY(&so->so_incomp)),
|
|
|
|
("%s: so_incomp populated", __func__));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_LOCK(so);
|
|
|
|
KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
|
1994-05-24 10:09:53 +00:00
|
|
|
so->so_state |= SS_NOFDREF;
|
2012-11-27 20:04:52 +00:00
|
|
|
sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_RESTORE();
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2006-07-16 23:09:39 +00:00
|
|
|
* soabort() is used to abruptly tear down a connection, such as when a
|
|
|
|
* resource limit is reached (listen queue depth exceeded), or if a listen
|
|
|
|
* socket is closed while there are sockets waiting to be accepted.
|
2006-04-01 15:15:05 +00:00
|
|
|
*
|
|
|
|
* This interface is tricky, because it is called on an unreferenced socket,
|
|
|
|
* and must be called only by a thread that has actually removed the socket
|
|
|
|
* from the listen queue it was on, or races with other threads are risked.
|
|
|
|
*
|
|
|
|
* This interface will call into the protocol code, so must not be called
|
|
|
|
* with any socket locks held. Protocols do call it while holding their own
|
|
|
|
* recursible protocol mutexes, but this is something that should be subject
|
|
|
|
* to review in the future.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2006-03-16 07:03:14 +00:00
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
soabort(struct socket *so)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
|
2006-04-01 15:15:05 +00:00
|
|
|
/*
|
|
|
|
* In as much as is possible, assert that no references to this
|
|
|
|
* socket are held. This is not quite the same as asserting that the
|
|
|
|
* current thread is responsible for arranging for no references, but
|
|
|
|
* is as close as we can get for now.
|
|
|
|
*/
|
|
|
|
KASSERT(so->so_count == 0, ("soabort: so_count"));
|
2006-04-23 18:15:54 +00:00
|
|
|
KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
|
2006-04-01 15:15:05 +00:00
|
|
|
KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
|
2006-04-23 18:15:54 +00:00
|
|
|
KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
|
|
|
|
KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2006-04-01 15:15:05 +00:00
|
|
|
|
2006-07-21 17:11:15 +00:00
|
|
|
if (so->so_proto->pr_usrreqs->pru_abort != NULL)
|
2006-07-11 23:18:28 +00:00
|
|
|
(*so->so_proto->pr_usrreqs->pru_abort)(so);
|
2006-04-01 15:15:05 +00:00
|
|
|
ACCEPT_LOCK();
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
sofree(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soaccept(struct socket *so, struct sockaddr **nam)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_LOCK(so);
|
|
|
|
KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
|
1994-05-24 10:09:53 +00:00
|
|
|
so->so_state &= ~SS_NOFDREF;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_UNLOCK(so);
|
2011-02-16 21:29:13 +00:00
|
|
|
|
|
|
|
CURVNET_SET(so->so_vnet);
|
2001-03-09 08:16:40 +00:00
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_RESTORE();
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
|
2013-03-02 21:11:30 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (soconnectat(AT_FDCWD, so, nam, td));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
2002-05-31 11:52:35 +00:00
|
|
|
if (so->so_options & SO_ACCEPTCONN)
|
1994-05-24 10:09:53 +00:00
|
|
|
return (EOPNOTSUPP);
|
2010-02-20 22:29:28 +00:00
|
|
|
|
|
|
|
CURVNET_SET(so->so_vnet);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* If protocol is connection-based, can only connect once.
|
2006-07-23 20:36:04 +00:00
|
|
|
* Otherwise, if connected, try to disconnect first. This allows
|
|
|
|
* user to disconnect by connecting to, e.g., a null address.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2002-05-31 11:52:35 +00:00
|
|
|
if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
|
|
|
|
((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
|
2005-06-09 19:59:09 +00:00
|
|
|
(error = sodisconnect(so)))) {
|
2002-05-31 11:52:35 +00:00
|
|
|
error = EISCONN;
|
2005-06-09 19:59:09 +00:00
|
|
|
} else {
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Prevent accumulated error from previous connection from
|
|
|
|
* biting us.
|
2005-06-09 19:59:09 +00:00
|
|
|
*/
|
|
|
|
so->so_error = 0;
|
2013-03-02 21:11:30 +00:00
|
|
|
if (fd == AT_FDCWD) {
|
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
|
|
|
|
nam, td);
|
|
|
|
} else {
|
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
|
|
|
|
so, nam, td);
|
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
}
|
2010-02-20 22:29:28 +00:00
|
|
|
CURVNET_RESTORE();
|
2005-06-09 19:59:09 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soconnect2(struct socket *so1, struct socket *so2)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2011-02-16 21:29:13 +00:00
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_SET(so1->so_vnet);
|
|
|
|
error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
|
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sodisconnect(struct socket *so)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
if ((so->so_state & SS_ISCONNECTED) == 0)
|
|
|
|
return (ENOTCONN);
|
|
|
|
if (so->so_state & SS_ISDISCONNECTING)
|
|
|
|
return (EALREADY);
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
1996-07-11 16:32:50 +00:00
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
Correct two problems relating to sorflush(), which is called to flush
read socket buffers in shutdown() and close():
- Call socantrcvmore() before sblock() to dislodge any threads that
might be sleeping (potentially indefinitely) while holding sblock(),
such as a thread blocked in recv().
- Flag the sblock() call as non-interruptible so that a signal
delivered to the thread calling sorflush() doesn't cause sblock() to
fail. The sblock() is required to ensure that all other socket
consumer threads have, in fact, left, and do not enter, the socket
buffer until we're done flushin it.
To implement the latter, change the 'flags' argument to sblock() to
accept two flags, SBL_WAIT and SBL_NOINTR, rather than one M_WAITOK
flag. When SBL_NOINTR is set, it forces a non-interruptible sx
acquisition, regardless of the setting of the disposition of SB_NOINTR
on the socket buffer; without this change it would be possible for
another thread to clear SB_NOINTR between when the socket buffer mutex
is released and sblock() is invoked.
Reviewed by: bz, kmacy
Reported by: Jos Backus <jos at catnook dot com>
2008-01-31 08:22:24 +00:00
|
|
|
#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
|
2006-01-13 10:22:01 +00:00
|
|
|
|
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
|
|
|
|
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
|
2006-01-13 10:22:01 +00:00
|
|
|
{
|
2012-02-21 01:05:12 +00:00
|
|
|
long space;
|
|
|
|
ssize_t resid;
|
2006-01-13 10:22:01 +00:00
|
|
|
int clen = 0, error, dontroute;
|
|
|
|
|
2012-10-02 18:38:05 +00:00
|
|
|
KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
|
2006-01-13 10:22:01 +00:00
|
|
|
KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
|
2012-10-02 18:38:05 +00:00
|
|
|
("sosend_dgram: !PR_ATOMIC"));
|
2006-01-13 10:22:01 +00:00
|
|
|
|
|
|
|
if (uio != NULL)
|
|
|
|
resid = uio->uio_resid;
|
|
|
|
else
|
|
|
|
resid = top->m_pkthdr.len;
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* In theory resid should be unsigned. However, space must be
|
|
|
|
* signed, as it might be less than 0 if we over-committed, and we
|
|
|
|
* must use a signed comparison of space and resid. On the other
|
|
|
|
* hand, a negative resid causes us to loop sending 0-length
|
|
|
|
* segments to the protocol.
|
2006-01-13 10:22:01 +00:00
|
|
|
*/
|
|
|
|
if (resid < 0) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
dontroute =
|
|
|
|
(flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
|
|
|
|
if (td != NULL)
|
2007-06-01 01:12:45 +00:00
|
|
|
td->td_ru.ru_msgsnd++;
|
2006-01-13 10:22:01 +00:00
|
|
|
if (control != NULL)
|
|
|
|
clen = control->m_len;
|
|
|
|
|
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
|
|
|
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
error = EPIPE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (so->so_error) {
|
|
|
|
error = so->so_error;
|
|
|
|
so->so_error = 0;
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if ((so->so_state & SS_ISCONNECTED) == 0) {
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* `sendto' and `sendmsg' is allowed on a connection-based
|
|
|
|
* socket if it supports implied connect. Return ENOTCONN if
|
|
|
|
* not connected and no address is supplied.
|
2006-01-13 10:22:01 +00:00
|
|
|
*/
|
|
|
|
if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
|
|
|
|
(so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
|
|
|
|
if ((so->so_state & SS_ISCONFIRMING) == 0 &&
|
|
|
|
!(resid == 0 && clen != 0)) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
error = ENOTCONN;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
} else if (addr == NULL) {
|
|
|
|
if (so->so_proto->pr_flags & PR_CONNREQUIRED)
|
|
|
|
error = ENOTCONN;
|
|
|
|
else
|
|
|
|
error = EDESTADDRREQ;
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
|
|
|
|
* problem and need fixing.
|
|
|
|
*/
|
|
|
|
space = sbspace(&so->so_snd);
|
|
|
|
if (flags & MSG_OOB)
|
|
|
|
space += 1024;
|
|
|
|
space -= clen;
|
2006-09-13 06:58:40 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
2006-01-13 10:22:01 +00:00
|
|
|
if (resid > space) {
|
|
|
|
error = EMSGSIZE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (uio == NULL) {
|
|
|
|
resid = 0;
|
|
|
|
if (flags & MSG_EOR)
|
|
|
|
top->m_flags |= M_EOR;
|
|
|
|
} else {
|
2007-01-22 14:50:28 +00:00
|
|
|
/*
|
|
|
|
* Copy the data from userland into a mbuf chain.
|
|
|
|
* If no data is to be copied in, a single empty mbuf
|
|
|
|
* is returned.
|
|
|
|
*/
|
2006-11-02 17:45:28 +00:00
|
|
|
top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
|
|
|
|
(M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
|
|
|
|
if (top == NULL) {
|
|
|
|
error = EFAULT; /* only possible error */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
space -= resid - uio->uio_resid;
|
2006-01-13 10:22:01 +00:00
|
|
|
resid = uio->uio_resid;
|
|
|
|
}
|
|
|
|
KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
|
|
|
|
/*
|
|
|
|
* XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
|
|
|
|
* than with.
|
|
|
|
*/
|
|
|
|
if (dontroute) {
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_options |= SO_DONTROUTE;
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
}
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* XXX all the SBS_CANTSENDMORE checks previously done could be out
|
|
|
|
* of date. We could have recieved a reset packet in an interrupt or
|
|
|
|
* maybe we slept while doing page faults in uiomove() etc. We could
|
|
|
|
* probably recheck again inside the locking protection here, but
|
|
|
|
* there are probably other places that this also happens. We must
|
|
|
|
* rethink this.
|
2006-01-13 10:22:01 +00:00
|
|
|
*/
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2006-01-13 10:22:01 +00:00
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_send)(so,
|
|
|
|
(flags & MSG_OOB) ? PRUS_OOB :
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If the user set MSG_EOF, the protocol understands this flag and
|
|
|
|
* nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
|
2006-01-13 10:22:01 +00:00
|
|
|
*/
|
|
|
|
((flags & MSG_EOF) &&
|
|
|
|
(so->so_proto->pr_flags & PR_IMPLOPCL) &&
|
|
|
|
(resid <= 0)) ?
|
|
|
|
PRUS_EOF :
|
|
|
|
/* If there is more to send set PRUS_MORETOCOME */
|
|
|
|
(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
|
|
|
|
top, addr, control, td);
|
|
|
|
if (dontroute) {
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_options &= ~SO_DONTROUTE;
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
}
|
|
|
|
clen = 0;
|
|
|
|
control = NULL;
|
|
|
|
top = NULL;
|
|
|
|
out:
|
|
|
|
if (top != NULL)
|
|
|
|
m_freem(top);
|
|
|
|
if (control != NULL)
|
|
|
|
m_freem(control);
|
|
|
|
return (error);
|
|
|
|
}
|
2005-11-28 18:09:03 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Send on a socket. If send must go all at once and message is larger than
|
|
|
|
* send buffering, then hard error. Lock against other senders. If must go
|
|
|
|
* all at once and not enough room now, then inform user that this would
|
|
|
|
* block and do nothing. Otherwise, if nonblocking, send as much as
|
|
|
|
* possible. The data to be sent is described by "uio" if nonzero, otherwise
|
|
|
|
* by the mbuf chain "top" (which must be null if uio is not). Data provided
|
|
|
|
* in mbuf chain must be small enough to send all at once.
|
1994-05-24 10:09:53 +00:00
|
|
|
*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Returns nonzero on error, timeout or signal; callers must check for short
|
|
|
|
* counts if EINTR/ERESTART are returned. Data and control buffers are freed
|
|
|
|
* on return.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
|
|
|
|
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2012-02-21 01:05:12 +00:00
|
|
|
long space;
|
|
|
|
ssize_t resid;
|
2005-06-09 19:59:09 +00:00
|
|
|
int clen = 0, error, dontroute;
|
1994-05-24 10:09:53 +00:00
|
|
|
int atomic = sosendallatonce(so) || top;
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
if (uio != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
resid = uio->uio_resid;
|
|
|
|
else
|
|
|
|
resid = top->m_pkthdr.len;
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* In theory resid should be unsigned. However, space must be
|
|
|
|
* signed, as it might be less than 0 if we over-committed, and we
|
|
|
|
* must use a signed comparison of space and resid. On the other
|
|
|
|
* hand, a negative resid causes us to loop sending 0-length
|
|
|
|
* segments to the protocol.
|
1997-11-09 05:07:40 +00:00
|
|
|
*
|
|
|
|
* Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
|
|
|
|
* type sockets since that's an error.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1999-01-27 21:50:00 +00:00
|
|
|
if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
|
1997-11-09 05:07:40 +00:00
|
|
|
error = EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
dontroute =
|
|
|
|
(flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
|
|
|
|
(so->so_proto->pr_flags & PR_ATOMIC);
|
2005-06-09 19:59:09 +00:00
|
|
|
if (td != NULL)
|
2007-06-01 01:12:45 +00:00
|
|
|
td->td_ru.ru_msgsnd++;
|
2005-06-09 19:59:09 +00:00
|
|
|
if (control != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
clen = control->m_len;
|
|
|
|
|
1994-10-02 17:35:40 +00:00
|
|
|
error = sblock(&so->so_snd, SBLOCKWAIT(flags));
|
|
|
|
if (error)
|
2007-05-03 14:42:42 +00:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
restart:
|
1994-05-24 10:09:53 +00:00
|
|
|
do {
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
|
|
|
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
error = EPIPE;
|
|
|
|
goto release;
|
|
|
|
}
|
1998-02-19 19:38:20 +00:00
|
|
|
if (so->so_error) {
|
|
|
|
error = so->so_error;
|
|
|
|
so->so_error = 0;
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
1998-02-19 19:38:20 +00:00
|
|
|
goto release;
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
if ((so->so_state & SS_ISCONNECTED) == 0) {
|
1995-02-07 02:01:16 +00:00
|
|
|
/*
|
|
|
|
* `sendto' and `sendmsg' is allowed on a connection-
|
|
|
|
* based socket if it supports implied connect.
|
|
|
|
* Return ENOTCONN if not connected and no address is
|
|
|
|
* supplied.
|
|
|
|
*/
|
|
|
|
if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
|
|
|
|
(so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
|
1994-05-24 10:09:53 +00:00
|
|
|
if ((so->so_state & SS_ISCONFIRMING) == 0 &&
|
2007-05-03 14:42:42 +00:00
|
|
|
!(resid == 0 && clen != 0)) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
error = ENOTCONN;
|
|
|
|
goto release;
|
|
|
|
}
|
|
|
|
} else if (addr == NULL) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
if (so->so_proto->pr_flags & PR_CONNREQUIRED)
|
|
|
|
error = ENOTCONN;
|
|
|
|
else
|
|
|
|
error = EDESTADDRREQ;
|
|
|
|
goto release;
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
space = sbspace(&so->so_snd);
|
|
|
|
if (flags & MSG_OOB)
|
|
|
|
space += 1024;
|
1994-10-02 17:35:40 +00:00
|
|
|
if ((atomic && resid > so->so_snd.sb_hiwat) ||
|
2007-05-03 14:42:42 +00:00
|
|
|
clen > so->so_snd.sb_hiwat) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
error = EMSGSIZE;
|
|
|
|
goto release;
|
|
|
|
}
|
2002-02-28 11:22:40 +00:00
|
|
|
if (space < resid + clen &&
|
1994-05-24 10:09:53 +00:00
|
|
|
(atomic || space < so->so_snd.sb_lowat || space < clen)) {
|
2007-05-03 14:42:42 +00:00
|
|
|
if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
error = EWOULDBLOCK;
|
|
|
|
goto release;
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
error = sbwait(&so->so_snd);
|
2007-05-08 12:34:14 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (error)
|
2007-05-03 14:42:42 +00:00
|
|
|
goto release;
|
1994-05-24 10:09:53 +00:00
|
|
|
goto restart;
|
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
1994-05-24 10:09:53 +00:00
|
|
|
space -= clen;
|
|
|
|
do {
|
2005-11-28 18:09:03 +00:00
|
|
|
if (uio == NULL) {
|
|
|
|
resid = 0;
|
|
|
|
if (flags & MSG_EOR)
|
|
|
|
top->m_flags |= M_EOR;
|
1994-05-24 10:09:53 +00:00
|
|
|
} else {
|
2007-01-22 14:50:28 +00:00
|
|
|
/*
|
|
|
|
* Copy the data from userland into a mbuf
|
|
|
|
* chain. If no data is to be copied in,
|
|
|
|
* a single empty mbuf is returned.
|
|
|
|
*/
|
2006-11-02 17:45:28 +00:00
|
|
|
top = m_uiotombuf(uio, M_WAITOK, space,
|
|
|
|
(atomic ? max_hdr : 0),
|
|
|
|
(atomic ? M_PKTHDR : 0) |
|
|
|
|
((flags & MSG_EOR) ? M_EOR : 0));
|
|
|
|
if (top == NULL) {
|
|
|
|
error = EFAULT; /* only possible error */
|
|
|
|
goto release;
|
|
|
|
}
|
|
|
|
space -= resid - uio->uio_resid;
|
2005-11-28 18:09:03 +00:00
|
|
|
resid = uio->uio_resid;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-11-28 18:09:03 +00:00
|
|
|
if (dontroute) {
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_options |= SO_DONTROUTE;
|
|
|
|
SOCK_UNLOCK(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-11-28 18:09:03 +00:00
|
|
|
/*
|
|
|
|
* XXX all the SBS_CANTSENDMORE checks previously
|
|
|
|
* done could be out of date. We could have recieved
|
|
|
|
* a reset packet in an interrupt or maybe we slept
|
2006-07-23 20:36:04 +00:00
|
|
|
* while doing page faults in uiomove() etc. We
|
|
|
|
* could probably recheck again inside the locking
|
|
|
|
* protection here, but there are probably other
|
|
|
|
* places that this also happens. We must rethink
|
|
|
|
* this.
|
2005-11-28 18:09:03 +00:00
|
|
|
*/
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2005-11-28 18:09:03 +00:00
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_send)(so,
|
|
|
|
(flags & MSG_OOB) ? PRUS_OOB :
|
1995-02-07 02:01:16 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If the user set MSG_EOF, the protocol understands
|
|
|
|
* this flag and nothing left to send then use
|
|
|
|
* PRU_SEND_EOF instead of PRU_SEND.
|
1995-02-07 02:01:16 +00:00
|
|
|
*/
|
2005-11-28 18:09:03 +00:00
|
|
|
((flags & MSG_EOF) &&
|
|
|
|
(so->so_proto->pr_flags & PR_IMPLOPCL) &&
|
|
|
|
(resid <= 0)) ?
|
1999-01-20 17:32:01 +00:00
|
|
|
PRUS_EOF :
|
2006-07-23 20:36:04 +00:00
|
|
|
/* If there is more to send set PRUS_MORETOCOME. */
|
2005-11-28 18:09:03 +00:00
|
|
|
(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
|
|
|
|
top, addr, control, td);
|
|
|
|
if (dontroute) {
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_options &= ~SO_DONTROUTE;
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
}
|
|
|
|
clen = 0;
|
|
|
|
control = NULL;
|
|
|
|
top = NULL;
|
2007-05-03 14:42:42 +00:00
|
|
|
if (error)
|
2005-11-28 18:09:03 +00:00
|
|
|
goto release;
|
1994-05-24 10:09:53 +00:00
|
|
|
} while (resid && space > 0);
|
|
|
|
} while (resid);
|
|
|
|
|
|
|
|
release:
|
|
|
|
sbunlock(&so->so_snd);
|
|
|
|
out:
|
2005-06-09 19:59:09 +00:00
|
|
|
if (top != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
m_freem(top);
|
2005-06-09 19:59:09 +00:00
|
|
|
if (control != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
m_freem(control);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
|
|
|
|
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
{
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
int error;
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
|
|
|
error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
|
2009-05-08 14:34:25 +00:00
|
|
|
control, flags, td);
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
|
|
|
* The part of soreceive() that implements reading non-inline out-of-band
|
|
|
|
* data from a socket. For more complete comments, see soreceive(), from
|
|
|
|
* which this code originated.
|
|
|
|
*
|
|
|
|
* Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
|
|
|
|
* unable to return an mbuf chain to the caller.
|
|
|
|
*/
|
|
|
|
static int
|
2007-05-16 20:41:08 +00:00
|
|
|
soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
|
2005-06-09 19:59:09 +00:00
|
|
|
{
|
|
|
|
struct protosw *pr = so->so_proto;
|
|
|
|
struct mbuf *m;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2005-06-09 19:59:09 +00:00
|
|
|
|
2012-12-05 08:04:20 +00:00
|
|
|
m = m_get(M_WAITOK, MT_DATA);
|
2005-06-09 19:59:09 +00:00
|
|
|
error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
|
|
|
|
if (error)
|
|
|
|
goto bad;
|
|
|
|
do {
|
|
|
|
error = uiomove(mtod(m, void *),
|
|
|
|
(int) min(uio->uio_resid, m->m_len), uio);
|
|
|
|
m = m_free(m);
|
|
|
|
} while (uio->uio_resid && error == 0 && m);
|
|
|
|
bad:
|
|
|
|
if (m != NULL)
|
|
|
|
m_freem(m);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Following replacement or removal of the first mbuf on the first mbuf chain
|
|
|
|
* of a socket buffer, push necessary state changes back into the socket
|
|
|
|
* buffer so that other consumers see the values consistently. 'nextrecord'
|
|
|
|
* is the callers locally stored value of the original value of
|
|
|
|
* sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
|
|
|
|
* NOTE: 'nextrecord' may be NULL.
|
|
|
|
*/
|
|
|
|
static __inline void
|
|
|
|
sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
|
|
|
|
{
|
|
|
|
|
|
|
|
SOCKBUF_LOCK_ASSERT(sb);
|
|
|
|
/*
|
|
|
|
* First, update for the new value of nextrecord. If necessary, make
|
|
|
|
* it the first record.
|
|
|
|
*/
|
|
|
|
if (sb->sb_mb != NULL)
|
|
|
|
sb->sb_mb->m_nextpkt = nextrecord;
|
|
|
|
else
|
|
|
|
sb->sb_mb = nextrecord;
|
|
|
|
|
2012-12-07 22:13:33 +00:00
|
|
|
/*
|
|
|
|
* Now update any dependent socket buffer fields to reflect the new
|
|
|
|
* state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
|
2005-06-09 19:59:09 +00:00
|
|
|
* addition of a second clause that takes care of the case where
|
|
|
|
* sb_mb has been updated, but remains the last record.
|
2012-12-07 22:13:33 +00:00
|
|
|
*/
|
|
|
|
if (sb->sb_mb == NULL) {
|
|
|
|
sb->sb_mbtail = NULL;
|
|
|
|
sb->sb_lastrecord = NULL;
|
|
|
|
} else if (sb->sb_mb->m_nextpkt == NULL)
|
|
|
|
sb->sb_lastrecord = sb->sb_mb;
|
2005-06-09 19:59:09 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Implement receive operations on a socket. We depend on the way that
|
|
|
|
* records are added to the sockbuf by sbappend. In particular, each record
|
|
|
|
* (mbufs linked through m_next) must begin with an address if the protocol
|
|
|
|
* so specifies, followed by an optional mbuf or mbufs containing ancillary
|
|
|
|
* data, and then zero or more mbufs of data. In order to allow parallelism
|
|
|
|
* between network receive and copying to user space, as well as avoid
|
|
|
|
* sleeping with a mutex held, we release the socket buffer mutex during the
|
|
|
|
* user space copy. Although the sockbuf is locked, new data may still be
|
|
|
|
* appended, and thus we must maintain consistency of the sockbuf during that
|
|
|
|
* time.
|
1994-05-24 10:09:53 +00:00
|
|
|
*
|
2006-07-23 20:36:04 +00:00
|
|
|
* The caller may receive the data as a single mbuf chain by supplying an
|
|
|
|
* mbuf **mp0 for use in returning the chain. The uio is then used only for
|
|
|
|
* the count in uio_resid.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
|
|
|
|
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct mbuf *m, **mp;
|
2012-02-21 01:05:12 +00:00
|
|
|
int flags, error, offset;
|
|
|
|
ssize_t len;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct protosw *pr = so->so_proto;
|
|
|
|
struct mbuf *nextrecord;
|
1994-05-25 09:21:21 +00:00
|
|
|
int moff, type = 0;
|
2012-02-21 01:05:12 +00:00
|
|
|
ssize_t orig_resid = uio->uio_resid;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
mp = mp0;
|
2005-06-09 19:59:09 +00:00
|
|
|
if (psa != NULL)
|
|
|
|
*psa = NULL;
|
|
|
|
if (controlp != NULL)
|
|
|
|
*controlp = NULL;
|
|
|
|
if (flagsp != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
flags = *flagsp &~ MSG_EOR;
|
|
|
|
else
|
|
|
|
flags = 0;
|
2005-06-09 19:59:09 +00:00
|
|
|
if (flags & MSG_OOB)
|
|
|
|
return (soreceive_rcvoob(so, uio, flags));
|
|
|
|
if (mp != NULL)
|
|
|
|
*mp = NULL;
|
|
|
|
if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
|
2011-02-16 21:29:13 +00:00
|
|
|
&& uio->uio_resid) {
|
|
|
|
VNET_SO_ASSERT(so);
|
1996-07-11 16:32:50 +00:00
|
|
|
(*pr->pr_usrreqs->pru_rcvd)(so, 0);
|
2011-02-16 21:29:13 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1994-10-02 17:35:40 +00:00
|
|
|
error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
|
|
|
|
if (error)
|
2007-05-03 14:42:42 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2007-05-03 14:42:42 +00:00
|
|
|
restart:
|
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
m = so->so_rcv.sb_mb;
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If we have less data than requested, block awaiting more (subject
|
|
|
|
* to any timeout) if:
|
1994-05-24 10:09:53 +00:00
|
|
|
* 1. the current count is less than the low water mark, or
|
2012-09-02 07:33:52 +00:00
|
|
|
* 2. MSG_DONTWAIT is not set
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
|
1994-05-24 10:09:53 +00:00
|
|
|
so->so_rcv.sb_cc < uio->uio_resid) &&
|
2012-09-02 07:33:52 +00:00
|
|
|
so->so_rcv.sb_cc < so->so_rcv.sb_lowat &&
|
2005-06-09 19:59:09 +00:00
|
|
|
m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
|
|
|
|
KASSERT(m != NULL || !so->so_rcv.sb_cc,
|
|
|
|
("receive: m == %p so->so_rcv.sb_cc == %u",
|
|
|
|
m, so->so_rcv.sb_cc));
|
1994-05-24 10:09:53 +00:00
|
|
|
if (so->so_error) {
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto dontblock;
|
|
|
|
error = so->so_error;
|
|
|
|
if ((flags & MSG_PEEK) == 0)
|
|
|
|
so->so_error = 0;
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
goto release;
|
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
|
|
|
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
|
2007-05-03 14:42:42 +00:00
|
|
|
if (m == NULL) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
goto release;
|
2007-05-03 14:42:42 +00:00
|
|
|
} else
|
|
|
|
goto dontblock;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
for (; m != NULL; m = m->m_next)
|
1994-05-24 10:09:53 +00:00
|
|
|
if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
|
|
|
|
m = so->so_rcv.sb_mb;
|
|
|
|
goto dontblock;
|
|
|
|
}
|
|
|
|
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
|
|
|
|
(so->so_proto->pr_flags & PR_CONNREQUIRED)) {
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
error = ENOTCONN;
|
|
|
|
goto release;
|
|
|
|
}
|
2007-05-03 14:42:42 +00:00
|
|
|
if (uio->uio_resid == 0) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
goto release;
|
2007-05-03 14:42:42 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
if ((so->so_state & SS_NBIO) ||
|
|
|
|
(flags & (MSG_DONTWAIT|MSG_NBIO))) {
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
error = EWOULDBLOCK;
|
|
|
|
goto release;
|
|
|
|
}
|
2003-10-28 05:47:40 +00:00
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
error = sbwait(&so->so_rcv);
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (error)
|
2007-05-03 14:42:42 +00:00
|
|
|
goto release;
|
1994-05-24 10:09:53 +00:00
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
dontblock:
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
|
|
|
* From this point onward, we maintain 'nextrecord' as a cache of the
|
|
|
|
* pointer to the next record in the socket buffer. We must keep the
|
|
|
|
* various socket buffer pointers and local stack versions of the
|
|
|
|
* pointers in sync, pushing out modifications before dropping the
|
|
|
|
* socket buffer mutex, and re-reading them when picking it up.
|
|
|
|
*
|
|
|
|
* Otherwise, we will race with the network stack appending new data
|
|
|
|
* or records onto the socket buffer by using inconsistent/stale
|
|
|
|
* versions of the field, possibly resulting in socket buffer
|
|
|
|
* corruption.
|
|
|
|
*
|
|
|
|
* By holding the high-level sblock(), we prevent simultaneous
|
|
|
|
* readers from pulling off the front of the socket buffer.
|
|
|
|
*/
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
|
|
|
if (uio->uio_td)
|
2007-06-01 01:12:45 +00:00
|
|
|
uio->uio_td->td_ru.ru_msgrcv++;
|
2005-06-09 19:59:09 +00:00
|
|
|
KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
|
2003-10-28 05:47:40 +00:00
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
nextrecord = m->m_nextpkt;
|
|
|
|
if (pr->pr_flags & PR_ADDR) {
|
2005-06-09 19:59:09 +00:00
|
|
|
KASSERT(m->m_type == MT_SONAME,
|
|
|
|
("m->m_type == %d", m->m_type));
|
1994-05-24 10:09:53 +00:00
|
|
|
orig_resid = 0;
|
2005-06-09 19:59:09 +00:00
|
|
|
if (psa != NULL)
|
|
|
|
*psa = sodupsockaddr(mtod(m, struct sockaddr *),
|
|
|
|
M_NOWAIT);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (flags & MSG_PEEK) {
|
|
|
|
m = m->m_next;
|
|
|
|
} else {
|
|
|
|
sbfree(&so->so_rcv, m);
|
2002-02-05 02:00:56 +00:00
|
|
|
so->so_rcv.sb_mb = m_free(m);
|
1997-08-16 19:16:27 +00:00
|
|
|
m = so->so_rcv.sb_mb;
|
2005-06-09 19:59:09 +00:00
|
|
|
sockbuf_pushsync(&so->so_rcv, nextrecord);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Process one or more MT_CONTROL mbufs present before any data mbufs
|
|
|
|
* in the first mbuf chain on the socket buffer. If MSG_PEEK, we
|
|
|
|
* just copy the data; if !MSG_PEEK, we call into the protocol to
|
|
|
|
* perform externalization (or freeing if controlp == NULL).
|
|
|
|
*/
|
|
|
|
if (m != NULL && m->m_type == MT_CONTROL) {
|
|
|
|
struct mbuf *cm = NULL, *cmn;
|
|
|
|
struct mbuf **cme = &cm;
|
|
|
|
|
|
|
|
do {
|
|
|
|
if (flags & MSG_PEEK) {
|
|
|
|
if (controlp != NULL) {
|
|
|
|
*controlp = m_copy(m, 0, m->m_len);
|
|
|
|
controlp = &(*controlp)->m_next;
|
|
|
|
}
|
|
|
|
m = m->m_next;
|
2005-06-09 19:56:38 +00:00
|
|
|
} else {
|
2005-06-09 19:59:09 +00:00
|
|
|
sbfree(&so->so_rcv, m);
|
|
|
|
so->so_rcv.sb_mb = m->m_next;
|
|
|
|
m->m_next = NULL;
|
|
|
|
*cme = m;
|
|
|
|
cme = &(*cme)->m_next;
|
2004-07-11 23:13:14 +00:00
|
|
|
m = so->so_rcv.sb_mb;
|
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
} while (m != NULL && m->m_type == MT_CONTROL);
|
|
|
|
if ((flags & MSG_PEEK) == 0)
|
|
|
|
sockbuf_pushsync(&so->so_rcv, nextrecord);
|
|
|
|
while (cm != NULL) {
|
|
|
|
cmn = cm->m_next;
|
|
|
|
cm->m_next = NULL;
|
|
|
|
if (pr->pr_domain->dom_externalize != NULL) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2005-06-09 19:59:09 +00:00
|
|
|
error = (*pr->pr_domain->dom_externalize)
|
2013-03-19 20:58:17 +00:00
|
|
|
(cm, controlp, flags);
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
} else if (controlp != NULL)
|
|
|
|
*controlp = cm;
|
|
|
|
else
|
|
|
|
m_freem(cm);
|
|
|
|
if (controlp != NULL) {
|
|
|
|
orig_resid = 0;
|
|
|
|
while (*controlp != NULL)
|
|
|
|
controlp = &(*controlp)->m_next;
|
|
|
|
}
|
|
|
|
cm = cmn;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2006-08-18 14:05:13 +00:00
|
|
|
if (m != NULL)
|
2005-07-28 10:10:01 +00:00
|
|
|
nextrecord = so->so_rcv.sb_mb->m_nextpkt;
|
|
|
|
else
|
2006-08-18 14:05:13 +00:00
|
|
|
nextrecord = so->so_rcv.sb_mb;
|
2005-06-09 19:59:09 +00:00
|
|
|
orig_resid = 0;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m != NULL) {
|
2003-10-28 05:47:40 +00:00
|
|
|
if ((flags & MSG_PEEK) == 0) {
|
2005-06-09 19:59:09 +00:00
|
|
|
KASSERT(m->m_nextpkt == nextrecord,
|
|
|
|
("soreceive: post-control, nextrecord !sync"));
|
2003-10-28 05:47:40 +00:00
|
|
|
if (nextrecord == NULL) {
|
|
|
|
KASSERT(so->so_rcv.sb_mb == m,
|
2005-06-09 19:59:09 +00:00
|
|
|
("soreceive: post-control, sb_mb!=m"));
|
|
|
|
KASSERT(so->so_rcv.sb_lastrecord == m,
|
|
|
|
("soreceive: post-control, lastrecord!=m"));
|
2003-10-28 05:47:40 +00:00
|
|
|
}
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
type = m->m_type;
|
|
|
|
if (type == MT_OOBDATA)
|
|
|
|
flags |= MSG_OOB;
|
2003-10-28 05:47:40 +00:00
|
|
|
} else {
|
|
|
|
if ((flags & MSG_PEEK) == 0) {
|
2005-06-09 19:59:09 +00:00
|
|
|
KASSERT(so->so_rcv.sb_mb == nextrecord,
|
|
|
|
("soreceive: sb_mb != nextrecord"));
|
|
|
|
if (so->so_rcv.sb_mb == NULL) {
|
|
|
|
KASSERT(so->so_rcv.sb_lastrecord == NULL,
|
|
|
|
("soreceive: sb_lastercord != NULL"));
|
|
|
|
}
|
2003-10-28 05:47:40 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
2003-10-28 05:47:40 +00:00
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
|
|
|
* Now continue to read any data mbufs off of the head of the socket
|
|
|
|
* buffer until the read request is satisfied. Note that 'type' is
|
|
|
|
* used to store the type of any mbuf reads that have happened so far
|
|
|
|
* such that soreceive() can stop reading if the type changes, which
|
|
|
|
* causes soreceive() to return only one of regular data and inline
|
|
|
|
* out-of-band data in a single socket receive operation.
|
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
moff = 0;
|
|
|
|
offset = 0;
|
2005-06-09 19:59:09 +00:00
|
|
|
while (m != NULL && uio->uio_resid > 0 && error == 0) {
|
|
|
|
/*
|
|
|
|
* If the type of mbuf has changed since the last mbuf
|
|
|
|
* examined ('type'), end the receive operation.
|
2012-12-07 22:13:33 +00:00
|
|
|
*/
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
2012-09-02 07:29:37 +00:00
|
|
|
if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
|
|
|
|
if (type != m->m_type)
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
} else if (type == MT_OOBDATA)
|
|
|
|
break;
|
1999-01-08 17:31:30 +00:00
|
|
|
else
|
2005-11-02 13:46:32 +00:00
|
|
|
KASSERT(m->m_type == MT_DATA,
|
2005-06-09 19:59:09 +00:00
|
|
|
("m->m_type == %d", m->m_type));
|
|
|
|
so->so_rcv.sb_state &= ~SBS_RCVATMARK;
|
1994-05-24 10:09:53 +00:00
|
|
|
len = uio->uio_resid;
|
|
|
|
if (so->so_oobmark && len > so->so_oobmark - offset)
|
|
|
|
len = so->so_oobmark - offset;
|
|
|
|
if (len > m->m_len - moff)
|
|
|
|
len = m->m_len - moff;
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If mp is set, just pass back the mbufs. Otherwise copy
|
|
|
|
* them out via the uio, then free. Sockbuf must be
|
|
|
|
* consistent here (points to current mbuf, it points to next
|
|
|
|
* record) when we drop priority; we must note any additions
|
|
|
|
* to the sockbuf when we block interrupts again.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2005-06-09 19:59:09 +00:00
|
|
|
if (mp == NULL) {
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
2003-10-28 05:47:40 +00:00
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
|
|
|
error = uiomove(mtod(m, char *) + moff, (int)len, uio);
|
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
2006-09-22 15:34:16 +00:00
|
|
|
if (error) {
|
|
|
|
/*
|
2006-12-23 21:07:07 +00:00
|
|
|
* The MT_SONAME mbuf has already been removed
|
|
|
|
* from the record, so it is necessary to
|
|
|
|
* remove the data mbufs, if any, to preserve
|
|
|
|
* the invariant in the case of PR_ADDR that
|
|
|
|
* requires MT_SONAME mbufs at the head of
|
|
|
|
* each record.
|
2006-09-22 15:34:16 +00:00
|
|
|
*/
|
2007-02-03 03:57:45 +00:00
|
|
|
if (m && pr->pr_flags & PR_ATOMIC &&
|
|
|
|
((flags & MSG_PEEK) == 0))
|
2006-09-22 15:34:16 +00:00
|
|
|
(void)sbdroprecord_locked(&so->so_rcv);
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1996-11-29 19:03:42 +00:00
|
|
|
goto release;
|
2006-09-22 15:34:16 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
} else
|
|
|
|
uio->uio_resid -= len;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (len == m->m_len - moff) {
|
|
|
|
if (m->m_flags & M_EOR)
|
|
|
|
flags |= MSG_EOR;
|
|
|
|
if (flags & MSG_PEEK) {
|
|
|
|
m = m->m_next;
|
|
|
|
moff = 0;
|
|
|
|
} else {
|
|
|
|
nextrecord = m->m_nextpkt;
|
|
|
|
sbfree(&so->so_rcv, m);
|
2005-06-09 19:59:09 +00:00
|
|
|
if (mp != NULL) {
|
2013-03-29 13:57:55 +00:00
|
|
|
m->m_nextpkt = NULL;
|
1994-05-24 10:09:53 +00:00
|
|
|
*mp = m;
|
|
|
|
mp = &m->m_next;
|
|
|
|
so->so_rcv.sb_mb = m = m->m_next;
|
2005-06-09 19:59:09 +00:00
|
|
|
*mp = NULL;
|
1994-05-24 10:09:53 +00:00
|
|
|
} else {
|
2005-06-09 19:59:09 +00:00
|
|
|
so->so_rcv.sb_mb = m_free(m);
|
|
|
|
m = so->so_rcv.sb_mb;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-09-06 17:05:11 +00:00
|
|
|
sockbuf_pushsync(&so->so_rcv, nextrecord);
|
2003-10-28 05:47:40 +00:00
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (flags & MSG_PEEK)
|
|
|
|
moff += len;
|
|
|
|
else {
|
2005-06-09 19:59:09 +00:00
|
|
|
if (mp != NULL) {
|
2014-01-16 13:45:41 +00:00
|
|
|
if (flags & MSG_DONTWAIT) {
|
|
|
|
*mp = m_copym(m, 0, len,
|
|
|
|
M_NOWAIT);
|
|
|
|
if (*mp == NULL) {
|
|
|
|
/*
|
|
|
|
* m_copym() couldn't
|
|
|
|
* allocate an mbuf.
|
|
|
|
* Adjust uio_resid back
|
|
|
|
* (it was adjusted
|
|
|
|
* down by len bytes,
|
|
|
|
* which we didn't end
|
|
|
|
* up "copying" over).
|
|
|
|
*/
|
|
|
|
uio->uio_resid += len;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2014-01-16 13:45:41 +00:00
|
|
|
*mp = m_copym(m, 0, len,
|
|
|
|
M_WAITOK);
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
2012-12-07 22:13:33 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
m->m_data += len;
|
|
|
|
m->m_len -= len;
|
|
|
|
so->so_rcv.sb_cc -= len;
|
|
|
|
}
|
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (so->so_oobmark) {
|
|
|
|
if ((flags & MSG_PEEK) == 0) {
|
|
|
|
so->so_oobmark -= len;
|
|
|
|
if (so->so_oobmark == 0) {
|
2005-06-09 19:59:09 +00:00
|
|
|
so->so_rcv.sb_state |= SBS_RCVATMARK;
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
offset += len;
|
|
|
|
if (offset == so->so_oobmark)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (flags & MSG_EOR)
|
|
|
|
break;
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If the MSG_WAITALL flag is set (for non-atomic socket), we
|
|
|
|
* must not quit until "uio->uio_resid == 0" or an error
|
|
|
|
* termination. If a signal/timeout occurs, return with a
|
|
|
|
* short count but without error. Keep sockbuf locked
|
|
|
|
* against other readers.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2005-06-09 19:59:09 +00:00
|
|
|
while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
|
|
|
|
!sosendallatonce(so) && nextrecord == NULL) {
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
2012-12-07 22:13:33 +00:00
|
|
|
if (so->so_error ||
|
|
|
|
so->so_rcv.sb_state & SBS_CANTRCVMORE)
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
2001-03-16 22:37:06 +00:00
|
|
|
/*
|
2005-06-09 19:59:09 +00:00
|
|
|
* Notify the protocol that some data has been
|
|
|
|
* drained before blocking.
|
2001-03-16 22:37:06 +00:00
|
|
|
*/
|
Chance protocol switch method pru_detach() so that it returns void
rather than an error. Detaches do not "fail", they other occur or
the protocol flags SS_PROTOREF to take ownership of the socket.
soclose() no longer looks at so_pcb to see if it's NULL, relying
entirely on the protocol to decide whether it's time to free the
socket or not using SS_PROTOREF. so_pcb is now entirely owned and
managed by the protocol code. Likewise, no longer test so_pcb in
other socket functions, such as soreceive(), which have no business
digging into protocol internals.
Protocol detach routines no longer try to free the socket on detach,
this is performed in the socket code if the protocol permits it.
In rts_detach(), no longer test for rp != NULL in detach, and
likewise in other protocols that don't permit a NULL so_pcb, reduce
the incidence of testing for it during detach.
netinet and netinet6 are not fully updated to this change, which
will be in an upcoming commit. In their current state they may leak
memory or panic.
MFC after: 3 months
2006-04-01 15:42:02 +00:00
|
|
|
if (pr->pr_flags & PR_WANTRCVD) {
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2001-03-16 22:37:06 +00:00
|
|
|
(*pr->pr_usrreqs->pru_rcvd)(so, flags);
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
}
|
2003-10-28 05:47:40 +00:00
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
2011-05-29 18:00:50 +00:00
|
|
|
/*
|
|
|
|
* We could receive some data while was notifying
|
|
|
|
* the protocol. Skip blocking in this case.
|
|
|
|
*/
|
|
|
|
if (so->so_rcv.sb_mb == NULL) {
|
|
|
|
error = sbwait(&so->so_rcv);
|
|
|
|
if (error) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
|
|
|
goto release;
|
|
|
|
}
|
2007-05-03 14:42:42 +00:00
|
|
|
}
|
1994-10-02 17:35:40 +00:00
|
|
|
m = so->so_rcv.sb_mb;
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
nextrecord = m->m_nextpkt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
|
|
|
if (m != NULL && pr->pr_flags & PR_ATOMIC) {
|
1994-05-24 10:09:53 +00:00
|
|
|
flags |= MSG_TRUNC;
|
2004-07-10 04:38:06 +00:00
|
|
|
if ((flags & MSG_PEEK) == 0)
|
2005-06-09 19:59:09 +00:00
|
|
|
(void) sbdroprecord_locked(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
if ((flags & MSG_PEEK) == 0) {
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m == NULL) {
|
2003-10-28 05:47:40 +00:00
|
|
|
/*
|
|
|
|
* First part is an inline SB_EMPTY_FIXUP(). Second
|
|
|
|
* part makes sure sb_lastrecord is up-to-date if
|
|
|
|
* there is still data in the socket buffer.
|
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
so->so_rcv.sb_mb = nextrecord;
|
2003-10-28 05:47:40 +00:00
|
|
|
if (so->so_rcv.sb_mb == NULL) {
|
|
|
|
so->so_rcv.sb_mbtail = NULL;
|
|
|
|
so->so_rcv.sb_lastrecord = NULL;
|
|
|
|
} else if (nextrecord->m_nextpkt == NULL)
|
|
|
|
so->so_rcv.sb_lastrecord = nextrecord;
|
|
|
|
}
|
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If soreceive() is being done from the socket callback,
|
|
|
|
* then don't need to generate ACK to peer to update window,
|
|
|
|
* since ACK will be generated on return to TCP.
|
2005-06-09 19:59:09 +00:00
|
|
|
*/
|
2006-07-16 23:09:39 +00:00
|
|
|
if (!(flags & MSG_SOCALLBCK) &&
|
Chance protocol switch method pru_detach() so that it returns void
rather than an error. Detaches do not "fail", they other occur or
the protocol flags SS_PROTOREF to take ownership of the socket.
soclose() no longer looks at so_pcb to see if it's NULL, relying
entirely on the protocol to decide whether it's time to free the
socket or not using SS_PROTOREF. so_pcb is now entirely owned and
managed by the protocol code. Likewise, no longer test so_pcb in
other socket functions, such as soreceive(), which have no business
digging into protocol internals.
Protocol detach routines no longer try to free the socket on detach,
this is performed in the socket code if the protocol permits it.
In rts_detach(), no longer test for rp != NULL in detach, and
likewise in other protocols that don't permit a NULL so_pcb, reduce
the incidence of testing for it during detach.
netinet and netinet6 are not fully updated to this change, which
will be in an upcoming commit. In their current state they may leak
memory or panic.
MFC after: 3 months
2006-04-01 15:42:02 +00:00
|
|
|
(pr->pr_flags & PR_WANTRCVD)) {
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
1996-07-11 16:32:50 +00:00
|
|
|
(*pr->pr_usrreqs->pru_rcvd)(so, flags);
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (orig_resid == uio->uio_resid && orig_resid &&
|
2005-06-09 19:59:09 +00:00
|
|
|
(flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
goto restart;
|
|
|
|
}
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1995-05-30 08:16:23 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
if (flagsp != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
*flagsp |= flags;
|
|
|
|
release:
|
|
|
|
sbunlock(&so->so_rcv);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2009-06-22 23:08:05 +00:00
|
|
|
/*
|
|
|
|
* Optimized version of soreceive() for stream (TCP) sockets.
|
2012-10-29 12:31:12 +00:00
|
|
|
* XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled.
|
2009-06-22 23:08:05 +00:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
|
|
|
|
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
|
|
|
|
{
|
|
|
|
int len = 0, error = 0, flags, oresid;
|
|
|
|
struct sockbuf *sb;
|
|
|
|
struct mbuf *m, *n = NULL;
|
|
|
|
|
|
|
|
/* We only do stream sockets. */
|
|
|
|
if (so->so_type != SOCK_STREAM)
|
|
|
|
return (EINVAL);
|
|
|
|
if (psa != NULL)
|
|
|
|
*psa = NULL;
|
|
|
|
if (controlp != NULL)
|
|
|
|
return (EINVAL);
|
|
|
|
if (flagsp != NULL)
|
|
|
|
flags = *flagsp &~ MSG_EOR;
|
|
|
|
else
|
|
|
|
flags = 0;
|
|
|
|
if (flags & MSG_OOB)
|
|
|
|
return (soreceive_rcvoob(so, uio, flags));
|
|
|
|
if (mp0 != NULL)
|
|
|
|
*mp0 = NULL;
|
|
|
|
|
|
|
|
sb = &so->so_rcv;
|
|
|
|
|
|
|
|
/* Prevent other readers from entering the socket. */
|
|
|
|
error = sblock(sb, SBLOCKWAIT(flags));
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
SOCKBUF_LOCK(sb);
|
|
|
|
|
|
|
|
/* Easy one, no space to copyout anything. */
|
|
|
|
if (uio->uio_resid == 0) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
oresid = uio->uio_resid;
|
|
|
|
|
2011-07-08 10:50:13 +00:00
|
|
|
/* We will never ever get anything unless we are or were connected. */
|
2009-06-22 23:08:05 +00:00
|
|
|
if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
|
2011-07-08 10:50:13 +00:00
|
|
|
error = ENOTCONN;
|
2009-06-22 23:08:05 +00:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
restart:
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
|
|
|
|
|
|
|
/* Abort if socket has reported problems. */
|
|
|
|
if (so->so_error) {
|
|
|
|
if (sb->sb_cc > 0)
|
|
|
|
goto deliver;
|
|
|
|
if (oresid > uio->uio_resid)
|
|
|
|
goto out;
|
|
|
|
error = so->so_error;
|
|
|
|
if (!(flags & MSG_PEEK))
|
|
|
|
so->so_error = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Door is closed. Deliver what is left, if any. */
|
|
|
|
if (sb->sb_state & SBS_CANTRCVMORE) {
|
|
|
|
if (sb->sb_cc > 0)
|
|
|
|
goto deliver;
|
|
|
|
else
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2011-07-08 10:50:13 +00:00
|
|
|
/* Socket buffer is empty and we shall not block. */
|
|
|
|
if (sb->sb_cc == 0 &&
|
|
|
|
((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
|
|
|
|
error = EAGAIN;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2009-06-22 23:08:05 +00:00
|
|
|
/* Socket buffer got some data that we shall deliver now. */
|
|
|
|
if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
|
|
|
|
((sb->sb_flags & SS_NBIO) ||
|
|
|
|
(flags & (MSG_DONTWAIT|MSG_NBIO)) ||
|
|
|
|
sb->sb_cc >= sb->sb_lowat ||
|
|
|
|
sb->sb_cc >= uio->uio_resid ||
|
|
|
|
sb->sb_cc >= sb->sb_hiwat) ) {
|
|
|
|
goto deliver;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* On MSG_WAITALL we must wait until all data or error arrives. */
|
|
|
|
if ((flags & MSG_WAITALL) &&
|
2012-10-29 12:31:12 +00:00
|
|
|
(sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat))
|
2009-06-22 23:08:05 +00:00
|
|
|
goto deliver;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait and block until (more) data comes in.
|
|
|
|
* NB: Drops the sockbuf lock during wait.
|
|
|
|
*/
|
|
|
|
error = sbwait(sb);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
goto restart;
|
|
|
|
|
|
|
|
deliver:
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
|
|
|
KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
|
|
|
|
KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
|
|
|
|
|
|
|
|
/* Statistics. */
|
|
|
|
if (uio->uio_td)
|
|
|
|
uio->uio_td->td_ru.ru_msgrcv++;
|
|
|
|
|
|
|
|
/* Fill uio until full or current end of socket buffer is reached. */
|
|
|
|
len = min(uio->uio_resid, sb->sb_cc);
|
|
|
|
if (mp0 != NULL) {
|
|
|
|
/* Dequeue as many mbufs as possible. */
|
|
|
|
if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
|
2012-10-29 12:31:12 +00:00
|
|
|
if (*mp0 == NULL)
|
|
|
|
*mp0 = sb->sb_mb;
|
|
|
|
else
|
|
|
|
m_cat(*mp0, sb->sb_mb);
|
|
|
|
for (m = sb->sb_mb;
|
2009-06-22 23:08:05 +00:00
|
|
|
m != NULL && m->m_len <= len;
|
|
|
|
m = m->m_next) {
|
|
|
|
len -= m->m_len;
|
|
|
|
uio->uio_resid -= m->m_len;
|
|
|
|
sbfree(sb, m);
|
|
|
|
n = m;
|
|
|
|
}
|
2012-10-29 12:31:12 +00:00
|
|
|
n->m_next = NULL;
|
2009-06-22 23:08:05 +00:00
|
|
|
sb->sb_mb = m;
|
2012-10-29 12:31:12 +00:00
|
|
|
sb->sb_lastrecord = sb->sb_mb;
|
2009-06-22 23:08:05 +00:00
|
|
|
if (sb->sb_mb == NULL)
|
|
|
|
SB_EMPTY_FIXUP(sb);
|
|
|
|
}
|
|
|
|
/* Copy the remainder. */
|
|
|
|
if (len > 0) {
|
|
|
|
KASSERT(sb->sb_mb != NULL,
|
|
|
|
("%s: len > 0 && sb->sb_mb empty", __func__));
|
|
|
|
|
2012-12-05 08:04:20 +00:00
|
|
|
m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
|
2009-06-22 23:08:05 +00:00
|
|
|
if (m == NULL)
|
|
|
|
len = 0; /* Don't flush data from sockbuf. */
|
|
|
|
else
|
2012-10-29 12:31:12 +00:00
|
|
|
uio->uio_resid -= len;
|
2009-06-22 23:08:05 +00:00
|
|
|
if (*mp0 != NULL)
|
2012-10-29 12:31:12 +00:00
|
|
|
m_cat(*mp0, m);
|
2009-06-22 23:08:05 +00:00
|
|
|
else
|
|
|
|
*mp0 = m;
|
|
|
|
if (*mp0 == NULL) {
|
|
|
|
error = ENOBUFS;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* NB: Must unlock socket buffer as uiomove may sleep. */
|
|
|
|
SOCKBUF_UNLOCK(sb);
|
|
|
|
error = m_mbuftouio(uio, sb->sb_mb, len);
|
|
|
|
SOCKBUF_LOCK(sb);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
SBLASTRECORDCHK(sb);
|
|
|
|
SBLASTMBUFCHK(sb);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove the delivered data from the socket buffer unless we
|
|
|
|
* were only peeking.
|
|
|
|
*/
|
|
|
|
if (!(flags & MSG_PEEK)) {
|
|
|
|
if (len > 0)
|
|
|
|
sbdrop_locked(sb, len);
|
|
|
|
|
|
|
|
/* Notify protocol that we drained some data. */
|
|
|
|
if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
|
|
|
|
(((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
|
|
|
|
!(flags & MSG_SOCALLBCK))) {
|
|
|
|
SOCKBUF_UNLOCK(sb);
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2009-06-22 23:08:05 +00:00
|
|
|
(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
|
|
|
|
SOCKBUF_LOCK(sb);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For MSG_WAITALL we may have to loop again and wait for
|
|
|
|
* more data to come in.
|
|
|
|
*/
|
|
|
|
if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
|
|
|
|
goto restart;
|
|
|
|
out:
|
|
|
|
SOCKBUF_LOCK_ASSERT(sb);
|
|
|
|
SBLASTRECORDCHK(sb);
|
|
|
|
SBLASTMBUFCHK(sb);
|
|
|
|
SOCKBUF_UNLOCK(sb);
|
|
|
|
sbunlock(sb);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2008-07-02 23:23:27 +00:00
|
|
|
/*
|
2008-10-01 13:26:52 +00:00
|
|
|
* Optimized version of soreceive() for simple datagram cases from userspace.
|
|
|
|
* Unlike in the stream case, we're able to drop a datagram if copyout()
|
|
|
|
* fails, and because we handle datagrams atomically, we don't need to use a
|
|
|
|
* sleep lock to prevent I/O interlacing.
|
2008-07-02 23:23:27 +00:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
|
|
|
|
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
|
|
|
|
{
|
|
|
|
struct mbuf *m, *m2;
|
2012-02-21 01:05:12 +00:00
|
|
|
int flags, error;
|
|
|
|
ssize_t len;
|
2008-07-02 23:23:27 +00:00
|
|
|
struct protosw *pr = so->so_proto;
|
|
|
|
struct mbuf *nextrecord;
|
|
|
|
|
|
|
|
if (psa != NULL)
|
|
|
|
*psa = NULL;
|
|
|
|
if (controlp != NULL)
|
|
|
|
*controlp = NULL;
|
|
|
|
if (flagsp != NULL)
|
|
|
|
flags = *flagsp &~ MSG_EOR;
|
|
|
|
else
|
|
|
|
flags = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For any complicated cases, fall back to the full
|
|
|
|
* soreceive_generic().
|
|
|
|
*/
|
|
|
|
if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
|
|
|
|
return (soreceive_generic(so, psa, uio, mp0, controlp,
|
|
|
|
flagsp));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Enforce restrictions on use.
|
|
|
|
*/
|
|
|
|
KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
|
|
|
|
("soreceive_dgram: wantrcvd"));
|
|
|
|
KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
|
|
|
|
KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
|
|
|
|
("soreceive_dgram: SBS_RCVATMARK"));
|
|
|
|
KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
|
|
|
|
("soreceive_dgram: P_CONNREQUIRED"));
|
|
|
|
|
|
|
|
/*
|
2008-10-01 13:26:52 +00:00
|
|
|
* Loop blocking while waiting for a datagram.
|
2008-07-02 23:23:27 +00:00
|
|
|
*/
|
2008-10-01 13:26:52 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
while ((m = so->so_rcv.sb_mb) == NULL) {
|
|
|
|
KASSERT(so->so_rcv.sb_cc == 0,
|
|
|
|
("soreceive_dgram: sb_mb NULL but sb_cc %u",
|
|
|
|
so->so_rcv.sb_cc));
|
2008-07-02 23:23:27 +00:00
|
|
|
if (so->so_error) {
|
|
|
|
error = so->so_error;
|
|
|
|
so->so_error = 0;
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
|
|
|
return (error);
|
|
|
|
}
|
2008-10-07 20:57:55 +00:00
|
|
|
if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
|
|
|
|
uio->uio_resid == 0) {
|
2008-07-02 23:23:27 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
if ((so->so_state & SS_NBIO) ||
|
|
|
|
(flags & (MSG_DONTWAIT|MSG_NBIO))) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2008-10-01 13:26:52 +00:00
|
|
|
return (EWOULDBLOCK);
|
2008-07-02 23:23:27 +00:00
|
|
|
}
|
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
|
|
|
error = sbwait(&so->so_rcv);
|
2008-10-01 13:26:52 +00:00
|
|
|
if (error) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2008-07-02 23:23:27 +00:00
|
|
|
return (error);
|
2008-10-01 13:26:52 +00:00
|
|
|
}
|
2008-07-02 23:23:27 +00:00
|
|
|
}
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
2008-10-01 13:26:52 +00:00
|
|
|
|
2008-07-02 23:23:27 +00:00
|
|
|
if (uio->uio_td)
|
|
|
|
uio->uio_td->td_ru.ru_msgrcv++;
|
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
|
|
|
nextrecord = m->m_nextpkt;
|
|
|
|
if (nextrecord == NULL) {
|
|
|
|
KASSERT(so->so_rcv.sb_lastrecord == m,
|
2008-09-30 18:44:26 +00:00
|
|
|
("soreceive_dgram: lastrecord != m"));
|
2008-07-02 23:23:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
|
|
|
|
("soreceive_dgram: m_nextpkt != nextrecord"));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pull 'm' and its chain off the front of the packet queue.
|
|
|
|
*/
|
|
|
|
so->so_rcv.sb_mb = NULL;
|
|
|
|
sockbuf_pushsync(&so->so_rcv, nextrecord);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Walk 'm's chain and free that many bytes from the socket buffer.
|
|
|
|
*/
|
|
|
|
for (m2 = m; m2 != NULL; m2 = m2->m_next)
|
|
|
|
sbfree(&so->so_rcv, m2);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do a few last checks before we let go of the lock.
|
|
|
|
*/
|
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
|
|
|
|
2008-10-01 19:14:05 +00:00
|
|
|
if (pr->pr_flags & PR_ADDR) {
|
|
|
|
KASSERT(m->m_type == MT_SONAME,
|
|
|
|
("m->m_type == %d", m->m_type));
|
|
|
|
if (psa != NULL)
|
|
|
|
*psa = sodupsockaddr(mtod(m, struct sockaddr *),
|
|
|
|
M_NOWAIT);
|
|
|
|
m = m_free(m);
|
|
|
|
}
|
|
|
|
if (m == NULL) {
|
|
|
|
/* XXXRW: Can this happen? */
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2008-07-02 23:23:27 +00:00
|
|
|
/*
|
|
|
|
* Packet to copyout() is now in 'm' and it is disconnected from the
|
|
|
|
* queue.
|
|
|
|
*
|
|
|
|
* Process one or more MT_CONTROL mbufs present before any data mbufs
|
2008-10-01 13:26:52 +00:00
|
|
|
* in the first mbuf chain on the socket buffer. We call into the
|
|
|
|
* protocol to perform externalization (or freeing if controlp ==
|
|
|
|
* NULL).
|
2008-07-02 23:23:27 +00:00
|
|
|
*/
|
|
|
|
if (m->m_type == MT_CONTROL) {
|
|
|
|
struct mbuf *cm = NULL, *cmn;
|
|
|
|
struct mbuf **cme = &cm;
|
|
|
|
|
|
|
|
do {
|
|
|
|
m2 = m->m_next;
|
|
|
|
m->m_next = NULL;
|
|
|
|
*cme = m;
|
|
|
|
cme = &(*cme)->m_next;
|
|
|
|
m = m2;
|
|
|
|
} while (m != NULL && m->m_type == MT_CONTROL);
|
|
|
|
while (cm != NULL) {
|
|
|
|
cmn = cm->m_next;
|
|
|
|
cm->m_next = NULL;
|
|
|
|
if (pr->pr_domain->dom_externalize != NULL) {
|
|
|
|
error = (*pr->pr_domain->dom_externalize)
|
2013-03-19 20:58:17 +00:00
|
|
|
(cm, controlp, flags);
|
2008-07-02 23:23:27 +00:00
|
|
|
} else if (controlp != NULL)
|
|
|
|
*controlp = cm;
|
|
|
|
else
|
|
|
|
m_freem(cm);
|
|
|
|
if (controlp != NULL) {
|
|
|
|
while (*controlp != NULL)
|
|
|
|
controlp = &(*controlp)->m_next;
|
|
|
|
}
|
|
|
|
cm = cmn;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
|
|
|
|
|
|
|
|
while (m != NULL && uio->uio_resid > 0) {
|
|
|
|
len = uio->uio_resid;
|
|
|
|
if (len > m->m_len)
|
|
|
|
len = m->m_len;
|
|
|
|
error = uiomove(mtod(m, char *), (int)len, uio);
|
|
|
|
if (error) {
|
|
|
|
m_freem(m);
|
|
|
|
return (error);
|
|
|
|
}
|
2010-08-07 17:57:58 +00:00
|
|
|
if (len == m->m_len)
|
|
|
|
m = m_free(m);
|
|
|
|
else {
|
|
|
|
m->m_data += len;
|
|
|
|
m->m_len -= len;
|
|
|
|
}
|
2008-07-02 23:23:27 +00:00
|
|
|
}
|
2008-10-01 13:26:52 +00:00
|
|
|
if (m != NULL)
|
2008-07-02 23:23:27 +00:00
|
|
|
flags |= MSG_TRUNC;
|
|
|
|
m_freem(m);
|
|
|
|
if (flagsp != NULL)
|
|
|
|
*flagsp |= flags;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
|
|
|
|
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
{
|
2011-02-16 21:29:13 +00:00
|
|
|
int error;
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
|
|
|
error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
controlp, flagsp));
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soshutdown(struct socket *so, int how)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct protosw *pr = so->so_proto;
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2001-02-27 13:48:07 +00:00
|
|
|
if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
|
|
|
|
return (EINVAL);
|
2011-02-16 21:29:13 +00:00
|
|
|
|
|
|
|
CURVNET_SET(so->so_vnet);
|
2012-12-07 22:13:33 +00:00
|
|
|
if (pr->pr_usrreqs->pru_flush != NULL)
|
|
|
|
(*pr->pr_usrreqs->pru_flush)(so, how);
|
2001-02-27 13:48:07 +00:00
|
|
|
if (how != SHUT_WR)
|
1994-05-24 10:09:53 +00:00
|
|
|
sorflush(so);
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
if (how != SHUT_RD) {
|
|
|
|
error = (*pr->pr_usrreqs->pru_shutdown)(so);
|
2013-04-30 15:06:30 +00:00
|
|
|
wakeup(&so->so_timeo);
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
|
|
|
}
|
2013-04-30 15:06:30 +00:00
|
|
|
wakeup(&so->so_timeo);
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_RESTORE();
|
1994-05-24 10:09:53 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
sorflush(struct socket *so)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct sockbuf *sb = &so->so_rcv;
|
|
|
|
struct protosw *pr = so->so_proto;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct sockbuf asb;
|
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
2008-02-04 12:25:13 +00:00
|
|
|
* In order to avoid calling dom_dispose with the socket buffer mutex
|
|
|
|
* held, and in order to generally avoid holding the lock for a long
|
|
|
|
* time, we make a copy of the socket buffer and clear the original
|
|
|
|
* (except locks, state). The new socket buffer copy won't have
|
|
|
|
* initialized locks so we can only call routines that won't use or
|
|
|
|
* assert those locks.
|
|
|
|
*
|
Correct two problems relating to sorflush(), which is called to flush
read socket buffers in shutdown() and close():
- Call socantrcvmore() before sblock() to dislodge any threads that
might be sleeping (potentially indefinitely) while holding sblock(),
such as a thread blocked in recv().
- Flag the sblock() call as non-interruptible so that a signal
delivered to the thread calling sorflush() doesn't cause sblock() to
fail. The sblock() is required to ensure that all other socket
consumer threads have, in fact, left, and do not enter, the socket
buffer until we're done flushin it.
To implement the latter, change the 'flags' argument to sblock() to
accept two flags, SBL_WAIT and SBL_NOINTR, rather than one M_WAITOK
flag. When SBL_NOINTR is set, it forces a non-interruptible sx
acquisition, regardless of the setting of the disposition of SB_NOINTR
on the socket buffer; without this change it would be possible for
another thread to clear SB_NOINTR between when the socket buffer mutex
is released and sblock() is invoked.
Reviewed by: bz, kmacy
Reported by: Jos Backus <jos at catnook dot com>
2008-01-31 08:22:24 +00:00
|
|
|
* Dislodge threads currently blocked in receive and wait to acquire
|
|
|
|
* a lock against other simultaneous readers before clearing the
|
|
|
|
* socket buffer. Don't let our acquire be interrupted by a signal
|
|
|
|
* despite any existing socket disposition on interruptable waiting.
|
|
|
|
*/
|
|
|
|
socantrcvmore(so);
|
|
|
|
(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Invalidate/clear most of the sockbuf structure, but leave selinfo
|
|
|
|
* and mutex data unchanged.
|
2005-06-09 19:59:09 +00:00
|
|
|
*/
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_LOCK(sb);
|
2005-06-09 19:59:09 +00:00
|
|
|
bzero(&asb, offsetof(struct sockbuf, sb_startzero));
|
|
|
|
bcopy(&sb->sb_startzero, &asb.sb_startzero,
|
|
|
|
sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
|
|
|
|
bzero(&sb->sb_startzero,
|
|
|
|
sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
|
|
|
|
SOCKBUF_UNLOCK(sb);
|
2007-05-03 14:42:42 +00:00
|
|
|
sbunlock(sb);
|
2005-06-09 19:59:09 +00:00
|
|
|
|
2008-02-04 12:25:13 +00:00
|
|
|
/*
|
|
|
|
* Dispose of special rights and flush the socket buffer. Don't call
|
|
|
|
* any unsafe routines (that rely on locks being initialized) on asb.
|
|
|
|
*/
|
2005-06-09 19:59:09 +00:00
|
|
|
if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
(*pr->pr_domain->dom_dispose)(asb.sb_mb);
|
2008-02-04 12:25:13 +00:00
|
|
|
sbrelease_internal(&asb, so);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Perhaps this routine, and sooptcopyout(), below, ought to come in an
|
|
|
|
* additional variant to handle the case where the option value needs to be
|
|
|
|
* some kind of integer, but not a specific size. In addition to their use
|
|
|
|
* here, these functions are also called by the protocol-level pr_ctloutput()
|
|
|
|
* routines.
|
1998-08-23 03:07:17 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1998-08-23 03:07:17 +00:00
|
|
|
size_t valsize;
|
|
|
|
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If the user gives us more than we wanted, we ignore it, but if we
|
|
|
|
* don't get the minimum length the caller wants, we return EINVAL.
|
|
|
|
* On success, sopt->sopt_valsize is set to however much we actually
|
|
|
|
* retrieved.
|
1998-08-23 03:07:17 +00:00
|
|
|
*/
|
|
|
|
if ((valsize = sopt->sopt_valsize) < minlen)
|
|
|
|
return EINVAL;
|
|
|
|
if (valsize > len)
|
|
|
|
sopt->sopt_valsize = valsize = len;
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
if (sopt->sopt_td != NULL)
|
1998-08-23 03:07:17 +00:00
|
|
|
return (copyin(sopt->sopt_val, buf, valsize));
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
bcopy(sopt->sopt_val, buf, valsize);
|
2006-02-12 15:00:27 +00:00
|
|
|
return (0);
|
1998-08-23 03:07:17 +00:00
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Kernel version of setsockopt(2).
|
|
|
|
*
|
2005-06-09 19:59:09 +00:00
|
|
|
* XXX: optlen is size_t, not socklen_t
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
so_setsockopt(struct socket *so, int level, int optname, void *optval,
|
|
|
|
size_t optlen)
|
|
|
|
{
|
|
|
|
struct sockopt sopt;
|
|
|
|
|
|
|
|
sopt.sopt_level = level;
|
|
|
|
sopt.sopt_name = optname;
|
|
|
|
sopt.sopt_dir = SOPT_SET;
|
|
|
|
sopt.sopt_val = optval;
|
|
|
|
sopt.sopt_valsize = optlen;
|
|
|
|
sopt.sopt_td = NULL;
|
|
|
|
return (sosetopt(so, &sopt));
|
|
|
|
}
|
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sosetopt(struct socket *so, struct sockopt *sopt)
|
1998-08-23 03:07:17 +00:00
|
|
|
{
|
|
|
|
int error, optval;
|
|
|
|
struct linger l;
|
|
|
|
struct timeval tv;
|
2013-09-01 23:34:53 +00:00
|
|
|
sbintime_t val;
|
2010-11-12 13:02:26 +00:00
|
|
|
uint32_t val32;
|
2005-06-09 19:59:09 +00:00
|
|
|
#ifdef MAC
|
|
|
|
struct mac extmac;
|
|
|
|
#endif
|
1998-08-23 03:07:17 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
1998-08-23 03:07:17 +00:00
|
|
|
error = 0;
|
|
|
|
if (sopt->sopt_level != SOL_SOCKET) {
|
2012-02-26 13:51:05 +00:00
|
|
|
if (so->so_proto->pr_ctloutput != NULL) {
|
2011-02-16 21:29:13 +00:00
|
|
|
error = (*so->so_proto->pr_ctloutput)(so, sopt);
|
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
error = ENOPROTOOPT;
|
|
|
|
} else {
|
1998-08-23 03:07:17 +00:00
|
|
|
switch (sopt->sopt_name) {
|
2000-11-20 01:35:25 +00:00
|
|
|
#ifdef INET
|
|
|
|
case SO_ACCEPTFILTER:
|
|
|
|
error = do_setopt_accept_filter(so, sopt);
|
|
|
|
if (error)
|
|
|
|
goto bad;
|
|
|
|
break;
|
2005-06-09 19:59:09 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_LINGER:
|
1998-08-23 03:07:17 +00:00
|
|
|
error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
|
|
|
|
if (error)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto bad;
|
1998-08-23 03:07:17 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_LOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
so->so_linger = l.l_linger;
|
|
|
|
if (l.l_onoff)
|
|
|
|
so->so_options |= SO_LINGER;
|
|
|
|
else
|
|
|
|
so->so_options &= ~SO_LINGER;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_UNLOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case SO_DEBUG:
|
|
|
|
case SO_KEEPALIVE:
|
|
|
|
case SO_DONTROUTE:
|
|
|
|
case SO_USELOOPBACK:
|
|
|
|
case SO_BROADCAST:
|
|
|
|
case SO_REUSEADDR:
|
|
|
|
case SO_REUSEPORT:
|
|
|
|
case SO_OOBINLINE:
|
1996-05-09 20:15:26 +00:00
|
|
|
case SO_TIMESTAMP:
|
2005-06-09 19:59:09 +00:00
|
|
|
case SO_BINTIME:
|
|
|
|
case SO_NOSIGPIPE:
|
2008-10-17 01:25:45 +00:00
|
|
|
case SO_NO_DDP:
|
|
|
|
case SO_NO_OFFLOAD:
|
1998-08-23 03:07:17 +00:00
|
|
|
error = sooptcopyin(sopt, &optval, sizeof optval,
|
2012-12-07 22:13:33 +00:00
|
|
|
sizeof optval);
|
1998-08-23 03:07:17 +00:00
|
|
|
if (error)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto bad;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_LOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
if (optval)
|
|
|
|
so->so_options |= sopt->sopt_name;
|
1994-05-24 10:09:53 +00:00
|
|
|
else
|
1998-08-23 03:07:17 +00:00
|
|
|
so->so_options &= ~sopt->sopt_name;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_UNLOCK(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
case SO_SETFIB:
|
|
|
|
error = sooptcopyin(sopt, &optval, sizeof optval,
|
2012-12-07 22:13:33 +00:00
|
|
|
sizeof optval);
|
2012-04-03 18:38:00 +00:00
|
|
|
if (error)
|
|
|
|
goto bad;
|
|
|
|
|
2012-02-03 11:00:53 +00:00
|
|
|
if (optval < 0 || optval >= rt_numfibs) {
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
error = EINVAL;
|
|
|
|
goto bad;
|
|
|
|
}
|
2012-02-26 13:51:05 +00:00
|
|
|
if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
|
2012-02-03 11:00:53 +00:00
|
|
|
(so->so_proto->pr_domain->dom_family == PF_INET6) ||
|
2012-04-03 18:38:00 +00:00
|
|
|
(so->so_proto->pr_domain->dom_family == PF_ROUTE)))
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
so->so_fibnum = optval;
|
2012-04-03 18:38:00 +00:00
|
|
|
else
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
so->so_fibnum = 0;
|
|
|
|
break;
|
2010-11-12 13:02:26 +00:00
|
|
|
|
|
|
|
case SO_USER_COOKIE:
|
|
|
|
error = sooptcopyin(sopt, &val32, sizeof val32,
|
2012-12-07 22:13:33 +00:00
|
|
|
sizeof val32);
|
2010-11-12 13:02:26 +00:00
|
|
|
if (error)
|
|
|
|
goto bad;
|
|
|
|
so->so_user_cookie = val32;
|
|
|
|
break;
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_SNDBUF:
|
|
|
|
case SO_RCVBUF:
|
|
|
|
case SO_SNDLOWAT:
|
|
|
|
case SO_RCVLOWAT:
|
1998-08-23 03:07:17 +00:00
|
|
|
error = sooptcopyin(sopt, &optval, sizeof optval,
|
2012-12-07 22:13:33 +00:00
|
|
|
sizeof optval);
|
1998-08-23 03:07:17 +00:00
|
|
|
if (error)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto bad;
|
1997-06-27 15:28:54 +00:00
|
|
|
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Values < 1 make no sense for any of these options,
|
|
|
|
* so disallow them.
|
1997-06-27 15:28:54 +00:00
|
|
|
*/
|
|
|
|
if (optval < 1) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
switch (sopt->sopt_name) {
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_SNDBUF:
|
|
|
|
case SO_RCVBUF:
|
1998-08-23 03:07:17 +00:00
|
|
|
if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
|
1999-10-09 20:42:17 +00:00
|
|
|
&so->so_snd : &so->so_rcv, (u_long)optval,
|
2005-06-09 19:59:09 +00:00
|
|
|
so, curthread) == 0) {
|
1994-05-24 10:09:53 +00:00
|
|
|
error = ENOBUFS;
|
|
|
|
goto bad;
|
|
|
|
}
|
2007-02-01 17:53:41 +00:00
|
|
|
(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
|
|
|
|
&so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
|
1997-06-27 15:28:54 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Make sure the low-water is never greater than the
|
|
|
|
* high-water.
|
1997-06-27 15:28:54 +00:00
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_SNDLOWAT:
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
1997-06-27 15:28:54 +00:00
|
|
|
so->so_snd.sb_lowat =
|
|
|
|
(optval > so->so_snd.sb_hiwat) ?
|
|
|
|
so->so_snd.sb_hiwat : optval;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
case SO_RCVLOWAT:
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
1997-06-27 15:28:54 +00:00
|
|
|
so->so_rcv.sb_lowat =
|
|
|
|
(optval > so->so_rcv.sb_hiwat) ?
|
|
|
|
so->so_rcv.sb_hiwat : optval;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case SO_SNDTIMEO:
|
|
|
|
case SO_RCVTIMEO:
|
2010-03-11 14:49:06 +00:00
|
|
|
#ifdef COMPAT_FREEBSD32
|
2008-11-22 12:36:15 +00:00
|
|
|
if (SV_CURPROC_FLAG(SV_ILP32)) {
|
2005-10-27 04:26:35 +00:00
|
|
|
struct timeval32 tv32;
|
|
|
|
|
|
|
|
error = sooptcopyin(sopt, &tv32, sizeof tv32,
|
|
|
|
sizeof tv32);
|
|
|
|
CP(tv32, tv, tv_sec);
|
|
|
|
CP(tv32, tv, tv_usec);
|
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
error = sooptcopyin(sopt, &tv, sizeof tv,
|
|
|
|
sizeof tv);
|
1998-08-23 03:07:17 +00:00
|
|
|
if (error)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto bad;
|
2013-08-29 15:59:05 +00:00
|
|
|
if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
|
|
|
|
tv.tv_usec >= 1000000) {
|
1999-05-21 15:54:40 +00:00
|
|
|
error = EDOM;
|
|
|
|
goto bad;
|
|
|
|
}
|
2013-09-01 23:34:53 +00:00
|
|
|
val = tvtosbt(tv);
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
switch (sopt->sopt_name) {
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_SNDTIMEO:
|
|
|
|
so->so_snd.sb_timeo = val;
|
|
|
|
break;
|
|
|
|
case SO_RCVTIMEO:
|
|
|
|
so->so_rcv.sb_timeo = val;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
2005-09-18 21:08:03 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
case SO_LABEL:
|
|
|
|
#ifdef MAC
|
|
|
|
error = sooptcopyin(sopt, &extmac, sizeof extmac,
|
|
|
|
sizeof extmac);
|
|
|
|
if (error)
|
|
|
|
goto bad;
|
|
|
|
error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
|
|
|
|
so, &extmac);
|
|
|
|
#else
|
|
|
|
error = EOPNOTSUPP;
|
|
|
|
#endif
|
|
|
|
break;
|
2005-09-18 21:08:03 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
default:
|
|
|
|
error = ENOPROTOOPT;
|
|
|
|
break;
|
|
|
|
}
|
2012-02-26 13:51:05 +00:00
|
|
|
if (error == 0 && so->so_proto->pr_ctloutput != NULL)
|
|
|
|
(void)(*so->so_proto->pr_ctloutput)(so, sopt);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
bad:
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_RESTORE();
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2006-07-23 20:36:04 +00:00
|
|
|
/*
|
|
|
|
* Helper routine for getsockopt.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2003-08-05 00:27:54 +00:00
|
|
|
sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1998-08-23 03:07:17 +00:00
|
|
|
int error;
|
|
|
|
size_t valsize;
|
|
|
|
|
|
|
|
error = 0;
|
|
|
|
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Documented get behavior is that we always return a value, possibly
|
|
|
|
* truncated to fit in the user's buffer. Traditional behavior is
|
|
|
|
* that we always tell the user precisely how much we copied, rather
|
|
|
|
* than something useful like the total amount we had available for
|
|
|
|
* her. Note that this interface is not idempotent; the entire
|
|
|
|
* answer must generated ahead of time.
|
1998-08-23 03:07:17 +00:00
|
|
|
*/
|
1998-08-31 15:34:55 +00:00
|
|
|
valsize = min(len, sopt->sopt_valsize);
|
1998-08-31 18:07:23 +00:00
|
|
|
sopt->sopt_valsize = valsize;
|
2005-06-09 19:59:09 +00:00
|
|
|
if (sopt->sopt_val != NULL) {
|
|
|
|
if (sopt->sopt_td != NULL)
|
1998-08-23 03:07:17 +00:00
|
|
|
error = copyout(buf, sopt->sopt_val, valsize);
|
|
|
|
else
|
|
|
|
bcopy(buf, sopt->sopt_val, valsize);
|
|
|
|
}
|
2006-02-12 15:00:27 +00:00
|
|
|
return (error);
|
1998-08-23 03:07:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sogetopt(struct socket *so, struct sockopt *sopt)
|
1998-08-23 03:07:17 +00:00
|
|
|
{
|
|
|
|
int error, optval;
|
|
|
|
struct linger l;
|
|
|
|
struct timeval tv;
|
2005-06-09 19:59:09 +00:00
|
|
|
#ifdef MAC
|
|
|
|
struct mac extmac;
|
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
1998-08-23 03:07:17 +00:00
|
|
|
error = 0;
|
|
|
|
if (sopt->sopt_level != SOL_SOCKET) {
|
2012-02-26 13:51:05 +00:00
|
|
|
if (so->so_proto->pr_ctloutput != NULL)
|
2011-02-16 21:29:13 +00:00
|
|
|
error = (*so->so_proto->pr_ctloutput)(so, sopt);
|
|
|
|
else
|
|
|
|
error = ENOPROTOOPT;
|
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
} else {
|
1998-08-23 03:07:17 +00:00
|
|
|
switch (sopt->sopt_name) {
|
2000-11-20 01:35:25 +00:00
|
|
|
#ifdef INET
|
2000-06-20 01:09:23 +00:00
|
|
|
case SO_ACCEPTFILTER:
|
2005-06-09 19:59:09 +00:00
|
|
|
error = do_getopt_accept_filter(so, sopt);
|
2000-06-20 01:09:23 +00:00
|
|
|
break;
|
2005-06-09 19:59:09 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_LINGER:
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_LOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
l.l_onoff = so->so_options & SO_LINGER;
|
|
|
|
l.l_linger = so->so_linger;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_UNLOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
error = sooptcopyout(sopt, &l, sizeof l);
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case SO_USELOOPBACK:
|
|
|
|
case SO_DONTROUTE:
|
|
|
|
case SO_DEBUG:
|
|
|
|
case SO_KEEPALIVE:
|
|
|
|
case SO_REUSEADDR:
|
|
|
|
case SO_REUSEPORT:
|
|
|
|
case SO_BROADCAST:
|
|
|
|
case SO_OOBINLINE:
|
2005-08-01 21:15:09 +00:00
|
|
|
case SO_ACCEPTCONN:
|
1996-05-09 20:15:26 +00:00
|
|
|
case SO_TIMESTAMP:
|
2005-06-09 19:59:09 +00:00
|
|
|
case SO_BINTIME:
|
|
|
|
case SO_NOSIGPIPE:
|
1998-08-23 03:07:17 +00:00
|
|
|
optval = so->so_options & sopt->sopt_name;
|
|
|
|
integer:
|
|
|
|
error = sooptcopyout(sopt, &optval, sizeof optval);
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case SO_TYPE:
|
1998-08-23 03:07:17 +00:00
|
|
|
optval = so->so_type;
|
|
|
|
goto integer;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2012-02-26 13:55:43 +00:00
|
|
|
case SO_PROTOCOL:
|
|
|
|
optval = so->so_proto->pr_protocol;
|
|
|
|
goto integer;
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_ERROR:
|
2006-06-18 19:02:49 +00:00
|
|
|
SOCK_LOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
optval = so->so_error;
|
1994-05-24 10:09:53 +00:00
|
|
|
so->so_error = 0;
|
2006-06-18 19:02:49 +00:00
|
|
|
SOCK_UNLOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
goto integer;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case SO_SNDBUF:
|
1998-08-23 03:07:17 +00:00
|
|
|
optval = so->so_snd.sb_hiwat;
|
|
|
|
goto integer;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case SO_RCVBUF:
|
1998-08-23 03:07:17 +00:00
|
|
|
optval = so->so_rcv.sb_hiwat;
|
|
|
|
goto integer;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case SO_SNDLOWAT:
|
1998-08-23 03:07:17 +00:00
|
|
|
optval = so->so_snd.sb_lowat;
|
|
|
|
goto integer;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case SO_RCVLOWAT:
|
1998-08-23 03:07:17 +00:00
|
|
|
optval = so->so_rcv.sb_lowat;
|
|
|
|
goto integer;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case SO_SNDTIMEO:
|
|
|
|
case SO_RCVTIMEO:
|
1998-08-23 03:07:17 +00:00
|
|
|
optval = (sopt->sopt_name == SO_SNDTIMEO ?
|
|
|
|
so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
|
|
|
|
|
2013-09-01 23:34:53 +00:00
|
|
|
tv = sbttotv(optval);
|
2010-03-11 14:49:06 +00:00
|
|
|
#ifdef COMPAT_FREEBSD32
|
2008-11-22 12:36:15 +00:00
|
|
|
if (SV_CURPROC_FLAG(SV_ILP32)) {
|
2005-10-27 04:26:35 +00:00
|
|
|
struct timeval32 tv32;
|
|
|
|
|
|
|
|
CP(tv, tv32, tv_sec);
|
|
|
|
CP(tv, tv32, tv_usec);
|
|
|
|
error = sooptcopyout(sopt, &tv32, sizeof tv32);
|
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
error = sooptcopyout(sopt, &tv, sizeof tv);
|
2005-06-09 19:59:09 +00:00
|
|
|
break;
|
2005-09-18 21:08:03 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
case SO_LABEL:
|
|
|
|
#ifdef MAC
|
|
|
|
error = sooptcopyin(sopt, &extmac, sizeof(extmac),
|
|
|
|
sizeof(extmac));
|
|
|
|
if (error)
|
2011-02-16 21:29:13 +00:00
|
|
|
goto bad;
|
2005-06-09 19:59:09 +00:00
|
|
|
error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
|
|
|
|
so, &extmac);
|
|
|
|
if (error)
|
2011-02-16 21:29:13 +00:00
|
|
|
goto bad;
|
2005-06-09 19:59:09 +00:00
|
|
|
error = sooptcopyout(sopt, &extmac, sizeof extmac);
|
|
|
|
#else
|
|
|
|
error = EOPNOTSUPP;
|
|
|
|
#endif
|
|
|
|
break;
|
2005-09-18 21:08:03 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
case SO_PEERLABEL:
|
|
|
|
#ifdef MAC
|
|
|
|
error = sooptcopyin(sopt, &extmac, sizeof(extmac),
|
|
|
|
sizeof(extmac));
|
|
|
|
if (error)
|
2011-02-16 21:29:13 +00:00
|
|
|
goto bad;
|
2005-06-09 19:59:09 +00:00
|
|
|
error = mac_getsockopt_peerlabel(
|
|
|
|
sopt->sopt_td->td_ucred, so, &extmac);
|
|
|
|
if (error)
|
2011-02-16 21:29:13 +00:00
|
|
|
goto bad;
|
2005-06-09 19:59:09 +00:00
|
|
|
error = sooptcopyout(sopt, &extmac, sizeof extmac);
|
|
|
|
#else
|
|
|
|
error = EOPNOTSUPP;
|
|
|
|
#endif
|
|
|
|
break;
|
2005-09-18 21:08:03 +00:00
|
|
|
|
|
|
|
case SO_LISTENQLIMIT:
|
|
|
|
optval = so->so_qlimit;
|
|
|
|
goto integer;
|
|
|
|
|
|
|
|
case SO_LISTENQLEN:
|
|
|
|
optval = so->so_qlen;
|
|
|
|
goto integer;
|
|
|
|
|
|
|
|
case SO_LISTENINCQLEN:
|
|
|
|
optval = so->so_incqlen;
|
|
|
|
goto integer;
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
default:
|
1998-08-23 03:07:17 +00:00
|
|
|
error = ENOPROTOOPT;
|
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
}
|
2011-02-16 21:29:13 +00:00
|
|
|
#ifdef MAC
|
|
|
|
bad:
|
|
|
|
#endif
|
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1999-11-22 02:45:11 +00:00
|
|
|
int
|
|
|
|
soopt_getm(struct sockopt *sopt, struct mbuf **mp)
|
|
|
|
{
|
|
|
|
struct mbuf *m, *m_prev;
|
|
|
|
int sopt_size = sopt->sopt_valsize;
|
|
|
|
|
2012-12-05 08:04:20 +00:00
|
|
|
MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m == NULL)
|
1999-11-22 02:45:11 +00:00
|
|
|
return ENOBUFS;
|
|
|
|
if (sopt_size > MLEN) {
|
2012-12-05 08:04:20 +00:00
|
|
|
MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
|
1999-11-22 02:45:11 +00:00
|
|
|
if ((m->m_flags & M_EXT) == 0) {
|
|
|
|
m_free(m);
|
|
|
|
return ENOBUFS;
|
|
|
|
}
|
|
|
|
m->m_len = min(MCLBYTES, sopt_size);
|
|
|
|
} else {
|
|
|
|
m->m_len = min(MLEN, sopt_size);
|
|
|
|
}
|
|
|
|
sopt_size -= m->m_len;
|
|
|
|
*mp = m;
|
|
|
|
m_prev = m;
|
|
|
|
|
|
|
|
while (sopt_size) {
|
2012-12-05 08:04:20 +00:00
|
|
|
MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m == NULL) {
|
1999-11-22 02:45:11 +00:00
|
|
|
m_freem(*mp);
|
|
|
|
return ENOBUFS;
|
|
|
|
}
|
|
|
|
if (sopt_size > MLEN) {
|
2012-12-05 08:04:20 +00:00
|
|
|
MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
|
|
|
|
M_NOWAIT);
|
1999-11-22 02:45:11 +00:00
|
|
|
if ((m->m_flags & M_EXT) == 0) {
|
2005-06-09 19:59:09 +00:00
|
|
|
m_freem(m);
|
1999-11-22 02:45:11 +00:00
|
|
|
m_freem(*mp);
|
|
|
|
return ENOBUFS;
|
|
|
|
}
|
|
|
|
m->m_len = min(MCLBYTES, sopt_size);
|
|
|
|
} else {
|
|
|
|
m->m_len = min(MLEN, sopt_size);
|
|
|
|
}
|
|
|
|
sopt_size -= m->m_len;
|
|
|
|
m_prev->m_next = m;
|
|
|
|
m_prev = m;
|
|
|
|
}
|
2006-02-12 15:00:27 +00:00
|
|
|
return (0);
|
1999-11-22 02:45:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
|
|
|
|
{
|
|
|
|
struct mbuf *m0 = m;
|
|
|
|
|
|
|
|
if (sopt->sopt_val == NULL)
|
2006-02-12 15:00:27 +00:00
|
|
|
return (0);
|
1999-11-22 02:45:11 +00:00
|
|
|
while (m != NULL && sopt->sopt_valsize >= m->m_len) {
|
2005-06-09 19:59:09 +00:00
|
|
|
if (sopt->sopt_td != NULL) {
|
1999-11-22 02:45:11 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
error = copyin(sopt->sopt_val, mtod(m, char *),
|
2012-12-07 22:13:33 +00:00
|
|
|
m->m_len);
|
1999-11-22 02:45:11 +00:00
|
|
|
if (error != 0) {
|
|
|
|
m_freem(m0);
|
|
|
|
return(error);
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
|
|
|
|
sopt->sopt_valsize -= m->m_len;
|
2005-06-09 19:59:09 +00:00
|
|
|
sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
|
1999-11-22 02:45:11 +00:00
|
|
|
m = m->m_next;
|
|
|
|
}
|
|
|
|
if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
|
|
|
|
panic("ip6_sooptmcopyin");
|
2006-02-12 15:00:27 +00:00
|
|
|
return (0);
|
1999-11-22 02:45:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
|
|
|
|
{
|
|
|
|
struct mbuf *m0 = m;
|
|
|
|
size_t valsize = 0;
|
|
|
|
|
|
|
|
if (sopt->sopt_val == NULL)
|
2006-02-12 15:00:27 +00:00
|
|
|
return (0);
|
1999-11-22 02:45:11 +00:00
|
|
|
while (m != NULL && sopt->sopt_valsize >= m->m_len) {
|
2005-06-09 19:59:09 +00:00
|
|
|
if (sopt->sopt_td != NULL) {
|
1999-11-22 02:45:11 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
error = copyout(mtod(m, char *), sopt->sopt_val,
|
2012-12-07 22:13:33 +00:00
|
|
|
m->m_len);
|
1999-11-22 02:45:11 +00:00
|
|
|
if (error != 0) {
|
|
|
|
m_freem(m0);
|
|
|
|
return(error);
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
|
2012-12-07 22:13:33 +00:00
|
|
|
sopt->sopt_valsize -= m->m_len;
|
|
|
|
sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
|
|
|
|
valsize += m->m_len;
|
|
|
|
m = m->m_next;
|
1999-11-22 02:45:11 +00:00
|
|
|
}
|
|
|
|
if (m != NULL) {
|
|
|
|
/* enough soopt buffer should be given from user-land */
|
|
|
|
m_freem(m0);
|
|
|
|
return(EINVAL);
|
|
|
|
}
|
|
|
|
sopt->sopt_valsize = valsize;
|
2006-02-12 15:00:27 +00:00
|
|
|
return (0);
|
1999-11-22 02:45:11 +00:00
|
|
|
}
|
|
|
|
|
2006-07-23 20:36:04 +00:00
|
|
|
/*
|
|
|
|
* sohasoutofband(): protocol notifies socket layer of the arrival of new
|
|
|
|
* out-of-band data, which will then notify socket consumers.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
sohasoutofband(struct socket *so)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
if (so->so_sigio != NULL)
|
2005-06-09 19:59:09 +00:00
|
|
|
pgsigio(&so->so_sigio, SIGURG, 0);
|
|
|
|
selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1997-04-27 20:01:29 +00:00
|
|
|
|
|
|
|
int
|
2005-06-09 19:59:09 +00:00
|
|
|
sopoll(struct socket *so, int events, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
1997-04-27 20:01:29 +00:00
|
|
|
{
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
/*
|
|
|
|
* We do not need to set or assert curvnet as long as everyone uses
|
|
|
|
* sopoll_generic().
|
|
|
|
*/
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
|
|
|
|
td));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
|
|
|
{
|
1997-09-14 02:34:14 +00:00
|
|
|
int revents = 0;
|
1997-04-27 20:01:29 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
2002-05-31 11:52:35 +00:00
|
|
|
if (events & (POLLIN | POLLRDNORM))
|
2009-07-07 09:43:44 +00:00
|
|
|
if (soreadabledata(so))
|
1997-09-14 02:34:14 +00:00
|
|
|
revents |= events & (POLLIN | POLLRDNORM);
|
1997-04-27 20:01:29 +00:00
|
|
|
|
2002-05-31 11:52:35 +00:00
|
|
|
if (events & (POLLOUT | POLLWRNORM))
|
1997-09-14 02:34:14 +00:00
|
|
|
if (sowriteable(so))
|
|
|
|
revents |= events & (POLLOUT | POLLWRNORM);
|
|
|
|
|
2002-05-31 11:52:35 +00:00
|
|
|
if (events & (POLLPRI | POLLRDBAND))
|
2005-06-09 19:59:09 +00:00
|
|
|
if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
|
1997-09-14 02:34:14 +00:00
|
|
|
revents |= events & (POLLPRI | POLLRDBAND);
|
|
|
|
|
2009-08-25 21:44:14 +00:00
|
|
|
if ((events & POLLINIGNEOF) == 0) {
|
|
|
|
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
|
|
|
|
revents |= events & (POLLIN | POLLRDNORM);
|
|
|
|
if (so->so_snd.sb_state & SBS_CANTSENDMORE)
|
|
|
|
revents |= POLLHUP;
|
|
|
|
}
|
|
|
|
}
|
2009-07-07 09:43:44 +00:00
|
|
|
|
1997-09-14 02:34:14 +00:00
|
|
|
if (revents == 0) {
|
2009-07-07 09:43:44 +00:00
|
|
|
if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
|
2005-06-09 19:59:09 +00:00
|
|
|
selrecord(td, &so->so_rcv.sb_sel);
|
1997-09-14 02:34:14 +00:00
|
|
|
so->so_rcv.sb_flags |= SB_SEL;
|
1997-04-27 20:01:29 +00:00
|
|
|
}
|
|
|
|
|
1997-09-14 02:34:14 +00:00
|
|
|
if (events & (POLLOUT | POLLWRNORM)) {
|
2005-06-09 19:59:09 +00:00
|
|
|
selrecord(td, &so->so_snd.sb_sel);
|
1997-09-14 02:34:14 +00:00
|
|
|
so->so_snd.sb_flags |= SB_SEL;
|
1997-04-27 20:01:29 +00:00
|
|
|
}
|
|
|
|
}
|
1997-09-14 02:34:14 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
1997-09-14 02:34:14 +00:00
|
|
|
return (revents);
|
1997-04-27 20:01:29 +00:00
|
|
|
}
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2001-02-15 16:34:11 +00:00
|
|
|
int
|
2005-06-09 19:59:09 +00:00
|
|
|
soo_kqfilter(struct file *fp, struct knote *kn)
|
2000-04-16 18:53:38 +00:00
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct socket *so = kn->kn_fp->f_data;
|
2001-02-15 16:34:11 +00:00
|
|
|
struct sockbuf *sb;
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2001-02-15 16:34:11 +00:00
|
|
|
switch (kn->kn_filter) {
|
|
|
|
case EVFILT_READ:
|
|
|
|
if (so->so_options & SO_ACCEPTCONN)
|
|
|
|
kn->kn_fop = &solisten_filtops;
|
2005-06-09 19:56:38 +00:00
|
|
|
else
|
2005-06-09 19:59:09 +00:00
|
|
|
kn->kn_fop = &soread_filtops;
|
2001-02-15 16:34:11 +00:00
|
|
|
sb = &so->so_rcv;
|
|
|
|
break;
|
|
|
|
case EVFILT_WRITE:
|
|
|
|
kn->kn_fop = &sowrite_filtops;
|
|
|
|
sb = &so->so_snd;
|
|
|
|
break;
|
|
|
|
default:
|
2005-06-09 19:59:09 +00:00
|
|
|
return (EINVAL);
|
2001-02-15 16:34:11 +00:00
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(sb);
|
|
|
|
knlist_add(&sb->sb_sel.si_note, kn, 1);
|
2001-02-15 16:34:11 +00:00
|
|
|
sb->sb_flags |= SB_KNOTE;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(sb);
|
2000-04-16 18:53:38 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
/*
|
|
|
|
* Some routines that return EOPNOTSUPP for entry points that are not
|
|
|
|
* supported by a protocol. Fill in as needed.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2013-03-02 21:11:30 +00:00
|
|
|
int
|
|
|
|
pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
|
|
|
|
struct thread *td)
|
|
|
|
{
|
|
|
|
|
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
int
|
|
|
|
pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2013-03-02 21:11:30 +00:00
|
|
|
int
|
|
|
|
pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
|
|
|
|
struct thread *td)
|
|
|
|
{
|
|
|
|
|
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
int
|
|
|
|
pru_connect2_notsupp(struct socket *so1, struct socket *so2)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
|
2007-05-16 20:41:08 +00:00
|
|
|
struct ifnet *ifp, struct thread *td)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_disconnect_notsupp(struct socket *so)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_rcvd_notsupp(struct socket *so, int flags)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
|
2007-05-16 20:41:08 +00:00
|
|
|
struct sockaddr *addr, struct mbuf *control, struct thread *td)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2007-03-26 17:05:09 +00:00
|
|
|
* This isn't really a ``null'' operation, but it's the default one and
|
|
|
|
* doesn't do anything destructive.
|
2007-03-26 08:59:03 +00:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
pru_sense_null(struct socket *so, struct stat *sb)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
sb->st_blksize = so->so_snd.sb_hiwat;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_shutdown_notsupp(struct socket *so)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
|
2007-05-16 20:41:08 +00:00
|
|
|
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
|
2007-05-16 20:41:08 +00:00
|
|
|
struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
|
2007-05-16 20:41:08 +00:00
|
|
|
struct thread *td)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2000-04-16 18:53:38 +00:00
|
|
|
static void
|
|
|
|
filt_sordetach(struct knote *kn)
|
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct socket *so = kn->kn_fp->f_data;
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
|
|
|
|
if (knlist_empty(&so->so_rcv.sb_sel.si_note))
|
2000-04-16 18:53:38 +00:00
|
|
|
so->so_rcv.sb_flags &= ~SB_KNOTE;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2000-04-16 18:53:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
|
|
|
filt_soread(struct knote *kn, long hint)
|
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct socket *so;
|
2004-08-24 05:28:18 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
so = kn->kn_fp->f_data;
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
|
|
|
|
|
|
|
kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
|
|
|
|
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
|
|
|
|
kn->kn_flags |= EV_EOF;
|
2001-02-24 01:33:12 +00:00
|
|
|
kn->kn_fflags = so->so_error;
|
2004-08-24 05:28:18 +00:00
|
|
|
return (1);
|
2005-06-09 19:59:09 +00:00
|
|
|
} else if (so->so_error) /* temporary udp error */
|
2004-08-24 05:28:18 +00:00
|
|
|
return (1);
|
2005-06-09 19:59:09 +00:00
|
|
|
else if (kn->kn_sfflags & NOTE_LOWAT)
|
2004-08-24 05:28:18 +00:00
|
|
|
return (kn->kn_data >= kn->kn_sdata);
|
2005-06-09 19:59:09 +00:00
|
|
|
else
|
|
|
|
return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
|
2000-04-16 18:53:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
filt_sowdetach(struct knote *kn)
|
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct socket *so = kn->kn_fp->f_data;
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
|
|
|
knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
|
|
|
|
if (knlist_empty(&so->so_snd.sb_sel.si_note))
|
2000-04-16 18:53:38 +00:00
|
|
|
so->so_snd.sb_flags &= ~SB_KNOTE;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
2000-04-16 18:53:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
|
|
|
filt_sowrite(struct knote *kn, long hint)
|
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct socket *so;
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
so = kn->kn_fp->f_data;
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_snd);
|
2000-04-16 18:53:38 +00:00
|
|
|
kn->kn_data = sbspace(&so->so_snd);
|
2005-06-09 19:59:09 +00:00
|
|
|
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
|
|
|
|
kn->kn_flags |= EV_EOF;
|
2001-02-24 01:33:12 +00:00
|
|
|
kn->kn_fflags = so->so_error;
|
2004-08-24 05:28:18 +00:00
|
|
|
return (1);
|
2005-06-09 19:59:09 +00:00
|
|
|
} else if (so->so_error) /* temporary udp error */
|
2004-08-24 05:28:18 +00:00
|
|
|
return (1);
|
2005-06-09 19:59:09 +00:00
|
|
|
else if (((so->so_state & SS_ISCONNECTED) == 0) &&
|
2002-05-31 11:52:35 +00:00
|
|
|
(so->so_proto->pr_flags & PR_CONNREQUIRED))
|
2004-08-24 05:28:18 +00:00
|
|
|
return (0);
|
2005-06-09 19:59:09 +00:00
|
|
|
else if (kn->kn_sfflags & NOTE_LOWAT)
|
2004-08-24 05:28:18 +00:00
|
|
|
return (kn->kn_data >= kn->kn_sdata);
|
2005-06-09 19:59:09 +00:00
|
|
|
else
|
|
|
|
return (kn->kn_data >= so->so_snd.sb_lowat);
|
2000-04-16 18:53:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
|
|
|
filt_solisten(struct knote *kn, long hint)
|
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct socket *so = kn->kn_fp->f_data;
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2002-04-26 02:07:46 +00:00
|
|
|
kn->kn_data = so->so_qlen;
|
2012-12-07 22:13:33 +00:00
|
|
|
return (!TAILQ_EMPTY(&so->so_comp));
|
2000-04-16 18:53:38 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
|
|
|
|
int
|
|
|
|
socheckuid(struct socket *so, uid_t uid)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (so == NULL)
|
|
|
|
return (EPERM);
|
|
|
|
if (so->so_cred->cr_uid != uid)
|
|
|
|
return (EPERM);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
/*
|
2007-03-26 17:05:09 +00:00
|
|
|
* These functions are used by protocols to notify the socket layer (and its
|
|
|
|
* consumers) of state changes in the sockets driven by protocol-side events.
|
2007-03-26 08:59:03 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
2007-03-26 17:05:09 +00:00
|
|
|
* Procedures to manipulate state flags of socket and do appropriate wakeups.
|
2007-03-26 08:59:03 +00:00
|
|
|
*
|
2007-03-26 17:05:09 +00:00
|
|
|
* Normal sequence from the active (originating) side is that
|
|
|
|
* soisconnecting() is called during processing of connect() call, resulting
|
|
|
|
* in an eventual call to soisconnected() if/when the connection is
|
|
|
|
* established. When the connection is torn down soisdisconnecting() is
|
|
|
|
* called during processing of disconnect() call, and soisdisconnected() is
|
|
|
|
* called when the connection to the peer is totally severed. The semantics
|
|
|
|
* of these routines are such that connectionless protocols can call
|
|
|
|
* soisconnected() and soisdisconnected() only, bypassing the in-progress
|
|
|
|
* calls when setting up a ``connection'' takes no time.
|
2007-03-26 08:59:03 +00:00
|
|
|
*
|
2007-03-26 17:05:09 +00:00
|
|
|
* From the passive side, a socket is created with two queues of sockets:
|
|
|
|
* so_incomp for connections in progress and so_comp for connections already
|
|
|
|
* made and awaiting user acceptance. As a protocol is preparing incoming
|
|
|
|
* connections, it creates a socket structure queued on so_incomp by calling
|
|
|
|
* sonewconn(). When the connection is established, soisconnected() is
|
|
|
|
* called, and transfers the socket structure to so_comp, making it available
|
|
|
|
* to accept().
|
2007-03-26 08:59:03 +00:00
|
|
|
*
|
2007-03-26 17:05:09 +00:00
|
|
|
* If a socket is closed with sockets on either so_incomp or so_comp, these
|
|
|
|
* sockets are dropped.
|
|
|
|
*
|
|
|
|
* If higher-level protocols are implemented in the kernel, the wakeups done
|
|
|
|
* here will sometimes cause software-interrupt process scheduling.
|
2007-03-26 08:59:03 +00:00
|
|
|
*/
|
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
soisconnecting(struct socket *so)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
|
|
|
|
so->so_state |= SS_ISCONNECTING;
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
soisconnected(struct socket *so)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
2012-12-07 22:13:33 +00:00
|
|
|
struct socket *head;
|
2009-06-01 21:17:03 +00:00
|
|
|
int ret;
|
2007-03-26 08:59:03 +00:00
|
|
|
|
2009-06-01 21:17:03 +00:00
|
|
|
restart:
|
2007-03-26 08:59:03 +00:00
|
|
|
ACCEPT_LOCK();
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
|
|
|
|
so->so_state |= SS_ISCONNECTED;
|
|
|
|
head = so->so_head;
|
|
|
|
if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
|
|
|
|
if ((so->so_options & SO_ACCEPTFILTER) == 0) {
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
TAILQ_REMOVE(&head->so_incomp, so, so_list);
|
|
|
|
head->so_incqlen--;
|
|
|
|
so->so_qstate &= ~SQ_INCOMP;
|
|
|
|
TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
|
|
|
|
head->so_qlen++;
|
|
|
|
so->so_qstate |= SQ_COMP;
|
|
|
|
ACCEPT_UNLOCK();
|
|
|
|
sorwakeup(head);
|
|
|
|
wakeup_one(&head->so_timeo);
|
|
|
|
} else {
|
|
|
|
ACCEPT_UNLOCK();
|
2009-06-01 21:17:03 +00:00
|
|
|
soupcall_set(so, SO_RCV,
|
|
|
|
head->so_accf->so_accept_filter->accf_callback,
|
|
|
|
head->so_accf->so_accept_filter_arg);
|
2007-03-26 08:59:03 +00:00
|
|
|
so->so_options &= ~SO_ACCEPTFILTER;
|
2009-06-01 21:17:03 +00:00
|
|
|
ret = head->so_accf->so_accept_filter->accf_callback(so,
|
2012-12-05 08:04:20 +00:00
|
|
|
head->so_accf->so_accept_filter_arg, M_NOWAIT);
|
2009-06-01 21:17:03 +00:00
|
|
|
if (ret == SU_ISCONNECTED)
|
|
|
|
soupcall_clear(so, SO_RCV);
|
2007-03-26 08:59:03 +00:00
|
|
|
SOCK_UNLOCK(so);
|
2009-06-01 21:17:03 +00:00
|
|
|
if (ret == SU_ISCONNECTED)
|
|
|
|
goto restart;
|
2007-03-26 08:59:03 +00:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
ACCEPT_UNLOCK();
|
|
|
|
wakeup(&so->so_timeo);
|
|
|
|
sorwakeup(so);
|
|
|
|
sowwakeup(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
soisdisconnecting(struct socket *so)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note: This code assumes that SOCK_LOCK(so) and
|
|
|
|
* SOCKBUF_LOCK(&so->so_rcv) are the same.
|
|
|
|
*/
|
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
so->so_state &= ~SS_ISCONNECTING;
|
|
|
|
so->so_state |= SS_ISDISCONNECTING;
|
|
|
|
so->so_rcv.sb_state |= SBS_CANTRCVMORE;
|
|
|
|
sorwakeup_locked(so);
|
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
|
|
|
so->so_snd.sb_state |= SBS_CANTSENDMORE;
|
|
|
|
sowwakeup_locked(so);
|
|
|
|
wakeup(&so->so_timeo);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
soisdisconnected(struct socket *so)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note: This code assumes that SOCK_LOCK(so) and
|
|
|
|
* SOCKBUF_LOCK(&so->so_rcv) are the same.
|
|
|
|
*/
|
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
|
|
|
|
so->so_state |= SS_ISDISCONNECTED;
|
|
|
|
so->so_rcv.sb_state |= SBS_CANTRCVMORE;
|
|
|
|
sorwakeup_locked(so);
|
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
|
|
|
so->so_snd.sb_state |= SBS_CANTSENDMORE;
|
|
|
|
sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
|
|
|
|
sowwakeup_locked(so);
|
|
|
|
wakeup(&so->so_timeo);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
|
|
|
|
*/
|
|
|
|
struct sockaddr *
|
|
|
|
sodupsockaddr(const struct sockaddr *sa, int mflags)
|
|
|
|
{
|
|
|
|
struct sockaddr *sa2;
|
|
|
|
|
|
|
|
sa2 = malloc(sa->sa_len, M_SONAME, mflags);
|
|
|
|
if (sa2)
|
|
|
|
bcopy(sa, sa2, sa->sa_len);
|
|
|
|
return sa2;
|
|
|
|
}
|
|
|
|
|
2009-06-01 21:17:03 +00:00
|
|
|
/*
|
|
|
|
* Register per-socket buffer upcalls.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
soupcall_set(struct socket *so, int which,
|
|
|
|
int (*func)(struct socket *, void *, int), void *arg)
|
|
|
|
{
|
|
|
|
struct sockbuf *sb;
|
2012-12-07 22:13:33 +00:00
|
|
|
|
2009-06-01 21:17:03 +00:00
|
|
|
switch (which) {
|
|
|
|
case SO_RCV:
|
|
|
|
sb = &so->so_rcv;
|
|
|
|
break;
|
|
|
|
case SO_SND:
|
|
|
|
sb = &so->so_snd;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
panic("soupcall_set: bad which");
|
|
|
|
}
|
|
|
|
SOCKBUF_LOCK_ASSERT(sb);
|
|
|
|
#if 0
|
|
|
|
/* XXX: accf_http actually wants to do this on purpose. */
|
|
|
|
KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
|
|
|
|
#endif
|
|
|
|
sb->sb_upcall = func;
|
|
|
|
sb->sb_upcallarg = arg;
|
|
|
|
sb->sb_flags |= SB_UPCALL;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
soupcall_clear(struct socket *so, int which)
|
|
|
|
{
|
|
|
|
struct sockbuf *sb;
|
|
|
|
|
|
|
|
switch (which) {
|
|
|
|
case SO_RCV:
|
|
|
|
sb = &so->so_rcv;
|
|
|
|
break;
|
|
|
|
case SO_SND:
|
|
|
|
sb = &so->so_snd;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
panic("soupcall_clear: bad which");
|
|
|
|
}
|
|
|
|
SOCKBUF_LOCK_ASSERT(sb);
|
|
|
|
KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
|
|
|
|
sb->sb_upcall = NULL;
|
|
|
|
sb->sb_upcallarg = NULL;
|
|
|
|
sb->sb_flags &= ~SB_UPCALL;
|
|
|
|
}
|
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
/*
|
2007-03-26 17:05:09 +00:00
|
|
|
* Create an external-format (``xsocket'') structure using the information in
|
|
|
|
* the kernel-format socket structure pointed to by so. This is done to
|
|
|
|
* reduce the spew of irrelevant information over this interface, to isolate
|
|
|
|
* user code from changes in the kernel structure, and potentially to provide
|
|
|
|
* information-hiding if we decide that some of this information should be
|
|
|
|
* hidden from users.
|
2007-03-26 08:59:03 +00:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
sotoxsocket(struct socket *so, struct xsocket *xso)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
xso->xso_len = sizeof *xso;
|
|
|
|
xso->xso_so = so;
|
|
|
|
xso->so_type = so->so_type;
|
|
|
|
xso->so_options = so->so_options;
|
|
|
|
xso->so_linger = so->so_linger;
|
|
|
|
xso->so_state = so->so_state;
|
|
|
|
xso->so_pcb = so->so_pcb;
|
|
|
|
xso->xso_protocol = so->so_proto->pr_protocol;
|
|
|
|
xso->xso_family = so->so_proto->pr_domain->dom_family;
|
|
|
|
xso->so_qlen = so->so_qlen;
|
|
|
|
xso->so_incqlen = so->so_incqlen;
|
|
|
|
xso->so_qlimit = so->so_qlimit;
|
|
|
|
xso->so_timeo = so->so_timeo;
|
|
|
|
xso->so_error = so->so_error;
|
|
|
|
xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
|
|
|
|
xso->so_oobmark = so->so_oobmark;
|
|
|
|
sbtoxsockbuf(&so->so_snd, &xso->so_snd);
|
|
|
|
sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
|
|
|
|
xso->so_uid = so->so_cred->cr_uid;
|
|
|
|
}
|
2008-07-21 00:49:34 +00:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Socket accessor functions to provide external consumers with
|
|
|
|
* a safe interface to socket state
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
void
|
2012-12-07 22:13:33 +00:00
|
|
|
so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *),
|
|
|
|
void *arg)
|
2008-07-21 00:49:34 +00:00
|
|
|
{
|
2012-12-07 22:13:33 +00:00
|
|
|
|
2008-07-21 00:49:34 +00:00
|
|
|
TAILQ_FOREACH(so, &so->so_comp, so_list)
|
|
|
|
func(so, arg);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct sockbuf *
|
|
|
|
so_sockbuf_rcv(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (&so->so_rcv);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct sockbuf *
|
|
|
|
so_sockbuf_snd(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (&so->so_snd);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
so_state_get(const struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (so->so_state);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_state_set(struct socket *so, int val)
|
|
|
|
{
|
|
|
|
|
|
|
|
so->so_state = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
so_options_get(const struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (so->so_options);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_options_set(struct socket *so, int val)
|
|
|
|
{
|
|
|
|
|
|
|
|
so->so_options = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
so_error_get(const struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (so->so_error);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_error_set(struct socket *so, int val)
|
|
|
|
{
|
|
|
|
|
|
|
|
so->so_error = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
so_linger_get(const struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (so->so_linger);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_linger_set(struct socket *so, int val)
|
|
|
|
{
|
|
|
|
|
|
|
|
so->so_linger = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct protosw *
|
|
|
|
so_protosw_get(const struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (so->so_proto);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_protosw_set(struct socket *so, struct protosw *val)
|
|
|
|
{
|
|
|
|
|
|
|
|
so->so_proto = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_sorwakeup(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
sorwakeup(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_sowwakeup(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
sowwakeup(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_sorwakeup_locked(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
sorwakeup_locked(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_sowwakeup_locked(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
sowwakeup_locked(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_lock(struct socket *so)
|
|
|
|
{
|
2012-12-07 22:13:33 +00:00
|
|
|
|
2008-07-21 00:49:34 +00:00
|
|
|
SOCK_LOCK(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_unlock(struct socket *so)
|
|
|
|
{
|
2012-12-07 22:13:33 +00:00
|
|
|
|
2008-07-21 00:49:34 +00:00
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
}
|