2005-06-09 19:59:09 +00:00
|
|
|
/*-
|
2017-11-20 19:43:44 +00:00
|
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
*
|
1994-05-24 10:09:53 +00:00
|
|
|
* Copyright (c) 1982, 1986, 1988, 1990, 1993
|
2007-01-08 17:49:59 +00:00
|
|
|
* The Regents of the University of California.
|
2006-03-15 12:45:35 +00:00
|
|
|
* Copyright (c) 2004 The FreeBSD Foundation
|
2008-07-03 06:47:45 +00:00
|
|
|
* Copyright (c) 2004-2008 Robert N. M. Watson
|
2007-01-08 17:49:59 +00:00
|
|
|
* All rights reserved.
|
1994-05-24 10:09:53 +00:00
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
2016-09-15 13:16:20 +00:00
|
|
|
* 3. Neither the name of the University nor the names of its contributors
|
1994-05-24 10:09:53 +00:00
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
|
|
|
|
*/
|
|
|
|
|
2006-04-01 10:43:02 +00:00
|
|
|
/*
|
|
|
|
* Comments on the socket life cycle:
|
|
|
|
*
|
|
|
|
* soalloc() sets of socket layer state for a socket, called only by
|
|
|
|
* socreate() and sonewconn(). Socket layer private.
|
|
|
|
*
|
2006-07-23 20:36:04 +00:00
|
|
|
* sodealloc() tears down socket layer state for a socket, called only by
|
2006-04-01 10:43:02 +00:00
|
|
|
* sofree() and sonewconn(). Socket layer private.
|
|
|
|
*
|
|
|
|
* pru_attach() associates protocol layer state with an allocated socket;
|
|
|
|
* called only once, may fail, aborting socket allocation. This is called
|
|
|
|
* from socreate() and sonewconn(). Socket layer private.
|
|
|
|
*
|
|
|
|
* pru_detach() disassociates protocol layer state from an attached socket,
|
|
|
|
* and will be called exactly once for sockets in which pru_attach() has
|
|
|
|
* been successfully called. If pru_attach() returned an error,
|
|
|
|
* pru_detach() will not be called. Socket layer private.
|
|
|
|
*
|
2006-07-23 20:36:04 +00:00
|
|
|
* pru_abort() and pru_close() notify the protocol layer that the last
|
|
|
|
* consumer of a socket is starting to tear down the socket, and that the
|
|
|
|
* protocol should terminate the connection. Historically, pru_abort() also
|
|
|
|
* detached protocol state from the socket state, but this is no longer the
|
|
|
|
* case.
|
|
|
|
*
|
2006-04-01 10:43:02 +00:00
|
|
|
* socreate() creates a socket and attaches protocol state. This is a public
|
|
|
|
* interface that may be used by socket layer consumers to create new
|
|
|
|
* sockets.
|
|
|
|
*
|
|
|
|
* sonewconn() creates a socket and attaches protocol state. This is a
|
|
|
|
* public interface that may be used by protocols to create new sockets when
|
|
|
|
* a new connection is received and will be available for accept() on a
|
|
|
|
* listen socket.
|
|
|
|
*
|
|
|
|
* soclose() destroys a socket after possibly waiting for it to disconnect.
|
|
|
|
* This is a public interface that socket consumers should use to close and
|
|
|
|
* release a socket when done with it.
|
|
|
|
*
|
|
|
|
* soabort() destroys a socket without waiting for it to disconnect (used
|
|
|
|
* only for incoming connections that are already partially or fully
|
|
|
|
* connected). This is used internally by the socket layer when clearing
|
|
|
|
* listen socket queues (due to overflow or close on the listen socket), but
|
|
|
|
* is also a public interface protocols may use to abort connections in
|
|
|
|
* their incomplete listen queues should they no longer be required. Sockets
|
2006-07-23 20:36:04 +00:00
|
|
|
* placed in completed connection listen queues should not be aborted for
|
|
|
|
* reasons described in the comment above the soclose() implementation. This
|
|
|
|
* is not a general purpose close routine, and except in the specific
|
|
|
|
* circumstances described here, should not be used.
|
2006-04-01 10:43:02 +00:00
|
|
|
*
|
|
|
|
* sofree() will free a socket and its protocol state if all references on
|
|
|
|
* the socket have been released, and is the public interface to attempt to
|
|
|
|
* free a socket when a reference is removed. This is a socket layer private
|
|
|
|
* interface.
|
|
|
|
*
|
|
|
|
* NOTE: In addition to socreate() and soclose(), which provide a single
|
|
|
|
* socket reference to the consumer to be managed as required, there are two
|
|
|
|
* calls to explicitly manage socket references, soref(), and sorele().
|
|
|
|
* Currently, these are generally required only when transitioning a socket
|
|
|
|
* from a listen queue to a file descriptor, in order to prevent garbage
|
|
|
|
* collection of the socket at an untimely moment. For a number of reasons,
|
|
|
|
* these interfaces are not preferred, and should be avoided.
|
2012-12-07 22:13:33 +00:00
|
|
|
*
|
2011-02-16 21:29:13 +00:00
|
|
|
* NOTE: With regard to VNETs the general rule is that callers do not set
|
|
|
|
* curvnet. Exceptions to this rule include soabort(), sodisconnect(),
|
|
|
|
* sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
|
|
|
|
* and sorflush(), which are usually called from a pre-set VNET context.
|
|
|
|
* sopoll() currently does not need a VNET context to be set.
|
2006-04-01 10:43:02 +00:00
|
|
|
*/
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
2000-11-20 01:35:25 +00:00
|
|
|
#include "opt_inet.h"
|
MFp4:
Bring in updated jail support from bz_jail branch.
This enhances the current jail implementation to permit multiple
addresses per jail. In addtion to IPv4, IPv6 is supported as well.
Due to updated checks it is even possible to have jails without
an IP address at all, which basically gives one a chroot with
restricted process view, no networking,..
SCTP support was updated and supports IPv6 in jails as well.
Cpuset support permits jails to be bound to specific processor
sets after creation.
Jails can have an unrestricted (no duplicate protection, etc.) name
in addition to the hostname. The jail name cannot be changed from
within a jail and is considered to be used for management purposes
or as audit-token in the future.
DDB 'show jails' command was added to aid debugging.
Proper compat support permits 32bit jail binaries to be used on 64bit
systems to manage jails. Also backward compatibility was preserved where
possible: for jail v1 syscalls, as well as with user space management
utilities.
Both jail as well as prison version were updated for the new features.
A gap was intentionally left as the intermediate versions had been
used by various patches floating around the last years.
Bump __FreeBSD_version for the afore mentioned and in kernel changes.
Special thanks to:
- Pawel Jakub Dawidek (pjd) for his multi-IPv4 patches
and Olivier Houchard (cognet) for initial single-IPv6 patches.
- Jeff Roberson (jeff) and Randall Stewart (rrs) for their
help, ideas and review on cpuset and SCTP support.
- Robert Watson (rwatson) for lots and lots of help, discussions,
suggestions and review of most of the patch at various stages.
- John Baldwin (jhb) for his help.
- Simon L. Nielsen (simon) as early adopter testing changes
on cluster machines as well as all the testers and people
who provided feedback the last months on freebsd-jail and
other channels.
- My employer, CK Software GmbH, for the support so I could work on this.
Reviewed by: (see above)
MFC after: 3 months (this is just so that I get the mail)
X-MFC Before: 7.2-RELEASE if possible
2008-11-29 14:32:14 +00:00
|
|
|
#include "opt_inet6.h"
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
#include "opt_kern_tls.h"
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
#include "opt_sctp.h"
|
2000-11-20 01:35:25 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
1997-03-23 03:37:54 +00:00
|
|
|
#include <sys/fcntl.h>
|
2005-06-09 19:59:09 +00:00
|
|
|
#include <sys/limits.h>
|
|
|
|
#include <sys/lock.h>
|
|
|
|
#include <sys/mac.h>
|
2002-08-01 17:47:56 +00:00
|
|
|
#include <sys/malloc.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/mbuf.h>
|
2005-06-09 19:59:09 +00:00
|
|
|
#include <sys/mutex.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/domain.h>
|
2000-04-16 18:53:38 +00:00
|
|
|
#include <sys/file.h> /* for struct knote */
|
2014-08-18 23:45:40 +00:00
|
|
|
#include <sys/hhook.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/kernel.h>
|
2014-08-18 23:45:40 +00:00
|
|
|
#include <sys/khelp.h>
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
#include <sys/ktls.h>
|
2000-04-16 18:53:38 +00:00
|
|
|
#include <sys/event.h>
|
2006-06-10 14:34:07 +00:00
|
|
|
#include <sys/eventhandler.h>
|
1997-09-14 02:34:14 +00:00
|
|
|
#include <sys/poll.h>
|
1999-06-17 23:54:50 +00:00
|
|
|
#include <sys/proc.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/protosw.h>
|
2020-04-14 15:30:34 +00:00
|
|
|
#include <sys/sbuf.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <sys/socketvar.h>
|
|
|
|
#include <sys/resourcevar.h>
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
#include <net/route.h>
|
1994-10-02 17:35:40 +00:00
|
|
|
#include <sys/signalvar.h>
|
2007-03-26 08:59:03 +00:00
|
|
|
#include <sys/stat.h>
|
2007-05-03 14:42:42 +00:00
|
|
|
#include <sys/sx.h>
|
1995-11-03 18:33:46 +00:00
|
|
|
#include <sys/sysctl.h>
|
2016-03-01 18:12:14 +00:00
|
|
|
#include <sys/taskqueue.h>
|
1998-03-28 10:33:27 +00:00
|
|
|
#include <sys/uio.h>
|
2020-04-14 15:30:34 +00:00
|
|
|
#include <sys/un.h>
|
|
|
|
#include <sys/unpcb.h>
|
2000-06-04 04:28:31 +00:00
|
|
|
#include <sys/jail.h>
|
2012-10-29 12:14:57 +00:00
|
|
|
#include <sys/syslog.h>
|
2013-03-11 17:43:55 +00:00
|
|
|
#include <netinet/in.h>
|
2020-04-14 15:30:34 +00:00
|
|
|
#include <netinet/in_pcb.h>
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
#include <netinet/tcp.h>
|
2009-08-01 19:26:27 +00:00
|
|
|
|
|
|
|
#include <net/vnet.h>
|
2001-05-01 08:13:21 +00:00
|
|
|
|
2006-10-22 11:52:19 +00:00
|
|
|
#include <security/mac/mac_framework.h>
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
#include <vm/uma.h>
|
1995-11-03 18:33:46 +00:00
|
|
|
|
2010-03-11 14:49:06 +00:00
|
|
|
#ifdef COMPAT_FREEBSD32
|
2005-10-27 04:26:35 +00:00
|
|
|
#include <sys/mount.h>
|
2008-11-22 12:36:15 +00:00
|
|
|
#include <sys/sysent.h>
|
2005-10-27 04:26:35 +00:00
|
|
|
#include <compat/freebsd32/freebsd32.h>
|
|
|
|
#endif
|
1997-08-21 20:33:42 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
static int soreceive_rcvoob(struct socket *so, struct uio *uio,
|
|
|
|
int flags);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
static void so_rdknl_lock(void *);
|
|
|
|
static void so_rdknl_unlock(void *);
|
|
|
|
static void so_rdknl_assert_locked(void *);
|
|
|
|
static void so_rdknl_assert_unlocked(void *);
|
|
|
|
static void so_wrknl_lock(void *);
|
|
|
|
static void so_wrknl_unlock(void *);
|
|
|
|
static void so_wrknl_assert_locked(void *);
|
|
|
|
static void so_wrknl_assert_unlocked(void *);
|
2005-06-09 19:59:09 +00:00
|
|
|
|
|
|
|
static void filt_sordetach(struct knote *kn);
|
|
|
|
static int filt_soread(struct knote *kn, long hint);
|
|
|
|
static void filt_sowdetach(struct knote *kn);
|
2000-04-16 18:53:38 +00:00
|
|
|
static int filt_sowrite(struct knote *kn, long hint);
|
2017-01-16 08:25:33 +00:00
|
|
|
static int filt_soempty(struct knote *kn, long hint);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
|
2014-08-26 14:44:08 +00:00
|
|
|
fo_kqfilter_t soo_kqfilter;
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2009-09-12 20:03:45 +00:00
|
|
|
static struct filterops soread_filtops = {
|
|
|
|
.f_isfd = 1,
|
|
|
|
.f_detach = filt_sordetach,
|
|
|
|
.f_event = filt_soread,
|
|
|
|
};
|
|
|
|
static struct filterops sowrite_filtops = {
|
|
|
|
.f_isfd = 1,
|
|
|
|
.f_detach = filt_sowdetach,
|
|
|
|
.f_event = filt_sowrite,
|
|
|
|
};
|
2017-01-16 08:25:33 +00:00
|
|
|
static struct filterops soempty_filtops = {
|
|
|
|
.f_isfd = 1,
|
|
|
|
.f_detach = filt_sowdetach,
|
|
|
|
.f_event = filt_soempty,
|
|
|
|
};
|
2000-04-16 18:53:38 +00:00
|
|
|
|
1998-05-15 20:11:40 +00:00
|
|
|
so_gen_t so_gencnt; /* generation count for sockets */
|
|
|
|
|
1997-10-12 20:26:33 +00:00
|
|
|
MALLOC_DEFINE(M_SONAME, "soname", "socket name");
|
|
|
|
MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
|
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
#define VNET_SO_ASSERT(so) \
|
|
|
|
VNET_ASSERT(curvnet != NULL, \
|
|
|
|
("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
|
|
|
|
|
2014-08-18 23:45:40 +00:00
|
|
|
VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
|
|
|
|
#define V_socket_hhh VNET(socket_hhh)
|
|
|
|
|
2012-10-20 10:51:32 +00:00
|
|
|
/*
|
|
|
|
* Limit on the number of connections in the listen queue waiting
|
|
|
|
* for accept(2).
|
2016-04-29 22:15:33 +00:00
|
|
|
* NB: The original sysctl somaxconn is still available but hidden
|
2012-10-20 19:38:22 +00:00
|
|
|
* to prevent confusion about the actual purpose of this number.
|
2012-10-20 10:51:32 +00:00
|
|
|
*/
|
2016-02-02 05:57:59 +00:00
|
|
|
static u_int somaxconn = SOMAXCONN;
|
2012-10-20 10:51:32 +00:00
|
|
|
|
|
|
|
static int
|
|
|
|
sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
int val;
|
|
|
|
|
|
|
|
val = somaxconn;
|
|
|
|
error = sysctl_handle_int(oidp, &val, 0, req);
|
|
|
|
if (error || !req->newptr )
|
|
|
|
return (error);
|
|
|
|
|
2016-02-02 05:57:59 +00:00
|
|
|
/*
|
|
|
|
* The purpose of the UINT_MAX / 3 limit, is so that the formula
|
|
|
|
* 3 * so_qlimit / 2
|
|
|
|
* below, will not overflow.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (val < 1 || val > UINT_MAX / 3)
|
2012-10-20 10:51:32 +00:00
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
somaxconn = val;
|
|
|
|
return (0);
|
|
|
|
}
|
2020-02-26 14:26:36 +00:00
|
|
|
SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue,
|
|
|
|
CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, sizeof(int),
|
|
|
|
sysctl_somaxconn, "I",
|
2012-10-20 10:51:32 +00:00
|
|
|
"Maximum listen socket pending connection accept queue size");
|
2012-10-20 12:53:14 +00:00
|
|
|
SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
|
2020-02-26 14:26:36 +00:00
|
|
|
CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, 0,
|
|
|
|
sizeof(int), sysctl_somaxconn, "I",
|
2012-10-20 12:53:14 +00:00
|
|
|
"Maximum listen socket pending connection accept queue size (compat)");
|
2012-10-20 10:51:32 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
static int numopensockets;
|
|
|
|
SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
|
|
|
|
&numopensockets, 0, "Number of open sockets");
|
2012-10-20 10:51:32 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
|
|
|
* accept_mtx locks down per-socket fields relating to accept queues. See
|
|
|
|
* socketvar.h for an annotation of the protected fields of struct socket.
|
|
|
|
*/
|
2013-05-06 16:42:18 +00:00
|
|
|
struct mtx accept_mtx;
|
2005-06-09 19:59:09 +00:00
|
|
|
MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* so_global_mtx protects so_gencnt, numopensockets, and the per-socket
|
|
|
|
* so_gencnt field.
|
|
|
|
*/
|
2013-05-06 16:42:18 +00:00
|
|
|
static struct mtx so_global_mtx;
|
2005-06-09 19:59:09 +00:00
|
|
|
MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
|
1994-05-25 09:21:21 +00:00
|
|
|
|
2006-07-23 20:36:04 +00:00
|
|
|
/*
|
|
|
|
* General IPC sysctl name space, used by sockets and a variety of other IPC
|
|
|
|
* types.
|
|
|
|
*/
|
2020-02-26 14:26:36 +00:00
|
|
|
SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
|
|
|
|
"IPC");
|
2006-06-10 14:34:07 +00:00
|
|
|
|
2012-10-19 10:15:32 +00:00
|
|
|
/*
|
|
|
|
* Initialize the socket subsystem and set up the socket
|
|
|
|
* memory allocator.
|
|
|
|
*/
|
2012-12-07 22:15:51 +00:00
|
|
|
static uma_zone_t socket_zone;
|
2012-10-19 12:16:29 +00:00
|
|
|
int maxsockets;
|
|
|
|
|
2012-10-19 10:15:32 +00:00
|
|
|
static void
|
|
|
|
socket_zone_change(void *tag)
|
|
|
|
{
|
|
|
|
|
2012-12-07 22:15:51 +00:00
|
|
|
maxsockets = uma_zone_set_max(socket_zone, maxsockets);
|
2012-10-19 10:15:32 +00:00
|
|
|
}
|
|
|
|
|
2014-08-18 23:45:40 +00:00
|
|
|
static void
|
|
|
|
socket_hhook_register(int subtype)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
|
|
|
|
&V_socket_hhh[subtype],
|
|
|
|
HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
|
|
|
|
printf("%s: WARNING: unable to register hook\n", __func__);
|
|
|
|
}
|
|
|
|
|
2014-08-22 05:03:30 +00:00
|
|
|
static void
|
|
|
|
socket_hhook_deregister(int subtype)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
|
|
|
|
printf("%s: WARNING: unable to deregister hook\n", __func__);
|
|
|
|
}
|
|
|
|
|
2012-10-19 10:15:32 +00:00
|
|
|
static void
|
|
|
|
socket_init(void *tag)
|
|
|
|
{
|
|
|
|
|
2012-12-07 22:13:33 +00:00
|
|
|
socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
|
|
|
|
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
|
2012-12-07 22:15:51 +00:00
|
|
|
maxsockets = uma_zone_set_max(socket_zone, maxsockets);
|
2012-12-07 22:30:30 +00:00
|
|
|
uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
|
2012-12-07 22:13:33 +00:00
|
|
|
EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
|
|
|
|
EVENTHANDLER_PRI_FIRST);
|
2014-08-22 05:03:30 +00:00
|
|
|
}
|
|
|
|
SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
|
|
|
|
|
|
|
|
static void
|
|
|
|
socket_vnet_init(const void *unused __unused)
|
|
|
|
{
|
|
|
|
int i;
|
2014-08-18 23:45:40 +00:00
|
|
|
|
|
|
|
/* We expect a contiguous range */
|
2014-08-22 05:03:30 +00:00
|
|
|
for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
|
2014-08-18 23:45:40 +00:00
|
|
|
socket_hhook_register(i);
|
2012-10-19 10:15:32 +00:00
|
|
|
}
|
2014-08-22 05:03:30 +00:00
|
|
|
VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
|
|
|
|
socket_vnet_init, NULL);
|
|
|
|
|
|
|
|
static void
|
|
|
|
socket_vnet_uninit(const void *unused __unused)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
|
|
|
|
socket_hhook_deregister(i);
|
|
|
|
}
|
|
|
|
VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
|
|
|
|
socket_vnet_uninit, NULL);
|
2012-10-19 10:15:32 +00:00
|
|
|
|
2012-10-19 12:16:29 +00:00
|
|
|
/*
|
|
|
|
* Initialise maxsockets. This SYSINIT must be run after
|
|
|
|
* tunable_mbinit().
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
init_maxsockets(void *ignored)
|
|
|
|
{
|
|
|
|
|
|
|
|
TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
|
Base the mbuf related limits on the available physical memory or
kernel memory, whichever is lower. The overall mbuf related memory
limit must be set so that mbufs (and clusters of various sizes)
can't exhaust physical RAM or KVM.
The limit is set to half of the physical RAM or KVM (whichever is
lower) as the baseline. In any normal scenario we want to leave
at least half of the physmem/kvm for other kernel functions and
userspace to prevent it from swapping too easily. Via a tunable
kern.maxmbufmem the limit can be upped to at most 3/4 of physmem/kvm.
At the same time divorce maxfiles from maxusers and set maxfiles to
physpages / 8 with a floor based on maxusers. This way busy servers
can make use of the significantly increased mbuf limits with a much
larger number of open sockets.
Tidy up ordering in init_param2() and check up on some users of
those values calculated here.
Out of the overall mbuf memory limit 2K clusters and 4K (page size)
clusters to get 1/4 each because these are the most heavily used mbuf
sizes. 2K clusters are used for MTU 1500 ethernet inbound packets.
4K clusters are used whenever possible for sends on sockets and thus
outbound packets. The larger cluster sizes of 9K and 16K are limited
to 1/6 of the overall mbuf memory limit. When jumbo MTU's are used
these large clusters will end up only on the inbound path. They are
not used on outbound, there it's still 4K. Yes, that will stay that
way because otherwise we run into lots of complications in the
stack. And it really isn't a problem, so don't make a scene.
Normal mbufs (256B) weren't limited at all previously. This was
problematic as there are certain places in the kernel that on
allocation failure of clusters try to piece together their packet
from smaller mbufs.
The mbuf limit is the number of all other mbuf sizes together plus
some more to allow for standalone mbufs (ACK for example) and to
send off a copy of a cluster. Unfortunately there isn't a way to
set an overall limit for all mbuf memory together as UMA doesn't
support such a limiting.
NB: Every cluster also has an mbuf associated with it.
Two examples on the revised mbuf sizing limits:
1GB KVM:
512MB limit for mbufs
419,430 mbufs
65,536 2K mbuf clusters
32,768 4K mbuf clusters
9,709 9K mbuf clusters
5,461 16K mbuf clusters
16GB RAM:
8GB limit for mbufs
33,554,432 mbufs
1,048,576 2K mbuf clusters
524,288 4K mbuf clusters
155,344 9K mbuf clusters
87,381 16K mbuf clusters
These defaults should be sufficient for even the most demanding
network loads.
MFC after: 1 month
2012-11-27 21:19:58 +00:00
|
|
|
maxsockets = imax(maxsockets, maxfiles);
|
2012-10-19 12:16:29 +00:00
|
|
|
}
|
|
|
|
SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
|
|
|
|
|
2006-07-23 20:36:04 +00:00
|
|
|
/*
|
|
|
|
* Sysctl to get and set the maximum global sockets limit. Notify protocols
|
|
|
|
* of the change so that they can update their dependent limits as required.
|
|
|
|
*/
|
2006-06-10 14:34:07 +00:00
|
|
|
static int
|
|
|
|
sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
int error, newmaxsockets;
|
|
|
|
|
|
|
|
newmaxsockets = maxsockets;
|
2007-06-04 18:25:08 +00:00
|
|
|
error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
|
2006-06-10 14:34:07 +00:00
|
|
|
if (error == 0 && req->newptr) {
|
Base the mbuf related limits on the available physical memory or
kernel memory, whichever is lower. The overall mbuf related memory
limit must be set so that mbufs (and clusters of various sizes)
can't exhaust physical RAM or KVM.
The limit is set to half of the physical RAM or KVM (whichever is
lower) as the baseline. In any normal scenario we want to leave
at least half of the physmem/kvm for other kernel functions and
userspace to prevent it from swapping too easily. Via a tunable
kern.maxmbufmem the limit can be upped to at most 3/4 of physmem/kvm.
At the same time divorce maxfiles from maxusers and set maxfiles to
physpages / 8 with a floor based on maxusers. This way busy servers
can make use of the significantly increased mbuf limits with a much
larger number of open sockets.
Tidy up ordering in init_param2() and check up on some users of
those values calculated here.
Out of the overall mbuf memory limit 2K clusters and 4K (page size)
clusters to get 1/4 each because these are the most heavily used mbuf
sizes. 2K clusters are used for MTU 1500 ethernet inbound packets.
4K clusters are used whenever possible for sends on sockets and thus
outbound packets. The larger cluster sizes of 9K and 16K are limited
to 1/6 of the overall mbuf memory limit. When jumbo MTU's are used
these large clusters will end up only on the inbound path. They are
not used on outbound, there it's still 4K. Yes, that will stay that
way because otherwise we run into lots of complications in the
stack. And it really isn't a problem, so don't make a scene.
Normal mbufs (256B) weren't limited at all previously. This was
problematic as there are certain places in the kernel that on
allocation failure of clusters try to piece together their packet
from smaller mbufs.
The mbuf limit is the number of all other mbuf sizes together plus
some more to allow for standalone mbufs (ACK for example) and to
send off a copy of a cluster. Unfortunately there isn't a way to
set an overall limit for all mbuf memory together as UMA doesn't
support such a limiting.
NB: Every cluster also has an mbuf associated with it.
Two examples on the revised mbuf sizing limits:
1GB KVM:
512MB limit for mbufs
419,430 mbufs
65,536 2K mbuf clusters
32,768 4K mbuf clusters
9,709 9K mbuf clusters
5,461 16K mbuf clusters
16GB RAM:
8GB limit for mbufs
33,554,432 mbufs
1,048,576 2K mbuf clusters
524,288 4K mbuf clusters
155,344 9K mbuf clusters
87,381 16K mbuf clusters
These defaults should be sufficient for even the most demanding
network loads.
MFC after: 1 month
2012-11-27 21:19:58 +00:00
|
|
|
if (newmaxsockets > maxsockets &&
|
|
|
|
newmaxsockets <= maxfiles) {
|
2006-06-10 14:34:07 +00:00
|
|
|
maxsockets = newmaxsockets;
|
|
|
|
EVENTHANDLER_INVOKE(maxsockets_change);
|
|
|
|
} else
|
|
|
|
error = EINVAL;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
2020-02-26 14:26:36 +00:00
|
|
|
SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets,
|
|
|
|
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &maxsockets, 0,
|
|
|
|
sysctl_maxsockets, "IU",
|
2016-03-21 08:03:50 +00:00
|
|
|
"Maximum number of sockets available");
|
2006-06-10 14:34:07 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Socket operation routines. These routines are called by the routines in
|
|
|
|
* sys_socket.c or from a system process, and implement the semantics of
|
|
|
|
* socket operations by switching out to the protocol specific routines.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1998-05-15 20:11:40 +00:00
|
|
|
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Get a socket structure from our zone, and initialize it. Note that it
|
|
|
|
* would probably be better to allocate socket and PCB at the same time, but
|
|
|
|
* I'm not convinced that all the protocols can be easily modified to do
|
|
|
|
* this.
|
2005-06-09 19:59:09 +00:00
|
|
|
*
|
|
|
|
* soalloc() returns a socket with a ref count of 0.
|
1998-05-15 20:11:40 +00:00
|
|
|
*/
|
2006-06-10 14:34:07 +00:00
|
|
|
static struct socket *
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
soalloc(struct vnet *vnet)
|
1998-05-15 20:11:40 +00:00
|
|
|
{
|
|
|
|
struct socket *so;
|
|
|
|
|
2007-02-26 10:45:21 +00:00
|
|
|
so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
|
2006-06-08 22:33:18 +00:00
|
|
|
if (so == NULL)
|
|
|
|
return (NULL);
|
2005-06-09 19:59:09 +00:00
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
if (mac_socket_init(so, M_NOWAIT) != 0) {
|
2006-06-08 22:33:18 +00:00
|
|
|
uma_zfree(socket_zone, so);
|
|
|
|
return (NULL);
|
1998-05-15 20:11:40 +00:00
|
|
|
}
|
2006-06-08 22:33:18 +00:00
|
|
|
#endif
|
2014-08-18 23:45:40 +00:00
|
|
|
if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) {
|
|
|
|
uma_zfree(socket_zone, so);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
/*
|
|
|
|
* The socket locking protocol allows to lock 2 sockets at a time,
|
|
|
|
* however, the first one must be a listening socket. WITNESS lacks
|
|
|
|
* a feature to change class of an existing lock, so we use DUPOK.
|
|
|
|
*/
|
|
|
|
mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
|
2006-06-08 22:33:18 +00:00
|
|
|
SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
|
|
|
|
SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
so->so_rcv.sb_sel = &so->so_rdsel;
|
|
|
|
so->so_snd.sb_sel = &so->so_wrsel;
|
2007-05-03 14:42:42 +00:00
|
|
|
sx_init(&so->so_snd.sb_sx, "so_snd_sx");
|
|
|
|
sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
|
2016-03-01 18:12:14 +00:00
|
|
|
TAILQ_INIT(&so->so_snd.sb_aiojobq);
|
|
|
|
TAILQ_INIT(&so->so_rcv.sb_aiojobq);
|
|
|
|
TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so);
|
|
|
|
TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so);
|
Permit buiding kernels with options VIMAGE, restricted to only a single
active network stack instance. Turning on options VIMAGE at compile
time yields the following changes relative to default kernel build:
1) V_ accessor macros for virtualized variables resolve to structure
fields via base pointers, instead of being resolved as fields in global
structs or plain global variables. As an example, V_ifnet becomes:
options VIMAGE: ((struct vnet_net *) vnet_net)->_ifnet
default build: vnet_net_0._ifnet
options VIMAGE_GLOBALS: ifnet
2) INIT_VNET_* macros will declare and set up base pointers to be used
by V_ accessor macros, instead of resolving to whitespace:
INIT_VNET_NET(ifp->if_vnet); becomes
struct vnet_net *vnet_net = (ifp->if_vnet)->mod_data[VNET_MOD_NET];
3) Memory for vnet modules registered via vnet_mod_register() is now
allocated at run time in sys/kern/kern_vimage.c, instead of per vnet
module structs being declared as globals. If required, vnet modules
can now request the framework to provide them with allocated bzeroed
memory by filling in the vmi_size field in their vmi_modinfo structures.
4) structs socket, ifnet, inpcbinfo, tcpcb and syncache_head are
extended to hold a pointer to the parent vnet. options VIMAGE builds
will fill in those fields as required.
5) curvnet is introduced as a new global variable in options VIMAGE
builds, always pointing to the default and only struct vnet.
6) struct sysctl_oid has been extended with additional two fields to
store major and minor virtualization module identifiers, oid_v_subs and
oid_v_mod. SYSCTL_V_* family of macros will fill in those fields
accordingly, and store the offset in the appropriate vnet container
struct in oid_arg1.
In sysctl handlers dealing with virtualized sysctls, the
SYSCTL_RESOLVE_V_ARG1() macro will compute the address of the target
variable and make it available in arg1 variable for further processing.
Unused fields in structs vnet_inet, vnet_inet6 and vnet_ipfw have
been deleted.
Reviewed by: bz, rwatson
Approved by: julian (mentor)
2009-04-30 13:36:26 +00:00
|
|
|
#ifdef VIMAGE
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
|
|
|
|
__func__, __LINE__, so));
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
so->so_vnet = vnet;
|
Permit buiding kernels with options VIMAGE, restricted to only a single
active network stack instance. Turning on options VIMAGE at compile
time yields the following changes relative to default kernel build:
1) V_ accessor macros for virtualized variables resolve to structure
fields via base pointers, instead of being resolved as fields in global
structs or plain global variables. As an example, V_ifnet becomes:
options VIMAGE: ((struct vnet_net *) vnet_net)->_ifnet
default build: vnet_net_0._ifnet
options VIMAGE_GLOBALS: ifnet
2) INIT_VNET_* macros will declare and set up base pointers to be used
by V_ accessor macros, instead of resolving to whitespace:
INIT_VNET_NET(ifp->if_vnet); becomes
struct vnet_net *vnet_net = (ifp->if_vnet)->mod_data[VNET_MOD_NET];
3) Memory for vnet modules registered via vnet_mod_register() is now
allocated at run time in sys/kern/kern_vimage.c, instead of per vnet
module structs being declared as globals. If required, vnet modules
can now request the framework to provide them with allocated bzeroed
memory by filling in the vmi_size field in their vmi_modinfo structures.
4) structs socket, ifnet, inpcbinfo, tcpcb and syncache_head are
extended to hold a pointer to the parent vnet. options VIMAGE builds
will fill in those fields as required.
5) curvnet is introduced as a new global variable in options VIMAGE
builds, always pointing to the default and only struct vnet.
6) struct sysctl_oid has been extended with additional two fields to
store major and minor virtualization module identifiers, oid_v_subs and
oid_v_mod. SYSCTL_V_* family of macros will fill in those fields
accordingly, and store the offset in the appropriate vnet container
struct in oid_arg1.
In sysctl handlers dealing with virtualized sysctls, the
SYSCTL_RESOLVE_V_ARG1() macro will compute the address of the target
variable and make it available in arg1 variable for further processing.
Unused fields in structs vnet_inet, vnet_inet6 and vnet_ipfw have
been deleted.
Reviewed by: bz, rwatson
Approved by: julian (mentor)
2009-04-30 13:36:26 +00:00
|
|
|
#endif
|
2014-08-18 23:45:40 +00:00
|
|
|
/* We shouldn't need the so_global_mtx */
|
2014-09-08 09:04:22 +00:00
|
|
|
if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) {
|
|
|
|
/* Do we need more comprehensive error returns? */
|
|
|
|
uma_zfree(socket_zone, so);
|
|
|
|
return (NULL);
|
2014-08-18 23:45:40 +00:00
|
|
|
}
|
2014-09-08 09:04:22 +00:00
|
|
|
mtx_lock(&so_global_mtx);
|
|
|
|
so->so_gencnt = ++so_gencnt;
|
|
|
|
++numopensockets;
|
|
|
|
#ifdef VIMAGE
|
|
|
|
vnet->vnet_sockcnt++;
|
|
|
|
#endif
|
|
|
|
mtx_unlock(&so_global_mtx);
|
2014-08-22 05:03:30 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
return (so);
|
1998-05-15 20:11:40 +00:00
|
|
|
}
|
|
|
|
|
2006-07-23 20:36:04 +00:00
|
|
|
/*
|
|
|
|
* Free the storage associated with a socket at the socket layer, tear down
|
|
|
|
* locks, labels, etc. All protocol state is assumed already to have been
|
|
|
|
* torn down (and possibly never set up) by the caller.
|
|
|
|
*/
|
2006-06-10 14:34:07 +00:00
|
|
|
static void
|
|
|
|
sodealloc(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
|
|
|
|
KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
|
|
|
|
|
|
|
|
mtx_lock(&so_global_mtx);
|
|
|
|
so->so_gencnt = ++so_gencnt;
|
2006-08-02 00:45:27 +00:00
|
|
|
--numopensockets; /* Could be below, but faster here. */
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
#ifdef VIMAGE
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
|
|
|
|
__func__, __LINE__, so));
|
2009-07-19 17:40:45 +00:00
|
|
|
so->so_vnet->vnet_sockcnt--;
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
#endif
|
2006-06-10 14:34:07 +00:00
|
|
|
mtx_unlock(&so_global_mtx);
|
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
mac_socket_destroy(so);
|
2006-06-10 14:34:07 +00:00
|
|
|
#endif
|
2014-09-08 09:04:22 +00:00
|
|
|
hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
|
2014-08-18 23:45:40 +00:00
|
|
|
|
2006-06-10 14:34:07 +00:00
|
|
|
crfree(so->so_cred);
|
2014-08-18 23:45:40 +00:00
|
|
|
khelp_destroy_osd(&so->osd);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
if (SOLISTENING(so)) {
|
|
|
|
if (so->sol_accept_filter != NULL)
|
|
|
|
accept_filt_setopt(so, NULL);
|
|
|
|
} else {
|
2017-06-25 01:41:07 +00:00
|
|
|
if (so->so_rcv.sb_hiwat)
|
|
|
|
(void)chgsbsize(so->so_cred->cr_uidinfo,
|
|
|
|
&so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
|
|
|
|
if (so->so_snd.sb_hiwat)
|
|
|
|
(void)chgsbsize(so->so_cred->cr_uidinfo,
|
|
|
|
&so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
sx_destroy(&so->so_snd.sb_sx);
|
|
|
|
sx_destroy(&so->so_rcv.sb_sx);
|
|
|
|
SOCKBUF_LOCK_DESTROY(&so->so_snd);
|
|
|
|
SOCKBUF_LOCK_DESTROY(&so->so_rcv);
|
|
|
|
}
|
|
|
|
mtx_destroy(&so->so_lock);
|
2006-06-10 14:34:07 +00:00
|
|
|
uma_zfree(socket_zone, so);
|
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
|
|
|
* socreate returns a socket with a ref count of 1. The socket should be
|
|
|
|
* closed with soclose().
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
socreate(int dom, struct socket **aso, int type, int proto,
|
|
|
|
struct ucred *cred, struct thread *td)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct protosw *prp;
|
|
|
|
struct socket *so;
|
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
if (proto)
|
|
|
|
prp = pffindproto(dom, proto, type);
|
|
|
|
else
|
|
|
|
prp = pffindtype(dom, type);
|
2000-06-04 04:28:31 +00:00
|
|
|
|
2012-12-07 02:22:48 +00:00
|
|
|
if (prp == NULL) {
|
|
|
|
/* No support for domain. */
|
|
|
|
if (pffinddomain(dom) == NULL)
|
|
|
|
return (EAFNOSUPPORT);
|
|
|
|
/* No support for socket type. */
|
|
|
|
if (proto == 0 && type != 0)
|
|
|
|
return (EPROTOTYPE);
|
|
|
|
return (EPROTONOSUPPORT);
|
|
|
|
}
|
|
|
|
if (prp->pr_usrreqs->pru_attach == NULL ||
|
2005-06-09 19:59:09 +00:00
|
|
|
prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
|
2000-06-13 15:44:04 +00:00
|
|
|
return (EPROTONOSUPPORT);
|
|
|
|
|
2009-02-05 14:15:18 +00:00
|
|
|
if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
|
2000-06-04 04:28:31 +00:00
|
|
|
return (EPROTONOSUPPORT);
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
if (prp->pr_type != type)
|
|
|
|
return (EPROTOTYPE);
|
2009-06-15 19:01:53 +00:00
|
|
|
so = soalloc(CRED_TO_VNET(cred));
|
2005-06-09 19:59:09 +00:00
|
|
|
if (so == NULL)
|
1998-05-15 20:11:40 +00:00
|
|
|
return (ENOBUFS);
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
so->so_type = type;
|
2005-06-09 19:59:09 +00:00
|
|
|
so->so_cred = crhold(cred);
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
if ((prp->pr_domain->dom_family == PF_INET) ||
|
2012-02-03 11:00:53 +00:00
|
|
|
(prp->pr_domain->dom_family == PF_INET6) ||
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
(prp->pr_domain->dom_family == PF_ROUTE))
|
|
|
|
so->so_fibnum = td->td_proc->p_fibnum;
|
|
|
|
else
|
|
|
|
so->so_fibnum = 0;
|
1994-05-24 10:09:53 +00:00
|
|
|
so->so_proto = prp;
|
2005-06-09 19:59:09 +00:00
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
mac_socket_create(cred, so);
|
2005-06-09 19:59:09 +00:00
|
|
|
#endif
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
|
|
|
|
so_rdknl_assert_locked, so_rdknl_assert_unlocked);
|
|
|
|
knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
|
|
|
|
so_wrknl_assert_locked, so_wrknl_assert_unlocked);
|
2007-02-01 17:53:41 +00:00
|
|
|
/*
|
|
|
|
* Auto-sizing of socket buffers is managed by the protocols and
|
|
|
|
* the appropriate flags must be set in the pru_attach function.
|
|
|
|
*/
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
2005-06-09 19:59:09 +00:00
|
|
|
error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_RESTORE();
|
1994-05-24 10:09:53 +00:00
|
|
|
if (error) {
|
2006-07-11 21:56:58 +00:00
|
|
|
sodealloc(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
soref(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
*aso = so;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2006-06-10 14:34:07 +00:00
|
|
|
#ifdef REGRESSION
|
|
|
|
static int regression_sonewconn_earlytest = 1;
|
|
|
|
SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
|
|
|
|
®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
|
|
|
|
#endif
|
|
|
|
|
2020-04-14 15:38:18 +00:00
|
|
|
static struct timeval overinterval = { 60, 0 };
|
|
|
|
SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW,
|
|
|
|
&overinterval,
|
|
|
|
"Delay in seconds between warnings for listen socket overflows");
|
|
|
|
|
2006-06-10 14:34:07 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* When an attempt at a new connection is noted on a socket which accepts
|
|
|
|
* connections, sonewconn is called. If the connection is possible (subject
|
2016-05-22 13:04:45 +00:00
|
|
|
* to space constraints, etc.) then we allocate a new structure, properly
|
2006-07-23 20:36:04 +00:00
|
|
|
* linked into the data structure of the original socket, and return this.
|
2013-11-08 20:11:15 +00:00
|
|
|
* Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED.
|
2006-06-10 14:34:07 +00:00
|
|
|
*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Note: the ref count on the socket is 0 on return.
|
2006-06-10 14:34:07 +00:00
|
|
|
*/
|
|
|
|
struct socket *
|
2007-05-16 20:41:08 +00:00
|
|
|
sonewconn(struct socket *head, int connstatus)
|
2006-06-10 14:34:07 +00:00
|
|
|
{
|
2020-04-14 15:30:34 +00:00
|
|
|
struct sbuf descrsb;
|
2007-05-16 20:41:08 +00:00
|
|
|
struct socket *so;
|
2020-04-14 15:38:18 +00:00
|
|
|
int len, overcount;
|
|
|
|
u_int qlen;
|
2020-04-14 15:30:34 +00:00
|
|
|
const char localprefix[] = "local:";
|
|
|
|
char descrbuf[SUNPATHLEN + sizeof(localprefix)];
|
|
|
|
#if defined(INET6)
|
|
|
|
char addrbuf[INET6_ADDRSTRLEN];
|
|
|
|
#elif defined(INET)
|
|
|
|
char addrbuf[INET_ADDRSTRLEN];
|
|
|
|
#endif
|
2020-04-14 15:38:18 +00:00
|
|
|
bool dolog, over;
|
2006-06-10 14:34:07 +00:00
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOLISTEN_LOCK(head);
|
|
|
|
over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
|
2006-06-10 14:34:07 +00:00
|
|
|
#ifdef REGRESSION
|
2012-10-29 12:14:57 +00:00
|
|
|
if (regression_sonewconn_earlytest && over) {
|
2006-06-10 14:34:07 +00:00
|
|
|
#else
|
2012-10-29 12:14:57 +00:00
|
|
|
if (over) {
|
2006-06-10 14:34:07 +00:00
|
|
|
#endif
|
2020-04-14 15:38:18 +00:00
|
|
|
head->sol_overcount++;
|
|
|
|
dolog = !!ratecheck(&head->sol_lastover, &overinterval);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're going to log, copy the overflow count and queue
|
|
|
|
* length from the listen socket before dropping the lock.
|
|
|
|
* Also, reset the overflow count.
|
|
|
|
*/
|
|
|
|
if (dolog) {
|
|
|
|
overcount = head->sol_overcount;
|
|
|
|
head->sol_overcount = 0;
|
|
|
|
qlen = head->sol_qlen;
|
|
|
|
}
|
|
|
|
SOLISTEN_UNLOCK(head);
|
2013-10-31 20:33:21 +00:00
|
|
|
|
2020-04-14 15:38:18 +00:00
|
|
|
if (dolog) {
|
2020-04-14 15:30:34 +00:00
|
|
|
/*
|
|
|
|
* Try to print something descriptive about the
|
|
|
|
* socket for the error message.
|
|
|
|
*/
|
|
|
|
sbuf_new(&descrsb, descrbuf, sizeof(descrbuf),
|
|
|
|
SBUF_FIXEDLEN);
|
|
|
|
switch (head->so_proto->pr_domain->dom_family) {
|
|
|
|
#if defined(INET) || defined(INET6)
|
|
|
|
#ifdef INET
|
|
|
|
case AF_INET:
|
|
|
|
#endif
|
|
|
|
#ifdef INET6
|
|
|
|
case AF_INET6:
|
|
|
|
if (head->so_proto->pr_domain->dom_family ==
|
|
|
|
AF_INET6 ||
|
|
|
|
(sotoinpcb(head)->inp_inc.inc_flags &
|
|
|
|
INC_ISIPV6)) {
|
|
|
|
ip6_sprintf(addrbuf,
|
|
|
|
&sotoinpcb(head)->inp_inc.inc6_laddr);
|
|
|
|
sbuf_printf(&descrsb, "[%s]", addrbuf);
|
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
#ifdef INET
|
|
|
|
inet_ntoa_r(
|
|
|
|
sotoinpcb(head)->inp_inc.inc_laddr,
|
|
|
|
addrbuf);
|
|
|
|
sbuf_cat(&descrsb, addrbuf);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
sbuf_printf(&descrsb, ":%hu (proto %u)",
|
|
|
|
ntohs(sotoinpcb(head)->inp_inc.inc_lport),
|
|
|
|
head->so_proto->pr_protocol);
|
|
|
|
break;
|
|
|
|
#endif /* INET || INET6 */
|
|
|
|
case AF_UNIX:
|
|
|
|
sbuf_cat(&descrsb, localprefix);
|
|
|
|
if (sotounpcb(head)->unp_addr != NULL)
|
|
|
|
len =
|
|
|
|
sotounpcb(head)->unp_addr->sun_len -
|
|
|
|
offsetof(struct sockaddr_un,
|
|
|
|
sun_path);
|
|
|
|
else
|
|
|
|
len = 0;
|
|
|
|
if (len > 0)
|
|
|
|
sbuf_bcat(&descrsb,
|
|
|
|
sotounpcb(head)->unp_addr->sun_path,
|
|
|
|
len);
|
|
|
|
else
|
|
|
|
sbuf_cat(&descrsb, "(unknown)");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we can't print something more specific, at least
|
|
|
|
* print the domain name.
|
|
|
|
*/
|
|
|
|
if (sbuf_finish(&descrsb) != 0 ||
|
|
|
|
sbuf_len(&descrsb) <= 0) {
|
|
|
|
sbuf_clear(&descrsb);
|
|
|
|
sbuf_cat(&descrsb,
|
|
|
|
head->so_proto->pr_domain->dom_name ?:
|
|
|
|
"unknown");
|
|
|
|
sbuf_finish(&descrsb);
|
|
|
|
}
|
|
|
|
KASSERT(sbuf_len(&descrsb) > 0,
|
|
|
|
("%s: sbuf creation failed", __func__));
|
|
|
|
log(LOG_DEBUG,
|
|
|
|
"%s: pcb %p (%s): Listen queue overflow: "
|
2013-10-31 20:33:21 +00:00
|
|
|
"%i already in queue awaiting acceptance "
|
|
|
|
"(%d occurrences)\n",
|
2020-04-14 15:30:34 +00:00
|
|
|
__func__, head->so_pcb, sbuf_data(&descrsb),
|
2020-04-14 15:38:18 +00:00
|
|
|
qlen, overcount);
|
2020-04-14 15:30:34 +00:00
|
|
|
sbuf_delete(&descrsb);
|
2013-10-31 20:33:21 +00:00
|
|
|
|
|
|
|
overcount = 0;
|
|
|
|
}
|
|
|
|
|
2006-06-10 14:34:07 +00:00
|
|
|
return (NULL);
|
2012-10-29 12:14:57 +00:00
|
|
|
}
|
2020-04-14 15:38:18 +00:00
|
|
|
SOLISTEN_UNLOCK(head);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
|
|
|
|
__func__, head));
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
so = soalloc(head->so_vnet);
|
2012-10-29 12:14:57 +00:00
|
|
|
if (so == NULL) {
|
|
|
|
log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
|
|
|
|
"limit reached or out of memory\n",
|
|
|
|
__func__, head->so_pcb);
|
2006-06-10 14:34:07 +00:00
|
|
|
return (NULL);
|
2012-10-29 12:14:57 +00:00
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
so->so_listen = head;
|
2006-06-10 14:34:07 +00:00
|
|
|
so->so_type = head->so_type;
|
|
|
|
so->so_linger = head->so_linger;
|
|
|
|
so->so_state = head->so_state | SS_NOFDREF;
|
2009-07-28 19:43:27 +00:00
|
|
|
so->so_fibnum = head->so_fibnum;
|
2006-06-10 14:34:07 +00:00
|
|
|
so->so_proto = head->so_proto;
|
|
|
|
so->so_cred = crhold(head->so_cred);
|
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
mac_socket_newconn(head, so);
|
2006-06-10 14:34:07 +00:00
|
|
|
#endif
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
|
|
|
|
so_rdknl_assert_locked, so_rdknl_assert_unlocked);
|
|
|
|
knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
|
|
|
|
so_wrknl_assert_locked, so_wrknl_assert_unlocked);
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(head);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
|
2012-10-29 12:14:57 +00:00
|
|
|
sodealloc(so);
|
|
|
|
log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
|
|
|
|
__func__, head->so_pcb);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
|
2006-06-10 14:34:07 +00:00
|
|
|
sodealloc(so);
|
2012-10-29 12:14:57 +00:00
|
|
|
log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
|
|
|
|
__func__, head->so_pcb);
|
2006-06-10 14:34:07 +00:00
|
|
|
return (NULL);
|
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
|
|
|
|
so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
|
|
|
|
so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
|
|
|
|
so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
|
|
|
|
so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE;
|
|
|
|
so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE;
|
|
|
|
|
|
|
|
SOLISTEN_LOCK(head);
|
|
|
|
if (head->sol_accept_filter != NULL)
|
|
|
|
connstatus = 0;
|
2006-06-10 14:34:07 +00:00
|
|
|
so->so_state |= connstatus;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
so->so_options = head->so_options & ~SO_ACCEPTCONN;
|
|
|
|
soref(head); /* A socket on (in)complete queue refs head. */
|
2006-06-10 14:34:07 +00:00
|
|
|
if (connstatus) {
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
|
|
|
|
so->so_qstate = SQ_COMP;
|
|
|
|
head->sol_qlen++;
|
|
|
|
solisten_wakeup(head); /* unlocks */
|
2006-06-10 14:34:07 +00:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Keep removing sockets from the head until there's room for
|
|
|
|
* us to insert on the tail. In pre-locking revisions, this
|
|
|
|
* was a simple if(), but as we could be racing with other
|
|
|
|
* threads and soabort() requires dropping locks, we must
|
|
|
|
* loop waiting for the condition to be true.
|
|
|
|
*/
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
while (head->sol_incqlen > head->sol_qlimit) {
|
2006-06-10 14:34:07 +00:00
|
|
|
struct socket *sp;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
|
|
|
|
sp = TAILQ_FIRST(&head->sol_incomp);
|
|
|
|
TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
|
|
|
|
head->sol_incqlen--;
|
|
|
|
SOCK_LOCK(sp);
|
|
|
|
sp->so_qstate = SQ_NONE;
|
|
|
|
sp->so_listen = NULL;
|
|
|
|
SOCK_UNLOCK(sp);
|
|
|
|
sorele(head); /* does SOLISTEN_UNLOCK, head stays */
|
2006-06-10 14:34:07 +00:00
|
|
|
soabort(sp);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOLISTEN_LOCK(head);
|
2006-06-10 14:34:07 +00:00
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
|
|
|
|
so->so_qstate = SQ_INCOMP;
|
|
|
|
head->sol_incqlen++;
|
|
|
|
SOLISTEN_UNLOCK(head);
|
2006-06-10 14:34:07 +00:00
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
return (so);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef SCTP
|
|
|
|
/*
|
|
|
|
* Socket part of sctp_peeloff(). Detach a new socket from an
|
|
|
|
* association. The new socket is returned with a reference.
|
|
|
|
*/
|
|
|
|
struct socket *
|
|
|
|
sopeeloff(struct socket *head)
|
|
|
|
{
|
|
|
|
struct socket *so;
|
|
|
|
|
|
|
|
VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
|
|
|
|
__func__, __LINE__, head));
|
|
|
|
so = soalloc(head->so_vnet);
|
|
|
|
if (so == NULL) {
|
|
|
|
log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
|
|
|
|
"limit reached or out of memory\n",
|
|
|
|
__func__, head->so_pcb);
|
|
|
|
return (NULL);
|
2006-06-10 14:34:07 +00:00
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
so->so_type = head->so_type;
|
|
|
|
so->so_options = head->so_options;
|
|
|
|
so->so_linger = head->so_linger;
|
|
|
|
so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
|
|
|
|
so->so_fibnum = head->so_fibnum;
|
|
|
|
so->so_proto = head->so_proto;
|
|
|
|
so->so_cred = crhold(head->so_cred);
|
|
|
|
#ifdef MAC
|
|
|
|
mac_socket_newconn(head, so);
|
|
|
|
#endif
|
|
|
|
knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
|
|
|
|
so_rdknl_assert_locked, so_rdknl_assert_unlocked);
|
|
|
|
knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
|
|
|
|
so_wrknl_assert_locked, so_wrknl_assert_unlocked);
|
|
|
|
VNET_SO_ASSERT(head);
|
|
|
|
if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
|
|
|
|
sodealloc(so);
|
|
|
|
log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
|
|
|
|
__func__, head->so_pcb);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
|
|
|
|
sodealloc(so);
|
|
|
|
log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
|
|
|
|
__func__, head->so_pcb);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
|
|
|
|
so->so_snd.sb_lowat = head->so_snd.sb_lowat;
|
|
|
|
so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
|
|
|
|
so->so_snd.sb_timeo = head->so_snd.sb_timeo;
|
|
|
|
so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
|
|
|
|
so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
|
|
|
|
|
|
|
|
soref(so);
|
|
|
|
|
2006-06-10 14:34:07 +00:00
|
|
|
return (so);
|
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
#endif /* SCTP */
|
2006-06-10 14:34:07 +00:00
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
|
|
|
|
CURVNET_RESTORE();
|
2013-03-02 21:11:30 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
CURVNET_SET(so->so_vnet);
|
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
|
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
|
|
|
* solisten() transitions a socket from a non-listening state to a listening
|
|
|
|
* state, but can also be used to update the listen queue depth on an
|
|
|
|
* existing listen socket. The protocol will call back into the sockets
|
|
|
|
* layer using solisten_proto_check() and solisten_proto() to check and set
|
|
|
|
* socket-layer listen state. Call backs are used so that the protocol can
|
2005-09-18 10:46:34 +00:00
|
|
|
* acquire both protocol and socket layer locks in whatever order is required
|
2005-06-09 19:59:09 +00:00
|
|
|
* by the protocol.
|
|
|
|
*
|
|
|
|
* Protocol implementors are advised to hold the socket lock across the
|
|
|
|
* socket-layer test and set to avoid races at the socket layer.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
solisten(struct socket *so, int backlog, struct thread *td)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2011-02-16 21:29:13 +00:00
|
|
|
int error;
|
2005-06-09 19:59:09 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
|
|
|
|
CURVNET_RESTORE();
|
2013-03-02 21:11:30 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
solisten_proto_check(struct socket *so)
|
2005-06-09 19:59:09 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
SOCK_LOCK_ASSERT(so);
|
|
|
|
|
|
|
|
if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
|
|
|
|
SS_ISDISCONNECTING))
|
|
|
|
return (EINVAL);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
solisten_proto(struct socket *so, int backlog)
|
2005-06-09 19:59:09 +00:00
|
|
|
{
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
int sbrcv_lowat, sbsnd_lowat;
|
|
|
|
u_int sbrcv_hiwat, sbsnd_hiwat;
|
|
|
|
short sbrcv_flags, sbsnd_flags;
|
|
|
|
sbintime_t sbrcv_timeo, sbsnd_timeo;
|
2005-06-09 19:59:09 +00:00
|
|
|
|
|
|
|
SOCK_LOCK_ASSERT(so);
|
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
if (SOLISTENING(so))
|
|
|
|
goto listening;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Change this socket to listening state.
|
|
|
|
*/
|
|
|
|
sbrcv_lowat = so->so_rcv.sb_lowat;
|
|
|
|
sbsnd_lowat = so->so_snd.sb_lowat;
|
|
|
|
sbrcv_hiwat = so->so_rcv.sb_hiwat;
|
|
|
|
sbsnd_hiwat = so->so_snd.sb_hiwat;
|
|
|
|
sbrcv_flags = so->so_rcv.sb_flags;
|
|
|
|
sbsnd_flags = so->so_snd.sb_flags;
|
|
|
|
sbrcv_timeo = so->so_rcv.sb_timeo;
|
|
|
|
sbsnd_timeo = so->so_snd.sb_timeo;
|
|
|
|
|
|
|
|
sbdestroy(&so->so_snd, so);
|
|
|
|
sbdestroy(&so->so_rcv, so);
|
|
|
|
sx_destroy(&so->so_snd.sb_sx);
|
|
|
|
sx_destroy(&so->so_rcv.sb_sx);
|
|
|
|
SOCKBUF_LOCK_DESTROY(&so->so_snd);
|
|
|
|
SOCKBUF_LOCK_DESTROY(&so->so_rcv);
|
|
|
|
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
bzero(&so->so_rcv,
|
|
|
|
sizeof(struct socket) - offsetof(struct socket, so_rcv));
|
|
|
|
#endif
|
|
|
|
|
|
|
|
so->sol_sbrcv_lowat = sbrcv_lowat;
|
|
|
|
so->sol_sbsnd_lowat = sbsnd_lowat;
|
|
|
|
so->sol_sbrcv_hiwat = sbrcv_hiwat;
|
|
|
|
so->sol_sbsnd_hiwat = sbsnd_hiwat;
|
|
|
|
so->sol_sbrcv_flags = sbrcv_flags;
|
|
|
|
so->sol_sbsnd_flags = sbsnd_flags;
|
|
|
|
so->sol_sbrcv_timeo = sbrcv_timeo;
|
|
|
|
so->sol_sbsnd_timeo = sbsnd_timeo;
|
|
|
|
|
|
|
|
so->sol_qlen = so->sol_incqlen = 0;
|
|
|
|
TAILQ_INIT(&so->sol_incomp);
|
|
|
|
TAILQ_INIT(&so->sol_comp);
|
|
|
|
|
|
|
|
so->sol_accept_filter = NULL;
|
|
|
|
so->sol_accept_filter_arg = NULL;
|
|
|
|
so->sol_accept_filter_str = NULL;
|
|
|
|
|
2017-07-04 18:23:17 +00:00
|
|
|
so->sol_upcall = NULL;
|
|
|
|
so->sol_upcallarg = NULL;
|
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
so->so_options |= SO_ACCEPTCONN;
|
|
|
|
|
|
|
|
listening:
|
2005-10-30 19:44:40 +00:00
|
|
|
if (backlog < 0 || backlog > somaxconn)
|
|
|
|
backlog = somaxconn;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
so->sol_qlimit = backlog;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wakeup listeners/subsystems once we have a complete connection.
|
|
|
|
* Enters with lock, returns unlocked.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
solisten_wakeup(struct socket *sol)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (sol->sol_upcall != NULL)
|
|
|
|
(void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
|
|
|
|
else {
|
|
|
|
selwakeuppri(&sol->so_rdsel, PSOCK);
|
|
|
|
KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
|
|
|
|
}
|
|
|
|
SOLISTEN_UNLOCK(sol);
|
|
|
|
wakeup_one(&sol->sol_comp);
|
2019-01-13 20:33:54 +00:00
|
|
|
if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL)
|
|
|
|
pgsigio(&sol->so_sigio, SIGIO, 0);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return single connection off a listening socket queue. Main consumer of
|
|
|
|
* the function is kern_accept4(). Some modules, that do their own accept
|
|
|
|
* management also use the function.
|
|
|
|
*
|
|
|
|
* Listening socket must be locked on entry and is returned unlocked on
|
|
|
|
* return.
|
|
|
|
* The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
solisten_dequeue(struct socket *head, struct socket **ret, int flags)
|
|
|
|
{
|
|
|
|
struct socket *so;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
SOLISTEN_LOCK_ASSERT(head);
|
|
|
|
|
|
|
|
while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
|
|
|
|
head->so_error == 0) {
|
|
|
|
error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH,
|
|
|
|
"accept", 0);
|
|
|
|
if (error != 0) {
|
|
|
|
SOLISTEN_UNLOCK(head);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (head->so_error) {
|
|
|
|
error = head->so_error;
|
|
|
|
head->so_error = 0;
|
2018-10-03 17:40:04 +00:00
|
|
|
} else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp))
|
|
|
|
error = EWOULDBLOCK;
|
|
|
|
else
|
|
|
|
error = 0;
|
|
|
|
if (error) {
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOLISTEN_UNLOCK(head);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
so = TAILQ_FIRST(&head->sol_comp);
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
KASSERT(so->so_qstate == SQ_COMP,
|
|
|
|
("%s: so %p not SQ_COMP", __func__, so));
|
|
|
|
soref(so);
|
|
|
|
head->sol_qlen--;
|
|
|
|
so->so_qstate = SQ_NONE;
|
|
|
|
so->so_listen = NULL;
|
|
|
|
TAILQ_REMOVE(&head->sol_comp, so, so_list);
|
|
|
|
if (flags & ACCEPT4_INHERIT)
|
|
|
|
so->so_state |= (head->so_state & SS_NBIO);
|
|
|
|
else
|
|
|
|
so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
sorele(head);
|
|
|
|
|
|
|
|
*ret = so;
|
|
|
|
return (0);
|
2005-06-09 19:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-09-18 11:18:42 +00:00
|
|
|
* Evaluate the reference count and named references on a socket; if no
|
|
|
|
* references remain, free it. This should be called whenever a reference is
|
|
|
|
* released, such as in sorele(), but also when named reference flags are
|
|
|
|
* cleared in socket or protocol code.
|
2005-06-09 19:59:09 +00:00
|
|
|
*
|
2010-09-18 11:18:42 +00:00
|
|
|
* sofree() will free the socket if:
|
Chance protocol switch method pru_detach() so that it returns void
rather than an error. Detaches do not "fail", they other occur or
the protocol flags SS_PROTOREF to take ownership of the socket.
soclose() no longer looks at so_pcb to see if it's NULL, relying
entirely on the protocol to decide whether it's time to free the
socket or not using SS_PROTOREF. so_pcb is now entirely owned and
managed by the protocol code. Likewise, no longer test so_pcb in
other socket functions, such as soreceive(), which have no business
digging into protocol internals.
Protocol detach routines no longer try to free the socket on detach,
this is performed in the socket code if the protocol permits it.
In rts_detach(), no longer test for rp != NULL in detach, and
likewise in other protocols that don't permit a NULL so_pcb, reduce
the incidence of testing for it during detach.
netinet and netinet6 are not fully updated to this change, which
will be in an upcoming commit. In their current state they may leak
memory or panic.
MFC after: 3 months
2006-04-01 15:42:02 +00:00
|
|
|
*
|
|
|
|
* - There are no outstanding file descriptor references or related consumers
|
|
|
|
* (so_count == 0).
|
|
|
|
*
|
|
|
|
* - The socket has been closed by user space, if ever open (SS_NOFDREF).
|
|
|
|
*
|
|
|
|
* - The protocol does not have an outstanding strong reference on the socket
|
|
|
|
* (SS_PROTOREF).
|
|
|
|
*
|
2006-04-23 15:37:23 +00:00
|
|
|
* - The socket is not in a completed connection queue, so a process has been
|
2006-04-23 15:33:38 +00:00
|
|
|
* notified that it is present. If it is removed, the user process may
|
|
|
|
* block in accept() despite select() saying the socket was ready.
|
2005-06-09 19:59:09 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
sofree(struct socket *so)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2006-08-01 10:30:26 +00:00
|
|
|
struct protosw *pr = so->so_proto;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_LOCK_ASSERT(so);
|
|
|
|
|
Chance protocol switch method pru_detach() so that it returns void
rather than an error. Detaches do not "fail", they other occur or
the protocol flags SS_PROTOREF to take ownership of the socket.
soclose() no longer looks at so_pcb to see if it's NULL, relying
entirely on the protocol to decide whether it's time to free the
socket or not using SS_PROTOREF. so_pcb is now entirely owned and
managed by the protocol code. Likewise, no longer test so_pcb in
other socket functions, such as soreceive(), which have no business
digging into protocol internals.
Protocol detach routines no longer try to free the socket on detach,
this is performed in the socket code if the protocol permits it.
In rts_detach(), no longer test for rp != NULL in detach, and
likewise in other protocols that don't permit a NULL so_pcb, reduce
the incidence of testing for it during detach.
netinet and netinet6 are not fully updated to this change, which
will be in an upcoming commit. In their current state they may leak
memory or panic.
MFC after: 3 months
2006-04-01 15:42:02 +00:00
|
|
|
if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
(so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) {
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_UNLOCK(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
return;
|
2005-06-09 19:59:09 +00:00
|
|
|
}
|
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) {
|
|
|
|
struct socket *sol;
|
|
|
|
|
|
|
|
sol = so->so_listen;
|
|
|
|
KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* To solve race between close of a listening socket and
|
|
|
|
* a socket on its incomplete queue, we need to lock both.
|
|
|
|
* The order is first listening socket, then regular.
|
|
|
|
* Since we don't have SS_NOFDREF neither SS_PROTOREF, this
|
|
|
|
* function and the listening socket are the only pointers
|
|
|
|
* to so. To preserve so and sol, we reference both and then
|
|
|
|
* relock.
|
|
|
|
* After relock the socket may not move to so_comp since it
|
|
|
|
* doesn't have PCB already, but it may be removed from
|
|
|
|
* so_incomp. If that happens, we share responsiblity on
|
|
|
|
* freeing the socket, but soclose() has already removed
|
|
|
|
* it from queue.
|
|
|
|
*/
|
|
|
|
soref(sol);
|
|
|
|
soref(so);
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
SOLISTEN_LOCK(sol);
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
if (so->so_qstate == SQ_INCOMP) {
|
|
|
|
KASSERT(so->so_listen == sol,
|
|
|
|
("%s: so %p migrated out of sol %p",
|
|
|
|
__func__, so, sol));
|
|
|
|
TAILQ_REMOVE(&sol->sol_incomp, so, so_list);
|
|
|
|
sol->sol_incqlen--;
|
|
|
|
/* This is guarenteed not to be the last. */
|
|
|
|
refcount_release(&sol->so_count);
|
|
|
|
so->so_qstate = SQ_NONE;
|
|
|
|
so->so_listen = NULL;
|
|
|
|
} else
|
|
|
|
KASSERT(so->so_listen == NULL,
|
|
|
|
("%s: so %p not on (in)comp with so_listen",
|
|
|
|
__func__, so));
|
|
|
|
sorele(sol);
|
|
|
|
KASSERT(so->so_count == 1,
|
|
|
|
("%s: so %p count %u", __func__, so, so->so_count));
|
|
|
|
so->so_count = 0;
|
2006-11-22 23:54:29 +00:00
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
if (SOLISTENING(so))
|
|
|
|
so->so_error = ECONNABORTED;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_UNLOCK(so);
|
Chance protocol switch method pru_detach() so that it returns void
rather than an error. Detaches do not "fail", they other occur or
the protocol flags SS_PROTOREF to take ownership of the socket.
soclose() no longer looks at so_pcb to see if it's NULL, relying
entirely on the protocol to decide whether it's time to free the
socket or not using SS_PROTOREF. so_pcb is now entirely owned and
managed by the protocol code. Likewise, no longer test so_pcb in
other socket functions, such as soreceive(), which have no business
digging into protocol internals.
Protocol detach routines no longer try to free the socket on detach,
this is performed in the socket code if the protocol permits it.
In rts_detach(), no longer test for rp != NULL in detach, and
likewise in other protocols that don't permit a NULL so_pcb, reduce
the incidence of testing for it during detach.
netinet and netinet6 are not fully updated to this change, which
will be in an upcoming commit. In their current state they may leak
memory or panic.
MFC after: 3 months
2006-04-01 15:42:02 +00:00
|
|
|
|
2018-10-18 14:20:15 +00:00
|
|
|
if (so->so_dtor != NULL)
|
|
|
|
so->so_dtor(so);
|
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2007-03-22 13:21:24 +00:00
|
|
|
if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
|
2015-07-14 02:00:50 +00:00
|
|
|
(*pr->pr_domain->dom_dispose)(so);
|
2007-03-22 13:21:24 +00:00
|
|
|
if (pr->pr_usrreqs->pru_detach != NULL)
|
|
|
|
(*pr->pr_usrreqs->pru_detach)(so);
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
2006-08-01 10:30:26 +00:00
|
|
|
* From this point on, we assume that no other references to this
|
|
|
|
* socket exist anywhere else in the stack. Therefore, no locks need
|
|
|
|
* to be acquired or held.
|
|
|
|
*
|
|
|
|
* We used to do a lot of socket buffer and socket locking here, as
|
|
|
|
* well as invoke sorflush() and perform wakeups. The direct call to
|
2019-06-27 22:50:11 +00:00
|
|
|
* dom_dispose() and sbdestroy() are an inlining of what was
|
2006-08-01 10:30:26 +00:00
|
|
|
* necessary from sorflush().
|
|
|
|
*
|
|
|
|
* Notice that the socket buffer and kqueue state are torn down
|
|
|
|
* before calling pru_detach. This means that protocols shold not
|
2007-05-03 14:42:42 +00:00
|
|
|
* assume they can perform socket wakeups, etc, in their detach code.
|
2005-06-09 19:59:09 +00:00
|
|
|
*/
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
if (!SOLISTENING(so)) {
|
|
|
|
sbdestroy(&so->so_snd, so);
|
|
|
|
sbdestroy(&so->so_rcv, so);
|
|
|
|
}
|
|
|
|
seldrain(&so->so_rdsel);
|
|
|
|
seldrain(&so->so_wrsel);
|
|
|
|
knlist_destroy(&so->so_rdsel.si_note);
|
|
|
|
knlist_destroy(&so->so_wrsel.si_note);
|
1998-05-15 20:11:40 +00:00
|
|
|
sodealloc(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Close a socket on last file table reference removal. Initiate disconnect
|
|
|
|
* if connected. Free socket when disconnect complete.
|
2005-06-09 19:59:09 +00:00
|
|
|
*
|
2006-07-23 20:36:04 +00:00
|
|
|
* This function will sorele() the socket. Note that soclose() may be called
|
|
|
|
* prior to the ref count reaching zero. The actual socket structure will
|
|
|
|
* not be freed until the ref count reaches zero.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soclose(struct socket *so)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
struct accept_queue lqueue;
|
|
|
|
bool listening;
|
1994-05-24 10:09:53 +00:00
|
|
|
int error = 0;
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
|
2005-06-09 19:56:38 +00:00
|
|
|
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
2005-06-09 19:59:09 +00:00
|
|
|
funsetown(&so->so_sigio);
|
2006-11-22 23:54:29 +00:00
|
|
|
if (so->so_state & SS_ISCONNECTED) {
|
|
|
|
if ((so->so_state & SS_ISDISCONNECTING) == 0) {
|
|
|
|
error = sodisconnect(so);
|
2010-05-27 15:27:31 +00:00
|
|
|
if (error) {
|
|
|
|
if (error == ENOTCONN)
|
|
|
|
error = 0;
|
2006-11-22 23:54:29 +00:00
|
|
|
goto drop;
|
2010-05-27 15:27:31 +00:00
|
|
|
}
|
2006-11-22 23:54:29 +00:00
|
|
|
}
|
|
|
|
if (so->so_options & SO_LINGER) {
|
|
|
|
if ((so->so_state & SS_ISDISCONNECTING) &&
|
|
|
|
(so->so_state & SS_NBIO))
|
|
|
|
goto drop;
|
|
|
|
while (so->so_state & SS_ISCONNECTED) {
|
|
|
|
error = tsleep(&so->so_timeo,
|
2012-12-07 22:13:33 +00:00
|
|
|
PSOCK | PCATCH, "soclos",
|
|
|
|
so->so_linger * hz);
|
2006-11-22 23:54:29 +00:00
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
drop:
|
|
|
|
if (so->so_proto->pr_usrreqs->pru_close != NULL)
|
|
|
|
(*so->so_proto->pr_usrreqs->pru_close)(so);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
if ((listening = (so->so_options & SO_ACCEPTCONN))) {
|
2005-06-09 19:59:09 +00:00
|
|
|
struct socket *sp;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
|
|
|
|
TAILQ_INIT(&lqueue);
|
|
|
|
TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
|
|
|
|
TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
|
|
|
|
|
|
|
|
so->sol_qlen = so->sol_incqlen = 0;
|
|
|
|
|
|
|
|
TAILQ_FOREACH(sp, &lqueue, so_list) {
|
|
|
|
SOCK_LOCK(sp);
|
|
|
|
sp->so_qstate = SQ_NONE;
|
|
|
|
sp->so_listen = NULL;
|
|
|
|
SOCK_UNLOCK(sp);
|
|
|
|
/* Guaranteed not to be the last. */
|
|
|
|
refcount_release(&so->so_count);
|
1996-04-16 03:50:08 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
|
1994-05-24 10:09:53 +00:00
|
|
|
so->so_state |= SS_NOFDREF;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
sorele(so);
|
|
|
|
if (listening) {
|
2019-08-19 12:42:03 +00:00
|
|
|
struct socket *sp, *tsp;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
|
2019-08-19 12:42:03 +00:00
|
|
|
TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) {
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOCK_LOCK(sp);
|
|
|
|
if (sp->so_count == 0) {
|
|
|
|
SOCK_UNLOCK(sp);
|
|
|
|
soabort(sp);
|
|
|
|
} else
|
|
|
|
/* sp is now in sofree() */
|
|
|
|
SOCK_UNLOCK(sp);
|
|
|
|
}
|
|
|
|
}
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_RESTORE();
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2006-07-16 23:09:39 +00:00
|
|
|
* soabort() is used to abruptly tear down a connection, such as when a
|
|
|
|
* resource limit is reached (listen queue depth exceeded), or if a listen
|
|
|
|
* socket is closed while there are sockets waiting to be accepted.
|
2006-04-01 15:15:05 +00:00
|
|
|
*
|
|
|
|
* This interface is tricky, because it is called on an unreferenced socket,
|
|
|
|
* and must be called only by a thread that has actually removed the socket
|
|
|
|
* from the listen queue it was on, or races with other threads are risked.
|
|
|
|
*
|
|
|
|
* This interface will call into the protocol code, so must not be called
|
|
|
|
* with any socket locks held. Protocols do call it while holding their own
|
|
|
|
* recursible protocol mutexes, but this is something that should be subject
|
|
|
|
* to review in the future.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2006-03-16 07:03:14 +00:00
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
soabort(struct socket *so)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
|
2006-04-01 15:15:05 +00:00
|
|
|
/*
|
|
|
|
* In as much as is possible, assert that no references to this
|
|
|
|
* socket are held. This is not quite the same as asserting that the
|
|
|
|
* current thread is responsible for arranging for no references, but
|
|
|
|
* is as close as we can get for now.
|
|
|
|
*/
|
|
|
|
KASSERT(so->so_count == 0, ("soabort: so_count"));
|
2006-04-23 18:15:54 +00:00
|
|
|
KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
|
2006-04-01 15:15:05 +00:00
|
|
|
KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2006-04-01 15:15:05 +00:00
|
|
|
|
2006-07-21 17:11:15 +00:00
|
|
|
if (so->so_proto->pr_usrreqs->pru_abort != NULL)
|
2006-07-11 23:18:28 +00:00
|
|
|
(*so->so_proto->pr_usrreqs->pru_abort)(so);
|
2006-04-01 15:15:05 +00:00
|
|
|
SOCK_LOCK(so);
|
|
|
|
sofree(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soaccept(struct socket *so, struct sockaddr **nam)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_LOCK(so);
|
|
|
|
KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
|
1994-05-24 10:09:53 +00:00
|
|
|
so->so_state &= ~SS_NOFDREF;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_UNLOCK(so);
|
2011-02-16 21:29:13 +00:00
|
|
|
|
|
|
|
CURVNET_SET(so->so_vnet);
|
2001-03-09 08:16:40 +00:00
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_RESTORE();
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
|
2013-03-02 21:11:30 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
return (soconnectat(AT_FDCWD, so, nam, td));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
2002-05-31 11:52:35 +00:00
|
|
|
if (so->so_options & SO_ACCEPTCONN)
|
1994-05-24 10:09:53 +00:00
|
|
|
return (EOPNOTSUPP);
|
2010-02-20 22:29:28 +00:00
|
|
|
|
|
|
|
CURVNET_SET(so->so_vnet);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* If protocol is connection-based, can only connect once.
|
2006-07-23 20:36:04 +00:00
|
|
|
* Otherwise, if connected, try to disconnect first. This allows
|
|
|
|
* user to disconnect by connecting to, e.g., a null address.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2002-05-31 11:52:35 +00:00
|
|
|
if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
|
|
|
|
((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
|
2005-06-09 19:59:09 +00:00
|
|
|
(error = sodisconnect(so)))) {
|
2002-05-31 11:52:35 +00:00
|
|
|
error = EISCONN;
|
2005-06-09 19:59:09 +00:00
|
|
|
} else {
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Prevent accumulated error from previous connection from
|
|
|
|
* biting us.
|
2005-06-09 19:59:09 +00:00
|
|
|
*/
|
|
|
|
so->so_error = 0;
|
2013-03-02 21:11:30 +00:00
|
|
|
if (fd == AT_FDCWD) {
|
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
|
|
|
|
nam, td);
|
|
|
|
} else {
|
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
|
|
|
|
so, nam, td);
|
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
}
|
2010-02-20 22:29:28 +00:00
|
|
|
CURVNET_RESTORE();
|
2005-06-09 19:59:09 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soconnect2(struct socket *so1, struct socket *so2)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2011-02-16 21:29:13 +00:00
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_SET(so1->so_vnet);
|
|
|
|
error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
|
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sodisconnect(struct socket *so)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
if ((so->so_state & SS_ISCONNECTED) == 0)
|
|
|
|
return (ENOTCONN);
|
|
|
|
if (so->so_state & SS_ISDISCONNECTING)
|
|
|
|
return (EALREADY);
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
1996-07-11 16:32:50 +00:00
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
Correct two problems relating to sorflush(), which is called to flush
read socket buffers in shutdown() and close():
- Call socantrcvmore() before sblock() to dislodge any threads that
might be sleeping (potentially indefinitely) while holding sblock(),
such as a thread blocked in recv().
- Flag the sblock() call as non-interruptible so that a signal
delivered to the thread calling sorflush() doesn't cause sblock() to
fail. The sblock() is required to ensure that all other socket
consumer threads have, in fact, left, and do not enter, the socket
buffer until we're done flushin it.
To implement the latter, change the 'flags' argument to sblock() to
accept two flags, SBL_WAIT and SBL_NOINTR, rather than one M_WAITOK
flag. When SBL_NOINTR is set, it forces a non-interruptible sx
acquisition, regardless of the setting of the disposition of SB_NOINTR
on the socket buffer; without this change it would be possible for
another thread to clear SB_NOINTR between when the socket buffer mutex
is released and sblock() is invoked.
Reviewed by: bz, kmacy
Reported by: Jos Backus <jos at catnook dot com>
2008-01-31 08:22:24 +00:00
|
|
|
#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
|
2006-01-13 10:22:01 +00:00
|
|
|
|
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
|
|
|
|
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
|
2006-01-13 10:22:01 +00:00
|
|
|
{
|
2012-02-21 01:05:12 +00:00
|
|
|
long space;
|
|
|
|
ssize_t resid;
|
2006-01-13 10:22:01 +00:00
|
|
|
int clen = 0, error, dontroute;
|
|
|
|
|
2012-10-02 18:38:05 +00:00
|
|
|
KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
|
2006-01-13 10:22:01 +00:00
|
|
|
KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
|
2012-10-02 18:38:05 +00:00
|
|
|
("sosend_dgram: !PR_ATOMIC"));
|
2006-01-13 10:22:01 +00:00
|
|
|
|
|
|
|
if (uio != NULL)
|
|
|
|
resid = uio->uio_resid;
|
|
|
|
else
|
|
|
|
resid = top->m_pkthdr.len;
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* In theory resid should be unsigned. However, space must be
|
|
|
|
* signed, as it might be less than 0 if we over-committed, and we
|
|
|
|
* must use a signed comparison of space and resid. On the other
|
|
|
|
* hand, a negative resid causes us to loop sending 0-length
|
|
|
|
* segments to the protocol.
|
2006-01-13 10:22:01 +00:00
|
|
|
*/
|
|
|
|
if (resid < 0) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
dontroute =
|
|
|
|
(flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
|
|
|
|
if (td != NULL)
|
2007-06-01 01:12:45 +00:00
|
|
|
td->td_ru.ru_msgsnd++;
|
2006-01-13 10:22:01 +00:00
|
|
|
if (control != NULL)
|
|
|
|
clen = control->m_len;
|
|
|
|
|
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
|
|
|
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
error = EPIPE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (so->so_error) {
|
|
|
|
error = so->so_error;
|
|
|
|
so->so_error = 0;
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if ((so->so_state & SS_ISCONNECTED) == 0) {
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* `sendto' and `sendmsg' is allowed on a connection-based
|
|
|
|
* socket if it supports implied connect. Return ENOTCONN if
|
|
|
|
* not connected and no address is supplied.
|
2006-01-13 10:22:01 +00:00
|
|
|
*/
|
|
|
|
if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
|
|
|
|
(so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
|
|
|
|
if ((so->so_state & SS_ISCONFIRMING) == 0 &&
|
|
|
|
!(resid == 0 && clen != 0)) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
error = ENOTCONN;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
} else if (addr == NULL) {
|
|
|
|
if (so->so_proto->pr_flags & PR_CONNREQUIRED)
|
|
|
|
error = ENOTCONN;
|
|
|
|
else
|
|
|
|
error = EDESTADDRREQ;
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
|
|
|
|
* problem and need fixing.
|
|
|
|
*/
|
|
|
|
space = sbspace(&so->so_snd);
|
|
|
|
if (flags & MSG_OOB)
|
|
|
|
space += 1024;
|
|
|
|
space -= clen;
|
2006-09-13 06:58:40 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
2006-01-13 10:22:01 +00:00
|
|
|
if (resid > space) {
|
|
|
|
error = EMSGSIZE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (uio == NULL) {
|
|
|
|
resid = 0;
|
|
|
|
if (flags & MSG_EOR)
|
|
|
|
top->m_flags |= M_EOR;
|
|
|
|
} else {
|
2007-01-22 14:50:28 +00:00
|
|
|
/*
|
|
|
|
* Copy the data from userland into a mbuf chain.
|
|
|
|
* If no data is to be copied in, a single empty mbuf
|
|
|
|
* is returned.
|
|
|
|
*/
|
2006-11-02 17:45:28 +00:00
|
|
|
top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
|
|
|
|
(M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
|
|
|
|
if (top == NULL) {
|
|
|
|
error = EFAULT; /* only possible error */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
space -= resid - uio->uio_resid;
|
2006-01-13 10:22:01 +00:00
|
|
|
resid = uio->uio_resid;
|
|
|
|
}
|
|
|
|
KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
|
|
|
|
/*
|
|
|
|
* XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
|
|
|
|
* than with.
|
|
|
|
*/
|
|
|
|
if (dontroute) {
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_options |= SO_DONTROUTE;
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
}
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* XXX all the SBS_CANTSENDMORE checks previously done could be out
|
2016-05-22 13:10:48 +00:00
|
|
|
* of date. We could have received a reset packet in an interrupt or
|
2006-07-23 20:36:04 +00:00
|
|
|
* maybe we slept while doing page faults in uiomove() etc. We could
|
|
|
|
* probably recheck again inside the locking protection here, but
|
|
|
|
* there are probably other places that this also happens. We must
|
|
|
|
* rethink this.
|
2006-01-13 10:22:01 +00:00
|
|
|
*/
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2006-01-13 10:22:01 +00:00
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_send)(so,
|
|
|
|
(flags & MSG_OOB) ? PRUS_OOB :
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If the user set MSG_EOF, the protocol understands this flag and
|
|
|
|
* nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
|
2006-01-13 10:22:01 +00:00
|
|
|
*/
|
|
|
|
((flags & MSG_EOF) &&
|
|
|
|
(so->so_proto->pr_flags & PR_IMPLOPCL) &&
|
|
|
|
(resid <= 0)) ?
|
|
|
|
PRUS_EOF :
|
|
|
|
/* If there is more to send set PRUS_MORETOCOME */
|
2017-01-06 23:41:45 +00:00
|
|
|
(flags & MSG_MORETOCOME) ||
|
2006-01-13 10:22:01 +00:00
|
|
|
(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
|
|
|
|
top, addr, control, td);
|
|
|
|
if (dontroute) {
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_options &= ~SO_DONTROUTE;
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
}
|
|
|
|
clen = 0;
|
|
|
|
control = NULL;
|
|
|
|
top = NULL;
|
|
|
|
out:
|
|
|
|
if (top != NULL)
|
|
|
|
m_freem(top);
|
|
|
|
if (control != NULL)
|
|
|
|
m_freem(control);
|
|
|
|
return (error);
|
|
|
|
}
|
2005-11-28 18:09:03 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Send on a socket. If send must go all at once and message is larger than
|
|
|
|
* send buffering, then hard error. Lock against other senders. If must go
|
|
|
|
* all at once and not enough room now, then inform user that this would
|
|
|
|
* block and do nothing. Otherwise, if nonblocking, send as much as
|
|
|
|
* possible. The data to be sent is described by "uio" if nonzero, otherwise
|
|
|
|
* by the mbuf chain "top" (which must be null if uio is not). Data provided
|
|
|
|
* in mbuf chain must be small enough to send all at once.
|
1994-05-24 10:09:53 +00:00
|
|
|
*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Returns nonzero on error, timeout or signal; callers must check for short
|
|
|
|
* counts if EINTR/ERESTART are returned. Data and control buffers are freed
|
|
|
|
* on return.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
|
|
|
|
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2012-02-21 01:05:12 +00:00
|
|
|
long space;
|
|
|
|
ssize_t resid;
|
2005-06-09 19:59:09 +00:00
|
|
|
int clen = 0, error, dontroute;
|
1994-05-24 10:09:53 +00:00
|
|
|
int atomic = sosendallatonce(so) || top;
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
int pru_flag;
|
|
|
|
#ifdef KERN_TLS
|
|
|
|
struct ktls_session *tls;
|
|
|
|
int tls_enq_cnt, tls_pruflag;
|
|
|
|
uint8_t tls_rtype;
|
|
|
|
|
|
|
|
tls = NULL;
|
|
|
|
tls_rtype = TLS_RLTYPE_APP;
|
|
|
|
#endif
|
2005-06-09 19:59:09 +00:00
|
|
|
if (uio != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
resid = uio->uio_resid;
|
2020-04-27 23:55:09 +00:00
|
|
|
else if ((top->m_flags & M_PKTHDR) != 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
resid = top->m_pkthdr.len;
|
2020-04-27 23:55:09 +00:00
|
|
|
else
|
|
|
|
resid = m_length(top, NULL);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* In theory resid should be unsigned. However, space must be
|
|
|
|
* signed, as it might be less than 0 if we over-committed, and we
|
|
|
|
* must use a signed comparison of space and resid. On the other
|
|
|
|
* hand, a negative resid causes us to loop sending 0-length
|
|
|
|
* segments to the protocol.
|
1997-11-09 05:07:40 +00:00
|
|
|
*
|
|
|
|
* Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
|
|
|
|
* type sockets since that's an error.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1999-01-27 21:50:00 +00:00
|
|
|
if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
|
1997-11-09 05:07:40 +00:00
|
|
|
error = EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
dontroute =
|
|
|
|
(flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
|
|
|
|
(so->so_proto->pr_flags & PR_ATOMIC);
|
2005-06-09 19:59:09 +00:00
|
|
|
if (td != NULL)
|
2007-06-01 01:12:45 +00:00
|
|
|
td->td_ru.ru_msgsnd++;
|
2005-06-09 19:59:09 +00:00
|
|
|
if (control != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
clen = control->m_len;
|
|
|
|
|
1994-10-02 17:35:40 +00:00
|
|
|
error = sblock(&so->so_snd, SBLOCKWAIT(flags));
|
|
|
|
if (error)
|
2007-05-03 14:42:42 +00:00
|
|
|
goto out;
|
|
|
|
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
#ifdef KERN_TLS
|
|
|
|
tls_pruflag = 0;
|
|
|
|
tls = ktls_hold(so->so_snd.sb_tls_info);
|
|
|
|
if (tls != NULL) {
|
2019-10-08 21:34:06 +00:00
|
|
|
if (tls->mode == TCP_TLS_MODE_SW)
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
tls_pruflag = PRUS_NOTREADY;
|
|
|
|
|
|
|
|
if (control != NULL) {
|
|
|
|
struct cmsghdr *cm = mtod(control, struct cmsghdr *);
|
|
|
|
|
|
|
|
if (clen >= sizeof(*cm) &&
|
|
|
|
cm->cmsg_type == TLS_SET_RECORD_TYPE) {
|
|
|
|
tls_rtype = *((uint8_t *)CMSG_DATA(cm));
|
|
|
|
clen = 0;
|
|
|
|
m_freem(control);
|
|
|
|
control = NULL;
|
|
|
|
atomic = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2007-05-03 14:42:42 +00:00
|
|
|
restart:
|
1994-05-24 10:09:53 +00:00
|
|
|
do {
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
|
|
|
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
error = EPIPE;
|
|
|
|
goto release;
|
|
|
|
}
|
1998-02-19 19:38:20 +00:00
|
|
|
if (so->so_error) {
|
|
|
|
error = so->so_error;
|
|
|
|
so->so_error = 0;
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
1998-02-19 19:38:20 +00:00
|
|
|
goto release;
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
if ((so->so_state & SS_ISCONNECTED) == 0) {
|
1995-02-07 02:01:16 +00:00
|
|
|
/*
|
|
|
|
* `sendto' and `sendmsg' is allowed on a connection-
|
|
|
|
* based socket if it supports implied connect.
|
|
|
|
* Return ENOTCONN if not connected and no address is
|
|
|
|
* supplied.
|
|
|
|
*/
|
|
|
|
if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
|
|
|
|
(so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
|
1994-05-24 10:09:53 +00:00
|
|
|
if ((so->so_state & SS_ISCONFIRMING) == 0 &&
|
2007-05-03 14:42:42 +00:00
|
|
|
!(resid == 0 && clen != 0)) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
error = ENOTCONN;
|
|
|
|
goto release;
|
|
|
|
}
|
|
|
|
} else if (addr == NULL) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
if (so->so_proto->pr_flags & PR_CONNREQUIRED)
|
|
|
|
error = ENOTCONN;
|
|
|
|
else
|
|
|
|
error = EDESTADDRREQ;
|
|
|
|
goto release;
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
space = sbspace(&so->so_snd);
|
|
|
|
if (flags & MSG_OOB)
|
|
|
|
space += 1024;
|
1994-10-02 17:35:40 +00:00
|
|
|
if ((atomic && resid > so->so_snd.sb_hiwat) ||
|
2007-05-03 14:42:42 +00:00
|
|
|
clen > so->so_snd.sb_hiwat) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
error = EMSGSIZE;
|
|
|
|
goto release;
|
|
|
|
}
|
2002-02-28 11:22:40 +00:00
|
|
|
if (space < resid + clen &&
|
1994-05-24 10:09:53 +00:00
|
|
|
(atomic || space < so->so_snd.sb_lowat || space < clen)) {
|
2019-01-04 17:31:50 +00:00
|
|
|
if ((so->so_state & SS_NBIO) ||
|
|
|
|
(flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
error = EWOULDBLOCK;
|
|
|
|
goto release;
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
error = sbwait(&so->so_snd);
|
2007-05-08 12:34:14 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (error)
|
2007-05-03 14:42:42 +00:00
|
|
|
goto release;
|
1994-05-24 10:09:53 +00:00
|
|
|
goto restart;
|
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
1994-05-24 10:09:53 +00:00
|
|
|
space -= clen;
|
|
|
|
do {
|
2005-11-28 18:09:03 +00:00
|
|
|
if (uio == NULL) {
|
|
|
|
resid = 0;
|
|
|
|
if (flags & MSG_EOR)
|
|
|
|
top->m_flags |= M_EOR;
|
2020-05-27 23:20:35 +00:00
|
|
|
#ifdef KERN_TLS
|
|
|
|
if (tls != NULL) {
|
|
|
|
ktls_frame(top, tls, &tls_enq_cnt,
|
|
|
|
tls_rtype);
|
|
|
|
tls_rtype = TLS_RLTYPE_APP;
|
|
|
|
}
|
|
|
|
#endif
|
2014-12-20 22:12:04 +00:00
|
|
|
} else {
|
2007-01-22 14:50:28 +00:00
|
|
|
/*
|
|
|
|
* Copy the data from userland into a mbuf
|
2014-12-20 22:12:04 +00:00
|
|
|
* chain. If resid is 0, which can happen
|
|
|
|
* only if we have control to send, then
|
|
|
|
* a single empty mbuf is returned. This
|
|
|
|
* is a workaround to prevent protocol send
|
|
|
|
* methods to panic.
|
2007-01-22 14:50:28 +00:00
|
|
|
*/
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
#ifdef KERN_TLS
|
|
|
|
if (tls != NULL) {
|
|
|
|
top = m_uiotombuf(uio, M_WAITOK, space,
|
|
|
|
tls->params.max_frame_len,
|
2020-05-03 00:21:11 +00:00
|
|
|
M_EXTPG |
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
((flags & MSG_EOR) ? M_EOR : 0));
|
|
|
|
if (top != NULL) {
|
2020-02-25 19:26:40 +00:00
|
|
|
ktls_frame(top, tls,
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
&tls_enq_cnt, tls_rtype);
|
|
|
|
}
|
|
|
|
tls_rtype = TLS_RLTYPE_APP;
|
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
top = m_uiotombuf(uio, M_WAITOK, space,
|
|
|
|
(atomic ? max_hdr : 0),
|
|
|
|
(atomic ? M_PKTHDR : 0) |
|
|
|
|
((flags & MSG_EOR) ? M_EOR : 0));
|
2006-11-02 17:45:28 +00:00
|
|
|
if (top == NULL) {
|
|
|
|
error = EFAULT; /* only possible error */
|
|
|
|
goto release;
|
|
|
|
}
|
|
|
|
space -= resid - uio->uio_resid;
|
2005-11-28 18:09:03 +00:00
|
|
|
resid = uio->uio_resid;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-11-28 18:09:03 +00:00
|
|
|
if (dontroute) {
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_options |= SO_DONTROUTE;
|
|
|
|
SOCK_UNLOCK(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-11-28 18:09:03 +00:00
|
|
|
/*
|
|
|
|
* XXX all the SBS_CANTSENDMORE checks previously
|
2016-04-29 22:15:33 +00:00
|
|
|
* done could be out of date. We could have received
|
2005-11-28 18:09:03 +00:00
|
|
|
* a reset packet in an interrupt or maybe we slept
|
2006-07-23 20:36:04 +00:00
|
|
|
* while doing page faults in uiomove() etc. We
|
|
|
|
* could probably recheck again inside the locking
|
|
|
|
* protection here, but there are probably other
|
|
|
|
* places that this also happens. We must rethink
|
|
|
|
* this.
|
2005-11-28 18:09:03 +00:00
|
|
|
*/
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
|
|
|
|
pru_flag = (flags & MSG_OOB) ? PRUS_OOB :
|
1995-02-07 02:01:16 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If the user set MSG_EOF, the protocol understands
|
|
|
|
* this flag and nothing left to send then use
|
|
|
|
* PRU_SEND_EOF instead of PRU_SEND.
|
1995-02-07 02:01:16 +00:00
|
|
|
*/
|
2005-11-28 18:09:03 +00:00
|
|
|
((flags & MSG_EOF) &&
|
|
|
|
(so->so_proto->pr_flags & PR_IMPLOPCL) &&
|
|
|
|
(resid <= 0)) ?
|
1999-01-20 17:32:01 +00:00
|
|
|
PRUS_EOF :
|
2006-07-23 20:36:04 +00:00
|
|
|
/* If there is more to send set PRUS_MORETOCOME. */
|
2017-01-06 23:41:45 +00:00
|
|
|
(flags & MSG_MORETOCOME) ||
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
|
|
|
|
|
|
|
|
#ifdef KERN_TLS
|
|
|
|
pru_flag |= tls_pruflag;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
error = (*so->so_proto->pr_usrreqs->pru_send)(so,
|
|
|
|
pru_flag, top, addr, control, td);
|
|
|
|
|
2005-11-28 18:09:03 +00:00
|
|
|
if (dontroute) {
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_options &= ~SO_DONTROUTE;
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
}
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
|
|
|
|
#ifdef KERN_TLS
|
2019-10-08 21:34:06 +00:00
|
|
|
if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
/*
|
|
|
|
* Note that error is intentionally
|
|
|
|
* ignored.
|
|
|
|
*
|
|
|
|
* Like sendfile(), we rely on the
|
|
|
|
* completion routine (pru_ready())
|
|
|
|
* to free the mbufs in the event that
|
|
|
|
* pru_send() encountered an error and
|
|
|
|
* did not append them to the sockbuf.
|
|
|
|
*/
|
|
|
|
soref(so);
|
|
|
|
ktls_enqueue(top, so, tls_enq_cnt);
|
|
|
|
}
|
|
|
|
#endif
|
2005-11-28 18:09:03 +00:00
|
|
|
clen = 0;
|
|
|
|
control = NULL;
|
|
|
|
top = NULL;
|
2007-05-03 14:42:42 +00:00
|
|
|
if (error)
|
2005-11-28 18:09:03 +00:00
|
|
|
goto release;
|
1994-05-24 10:09:53 +00:00
|
|
|
} while (resid && space > 0);
|
|
|
|
} while (resid);
|
|
|
|
|
|
|
|
release:
|
|
|
|
sbunlock(&so->so_snd);
|
|
|
|
out:
|
Add kernel-side support for in-kernel TLS.
KTLS adds support for in-kernel framing and encryption of Transport
Layer Security (1.0-1.2) data on TCP sockets. KTLS only supports
offload of TLS for transmitted data. Key negotation must still be
performed in userland. Once completed, transmit session keys for a
connection are provided to the kernel via a new TCP_TXTLS_ENABLE
socket option. All subsequent data transmitted on the socket is
placed into TLS frames and encrypted using the supplied keys.
Any data written to a KTLS-enabled socket via write(2), aio_write(2),
or sendfile(2) is assumed to be application data and is encoded in TLS
frames with an application data type. Individual records can be sent
with a custom type (e.g. handshake messages) via sendmsg(2) with a new
control message (TLS_SET_RECORD_TYPE) specifying the record type.
At present, rekeying is not supported though the in-kernel framework
should support rekeying.
KTLS makes use of the recently added unmapped mbufs to store TLS
frames in the socket buffer. Each TLS frame is described by a single
ext_pgs mbuf. The ext_pgs structure contains the header of the TLS
record (and trailer for encrypted records) as well as references to
the associated TLS session.
KTLS supports two primary methods of encrypting TLS frames: software
TLS and ifnet TLS.
Software TLS marks mbufs holding socket data as not ready via
M_NOTREADY similar to sendfile(2) when TLS framing information is
added to an unmapped mbuf in ktls_frame(). ktls_enqueue() is then
called to schedule TLS frames for encryption. In the case of
sendfile_iodone() calls ktls_enqueue() instead of pru_ready() leaving
the mbufs marked M_NOTREADY until encryption is completed. For other
writes (vn_sendfile when pages are available, write(2), etc.), the
PRUS_NOTREADY is set when invoking pru_send() along with invoking
ktls_enqueue().
A pool of worker threads (the "KTLS" kernel process) encrypts TLS
frames queued via ktls_enqueue(). Each TLS frame is temporarily
mapped using the direct map and passed to a software encryption
backend to perform the actual encryption.
(Note: The use of PHYS_TO_DMAP could be replaced with sf_bufs if
someone wished to make this work on architectures without a direct
map.)
KTLS supports pluggable software encryption backends. Internally,
Netflix uses proprietary pure-software backends. This commit includes
a simple backend in a new ktls_ocf.ko module that uses the kernel's
OpenCrypto framework to provide AES-GCM encryption of TLS frames. As
a result, software TLS is now a bit of a misnomer as it can make use
of hardware crypto accelerators.
Once software encryption has finished, the TLS frame mbufs are marked
ready via pru_ready(). At this point, the encrypted data appears as
regular payload to the TCP stack stored in unmapped mbufs.
ifnet TLS permits a NIC to offload the TLS encryption and TCP
segmentation. In this mode, a new send tag type (IF_SND_TAG_TYPE_TLS)
is allocated on the interface a socket is routed over and associated
with a TLS session. TLS records for a TLS session using ifnet TLS are
not marked M_NOTREADY but are passed down the stack unencrypted. The
ip_output_send() and ip6_output_send() helper functions that apply
send tags to outbound IP packets verify that the send tag of the TLS
record matches the outbound interface. If so, the packet is tagged
with the TLS send tag and sent to the interface. The NIC device
driver must recognize packets with the TLS send tag and schedule them
for TLS encryption and TCP segmentation. If the the outbound
interface does not match the interface in the TLS send tag, the packet
is dropped. In addition, a task is scheduled to refresh the TLS send
tag for the TLS session. If a new TLS send tag cannot be allocated,
the connection is dropped. If a new TLS send tag is allocated,
however, subsequent packets will be tagged with the correct TLS send
tag. (This latter case has been tested by configuring both ports of a
Chelsio T6 in a lagg and failing over from one port to another. As
the connections migrated to the new port, new TLS send tags were
allocated for the new port and connections resumed without being
dropped.)
ifnet TLS can be enabled and disabled on supported network interfaces
via new '[-]txtls[46]' options to ifconfig(8). ifnet TLS is supported
across both vlan devices and lagg interfaces using failover, lacp with
flowid enabled, or lacp with flowid enabled.
Applications may request the current KTLS mode of a connection via a
new TCP_TXTLS_MODE socket option. They can also use this socket
option to toggle between software and ifnet TLS modes.
In addition, a testing tool is available in tools/tools/switch_tls.
This is modeled on tcpdrop and uses similar syntax. However, instead
of dropping connections, -s is used to force KTLS connections to
switch to software TLS and -i is used to switch to ifnet TLS.
Various sysctls and counters are available under the kern.ipc.tls
sysctl node. The kern.ipc.tls.enable node must be set to true to
enable KTLS (it is off by default). The use of unmapped mbufs must
also be enabled via kern.ipc.mb_use_ext_pgs to enable KTLS.
KTLS is enabled via the KERN_TLS kernel option.
This patch is the culmination of years of work by several folks
including Scott Long and Randall Stewart for the original design and
implementation; Drew Gallatin for several optimizations including the
use of ext_pgs mbufs, the M_NOTREADY mechanism for TLS records
awaiting software encryption, and pluggable software crypto backends;
and John Baldwin for modifications to support hardware TLS offload.
Reviewed by: gallatin, hselasky, rrs
Obtained from: Netflix
Sponsored by: Netflix, Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D21277
2019-08-27 00:01:56 +00:00
|
|
|
#ifdef KERN_TLS
|
|
|
|
if (tls != NULL)
|
|
|
|
ktls_free(tls);
|
|
|
|
#endif
|
2005-06-09 19:59:09 +00:00
|
|
|
if (top != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
m_freem(top);
|
2005-06-09 19:59:09 +00:00
|
|
|
if (control != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
m_freem(control);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
|
|
|
|
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
{
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
int error;
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
2017-06-15 20:11:29 +00:00
|
|
|
if (!SOLISTENING(so))
|
|
|
|
error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio,
|
|
|
|
top, control, flags, td);
|
|
|
|
else {
|
|
|
|
m_freem(top);
|
|
|
|
m_freem(control);
|
|
|
|
error = ENOTCONN;
|
|
|
|
}
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
|
|
|
* The part of soreceive() that implements reading non-inline out-of-band
|
|
|
|
* data from a socket. For more complete comments, see soreceive(), from
|
|
|
|
* which this code originated.
|
|
|
|
*
|
|
|
|
* Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
|
|
|
|
* unable to return an mbuf chain to the caller.
|
|
|
|
*/
|
|
|
|
static int
|
2007-05-16 20:41:08 +00:00
|
|
|
soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
|
2005-06-09 19:59:09 +00:00
|
|
|
{
|
|
|
|
struct protosw *pr = so->so_proto;
|
|
|
|
struct mbuf *m;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2005-06-09 19:59:09 +00:00
|
|
|
|
2012-12-05 08:04:20 +00:00
|
|
|
m = m_get(M_WAITOK, MT_DATA);
|
2005-06-09 19:59:09 +00:00
|
|
|
error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
|
|
|
|
if (error)
|
|
|
|
goto bad;
|
|
|
|
do {
|
|
|
|
error = uiomove(mtod(m, void *),
|
|
|
|
(int) min(uio->uio_resid, m->m_len), uio);
|
|
|
|
m = m_free(m);
|
|
|
|
} while (uio->uio_resid && error == 0 && m);
|
|
|
|
bad:
|
|
|
|
if (m != NULL)
|
|
|
|
m_freem(m);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Following replacement or removal of the first mbuf on the first mbuf chain
|
|
|
|
* of a socket buffer, push necessary state changes back into the socket
|
|
|
|
* buffer so that other consumers see the values consistently. 'nextrecord'
|
|
|
|
* is the callers locally stored value of the original value of
|
|
|
|
* sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
|
|
|
|
* NOTE: 'nextrecord' may be NULL.
|
|
|
|
*/
|
|
|
|
static __inline void
|
|
|
|
sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
|
|
|
|
{
|
|
|
|
|
|
|
|
SOCKBUF_LOCK_ASSERT(sb);
|
|
|
|
/*
|
|
|
|
* First, update for the new value of nextrecord. If necessary, make
|
|
|
|
* it the first record.
|
|
|
|
*/
|
|
|
|
if (sb->sb_mb != NULL)
|
|
|
|
sb->sb_mb->m_nextpkt = nextrecord;
|
|
|
|
else
|
|
|
|
sb->sb_mb = nextrecord;
|
|
|
|
|
2012-12-07 22:13:33 +00:00
|
|
|
/*
|
|
|
|
* Now update any dependent socket buffer fields to reflect the new
|
|
|
|
* state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
|
2005-06-09 19:59:09 +00:00
|
|
|
* addition of a second clause that takes care of the case where
|
|
|
|
* sb_mb has been updated, but remains the last record.
|
2012-12-07 22:13:33 +00:00
|
|
|
*/
|
|
|
|
if (sb->sb_mb == NULL) {
|
|
|
|
sb->sb_mbtail = NULL;
|
|
|
|
sb->sb_lastrecord = NULL;
|
|
|
|
} else if (sb->sb_mb->m_nextpkt == NULL)
|
|
|
|
sb->sb_lastrecord = sb->sb_mb;
|
2005-06-09 19:59:09 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Implement receive operations on a socket. We depend on the way that
|
|
|
|
* records are added to the sockbuf by sbappend. In particular, each record
|
|
|
|
* (mbufs linked through m_next) must begin with an address if the protocol
|
|
|
|
* so specifies, followed by an optional mbuf or mbufs containing ancillary
|
|
|
|
* data, and then zero or more mbufs of data. In order to allow parallelism
|
|
|
|
* between network receive and copying to user space, as well as avoid
|
|
|
|
* sleeping with a mutex held, we release the socket buffer mutex during the
|
|
|
|
* user space copy. Although the sockbuf is locked, new data may still be
|
|
|
|
* appended, and thus we must maintain consistency of the sockbuf during that
|
|
|
|
* time.
|
1994-05-24 10:09:53 +00:00
|
|
|
*
|
2006-07-23 20:36:04 +00:00
|
|
|
* The caller may receive the data as a single mbuf chain by supplying an
|
|
|
|
* mbuf **mp0 for use in returning the chain. The uio is then used only for
|
|
|
|
* the count in uio_resid.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
|
|
|
|
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct mbuf *m, **mp;
|
2012-02-21 01:05:12 +00:00
|
|
|
int flags, error, offset;
|
|
|
|
ssize_t len;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct protosw *pr = so->so_proto;
|
|
|
|
struct mbuf *nextrecord;
|
1994-05-25 09:21:21 +00:00
|
|
|
int moff, type = 0;
|
2012-02-21 01:05:12 +00:00
|
|
|
ssize_t orig_resid = uio->uio_resid;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
mp = mp0;
|
2005-06-09 19:59:09 +00:00
|
|
|
if (psa != NULL)
|
|
|
|
*psa = NULL;
|
|
|
|
if (controlp != NULL)
|
|
|
|
*controlp = NULL;
|
|
|
|
if (flagsp != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
flags = *flagsp &~ MSG_EOR;
|
|
|
|
else
|
|
|
|
flags = 0;
|
2005-06-09 19:59:09 +00:00
|
|
|
if (flags & MSG_OOB)
|
|
|
|
return (soreceive_rcvoob(so, uio, flags));
|
|
|
|
if (mp != NULL)
|
|
|
|
*mp = NULL;
|
|
|
|
if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
|
2011-02-16 21:29:13 +00:00
|
|
|
&& uio->uio_resid) {
|
|
|
|
VNET_SO_ASSERT(so);
|
1996-07-11 16:32:50 +00:00
|
|
|
(*pr->pr_usrreqs->pru_rcvd)(so, 0);
|
2011-02-16 21:29:13 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1994-10-02 17:35:40 +00:00
|
|
|
error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
|
|
|
|
if (error)
|
2007-05-03 14:42:42 +00:00
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2007-05-03 14:42:42 +00:00
|
|
|
restart:
|
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
m = so->so_rcv.sb_mb;
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If we have less data than requested, block awaiting more (subject
|
|
|
|
* to any timeout) if:
|
1994-05-24 10:09:53 +00:00
|
|
|
* 1. the current count is less than the low water mark, or
|
2012-09-02 07:33:52 +00:00
|
|
|
* 2. MSG_DONTWAIT is not set
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
|
2014-11-12 09:57:15 +00:00
|
|
|
sbavail(&so->so_rcv) < uio->uio_resid) &&
|
|
|
|
sbavail(&so->so_rcv) < so->so_rcv.sb_lowat &&
|
2005-06-09 19:59:09 +00:00
|
|
|
m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
|
2014-11-12 09:57:15 +00:00
|
|
|
KASSERT(m != NULL || !sbavail(&so->so_rcv),
|
|
|
|
("receive: m == %p sbavail == %u",
|
|
|
|
m, sbavail(&so->so_rcv)));
|
1994-05-24 10:09:53 +00:00
|
|
|
if (so->so_error) {
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto dontblock;
|
|
|
|
error = so->so_error;
|
|
|
|
if ((flags & MSG_PEEK) == 0)
|
|
|
|
so->so_error = 0;
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
goto release;
|
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
|
|
|
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
|
2007-05-03 14:42:42 +00:00
|
|
|
if (m == NULL) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
goto release;
|
2007-05-03 14:42:42 +00:00
|
|
|
} else
|
|
|
|
goto dontblock;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
for (; m != NULL; m = m->m_next)
|
1994-05-24 10:09:53 +00:00
|
|
|
if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
|
|
|
|
m = so->so_rcv.sb_mb;
|
|
|
|
goto dontblock;
|
|
|
|
}
|
2020-05-14 17:54:08 +00:00
|
|
|
if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED |
|
|
|
|
SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 &&
|
|
|
|
(so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
error = ENOTCONN;
|
|
|
|
goto release;
|
|
|
|
}
|
2007-05-03 14:42:42 +00:00
|
|
|
if (uio->uio_resid == 0) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
goto release;
|
2007-05-03 14:42:42 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
if ((so->so_state & SS_NBIO) ||
|
|
|
|
(flags & (MSG_DONTWAIT|MSG_NBIO))) {
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
error = EWOULDBLOCK;
|
|
|
|
goto release;
|
|
|
|
}
|
2003-10-28 05:47:40 +00:00
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
error = sbwait(&so->so_rcv);
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (error)
|
2007-05-03 14:42:42 +00:00
|
|
|
goto release;
|
1994-05-24 10:09:53 +00:00
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
dontblock:
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
|
|
|
* From this point onward, we maintain 'nextrecord' as a cache of the
|
|
|
|
* pointer to the next record in the socket buffer. We must keep the
|
|
|
|
* various socket buffer pointers and local stack versions of the
|
|
|
|
* pointers in sync, pushing out modifications before dropping the
|
|
|
|
* socket buffer mutex, and re-reading them when picking it up.
|
|
|
|
*
|
|
|
|
* Otherwise, we will race with the network stack appending new data
|
|
|
|
* or records onto the socket buffer by using inconsistent/stale
|
|
|
|
* versions of the field, possibly resulting in socket buffer
|
|
|
|
* corruption.
|
|
|
|
*
|
|
|
|
* By holding the high-level sblock(), we prevent simultaneous
|
|
|
|
* readers from pulling off the front of the socket buffer.
|
|
|
|
*/
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
|
|
|
if (uio->uio_td)
|
2007-06-01 01:12:45 +00:00
|
|
|
uio->uio_td->td_ru.ru_msgrcv++;
|
2005-06-09 19:59:09 +00:00
|
|
|
KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
|
2003-10-28 05:47:40 +00:00
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
nextrecord = m->m_nextpkt;
|
|
|
|
if (pr->pr_flags & PR_ADDR) {
|
2005-06-09 19:59:09 +00:00
|
|
|
KASSERT(m->m_type == MT_SONAME,
|
|
|
|
("m->m_type == %d", m->m_type));
|
1994-05-24 10:09:53 +00:00
|
|
|
orig_resid = 0;
|
2005-06-09 19:59:09 +00:00
|
|
|
if (psa != NULL)
|
|
|
|
*psa = sodupsockaddr(mtod(m, struct sockaddr *),
|
|
|
|
M_NOWAIT);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (flags & MSG_PEEK) {
|
|
|
|
m = m->m_next;
|
|
|
|
} else {
|
|
|
|
sbfree(&so->so_rcv, m);
|
2002-02-05 02:00:56 +00:00
|
|
|
so->so_rcv.sb_mb = m_free(m);
|
1997-08-16 19:16:27 +00:00
|
|
|
m = so->so_rcv.sb_mb;
|
2005-06-09 19:59:09 +00:00
|
|
|
sockbuf_pushsync(&so->so_rcv, nextrecord);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Process one or more MT_CONTROL mbufs present before any data mbufs
|
|
|
|
* in the first mbuf chain on the socket buffer. If MSG_PEEK, we
|
|
|
|
* just copy the data; if !MSG_PEEK, we call into the protocol to
|
|
|
|
* perform externalization (or freeing if controlp == NULL).
|
|
|
|
*/
|
|
|
|
if (m != NULL && m->m_type == MT_CONTROL) {
|
|
|
|
struct mbuf *cm = NULL, *cmn;
|
|
|
|
struct mbuf **cme = &cm;
|
|
|
|
|
|
|
|
do {
|
|
|
|
if (flags & MSG_PEEK) {
|
|
|
|
if (controlp != NULL) {
|
2016-09-15 07:41:48 +00:00
|
|
|
*controlp = m_copym(m, 0, m->m_len,
|
|
|
|
M_NOWAIT);
|
2005-06-09 19:59:09 +00:00
|
|
|
controlp = &(*controlp)->m_next;
|
|
|
|
}
|
|
|
|
m = m->m_next;
|
2005-06-09 19:56:38 +00:00
|
|
|
} else {
|
2005-06-09 19:59:09 +00:00
|
|
|
sbfree(&so->so_rcv, m);
|
|
|
|
so->so_rcv.sb_mb = m->m_next;
|
|
|
|
m->m_next = NULL;
|
|
|
|
*cme = m;
|
|
|
|
cme = &(*cme)->m_next;
|
2004-07-11 23:13:14 +00:00
|
|
|
m = so->so_rcv.sb_mb;
|
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
} while (m != NULL && m->m_type == MT_CONTROL);
|
|
|
|
if ((flags & MSG_PEEK) == 0)
|
|
|
|
sockbuf_pushsync(&so->so_rcv, nextrecord);
|
|
|
|
while (cm != NULL) {
|
|
|
|
cmn = cm->m_next;
|
|
|
|
cm->m_next = NULL;
|
|
|
|
if (pr->pr_domain->dom_externalize != NULL) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2005-06-09 19:59:09 +00:00
|
|
|
error = (*pr->pr_domain->dom_externalize)
|
2013-03-19 20:58:17 +00:00
|
|
|
(cm, controlp, flags);
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
} else if (controlp != NULL)
|
|
|
|
*controlp = cm;
|
|
|
|
else
|
|
|
|
m_freem(cm);
|
|
|
|
if (controlp != NULL) {
|
|
|
|
orig_resid = 0;
|
|
|
|
while (*controlp != NULL)
|
|
|
|
controlp = &(*controlp)->m_next;
|
|
|
|
}
|
|
|
|
cm = cmn;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2006-08-18 14:05:13 +00:00
|
|
|
if (m != NULL)
|
2005-07-28 10:10:01 +00:00
|
|
|
nextrecord = so->so_rcv.sb_mb->m_nextpkt;
|
|
|
|
else
|
2006-08-18 14:05:13 +00:00
|
|
|
nextrecord = so->so_rcv.sb_mb;
|
2005-06-09 19:59:09 +00:00
|
|
|
orig_resid = 0;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m != NULL) {
|
2003-10-28 05:47:40 +00:00
|
|
|
if ((flags & MSG_PEEK) == 0) {
|
2005-06-09 19:59:09 +00:00
|
|
|
KASSERT(m->m_nextpkt == nextrecord,
|
|
|
|
("soreceive: post-control, nextrecord !sync"));
|
2003-10-28 05:47:40 +00:00
|
|
|
if (nextrecord == NULL) {
|
|
|
|
KASSERT(so->so_rcv.sb_mb == m,
|
2005-06-09 19:59:09 +00:00
|
|
|
("soreceive: post-control, sb_mb!=m"));
|
|
|
|
KASSERT(so->so_rcv.sb_lastrecord == m,
|
|
|
|
("soreceive: post-control, lastrecord!=m"));
|
2003-10-28 05:47:40 +00:00
|
|
|
}
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
type = m->m_type;
|
|
|
|
if (type == MT_OOBDATA)
|
|
|
|
flags |= MSG_OOB;
|
2003-10-28 05:47:40 +00:00
|
|
|
} else {
|
|
|
|
if ((flags & MSG_PEEK) == 0) {
|
2005-06-09 19:59:09 +00:00
|
|
|
KASSERT(so->so_rcv.sb_mb == nextrecord,
|
|
|
|
("soreceive: sb_mb != nextrecord"));
|
|
|
|
if (so->so_rcv.sb_mb == NULL) {
|
|
|
|
KASSERT(so->so_rcv.sb_lastrecord == NULL,
|
|
|
|
("soreceive: sb_lastercord != NULL"));
|
|
|
|
}
|
2003-10-28 05:47:40 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
2003-10-28 05:47:40 +00:00
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
|
|
|
* Now continue to read any data mbufs off of the head of the socket
|
|
|
|
* buffer until the read request is satisfied. Note that 'type' is
|
|
|
|
* used to store the type of any mbuf reads that have happened so far
|
|
|
|
* such that soreceive() can stop reading if the type changes, which
|
|
|
|
* causes soreceive() to return only one of regular data and inline
|
|
|
|
* out-of-band data in a single socket receive operation.
|
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
moff = 0;
|
|
|
|
offset = 0;
|
Merge from projects/sendfile:
o Introduce a notion of "not ready" mbufs in socket buffers. These
mbufs are now being populated by some I/O in background and are
referenced outside. This forces following implications:
- An mbuf which is "not ready" can't be taken out of the buffer.
- An mbuf that is behind a "not ready" in the queue neither.
- If sockbet buffer is flushed, then "not ready" mbufs shouln't be
freed.
o In struct sockbuf the sb_cc field is split into sb_ccc and sb_acc.
The sb_ccc stands for ""claimed character count", or "committed
character count". And the sb_acc is "available character count".
Consumers of socket buffer API shouldn't already access them directly,
but use sbused() and sbavail() respectively.
o Not ready mbufs are marked with M_NOTREADY, and ready but blocked ones
with M_BLOCKED.
o New field sb_fnrdy points to the first not ready mbuf, to avoid linear
search.
o New function sbready() is provided to activate certain amount of mbufs
in a socket buffer.
A special note on SCTP:
SCTP has its own sockbufs. Unfortunately, FreeBSD stack doesn't yet
allow protocol specific sockbufs. Thus, SCTP does some hacks to make
itself compatible with FreeBSD: it manages sockbufs on its own, but keeps
sb_cc updated to inform the stack of amount of data in them. The new
notion of "not ready" data isn't supported by SCTP. Instead, only a
mechanical substitute is done: s/sb_cc/sb_ccc/.
A proper solution would be to take away struct sockbuf from struct
socket and allow protocols to implement their own socket buffers, like
SCTP already does. This was discussed with rrs@.
Sponsored by: Netflix
Sponsored by: Nginx, Inc.
2014-11-30 12:52:33 +00:00
|
|
|
while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
|
|
|
|
&& error == 0) {
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
|
|
|
* If the type of mbuf has changed since the last mbuf
|
|
|
|
* examined ('type'), end the receive operation.
|
2012-12-07 22:13:33 +00:00
|
|
|
*/
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
2012-09-02 07:29:37 +00:00
|
|
|
if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
|
|
|
|
if (type != m->m_type)
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
} else if (type == MT_OOBDATA)
|
|
|
|
break;
|
1999-01-08 17:31:30 +00:00
|
|
|
else
|
2005-11-02 13:46:32 +00:00
|
|
|
KASSERT(m->m_type == MT_DATA,
|
2005-06-09 19:59:09 +00:00
|
|
|
("m->m_type == %d", m->m_type));
|
|
|
|
so->so_rcv.sb_state &= ~SBS_RCVATMARK;
|
1994-05-24 10:09:53 +00:00
|
|
|
len = uio->uio_resid;
|
|
|
|
if (so->so_oobmark && len > so->so_oobmark - offset)
|
|
|
|
len = so->so_oobmark - offset;
|
|
|
|
if (len > m->m_len - moff)
|
|
|
|
len = m->m_len - moff;
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If mp is set, just pass back the mbufs. Otherwise copy
|
|
|
|
* them out via the uio, then free. Sockbuf must be
|
|
|
|
* consistent here (points to current mbuf, it points to next
|
|
|
|
* record) when we drop priority; we must note any additions
|
|
|
|
* to the sockbuf when we block interrupts again.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2005-06-09 19:59:09 +00:00
|
|
|
if (mp == NULL) {
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
2003-10-28 05:47:40 +00:00
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2020-05-03 00:21:11 +00:00
|
|
|
if ((m->m_flags & M_EXTPG) != 0)
|
Add an external mbuf buffer type that holds multiple unmapped pages.
Unmapped mbufs allow sendfile to carry multiple pages of data in a
single mbuf, without mapping those pages. It is a requirement for
Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
serving workloads when used by sendfile, due to effectively
compressing socket buffers by an order of magnitude, and hence
reducing cache misses.
For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
now points to a struct mbuf_ext_pgs structure instead of a data
buffer. This structure contains an array of physical addresses (this
reduces cache misses compared to an earlier version that stored an
array of vm_page_t pointers). It also stores additional fields needed
for in-kernel TLS such as the TLS header and trailer data that are
currently unused. To more easily detect these mbufs, the M_NOMAP flag
is set in m_flags in addition to M_EXT.
Various functions like m_copydata() have been updated to safely access
packet contents (using uiomove_fromphys()), to make things like BPF
safe.
NIC drivers advertise support for unmapped mbufs on transmit via a new
IFCAP_NOMAP capability. This capability can be toggled via the new
'nomap' and '-nomap' ifconfig(8) commands. For NIC drivers that only
transmit packet contents via DMA and use bus_dma, adding the
capability to if_capabilities and if_capenable should be all that is
required.
If a NIC does not support unmapped mbufs, they are converted to a
chain of mapped mbufs (using sf_bufs to provide the mapping) in
ip_output or ip6_output. If an unmapped mbuf requires software
checksums, it is also converted to a chain of mapped mbufs before
computing the checksum.
Submitted by: gallatin (earlier version)
Reviewed by: gallatin, hselasky, rrs
Discussed with: ae, kp (firewalls)
Relnotes: yes
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D20616
2019-06-29 00:48:33 +00:00
|
|
|
error = m_unmappedtouio(m, moff, uio, (int)len);
|
|
|
|
else
|
|
|
|
error = uiomove(mtod(m, char *) + moff,
|
|
|
|
(int)len, uio);
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
2006-09-22 15:34:16 +00:00
|
|
|
if (error) {
|
|
|
|
/*
|
2006-12-23 21:07:07 +00:00
|
|
|
* The MT_SONAME mbuf has already been removed
|
|
|
|
* from the record, so it is necessary to
|
|
|
|
* remove the data mbufs, if any, to preserve
|
|
|
|
* the invariant in the case of PR_ADDR that
|
|
|
|
* requires MT_SONAME mbufs at the head of
|
|
|
|
* each record.
|
2006-09-22 15:34:16 +00:00
|
|
|
*/
|
2017-04-25 19:54:34 +00:00
|
|
|
if (pr->pr_flags & PR_ATOMIC &&
|
2007-02-03 03:57:45 +00:00
|
|
|
((flags & MSG_PEEK) == 0))
|
2006-09-22 15:34:16 +00:00
|
|
|
(void)sbdroprecord_locked(&so->so_rcv);
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1996-11-29 19:03:42 +00:00
|
|
|
goto release;
|
2006-09-22 15:34:16 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
} else
|
|
|
|
uio->uio_resid -= len;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (len == m->m_len - moff) {
|
|
|
|
if (m->m_flags & M_EOR)
|
|
|
|
flags |= MSG_EOR;
|
|
|
|
if (flags & MSG_PEEK) {
|
|
|
|
m = m->m_next;
|
|
|
|
moff = 0;
|
|
|
|
} else {
|
|
|
|
nextrecord = m->m_nextpkt;
|
|
|
|
sbfree(&so->so_rcv, m);
|
2005-06-09 19:59:09 +00:00
|
|
|
if (mp != NULL) {
|
2013-03-29 13:57:55 +00:00
|
|
|
m->m_nextpkt = NULL;
|
1994-05-24 10:09:53 +00:00
|
|
|
*mp = m;
|
|
|
|
mp = &m->m_next;
|
|
|
|
so->so_rcv.sb_mb = m = m->m_next;
|
2005-06-09 19:59:09 +00:00
|
|
|
*mp = NULL;
|
1994-05-24 10:09:53 +00:00
|
|
|
} else {
|
2005-06-09 19:59:09 +00:00
|
|
|
so->so_rcv.sb_mb = m_free(m);
|
|
|
|
m = so->so_rcv.sb_mb;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-09-06 17:05:11 +00:00
|
|
|
sockbuf_pushsync(&so->so_rcv, nextrecord);
|
2003-10-28 05:47:40 +00:00
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (flags & MSG_PEEK)
|
|
|
|
moff += len;
|
|
|
|
else {
|
2005-06-09 19:59:09 +00:00
|
|
|
if (mp != NULL) {
|
2014-01-16 13:45:41 +00:00
|
|
|
if (flags & MSG_DONTWAIT) {
|
|
|
|
*mp = m_copym(m, 0, len,
|
|
|
|
M_NOWAIT);
|
|
|
|
if (*mp == NULL) {
|
|
|
|
/*
|
|
|
|
* m_copym() couldn't
|
|
|
|
* allocate an mbuf.
|
|
|
|
* Adjust uio_resid back
|
|
|
|
* (it was adjusted
|
|
|
|
* down by len bytes,
|
|
|
|
* which we didn't end
|
|
|
|
* up "copying" over).
|
|
|
|
*/
|
|
|
|
uio->uio_resid += len;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2014-01-16 13:45:41 +00:00
|
|
|
*mp = m_copym(m, 0, len,
|
|
|
|
M_WAITOK);
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
2012-12-07 22:13:33 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
}
|
2014-11-14 15:33:40 +00:00
|
|
|
sbcut_locked(&so->so_rcv, len);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (so->so_oobmark) {
|
|
|
|
if ((flags & MSG_PEEK) == 0) {
|
|
|
|
so->so_oobmark -= len;
|
|
|
|
if (so->so_oobmark == 0) {
|
2005-06-09 19:59:09 +00:00
|
|
|
so->so_rcv.sb_state |= SBS_RCVATMARK;
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
offset += len;
|
|
|
|
if (offset == so->so_oobmark)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (flags & MSG_EOR)
|
|
|
|
break;
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If the MSG_WAITALL flag is set (for non-atomic socket), we
|
|
|
|
* must not quit until "uio->uio_resid == 0" or an error
|
|
|
|
* termination. If a signal/timeout occurs, return with a
|
|
|
|
* short count but without error. Keep sockbuf locked
|
|
|
|
* against other readers.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2005-06-09 19:59:09 +00:00
|
|
|
while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
|
|
|
|
!sosendallatonce(so) && nextrecord == NULL) {
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
2012-12-07 22:13:33 +00:00
|
|
|
if (so->so_error ||
|
|
|
|
so->so_rcv.sb_state & SBS_CANTRCVMORE)
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
2001-03-16 22:37:06 +00:00
|
|
|
/*
|
2005-06-09 19:59:09 +00:00
|
|
|
* Notify the protocol that some data has been
|
|
|
|
* drained before blocking.
|
2001-03-16 22:37:06 +00:00
|
|
|
*/
|
Chance protocol switch method pru_detach() so that it returns void
rather than an error. Detaches do not "fail", they other occur or
the protocol flags SS_PROTOREF to take ownership of the socket.
soclose() no longer looks at so_pcb to see if it's NULL, relying
entirely on the protocol to decide whether it's time to free the
socket or not using SS_PROTOREF. so_pcb is now entirely owned and
managed by the protocol code. Likewise, no longer test so_pcb in
other socket functions, such as soreceive(), which have no business
digging into protocol internals.
Protocol detach routines no longer try to free the socket on detach,
this is performed in the socket code if the protocol permits it.
In rts_detach(), no longer test for rp != NULL in detach, and
likewise in other protocols that don't permit a NULL so_pcb, reduce
the incidence of testing for it during detach.
netinet and netinet6 are not fully updated to this change, which
will be in an upcoming commit. In their current state they may leak
memory or panic.
MFC after: 3 months
2006-04-01 15:42:02 +00:00
|
|
|
if (pr->pr_flags & PR_WANTRCVD) {
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2001-03-16 22:37:06 +00:00
|
|
|
(*pr->pr_usrreqs->pru_rcvd)(so, flags);
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
}
|
2003-10-28 05:47:40 +00:00
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
2011-05-29 18:00:50 +00:00
|
|
|
/*
|
|
|
|
* We could receive some data while was notifying
|
|
|
|
* the protocol. Skip blocking in this case.
|
|
|
|
*/
|
|
|
|
if (so->so_rcv.sb_mb == NULL) {
|
|
|
|
error = sbwait(&so->so_rcv);
|
|
|
|
if (error) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
|
|
|
goto release;
|
|
|
|
}
|
2007-05-03 14:42:42 +00:00
|
|
|
}
|
1994-10-02 17:35:40 +00:00
|
|
|
m = so->so_rcv.sb_mb;
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
nextrecord = m->m_nextpkt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
|
|
|
if (m != NULL && pr->pr_flags & PR_ATOMIC) {
|
1994-05-24 10:09:53 +00:00
|
|
|
flags |= MSG_TRUNC;
|
2004-07-10 04:38:06 +00:00
|
|
|
if ((flags & MSG_PEEK) == 0)
|
2005-06-09 19:59:09 +00:00
|
|
|
(void) sbdroprecord_locked(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
if ((flags & MSG_PEEK) == 0) {
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m == NULL) {
|
2003-10-28 05:47:40 +00:00
|
|
|
/*
|
|
|
|
* First part is an inline SB_EMPTY_FIXUP(). Second
|
|
|
|
* part makes sure sb_lastrecord is up-to-date if
|
|
|
|
* there is still data in the socket buffer.
|
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
so->so_rcv.sb_mb = nextrecord;
|
2003-10-28 05:47:40 +00:00
|
|
|
if (so->so_rcv.sb_mb == NULL) {
|
|
|
|
so->so_rcv.sb_mbtail = NULL;
|
|
|
|
so->so_rcv.sb_lastrecord = NULL;
|
|
|
|
} else if (nextrecord->m_nextpkt == NULL)
|
|
|
|
so->so_rcv.sb_lastrecord = nextrecord;
|
|
|
|
}
|
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If soreceive() is being done from the socket callback,
|
|
|
|
* then don't need to generate ACK to peer to update window,
|
|
|
|
* since ACK will be generated on return to TCP.
|
2005-06-09 19:59:09 +00:00
|
|
|
*/
|
2006-07-16 23:09:39 +00:00
|
|
|
if (!(flags & MSG_SOCALLBCK) &&
|
Chance protocol switch method pru_detach() so that it returns void
rather than an error. Detaches do not "fail", they other occur or
the protocol flags SS_PROTOREF to take ownership of the socket.
soclose() no longer looks at so_pcb to see if it's NULL, relying
entirely on the protocol to decide whether it's time to free the
socket or not using SS_PROTOREF. so_pcb is now entirely owned and
managed by the protocol code. Likewise, no longer test so_pcb in
other socket functions, such as soreceive(), which have no business
digging into protocol internals.
Protocol detach routines no longer try to free the socket on detach,
this is performed in the socket code if the protocol permits it.
In rts_detach(), no longer test for rp != NULL in detach, and
likewise in other protocols that don't permit a NULL so_pcb, reduce
the incidence of testing for it during detach.
netinet and netinet6 are not fully updated to this change, which
will be in an upcoming commit. In their current state they may leak
memory or panic.
MFC after: 3 months
2006-04-01 15:42:02 +00:00
|
|
|
(pr->pr_flags & PR_WANTRCVD)) {
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
1996-07-11 16:32:50 +00:00
|
|
|
(*pr->pr_usrreqs->pru_rcvd)(so, flags);
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (orig_resid == uio->uio_resid && orig_resid &&
|
2005-06-09 19:59:09 +00:00
|
|
|
(flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1994-05-24 10:09:53 +00:00
|
|
|
goto restart;
|
|
|
|
}
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
1995-05-30 08:16:23 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
if (flagsp != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
*flagsp |= flags;
|
|
|
|
release:
|
|
|
|
sbunlock(&so->so_rcv);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2009-06-22 23:08:05 +00:00
|
|
|
/*
|
|
|
|
* Optimized version of soreceive() for stream (TCP) sockets.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
|
|
|
|
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
|
|
|
|
{
|
|
|
|
int len = 0, error = 0, flags, oresid;
|
|
|
|
struct sockbuf *sb;
|
|
|
|
struct mbuf *m, *n = NULL;
|
|
|
|
|
|
|
|
/* We only do stream sockets. */
|
|
|
|
if (so->so_type != SOCK_STREAM)
|
|
|
|
return (EINVAL);
|
|
|
|
if (psa != NULL)
|
|
|
|
*psa = NULL;
|
|
|
|
if (flagsp != NULL)
|
|
|
|
flags = *flagsp &~ MSG_EOR;
|
|
|
|
else
|
|
|
|
flags = 0;
|
2018-06-11 16:31:42 +00:00
|
|
|
if (controlp != NULL)
|
|
|
|
*controlp = NULL;
|
2009-06-22 23:08:05 +00:00
|
|
|
if (flags & MSG_OOB)
|
|
|
|
return (soreceive_rcvoob(so, uio, flags));
|
|
|
|
if (mp0 != NULL)
|
|
|
|
*mp0 = NULL;
|
|
|
|
|
|
|
|
sb = &so->so_rcv;
|
|
|
|
|
2020-04-27 23:17:19 +00:00
|
|
|
#ifdef KERN_TLS
|
|
|
|
/*
|
|
|
|
* KTLS store TLS records as records with a control message to
|
|
|
|
* describe the framing.
|
|
|
|
*
|
|
|
|
* We check once here before acquiring locks to optimize the
|
|
|
|
* common case.
|
|
|
|
*/
|
|
|
|
if (sb->sb_tls_info != NULL)
|
|
|
|
return (soreceive_generic(so, psa, uio, mp0, controlp,
|
|
|
|
flagsp));
|
|
|
|
#endif
|
|
|
|
|
2009-06-22 23:08:05 +00:00
|
|
|
/* Prevent other readers from entering the socket. */
|
|
|
|
error = sblock(sb, SBLOCKWAIT(flags));
|
|
|
|
if (error)
|
2019-07-02 14:24:42 +00:00
|
|
|
return (error);
|
2009-06-22 23:08:05 +00:00
|
|
|
SOCKBUF_LOCK(sb);
|
|
|
|
|
2020-04-27 23:17:19 +00:00
|
|
|
#ifdef KERN_TLS
|
|
|
|
if (sb->sb_tls_info != NULL) {
|
|
|
|
SOCKBUF_UNLOCK(sb);
|
|
|
|
sbunlock(sb);
|
|
|
|
return (soreceive_generic(so, psa, uio, mp0, controlp,
|
|
|
|
flagsp));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2009-06-22 23:08:05 +00:00
|
|
|
/* Easy one, no space to copyout anything. */
|
|
|
|
if (uio->uio_resid == 0) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
oresid = uio->uio_resid;
|
|
|
|
|
2011-07-08 10:50:13 +00:00
|
|
|
/* We will never ever get anything unless we are or were connected. */
|
2009-06-22 23:08:05 +00:00
|
|
|
if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
|
2011-07-08 10:50:13 +00:00
|
|
|
error = ENOTCONN;
|
2009-06-22 23:08:05 +00:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
restart:
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
|
|
|
|
|
|
|
/* Abort if socket has reported problems. */
|
|
|
|
if (so->so_error) {
|
2014-11-12 09:57:15 +00:00
|
|
|
if (sbavail(sb) > 0)
|
2009-06-22 23:08:05 +00:00
|
|
|
goto deliver;
|
|
|
|
if (oresid > uio->uio_resid)
|
|
|
|
goto out;
|
|
|
|
error = so->so_error;
|
|
|
|
if (!(flags & MSG_PEEK))
|
|
|
|
so->so_error = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Door is closed. Deliver what is left, if any. */
|
|
|
|
if (sb->sb_state & SBS_CANTRCVMORE) {
|
2014-11-12 09:57:15 +00:00
|
|
|
if (sbavail(sb) > 0)
|
2009-06-22 23:08:05 +00:00
|
|
|
goto deliver;
|
|
|
|
else
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2011-07-08 10:50:13 +00:00
|
|
|
/* Socket buffer is empty and we shall not block. */
|
2014-11-12 09:57:15 +00:00
|
|
|
if (sbavail(sb) == 0 &&
|
2011-07-08 10:50:13 +00:00
|
|
|
((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
|
|
|
|
error = EAGAIN;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2009-06-22 23:08:05 +00:00
|
|
|
/* Socket buffer got some data that we shall deliver now. */
|
2014-11-12 09:57:15 +00:00
|
|
|
if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) &&
|
2014-12-15 17:52:08 +00:00
|
|
|
((so->so_state & SS_NBIO) ||
|
2009-06-22 23:08:05 +00:00
|
|
|
(flags & (MSG_DONTWAIT|MSG_NBIO)) ||
|
2014-11-12 09:57:15 +00:00
|
|
|
sbavail(sb) >= sb->sb_lowat ||
|
|
|
|
sbavail(sb) >= uio->uio_resid ||
|
|
|
|
sbavail(sb) >= sb->sb_hiwat) ) {
|
2009-06-22 23:08:05 +00:00
|
|
|
goto deliver;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* On MSG_WAITALL we must wait until all data or error arrives. */
|
|
|
|
if ((flags & MSG_WAITALL) &&
|
2014-11-12 09:57:15 +00:00
|
|
|
(sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat))
|
2009-06-22 23:08:05 +00:00
|
|
|
goto deliver;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait and block until (more) data comes in.
|
|
|
|
* NB: Drops the sockbuf lock during wait.
|
|
|
|
*/
|
|
|
|
error = sbwait(sb);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
goto restart;
|
|
|
|
|
|
|
|
deliver:
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
2014-11-12 09:57:15 +00:00
|
|
|
KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__));
|
2009-06-22 23:08:05 +00:00
|
|
|
KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
|
|
|
|
|
|
|
|
/* Statistics. */
|
|
|
|
if (uio->uio_td)
|
|
|
|
uio->uio_td->td_ru.ru_msgrcv++;
|
|
|
|
|
|
|
|
/* Fill uio until full or current end of socket buffer is reached. */
|
2014-11-12 09:57:15 +00:00
|
|
|
len = min(uio->uio_resid, sbavail(sb));
|
2009-06-22 23:08:05 +00:00
|
|
|
if (mp0 != NULL) {
|
|
|
|
/* Dequeue as many mbufs as possible. */
|
|
|
|
if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
|
2012-10-29 12:31:12 +00:00
|
|
|
if (*mp0 == NULL)
|
|
|
|
*mp0 = sb->sb_mb;
|
|
|
|
else
|
|
|
|
m_cat(*mp0, sb->sb_mb);
|
|
|
|
for (m = sb->sb_mb;
|
2009-06-22 23:08:05 +00:00
|
|
|
m != NULL && m->m_len <= len;
|
|
|
|
m = m->m_next) {
|
Merge from projects/sendfile:
o Introduce a notion of "not ready" mbufs in socket buffers. These
mbufs are now being populated by some I/O in background and are
referenced outside. This forces following implications:
- An mbuf which is "not ready" can't be taken out of the buffer.
- An mbuf that is behind a "not ready" in the queue neither.
- If sockbet buffer is flushed, then "not ready" mbufs shouln't be
freed.
o In struct sockbuf the sb_cc field is split into sb_ccc and sb_acc.
The sb_ccc stands for ""claimed character count", or "committed
character count". And the sb_acc is "available character count".
Consumers of socket buffer API shouldn't already access them directly,
but use sbused() and sbavail() respectively.
o Not ready mbufs are marked with M_NOTREADY, and ready but blocked ones
with M_BLOCKED.
o New field sb_fnrdy points to the first not ready mbuf, to avoid linear
search.
o New function sbready() is provided to activate certain amount of mbufs
in a socket buffer.
A special note on SCTP:
SCTP has its own sockbufs. Unfortunately, FreeBSD stack doesn't yet
allow protocol specific sockbufs. Thus, SCTP does some hacks to make
itself compatible with FreeBSD: it manages sockbufs on its own, but keeps
sb_cc updated to inform the stack of amount of data in them. The new
notion of "not ready" data isn't supported by SCTP. Instead, only a
mechanical substitute is done: s/sb_cc/sb_ccc/.
A proper solution would be to take away struct sockbuf from struct
socket and allow protocols to implement their own socket buffers, like
SCTP already does. This was discussed with rrs@.
Sponsored by: Netflix
Sponsored by: Nginx, Inc.
2014-11-30 12:52:33 +00:00
|
|
|
KASSERT(!(m->m_flags & M_NOTAVAIL),
|
|
|
|
("%s: m %p not available", __func__, m));
|
2009-06-22 23:08:05 +00:00
|
|
|
len -= m->m_len;
|
|
|
|
uio->uio_resid -= m->m_len;
|
|
|
|
sbfree(sb, m);
|
|
|
|
n = m;
|
|
|
|
}
|
2012-10-29 12:31:12 +00:00
|
|
|
n->m_next = NULL;
|
2009-06-22 23:08:05 +00:00
|
|
|
sb->sb_mb = m;
|
2012-10-29 12:31:12 +00:00
|
|
|
sb->sb_lastrecord = sb->sb_mb;
|
2009-06-22 23:08:05 +00:00
|
|
|
if (sb->sb_mb == NULL)
|
|
|
|
SB_EMPTY_FIXUP(sb);
|
|
|
|
}
|
|
|
|
/* Copy the remainder. */
|
|
|
|
if (len > 0) {
|
|
|
|
KASSERT(sb->sb_mb != NULL,
|
|
|
|
("%s: len > 0 && sb->sb_mb empty", __func__));
|
|
|
|
|
2012-12-05 08:04:20 +00:00
|
|
|
m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
|
2009-06-22 23:08:05 +00:00
|
|
|
if (m == NULL)
|
|
|
|
len = 0; /* Don't flush data from sockbuf. */
|
|
|
|
else
|
2012-10-29 12:31:12 +00:00
|
|
|
uio->uio_resid -= len;
|
2009-06-22 23:08:05 +00:00
|
|
|
if (*mp0 != NULL)
|
2012-10-29 12:31:12 +00:00
|
|
|
m_cat(*mp0, m);
|
2009-06-22 23:08:05 +00:00
|
|
|
else
|
|
|
|
*mp0 = m;
|
|
|
|
if (*mp0 == NULL) {
|
|
|
|
error = ENOBUFS;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* NB: Must unlock socket buffer as uiomove may sleep. */
|
|
|
|
SOCKBUF_UNLOCK(sb);
|
|
|
|
error = m_mbuftouio(uio, sb->sb_mb, len);
|
|
|
|
SOCKBUF_LOCK(sb);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
SBLASTRECORDCHK(sb);
|
|
|
|
SBLASTMBUFCHK(sb);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove the delivered data from the socket buffer unless we
|
|
|
|
* were only peeking.
|
|
|
|
*/
|
|
|
|
if (!(flags & MSG_PEEK)) {
|
|
|
|
if (len > 0)
|
|
|
|
sbdrop_locked(sb, len);
|
|
|
|
|
|
|
|
/* Notify protocol that we drained some data. */
|
|
|
|
if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
|
|
|
|
(((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
|
|
|
|
!(flags & MSG_SOCALLBCK))) {
|
|
|
|
SOCKBUF_UNLOCK(sb);
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
2009-06-22 23:08:05 +00:00
|
|
|
(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
|
|
|
|
SOCKBUF_LOCK(sb);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For MSG_WAITALL we may have to loop again and wait for
|
|
|
|
* more data to come in.
|
|
|
|
*/
|
|
|
|
if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
|
|
|
|
goto restart;
|
|
|
|
out:
|
|
|
|
SOCKBUF_LOCK_ASSERT(sb);
|
|
|
|
SBLASTRECORDCHK(sb);
|
|
|
|
SBLASTMBUFCHK(sb);
|
|
|
|
SOCKBUF_UNLOCK(sb);
|
|
|
|
sbunlock(sb);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2008-07-02 23:23:27 +00:00
|
|
|
/*
|
2008-10-01 13:26:52 +00:00
|
|
|
* Optimized version of soreceive() for simple datagram cases from userspace.
|
|
|
|
* Unlike in the stream case, we're able to drop a datagram if copyout()
|
|
|
|
* fails, and because we handle datagrams atomically, we don't need to use a
|
|
|
|
* sleep lock to prevent I/O interlacing.
|
2008-07-02 23:23:27 +00:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
|
|
|
|
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
|
|
|
|
{
|
|
|
|
struct mbuf *m, *m2;
|
2012-02-21 01:05:12 +00:00
|
|
|
int flags, error;
|
|
|
|
ssize_t len;
|
2008-07-02 23:23:27 +00:00
|
|
|
struct protosw *pr = so->so_proto;
|
|
|
|
struct mbuf *nextrecord;
|
|
|
|
|
|
|
|
if (psa != NULL)
|
|
|
|
*psa = NULL;
|
|
|
|
if (controlp != NULL)
|
|
|
|
*controlp = NULL;
|
|
|
|
if (flagsp != NULL)
|
|
|
|
flags = *flagsp &~ MSG_EOR;
|
|
|
|
else
|
|
|
|
flags = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For any complicated cases, fall back to the full
|
|
|
|
* soreceive_generic().
|
|
|
|
*/
|
|
|
|
if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
|
|
|
|
return (soreceive_generic(so, psa, uio, mp0, controlp,
|
|
|
|
flagsp));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Enforce restrictions on use.
|
|
|
|
*/
|
|
|
|
KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
|
|
|
|
("soreceive_dgram: wantrcvd"));
|
|
|
|
KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
|
|
|
|
KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
|
|
|
|
("soreceive_dgram: SBS_RCVATMARK"));
|
|
|
|
KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
|
|
|
|
("soreceive_dgram: P_CONNREQUIRED"));
|
|
|
|
|
|
|
|
/*
|
2008-10-01 13:26:52 +00:00
|
|
|
* Loop blocking while waiting for a datagram.
|
2008-07-02 23:23:27 +00:00
|
|
|
*/
|
2008-10-01 13:26:52 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
while ((m = so->so_rcv.sb_mb) == NULL) {
|
2014-11-12 09:57:15 +00:00
|
|
|
KASSERT(sbavail(&so->so_rcv) == 0,
|
|
|
|
("soreceive_dgram: sb_mb NULL but sbavail %u",
|
|
|
|
sbavail(&so->so_rcv)));
|
2008-07-02 23:23:27 +00:00
|
|
|
if (so->so_error) {
|
|
|
|
error = so->so_error;
|
|
|
|
so->so_error = 0;
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
|
|
|
return (error);
|
|
|
|
}
|
2008-10-07 20:57:55 +00:00
|
|
|
if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
|
|
|
|
uio->uio_resid == 0) {
|
2008-07-02 23:23:27 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
if ((so->so_state & SS_NBIO) ||
|
|
|
|
(flags & (MSG_DONTWAIT|MSG_NBIO))) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2008-10-01 13:26:52 +00:00
|
|
|
return (EWOULDBLOCK);
|
2008-07-02 23:23:27 +00:00
|
|
|
}
|
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
|
|
|
error = sbwait(&so->so_rcv);
|
2008-10-01 13:26:52 +00:00
|
|
|
if (error) {
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2008-07-02 23:23:27 +00:00
|
|
|
return (error);
|
2008-10-01 13:26:52 +00:00
|
|
|
}
|
2008-07-02 23:23:27 +00:00
|
|
|
}
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
2008-10-01 13:26:52 +00:00
|
|
|
|
2008-07-02 23:23:27 +00:00
|
|
|
if (uio->uio_td)
|
|
|
|
uio->uio_td->td_ru.ru_msgrcv++;
|
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
|
|
|
nextrecord = m->m_nextpkt;
|
|
|
|
if (nextrecord == NULL) {
|
|
|
|
KASSERT(so->so_rcv.sb_lastrecord == m,
|
2008-09-30 18:44:26 +00:00
|
|
|
("soreceive_dgram: lastrecord != m"));
|
2008-07-02 23:23:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
|
|
|
|
("soreceive_dgram: m_nextpkt != nextrecord"));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pull 'm' and its chain off the front of the packet queue.
|
|
|
|
*/
|
|
|
|
so->so_rcv.sb_mb = NULL;
|
|
|
|
sockbuf_pushsync(&so->so_rcv, nextrecord);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Walk 'm's chain and free that many bytes from the socket buffer.
|
|
|
|
*/
|
|
|
|
for (m2 = m; m2 != NULL; m2 = m2->m_next)
|
|
|
|
sbfree(&so->so_rcv, m2);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do a few last checks before we let go of the lock.
|
|
|
|
*/
|
|
|
|
SBLASTRECORDCHK(&so->so_rcv);
|
|
|
|
SBLASTMBUFCHK(&so->so_rcv);
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
|
|
|
|
2008-10-01 19:14:05 +00:00
|
|
|
if (pr->pr_flags & PR_ADDR) {
|
|
|
|
KASSERT(m->m_type == MT_SONAME,
|
|
|
|
("m->m_type == %d", m->m_type));
|
|
|
|
if (psa != NULL)
|
|
|
|
*psa = sodupsockaddr(mtod(m, struct sockaddr *),
|
|
|
|
M_NOWAIT);
|
|
|
|
m = m_free(m);
|
|
|
|
}
|
|
|
|
if (m == NULL) {
|
|
|
|
/* XXXRW: Can this happen? */
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2008-07-02 23:23:27 +00:00
|
|
|
/*
|
|
|
|
* Packet to copyout() is now in 'm' and it is disconnected from the
|
|
|
|
* queue.
|
|
|
|
*
|
|
|
|
* Process one or more MT_CONTROL mbufs present before any data mbufs
|
2008-10-01 13:26:52 +00:00
|
|
|
* in the first mbuf chain on the socket buffer. We call into the
|
|
|
|
* protocol to perform externalization (or freeing if controlp ==
|
2015-02-23 13:41:35 +00:00
|
|
|
* NULL). In some cases there can be only MT_CONTROL mbufs without
|
|
|
|
* MT_DATA mbufs.
|
2008-07-02 23:23:27 +00:00
|
|
|
*/
|
|
|
|
if (m->m_type == MT_CONTROL) {
|
|
|
|
struct mbuf *cm = NULL, *cmn;
|
|
|
|
struct mbuf **cme = &cm;
|
|
|
|
|
|
|
|
do {
|
|
|
|
m2 = m->m_next;
|
|
|
|
m->m_next = NULL;
|
|
|
|
*cme = m;
|
|
|
|
cme = &(*cme)->m_next;
|
|
|
|
m = m2;
|
|
|
|
} while (m != NULL && m->m_type == MT_CONTROL);
|
|
|
|
while (cm != NULL) {
|
|
|
|
cmn = cm->m_next;
|
|
|
|
cm->m_next = NULL;
|
|
|
|
if (pr->pr_domain->dom_externalize != NULL) {
|
|
|
|
error = (*pr->pr_domain->dom_externalize)
|
2013-03-19 20:58:17 +00:00
|
|
|
(cm, controlp, flags);
|
2008-07-02 23:23:27 +00:00
|
|
|
} else if (controlp != NULL)
|
|
|
|
*controlp = cm;
|
|
|
|
else
|
|
|
|
m_freem(cm);
|
|
|
|
if (controlp != NULL) {
|
|
|
|
while (*controlp != NULL)
|
|
|
|
controlp = &(*controlp)->m_next;
|
|
|
|
}
|
|
|
|
cm = cmn;
|
|
|
|
}
|
|
|
|
}
|
2015-02-23 15:24:43 +00:00
|
|
|
KASSERT(m == NULL || m->m_type == MT_DATA,
|
|
|
|
("soreceive_dgram: !data"));
|
2008-07-02 23:23:27 +00:00
|
|
|
while (m != NULL && uio->uio_resid > 0) {
|
|
|
|
len = uio->uio_resid;
|
|
|
|
if (len > m->m_len)
|
|
|
|
len = m->m_len;
|
|
|
|
error = uiomove(mtod(m, char *), (int)len, uio);
|
|
|
|
if (error) {
|
|
|
|
m_freem(m);
|
|
|
|
return (error);
|
|
|
|
}
|
2010-08-07 17:57:58 +00:00
|
|
|
if (len == m->m_len)
|
|
|
|
m = m_free(m);
|
|
|
|
else {
|
|
|
|
m->m_data += len;
|
|
|
|
m->m_len -= len;
|
|
|
|
}
|
2008-07-02 23:23:27 +00:00
|
|
|
}
|
2015-02-23 13:41:35 +00:00
|
|
|
if (m != NULL) {
|
2008-07-02 23:23:27 +00:00
|
|
|
flags |= MSG_TRUNC;
|
2015-02-23 13:41:35 +00:00
|
|
|
m_freem(m);
|
|
|
|
}
|
2008-07-02 23:23:27 +00:00
|
|
|
if (flagsp != NULL)
|
|
|
|
*flagsp |= flags;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
|
|
|
|
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
{
|
2011-02-16 21:29:13 +00:00
|
|
|
int error;
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
2017-06-15 20:11:29 +00:00
|
|
|
if (!SOLISTENING(so))
|
|
|
|
error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio,
|
|
|
|
mp0, controlp, flagsp));
|
|
|
|
else
|
|
|
|
error = ENOTCONN;
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
soshutdown(struct socket *so, int how)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct protosw *pr = so->so_proto;
|
2017-04-14 17:23:28 +00:00
|
|
|
int error, soerror_enotconn;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2001-02-27 13:48:07 +00:00
|
|
|
if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
|
|
|
|
return (EINVAL);
|
2017-04-14 17:23:28 +00:00
|
|
|
|
|
|
|
soerror_enotconn = 0;
|
Make shutdown() return ENOTCONN as required by POSIX, part deux.
Summary:
Back in 2005, maxim@ attempted to fix shutdown() to return ENOTCONN in case the socket was not connected (r150152). This had to be rolled back (r150155), as it broke some of the existing programs that depend on this behavior. I reapplied this change on my system and indeed, syslogd failed to start up. I fixed this back in February (279016) and MFC'ed it to the supported stable branches. Apart from that, things seem to work out all right.
Since at least Linux and Mac OS X do the right thing, I'd like to go ahead and give this another try. To keep old copies of syslogd working, only start returning ENOTCONN for recent binaries.
I took a look at the XNU sources and they seem to test against both SS_ISCONNECTED, SS_ISCONNECTING and SS_ISDISCONNECTING, instead of just SS_ISCONNECTED. That seams reasonable, so let's do the same.
Test Plan:
This issue was uncovered while writing tests for shutdown() in CloudABI:
https://github.com/NuxiNL/cloudlibc/blob/master/src/libc/sys/socket/shutdown_test.c#L26
Reviewers: glebius, rwatson, #manpages, gnn, #network
Reviewed By: gnn, #network
Subscribers: bms, mjg, imp
Differential Revision: https://reviews.freebsd.org/D3039
2015-07-27 13:17:57 +00:00
|
|
|
if ((so->so_state &
|
2017-04-14 17:23:28 +00:00
|
|
|
(SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
|
|
|
|
/*
|
|
|
|
* POSIX mandates us to return ENOTCONN when shutdown(2) is
|
|
|
|
* invoked on a datagram sockets, however historically we would
|
|
|
|
* actually tear socket down. This is known to be leveraged by
|
|
|
|
* some applications to unblock process waiting in recvXXX(2)
|
|
|
|
* by other process that it shares that socket with. Try to meet
|
|
|
|
* both backward-compatibility and POSIX requirements by forcing
|
|
|
|
* ENOTCONN but still asking protocol to perform pru_shutdown().
|
|
|
|
*/
|
2018-10-03 17:40:04 +00:00
|
|
|
if (so->so_type != SOCK_DGRAM && !SOLISTENING(so))
|
2017-04-14 17:23:28 +00:00
|
|
|
return (ENOTCONN);
|
|
|
|
soerror_enotconn = 1;
|
|
|
|
}
|
2011-02-16 21:29:13 +00:00
|
|
|
|
2018-10-03 17:40:04 +00:00
|
|
|
if (SOLISTENING(so)) {
|
|
|
|
if (how != SHUT_WR) {
|
|
|
|
SOLISTEN_LOCK(so);
|
|
|
|
so->so_error = ECONNABORTED;
|
|
|
|
solisten_wakeup(so); /* unlocks so */
|
|
|
|
}
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
2012-12-07 22:13:33 +00:00
|
|
|
if (pr->pr_usrreqs->pru_flush != NULL)
|
|
|
|
(*pr->pr_usrreqs->pru_flush)(so, how);
|
2001-02-27 13:48:07 +00:00
|
|
|
if (how != SHUT_WR)
|
1994-05-24 10:09:53 +00:00
|
|
|
sorflush(so);
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
if (how != SHUT_RD) {
|
|
|
|
error = (*pr->pr_usrreqs->pru_shutdown)(so);
|
2013-04-30 15:06:30 +00:00
|
|
|
wakeup(&so->so_timeo);
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
CURVNET_RESTORE();
|
2017-04-14 17:23:28 +00:00
|
|
|
return ((error == 0 && soerror_enotconn) ? ENOTCONN : error);
|
Change the curvnet variable from a global const struct vnet *,
previously always pointing to the default vnet context, to a
dynamically changing thread-local one. The currvnet context
should be set on entry to networking code via CURVNET_SET() macros,
and reverted to previous state via CURVNET_RESTORE(). Recursions
on curvnet are permitted, though strongly discuouraged.
This change should have no functional impact on nooptions VIMAGE
kernel builds, where CURVNET_* macros expand to whitespace.
The curthread->td_vnet (aka curvnet) variable's purpose is to be an
indicator of the vnet context in which the current network-related
operation takes place, in case we cannot deduce the current vnet
context from any other source, such as by looking at mbuf's
m->m_pkthdr.rcvif->if_vnet, sockets's so->so_vnet etc. Moreover, so
far curvnet has turned out to be an invaluable consistency checking
aid: it helps to catch cases when sockets, ifnets or any other
vnet-aware structures may have leaked from one vnet to another.
The exact placement of the CURVNET_SET() / CURVNET_RESTORE() macros
was a result of an empirical iterative process, whith an aim to
reduce recursions on CURVNET_SET() to a minimum, while still reducing
the scope of CURVNET_SET() to networking only operations - the
alternative would be calling CURVNET_SET() on each system call entry.
In general, curvnet has to be set in three typicall cases: when
processing socket-related requests from userspace or from within the
kernel; when processing inbound traffic flowing from device drivers
to upper layers of the networking stack, and when executing
timer-driven networking functions.
This change also introduces a DDB subcommand to show the list of all
vnet instances.
Approved by: julian (mentor)
2009-05-05 10:56:12 +00:00
|
|
|
}
|
2013-04-30 15:06:30 +00:00
|
|
|
wakeup(&so->so_timeo);
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_RESTORE();
|
2017-04-14 17:23:28 +00:00
|
|
|
|
2018-10-03 17:40:04 +00:00
|
|
|
done:
|
2017-04-14 17:23:28 +00:00
|
|
|
return (soerror_enotconn ? ENOTCONN : 0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
sorflush(struct socket *so)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct sockbuf *sb = &so->so_rcv;
|
|
|
|
struct protosw *pr = so->so_proto;
|
2015-07-14 02:00:50 +00:00
|
|
|
struct socket aso;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
VNET_SO_ASSERT(so);
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
2008-02-04 12:25:13 +00:00
|
|
|
* In order to avoid calling dom_dispose with the socket buffer mutex
|
|
|
|
* held, and in order to generally avoid holding the lock for a long
|
|
|
|
* time, we make a copy of the socket buffer and clear the original
|
|
|
|
* (except locks, state). The new socket buffer copy won't have
|
|
|
|
* initialized locks so we can only call routines that won't use or
|
|
|
|
* assert those locks.
|
|
|
|
*
|
Correct two problems relating to sorflush(), which is called to flush
read socket buffers in shutdown() and close():
- Call socantrcvmore() before sblock() to dislodge any threads that
might be sleeping (potentially indefinitely) while holding sblock(),
such as a thread blocked in recv().
- Flag the sblock() call as non-interruptible so that a signal
delivered to the thread calling sorflush() doesn't cause sblock() to
fail. The sblock() is required to ensure that all other socket
consumer threads have, in fact, left, and do not enter, the socket
buffer until we're done flushin it.
To implement the latter, change the 'flags' argument to sblock() to
accept two flags, SBL_WAIT and SBL_NOINTR, rather than one M_WAITOK
flag. When SBL_NOINTR is set, it forces a non-interruptible sx
acquisition, regardless of the setting of the disposition of SB_NOINTR
on the socket buffer; without this change it would be possible for
another thread to clear SB_NOINTR between when the socket buffer mutex
is released and sblock() is invoked.
Reviewed by: bz, kmacy
Reported by: Jos Backus <jos at catnook dot com>
2008-01-31 08:22:24 +00:00
|
|
|
* Dislodge threads currently blocked in receive and wait to acquire
|
|
|
|
* a lock against other simultaneous readers before clearing the
|
|
|
|
* socket buffer. Don't let our acquire be interrupted by a signal
|
|
|
|
* despite any existing socket disposition on interruptable waiting.
|
|
|
|
*/
|
|
|
|
socantrcvmore(so);
|
|
|
|
(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Invalidate/clear most of the sockbuf structure, but leave selinfo
|
|
|
|
* and mutex data unchanged.
|
2005-06-09 19:59:09 +00:00
|
|
|
*/
|
2007-05-03 14:42:42 +00:00
|
|
|
SOCKBUF_LOCK(sb);
|
2015-07-14 02:00:50 +00:00
|
|
|
bzero(&aso, sizeof(aso));
|
|
|
|
aso.so_pcb = so->so_pcb;
|
|
|
|
bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero,
|
2005-06-09 19:59:09 +00:00
|
|
|
sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
|
|
|
|
bzero(&sb->sb_startzero,
|
|
|
|
sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
|
|
|
|
SOCKBUF_UNLOCK(sb);
|
2007-05-03 14:42:42 +00:00
|
|
|
sbunlock(sb);
|
2005-06-09 19:59:09 +00:00
|
|
|
|
2008-02-04 12:25:13 +00:00
|
|
|
/*
|
2015-07-14 02:00:50 +00:00
|
|
|
* Dispose of special rights and flush the copied socket. Don't call
|
|
|
|
* any unsafe routines (that rely on locks being initialized) on aso.
|
2008-02-04 12:25:13 +00:00
|
|
|
*/
|
2005-06-09 19:59:09 +00:00
|
|
|
if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
|
2015-07-14 02:00:50 +00:00
|
|
|
(*pr->pr_domain->dom_dispose)(&aso);
|
|
|
|
sbrelease_internal(&aso.so_rcv, so);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2014-08-18 23:45:40 +00:00
|
|
|
/*
|
|
|
|
* Wrapper for Socket established helper hook.
|
|
|
|
* Parameters: socket, context of the hook point, hook id.
|
|
|
|
*/
|
|
|
|
static int inline
|
|
|
|
hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
|
|
|
|
{
|
|
|
|
struct socket_hhook_data hhook_data = {
|
|
|
|
.so = so,
|
|
|
|
.hctx = hctx,
|
2014-09-08 09:04:22 +00:00
|
|
|
.m = NULL,
|
|
|
|
.status = 0
|
2014-08-18 23:45:40 +00:00
|
|
|
};
|
|
|
|
|
2014-09-08 09:04:22 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
|
|
|
HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd);
|
|
|
|
CURVNET_RESTORE();
|
2014-08-18 23:45:40 +00:00
|
|
|
|
|
|
|
/* Ugly but needed, since hhooks return void for now */
|
|
|
|
return (hhook_data.status);
|
|
|
|
}
|
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Perhaps this routine, and sooptcopyout(), below, ought to come in an
|
|
|
|
* additional variant to handle the case where the option value needs to be
|
|
|
|
* some kind of integer, but not a specific size. In addition to their use
|
|
|
|
* here, these functions are also called by the protocol-level pr_ctloutput()
|
|
|
|
* routines.
|
1998-08-23 03:07:17 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1998-08-23 03:07:17 +00:00
|
|
|
size_t valsize;
|
|
|
|
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* If the user gives us more than we wanted, we ignore it, but if we
|
|
|
|
* don't get the minimum length the caller wants, we return EINVAL.
|
|
|
|
* On success, sopt->sopt_valsize is set to however much we actually
|
|
|
|
* retrieved.
|
1998-08-23 03:07:17 +00:00
|
|
|
*/
|
|
|
|
if ((valsize = sopt->sopt_valsize) < minlen)
|
|
|
|
return EINVAL;
|
2016-11-22 18:31:43 +00:00
|
|
|
if (valsize > len)
|
1998-08-23 03:07:17 +00:00
|
|
|
sopt->sopt_valsize = valsize = len;
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
if (sopt->sopt_td != NULL)
|
1998-08-23 03:07:17 +00:00
|
|
|
return (copyin(sopt->sopt_val, buf, valsize));
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
bcopy(sopt->sopt_val, buf, valsize);
|
2006-02-12 15:00:27 +00:00
|
|
|
return (0);
|
1998-08-23 03:07:17 +00:00
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Kernel version of setsockopt(2).
|
|
|
|
*
|
2005-06-09 19:59:09 +00:00
|
|
|
* XXX: optlen is size_t, not socklen_t
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
so_setsockopt(struct socket *so, int level, int optname, void *optval,
|
|
|
|
size_t optlen)
|
|
|
|
{
|
|
|
|
struct sockopt sopt;
|
|
|
|
|
|
|
|
sopt.sopt_level = level;
|
|
|
|
sopt.sopt_name = optname;
|
|
|
|
sopt.sopt_dir = SOPT_SET;
|
|
|
|
sopt.sopt_val = optval;
|
|
|
|
sopt.sopt_valsize = optlen;
|
|
|
|
sopt.sopt_td = NULL;
|
|
|
|
return (sosetopt(so, &sopt));
|
|
|
|
}
|
|
|
|
|
1998-08-23 03:07:17 +00:00
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sosetopt(struct socket *so, struct sockopt *sopt)
|
1998-08-23 03:07:17 +00:00
|
|
|
{
|
|
|
|
int error, optval;
|
|
|
|
struct linger l;
|
|
|
|
struct timeval tv;
|
2013-09-01 23:34:53 +00:00
|
|
|
sbintime_t val;
|
2010-11-12 13:02:26 +00:00
|
|
|
uint32_t val32;
|
2005-06-09 19:59:09 +00:00
|
|
|
#ifdef MAC
|
|
|
|
struct mac extmac;
|
|
|
|
#endif
|
1998-08-23 03:07:17 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
1998-08-23 03:07:17 +00:00
|
|
|
error = 0;
|
|
|
|
if (sopt->sopt_level != SOL_SOCKET) {
|
2019-01-10 00:25:12 +00:00
|
|
|
if (so->so_proto->pr_ctloutput != NULL)
|
2011-02-16 21:29:13 +00:00
|
|
|
error = (*so->so_proto->pr_ctloutput)(so, sopt);
|
2019-01-10 00:25:12 +00:00
|
|
|
else
|
|
|
|
error = ENOPROTOOPT;
|
1994-05-24 10:09:53 +00:00
|
|
|
} else {
|
1998-08-23 03:07:17 +00:00
|
|
|
switch (sopt->sopt_name) {
|
2000-11-20 01:35:25 +00:00
|
|
|
case SO_ACCEPTFILTER:
|
2017-06-02 17:49:21 +00:00
|
|
|
error = accept_filt_setopt(so, sopt);
|
2000-11-20 01:35:25 +00:00
|
|
|
if (error)
|
|
|
|
goto bad;
|
|
|
|
break;
|
2014-07-26 19:27:34 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_LINGER:
|
1998-08-23 03:07:17 +00:00
|
|
|
error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
|
|
|
|
if (error)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto bad;
|
2019-07-14 21:44:18 +00:00
|
|
|
if (l.l_linger < 0 ||
|
|
|
|
l.l_linger > USHRT_MAX ||
|
|
|
|
l.l_linger > (INT_MAX / hz)) {
|
|
|
|
error = EDOM;
|
|
|
|
goto bad;
|
|
|
|
}
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_LOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
so->so_linger = l.l_linger;
|
|
|
|
if (l.l_onoff)
|
|
|
|
so->so_options |= SO_LINGER;
|
|
|
|
else
|
|
|
|
so->so_options &= ~SO_LINGER;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_UNLOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case SO_DEBUG:
|
|
|
|
case SO_KEEPALIVE:
|
|
|
|
case SO_DONTROUTE:
|
|
|
|
case SO_USELOOPBACK:
|
|
|
|
case SO_BROADCAST:
|
|
|
|
case SO_REUSEADDR:
|
|
|
|
case SO_REUSEPORT:
|
2018-06-06 15:45:57 +00:00
|
|
|
case SO_REUSEPORT_LB:
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_OOBINLINE:
|
1996-05-09 20:15:26 +00:00
|
|
|
case SO_TIMESTAMP:
|
2005-06-09 19:59:09 +00:00
|
|
|
case SO_BINTIME:
|
|
|
|
case SO_NOSIGPIPE:
|
2008-10-17 01:25:45 +00:00
|
|
|
case SO_NO_DDP:
|
|
|
|
case SO_NO_OFFLOAD:
|
1998-08-23 03:07:17 +00:00
|
|
|
error = sooptcopyin(sopt, &optval, sizeof optval,
|
2012-12-07 22:13:33 +00:00
|
|
|
sizeof optval);
|
1998-08-23 03:07:17 +00:00
|
|
|
if (error)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto bad;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_LOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
if (optval)
|
|
|
|
so->so_options |= sopt->sopt_name;
|
1994-05-24 10:09:53 +00:00
|
|
|
else
|
1998-08-23 03:07:17 +00:00
|
|
|
so->so_options &= ~sopt->sopt_name;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_UNLOCK(so);
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
case SO_SETFIB:
|
|
|
|
error = sooptcopyin(sopt, &optval, sizeof optval,
|
2012-12-07 22:13:33 +00:00
|
|
|
sizeof optval);
|
2012-04-03 18:38:00 +00:00
|
|
|
if (error)
|
|
|
|
goto bad;
|
|
|
|
|
2012-02-03 11:00:53 +00:00
|
|
|
if (optval < 0 || optval >= rt_numfibs) {
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
error = EINVAL;
|
|
|
|
goto bad;
|
|
|
|
}
|
2012-02-26 13:51:05 +00:00
|
|
|
if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
|
2012-02-03 11:00:53 +00:00
|
|
|
(so->so_proto->pr_domain->dom_family == PF_INET6) ||
|
2012-04-03 18:38:00 +00:00
|
|
|
(so->so_proto->pr_domain->dom_family == PF_ROUTE)))
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
so->so_fibnum = optval;
|
2012-04-03 18:38:00 +00:00
|
|
|
else
|
Add code to allow the system to handle multiple routing tables.
This particular implementation is designed to be fully backwards compatible
and to be MFC-able to 7.x (and 6.x)
Currently the only protocol that can make use of the multiple tables is IPv4
Similar functionality exists in OpenBSD and Linux.
From my notes:
-----
One thing where FreeBSD has been falling behind, and which by chance I
have some time to work on is "policy based routing", which allows
different
packet streams to be routed by more than just the destination address.
Constraints:
------------
I want to make some form of this available in the 6.x tree
(and by extension 7.x) , but FreeBSD in general needs it so I might as
well do it in -current and back port the portions I need.
One of the ways that this can be done is to have the ability to
instantiate multiple kernel routing tables (which I will now
refer to as "Forwarding Information Bases" or "FIBs" for political
correctness reasons). Which FIB a particular packet uses to make
the next hop decision can be decided by a number of mechanisms.
The policies these mechanisms implement are the "Policies" referred
to in "Policy based routing".
One of the constraints I have if I try to back port this work to
6.x is that it must be implemented as a EXTENSION to the existing
ABIs in 6.x so that third party applications do not need to be
recompiled in timespan of the branch.
This first version will not have some of the bells and whistles that
will come with later versions. It will, for example, be limited to 16
tables in the first commit.
Implementation method, Compatible version. (part 1)
-------------------------------
For this reason I have implemented a "sufficient subset" of a
multiple routing table solution in Perforce, and back-ported it
to 6.x. (also in Perforce though not always caught up with what I
have done in -current/P4). The subset allows a number of FIBs
to be defined at compile time (8 is sufficient for my purposes in 6.x)
and implements the changes needed to allow IPV4 to use them. I have not
done the changes for ipv6 simply because I do not need it, and I do not
have enough knowledge of ipv6 (e.g. neighbor discovery) needed to do it.
Other protocol families are left untouched and should there be
users with proprietary protocol families, they should continue to work
and be oblivious to the existence of the extra FIBs.
To understand how this is done, one must know that the current FIB
code starts everything off with a single dimensional array of
pointers to FIB head structures (One per protocol family), each of
which in turn points to the trie of routes available to that family.
The basic change in the ABI compatible version of the change is to
extent that array to be a 2 dimensional array, so that
instead of protocol family X looking at rt_tables[X] for the
table it needs, it looks at rt_tables[Y][X] when for all
protocol families except ipv4 Y is always 0.
Code that is unaware of the change always just sees the first row
of the table, which of course looks just like the one dimensional
array that existed before.
The entry points rtrequest(), rtalloc(), rtalloc1(), rtalloc_ign()
are all maintained, but refer only to the first row of the array,
so that existing callers in proprietary protocols can continue to
do the "right thing".
Some new entry points are added, for the exclusive use of ipv4 code
called in_rtrequest(), in_rtalloc(), in_rtalloc1() and in_rtalloc_ign(),
which have an extra argument which refers the code to the correct row.
In addition, there are some new entry points (currently called
rtalloc_fib() and friends) that check the Address family being
looked up and call either rtalloc() (and friends) if the protocol
is not IPv4 forcing the action to row 0 or to the appropriate row
if it IS IPv4 (and that info is available). These are for calling
from code that is not specific to any particular protocol. The way
these are implemented would change in the non ABI preserving code
to be added later.
One feature of the first version of the code is that for ipv4,
the interface routes show up automatically on all the FIBs, so
that no matter what FIB you select you always have the basic
direct attached hosts available to you. (rtinit() does this
automatically).
You CAN delete an interface route from one FIB should you want
to but by default it's there. ARP information is also available
in each FIB. It's assumed that the same machine would have the
same MAC address, regardless of which FIB you are using to get
to it.
This brings us as to how the correct FIB is selected for an outgoing
IPV4 packet.
Firstly, all packets have a FIB associated with them. if nothing
has been done to change it, it will be FIB 0. The FIB is changed
in the following ways.
Packets fall into one of a number of classes.
1/ locally generated packets, coming from a socket/PCB.
Such packets select a FIB from a number associated with the
socket/PCB. This in turn is inherited from the process,
but can be changed by a socket option. The process in turn
inherits it on fork. I have written a utility call setfib
that acts a bit like nice..
setfib -3 ping target.example.com # will use fib 3 for ping.
It is an obvious extension to make it a property of a jail
but I have not done so. It can be achieved by combining the setfib and
jail commands.
2/ packets received on an interface for forwarding.
By default these packets would use table 0,
(or possibly a number settable in a sysctl(not yet)).
but prior to routing the firewall can inspect them (see below).
(possibly in the future you may be able to associate a FIB
with packets received on an interface.. An ifconfig arg, but not yet.)
3/ packets inspected by a packet classifier, which can arbitrarily
associate a fib with it on a packet by packet basis.
A fib assigned to a packet by a packet classifier
(such as ipfw) would over-ride a fib associated by
a more default source. (such as cases 1 or 2).
4/ a tcp listen socket associated with a fib will generate
accept sockets that are associated with that same fib.
5/ Packets generated in response to some other packet (e.g. reset
or icmp packets). These should use the FIB associated with the
packet being reponded to.
6/ Packets generated during encapsulation.
gif, tun and other tunnel interfaces will encapsulate using the FIB
that was in effect withthe proces that set up the tunnel.
thus setfib 1 ifconfig gif0 [tunnel instructions]
will set the fib for the tunnel to use to be fib 1.
Routing messages would be associated with their
process, and thus select one FIB or another.
messages from the kernel would be associated with the fib they
refer to and would only be received by a routing socket associated
with that fib. (not yet implemented)
In addition Netstat has been edited to be able to cope with the
fact that the array is now 2 dimensional. (It looks in system
memory using libkvm (!)). Old versions of netstat see only the first FIB.
In addition two sysctls are added to give:
a) the number of FIBs compiled in (active)
b) the default FIB of the calling process.
Early testing experience:
-------------------------
Basically our (IronPort's) appliance does this functionality already
using ipfw fwd but that method has some drawbacks.
For example,
It can't fully simulate a routing table because it can't influence the
socket's choice of local address when a connect() is done.
Testing during the generating of these changes has been
remarkably smooth so far. Multiple tables have co-existed
with no notable side effects, and packets have been routes
accordingly.
ipfw has grown 2 new keywords:
setfib N ip from anay to any
count ip from any to any fib N
In pf there seems to be a requirement to be able to give symbolic names to the
fibs but I do not have that capacity. I am not sure if it is required.
SCTP has interestingly enough built in support for this, called VRFs
in Cisco parlance. it will be interesting to see how that handles it
when it suddenly actually does something.
Where to next:
--------------------
After committing the ABI compatible version and MFCing it, I'd
like to proceed in a forward direction in -current. this will
result in some roto-tilling in the routing code.
Firstly: the current code's idea of having a separate tree per
protocol family, all of the same format, and pointed to by the
1 dimensional array is a bit silly. Especially when one considers that
there is code that makes assumptions about every protocol having the
same internal structures there. Some protocols don't WANT that
sort of structure. (for example the whole idea of a netmask is foreign
to appletalk). This needs to be made opaque to the external code.
My suggested first change is to add routing method pointers to the
'domain' structure, along with information pointing the data.
instead of having an array of pointers to uniform structures,
there would be an array pointing to the 'domain' structures
for each protocol address domain (protocol family),
and the methods this reached would be called. The methods would have
an argument that gives FIB number, but the protocol would be free
to ignore it.
When the ABI can be changed it raises the possibilty of the
addition of a fib entry into the "struct route". Currently,
the structure contains the sockaddr of the desination, and the resulting
fib entry. To make this work fully, one could add a fib number
so that given an address and a fib, one can find the third element, the
fib entry.
Interaction with the ARP layer/ LL layer would need to be
revisited as well. Qing Li has been working on this already.
This work was sponsored by Ironport Systems/Cisco
Reviewed by: several including rwatson, bz and mlair (parts each)
Obtained from: Ironport systems/Cisco
2008-05-09 23:03:00 +00:00
|
|
|
so->so_fibnum = 0;
|
|
|
|
break;
|
2010-11-12 13:02:26 +00:00
|
|
|
|
|
|
|
case SO_USER_COOKIE:
|
|
|
|
error = sooptcopyin(sopt, &val32, sizeof val32,
|
2012-12-07 22:13:33 +00:00
|
|
|
sizeof val32);
|
2010-11-12 13:02:26 +00:00
|
|
|
if (error)
|
|
|
|
goto bad;
|
|
|
|
so->so_user_cookie = val32;
|
|
|
|
break;
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_SNDBUF:
|
|
|
|
case SO_RCVBUF:
|
|
|
|
case SO_SNDLOWAT:
|
|
|
|
case SO_RCVLOWAT:
|
1998-08-23 03:07:17 +00:00
|
|
|
error = sooptcopyin(sopt, &optval, sizeof optval,
|
2012-12-07 22:13:33 +00:00
|
|
|
sizeof optval);
|
1998-08-23 03:07:17 +00:00
|
|
|
if (error)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto bad;
|
1997-06-27 15:28:54 +00:00
|
|
|
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Values < 1 make no sense for any of these options,
|
|
|
|
* so disallow them.
|
1997-06-27 15:28:54 +00:00
|
|
|
*/
|
|
|
|
if (optval < 1) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
|
2017-06-25 01:41:07 +00:00
|
|
|
error = sbsetopt(so, sopt->sopt_name, optval);
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case SO_SNDTIMEO:
|
|
|
|
case SO_RCVTIMEO:
|
2010-03-11 14:49:06 +00:00
|
|
|
#ifdef COMPAT_FREEBSD32
|
2008-11-22 12:36:15 +00:00
|
|
|
if (SV_CURPROC_FLAG(SV_ILP32)) {
|
2005-10-27 04:26:35 +00:00
|
|
|
struct timeval32 tv32;
|
|
|
|
|
|
|
|
error = sooptcopyin(sopt, &tv32, sizeof tv32,
|
|
|
|
sizeof tv32);
|
|
|
|
CP(tv32, tv, tv_sec);
|
|
|
|
CP(tv32, tv, tv_usec);
|
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
error = sooptcopyin(sopt, &tv, sizeof tv,
|
|
|
|
sizeof tv);
|
1998-08-23 03:07:17 +00:00
|
|
|
if (error)
|
1994-05-24 10:09:53 +00:00
|
|
|
goto bad;
|
2013-08-29 15:59:05 +00:00
|
|
|
if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
|
|
|
|
tv.tv_usec >= 1000000) {
|
1999-05-21 15:54:40 +00:00
|
|
|
error = EDOM;
|
|
|
|
goto bad;
|
|
|
|
}
|
2014-08-04 05:40:51 +00:00
|
|
|
if (tv.tv_sec > INT32_MAX)
|
|
|
|
val = SBT_MAX;
|
|
|
|
else
|
|
|
|
val = tvtosbt(tv);
|
1998-08-23 03:07:17 +00:00
|
|
|
switch (sopt->sopt_name) {
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_SNDTIMEO:
|
|
|
|
so->so_snd.sb_timeo = val;
|
|
|
|
break;
|
|
|
|
case SO_RCVTIMEO:
|
|
|
|
so->so_rcv.sb_timeo = val;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
2005-09-18 21:08:03 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
case SO_LABEL:
|
|
|
|
#ifdef MAC
|
|
|
|
error = sooptcopyin(sopt, &extmac, sizeof extmac,
|
|
|
|
sizeof extmac);
|
|
|
|
if (error)
|
|
|
|
goto bad;
|
|
|
|
error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
|
|
|
|
so, &extmac);
|
|
|
|
#else
|
|
|
|
error = EOPNOTSUPP;
|
|
|
|
#endif
|
|
|
|
break;
|
2005-09-18 21:08:03 +00:00
|
|
|
|
2017-01-16 17:46:38 +00:00
|
|
|
case SO_TS_CLOCK:
|
|
|
|
error = sooptcopyin(sopt, &optval, sizeof optval,
|
|
|
|
sizeof optval);
|
|
|
|
if (error)
|
|
|
|
goto bad;
|
|
|
|
if (optval < 0 || optval > SO_TS_CLOCK_MAX) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
so->so_ts_clock = optval;
|
|
|
|
break;
|
|
|
|
|
Implement kernel support for hardware rate limited sockets.
- Add RATELIMIT kernel configuration keyword which must be set to
enable the new functionality.
- Add support for hardware driven, Receive Side Scaling, RSS aware, rate
limited sendqueues and expose the functionality through the already
established SO_MAX_PACING_RATE setsockopt(). The API support rates in
the range from 1 to 4Gbytes/s which are suitable for regular TCP and
UDP streams. The setsockopt(2) manual page has been updated.
- Add rate limit function callback API to "struct ifnet" which supports
the following operations: if_snd_tag_alloc(), if_snd_tag_modify(),
if_snd_tag_query() and if_snd_tag_free().
- Add support to ifconfig to view, set and clear the IFCAP_TXRTLMT
flag, which tells if a network driver supports rate limiting or not.
- This patch also adds support for rate limiting through VLAN and LAGG
intermediate network devices.
- How rate limiting works:
1) The userspace application calls setsockopt() after accepting or
making a new connection to set the rate which is then stored in the
socket structure in the kernel. Later on when packets are transmitted
a check is made in the transmit path for rate changes. A rate change
implies a non-blocking ifp->if_snd_tag_alloc() call will be made to the
destination network interface, which then sets up a custom sendqueue
with the given rate limitation parameter. A "struct m_snd_tag" pointer is
returned which serves as a "snd_tag" hint in the m_pkthdr for the
subsequently transmitted mbufs.
2) When the network driver sees the "m->m_pkthdr.snd_tag" different
from NULL, it will move the packets into a designated rate limited sendqueue
given by the snd_tag pointer. It is up to the individual drivers how the rate
limited traffic will be rate limited.
3) Route changes are detected by the NIC drivers in the ifp->if_transmit()
routine when the ifnet pointer in the incoming snd_tag mismatches the
one of the network interface. The network adapter frees the mbuf and
returns EAGAIN which causes the ip_output() to release and clear the send
tag. Upon next ip_output() a new "snd_tag" will be tried allocated.
4) When the PCB is detached the custom sendqueue will be released by a
non-blocking ifp->if_snd_tag_free() call to the currently bound network
interface.
Reviewed by: wblock (manpages), adrian, gallatin, scottl (network)
Differential Revision: https://reviews.freebsd.org/D3687
Sponsored by: Mellanox Technologies
MFC after: 3 months
2017-01-18 13:31:17 +00:00
|
|
|
case SO_MAX_PACING_RATE:
|
|
|
|
error = sooptcopyin(sopt, &val32, sizeof(val32),
|
|
|
|
sizeof(val32));
|
|
|
|
if (error)
|
|
|
|
goto bad;
|
|
|
|
so->so_max_pacing_rate = val32;
|
|
|
|
break;
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
default:
|
2014-08-18 23:45:40 +00:00
|
|
|
if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
|
|
|
|
error = hhook_run_socket(so, sopt,
|
|
|
|
HHOOK_SOCKET_OPT);
|
|
|
|
else
|
|
|
|
error = ENOPROTOOPT;
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
}
|
2012-02-26 13:51:05 +00:00
|
|
|
if (error == 0 && so->so_proto->pr_ctloutput != NULL)
|
|
|
|
(void)(*so->so_proto->pr_ctloutput)(so, sopt);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
bad:
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_RESTORE();
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2006-07-23 20:36:04 +00:00
|
|
|
/*
|
|
|
|
* Helper routine for getsockopt.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2003-08-05 00:27:54 +00:00
|
|
|
sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1998-08-23 03:07:17 +00:00
|
|
|
int error;
|
|
|
|
size_t valsize;
|
|
|
|
|
|
|
|
error = 0;
|
|
|
|
|
|
|
|
/*
|
2006-07-23 20:36:04 +00:00
|
|
|
* Documented get behavior is that we always return a value, possibly
|
|
|
|
* truncated to fit in the user's buffer. Traditional behavior is
|
|
|
|
* that we always tell the user precisely how much we copied, rather
|
|
|
|
* than something useful like the total amount we had available for
|
|
|
|
* her. Note that this interface is not idempotent; the entire
|
2016-05-22 13:04:45 +00:00
|
|
|
* answer must be generated ahead of time.
|
1998-08-23 03:07:17 +00:00
|
|
|
*/
|
1998-08-31 15:34:55 +00:00
|
|
|
valsize = min(len, sopt->sopt_valsize);
|
1998-08-31 18:07:23 +00:00
|
|
|
sopt->sopt_valsize = valsize;
|
2005-06-09 19:59:09 +00:00
|
|
|
if (sopt->sopt_val != NULL) {
|
|
|
|
if (sopt->sopt_td != NULL)
|
1998-08-23 03:07:17 +00:00
|
|
|
error = copyout(buf, sopt->sopt_val, valsize);
|
|
|
|
else
|
|
|
|
bcopy(buf, sopt->sopt_val, valsize);
|
|
|
|
}
|
2006-02-12 15:00:27 +00:00
|
|
|
return (error);
|
1998-08-23 03:07:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2007-05-16 20:41:08 +00:00
|
|
|
sogetopt(struct socket *so, struct sockopt *sopt)
|
1998-08-23 03:07:17 +00:00
|
|
|
{
|
|
|
|
int error, optval;
|
|
|
|
struct linger l;
|
|
|
|
struct timeval tv;
|
2005-06-09 19:59:09 +00:00
|
|
|
#ifdef MAC
|
|
|
|
struct mac extmac;
|
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
CURVNET_SET(so->so_vnet);
|
1998-08-23 03:07:17 +00:00
|
|
|
error = 0;
|
|
|
|
if (sopt->sopt_level != SOL_SOCKET) {
|
2012-02-26 13:51:05 +00:00
|
|
|
if (so->so_proto->pr_ctloutput != NULL)
|
2011-02-16 21:29:13 +00:00
|
|
|
error = (*so->so_proto->pr_ctloutput)(so, sopt);
|
|
|
|
else
|
|
|
|
error = ENOPROTOOPT;
|
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
} else {
|
1998-08-23 03:07:17 +00:00
|
|
|
switch (sopt->sopt_name) {
|
2000-06-20 01:09:23 +00:00
|
|
|
case SO_ACCEPTFILTER:
|
2017-06-02 17:49:21 +00:00
|
|
|
error = accept_filt_getopt(so, sopt);
|
2000-06-20 01:09:23 +00:00
|
|
|
break;
|
2014-07-26 19:27:34 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_LINGER:
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_LOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
l.l_onoff = so->so_options & SO_LINGER;
|
|
|
|
l.l_linger = so->so_linger;
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCK_UNLOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
error = sooptcopyout(sopt, &l, sizeof l);
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case SO_USELOOPBACK:
|
|
|
|
case SO_DONTROUTE:
|
|
|
|
case SO_DEBUG:
|
|
|
|
case SO_KEEPALIVE:
|
|
|
|
case SO_REUSEADDR:
|
|
|
|
case SO_REUSEPORT:
|
2018-06-06 15:45:57 +00:00
|
|
|
case SO_REUSEPORT_LB:
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_BROADCAST:
|
|
|
|
case SO_OOBINLINE:
|
2005-08-01 21:15:09 +00:00
|
|
|
case SO_ACCEPTCONN:
|
1996-05-09 20:15:26 +00:00
|
|
|
case SO_TIMESTAMP:
|
2005-06-09 19:59:09 +00:00
|
|
|
case SO_BINTIME:
|
|
|
|
case SO_NOSIGPIPE:
|
2020-05-29 00:09:12 +00:00
|
|
|
case SO_NO_DDP:
|
|
|
|
case SO_NO_OFFLOAD:
|
1998-08-23 03:07:17 +00:00
|
|
|
optval = so->so_options & sopt->sopt_name;
|
|
|
|
integer:
|
|
|
|
error = sooptcopyout(sopt, &optval, sizeof optval);
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
|
2018-08-21 14:04:30 +00:00
|
|
|
case SO_DOMAIN:
|
|
|
|
optval = so->so_proto->pr_domain->dom_family;
|
|
|
|
goto integer;
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_TYPE:
|
1998-08-23 03:07:17 +00:00
|
|
|
optval = so->so_type;
|
|
|
|
goto integer;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2012-02-26 13:55:43 +00:00
|
|
|
case SO_PROTOCOL:
|
|
|
|
optval = so->so_proto->pr_protocol;
|
|
|
|
goto integer;
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
case SO_ERROR:
|
2006-06-18 19:02:49 +00:00
|
|
|
SOCK_LOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
optval = so->so_error;
|
1994-05-24 10:09:53 +00:00
|
|
|
so->so_error = 0;
|
2006-06-18 19:02:49 +00:00
|
|
|
SOCK_UNLOCK(so);
|
1998-08-23 03:07:17 +00:00
|
|
|
goto integer;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case SO_SNDBUF:
|
2017-07-21 07:44:43 +00:00
|
|
|
optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
|
|
|
|
so->so_snd.sb_hiwat;
|
1998-08-23 03:07:17 +00:00
|
|
|
goto integer;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case SO_RCVBUF:
|
2017-07-21 07:44:43 +00:00
|
|
|
optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
|
|
|
|
so->so_rcv.sb_hiwat;
|
1998-08-23 03:07:17 +00:00
|
|
|
goto integer;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case SO_SNDLOWAT:
|
2017-07-21 07:44:43 +00:00
|
|
|
optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
|
|
|
|
so->so_snd.sb_lowat;
|
1998-08-23 03:07:17 +00:00
|
|
|
goto integer;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case SO_RCVLOWAT:
|
2017-07-21 07:44:43 +00:00
|
|
|
optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
|
|
|
|
so->so_rcv.sb_lowat;
|
1998-08-23 03:07:17 +00:00
|
|
|
goto integer;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
case SO_SNDTIMEO:
|
|
|
|
case SO_RCVTIMEO:
|
2014-08-04 05:40:51 +00:00
|
|
|
tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
|
|
|
|
so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
|
2010-03-11 14:49:06 +00:00
|
|
|
#ifdef COMPAT_FREEBSD32
|
2008-11-22 12:36:15 +00:00
|
|
|
if (SV_CURPROC_FLAG(SV_ILP32)) {
|
2005-10-27 04:26:35 +00:00
|
|
|
struct timeval32 tv32;
|
|
|
|
|
|
|
|
CP(tv, tv32, tv_sec);
|
|
|
|
CP(tv, tv32, tv_usec);
|
|
|
|
error = sooptcopyout(sopt, &tv32, sizeof tv32);
|
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
error = sooptcopyout(sopt, &tv, sizeof tv);
|
2005-06-09 19:59:09 +00:00
|
|
|
break;
|
2005-09-18 21:08:03 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
case SO_LABEL:
|
|
|
|
#ifdef MAC
|
|
|
|
error = sooptcopyin(sopt, &extmac, sizeof(extmac),
|
|
|
|
sizeof(extmac));
|
|
|
|
if (error)
|
2011-02-16 21:29:13 +00:00
|
|
|
goto bad;
|
2005-06-09 19:59:09 +00:00
|
|
|
error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
|
|
|
|
so, &extmac);
|
|
|
|
if (error)
|
2011-02-16 21:29:13 +00:00
|
|
|
goto bad;
|
2005-06-09 19:59:09 +00:00
|
|
|
error = sooptcopyout(sopt, &extmac, sizeof extmac);
|
|
|
|
#else
|
|
|
|
error = EOPNOTSUPP;
|
|
|
|
#endif
|
|
|
|
break;
|
2005-09-18 21:08:03 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
case SO_PEERLABEL:
|
|
|
|
#ifdef MAC
|
|
|
|
error = sooptcopyin(sopt, &extmac, sizeof(extmac),
|
|
|
|
sizeof(extmac));
|
|
|
|
if (error)
|
2011-02-16 21:29:13 +00:00
|
|
|
goto bad;
|
2005-06-09 19:59:09 +00:00
|
|
|
error = mac_getsockopt_peerlabel(
|
|
|
|
sopt->sopt_td->td_ucred, so, &extmac);
|
|
|
|
if (error)
|
2011-02-16 21:29:13 +00:00
|
|
|
goto bad;
|
2005-06-09 19:59:09 +00:00
|
|
|
error = sooptcopyout(sopt, &extmac, sizeof extmac);
|
|
|
|
#else
|
|
|
|
error = EOPNOTSUPP;
|
|
|
|
#endif
|
|
|
|
break;
|
2005-09-18 21:08:03 +00:00
|
|
|
|
|
|
|
case SO_LISTENQLIMIT:
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
optval = SOLISTENING(so) ? so->sol_qlimit : 0;
|
2005-09-18 21:08:03 +00:00
|
|
|
goto integer;
|
|
|
|
|
|
|
|
case SO_LISTENQLEN:
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
optval = SOLISTENING(so) ? so->sol_qlen : 0;
|
2005-09-18 21:08:03 +00:00
|
|
|
goto integer;
|
|
|
|
|
|
|
|
case SO_LISTENINCQLEN:
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
optval = SOLISTENING(so) ? so->sol_incqlen : 0;
|
2005-09-18 21:08:03 +00:00
|
|
|
goto integer;
|
|
|
|
|
2017-01-16 17:46:38 +00:00
|
|
|
case SO_TS_CLOCK:
|
|
|
|
optval = so->so_ts_clock;
|
|
|
|
goto integer;
|
|
|
|
|
Implement kernel support for hardware rate limited sockets.
- Add RATELIMIT kernel configuration keyword which must be set to
enable the new functionality.
- Add support for hardware driven, Receive Side Scaling, RSS aware, rate
limited sendqueues and expose the functionality through the already
established SO_MAX_PACING_RATE setsockopt(). The API support rates in
the range from 1 to 4Gbytes/s which are suitable for regular TCP and
UDP streams. The setsockopt(2) manual page has been updated.
- Add rate limit function callback API to "struct ifnet" which supports
the following operations: if_snd_tag_alloc(), if_snd_tag_modify(),
if_snd_tag_query() and if_snd_tag_free().
- Add support to ifconfig to view, set and clear the IFCAP_TXRTLMT
flag, which tells if a network driver supports rate limiting or not.
- This patch also adds support for rate limiting through VLAN and LAGG
intermediate network devices.
- How rate limiting works:
1) The userspace application calls setsockopt() after accepting or
making a new connection to set the rate which is then stored in the
socket structure in the kernel. Later on when packets are transmitted
a check is made in the transmit path for rate changes. A rate change
implies a non-blocking ifp->if_snd_tag_alloc() call will be made to the
destination network interface, which then sets up a custom sendqueue
with the given rate limitation parameter. A "struct m_snd_tag" pointer is
returned which serves as a "snd_tag" hint in the m_pkthdr for the
subsequently transmitted mbufs.
2) When the network driver sees the "m->m_pkthdr.snd_tag" different
from NULL, it will move the packets into a designated rate limited sendqueue
given by the snd_tag pointer. It is up to the individual drivers how the rate
limited traffic will be rate limited.
3) Route changes are detected by the NIC drivers in the ifp->if_transmit()
routine when the ifnet pointer in the incoming snd_tag mismatches the
one of the network interface. The network adapter frees the mbuf and
returns EAGAIN which causes the ip_output() to release and clear the send
tag. Upon next ip_output() a new "snd_tag" will be tried allocated.
4) When the PCB is detached the custom sendqueue will be released by a
non-blocking ifp->if_snd_tag_free() call to the currently bound network
interface.
Reviewed by: wblock (manpages), adrian, gallatin, scottl (network)
Differential Revision: https://reviews.freebsd.org/D3687
Sponsored by: Mellanox Technologies
MFC after: 3 months
2017-01-18 13:31:17 +00:00
|
|
|
case SO_MAX_PACING_RATE:
|
|
|
|
optval = so->so_max_pacing_rate;
|
|
|
|
goto integer;
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
default:
|
2014-08-18 23:45:40 +00:00
|
|
|
if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
|
|
|
|
error = hhook_run_socket(so, sopt,
|
|
|
|
HHOOK_SOCKET_OPT);
|
|
|
|
else
|
|
|
|
error = ENOPROTOOPT;
|
1998-08-23 03:07:17 +00:00
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
}
|
2011-02-16 21:29:13 +00:00
|
|
|
#ifdef MAC
|
|
|
|
bad:
|
|
|
|
#endif
|
|
|
|
CURVNET_RESTORE();
|
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1999-11-22 02:45:11 +00:00
|
|
|
int
|
|
|
|
soopt_getm(struct sockopt *sopt, struct mbuf **mp)
|
|
|
|
{
|
|
|
|
struct mbuf *m, *m_prev;
|
|
|
|
int sopt_size = sopt->sopt_valsize;
|
|
|
|
|
2012-12-05 08:04:20 +00:00
|
|
|
MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m == NULL)
|
1999-11-22 02:45:11 +00:00
|
|
|
return ENOBUFS;
|
|
|
|
if (sopt_size > MLEN) {
|
2012-12-05 08:04:20 +00:00
|
|
|
MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
|
1999-11-22 02:45:11 +00:00
|
|
|
if ((m->m_flags & M_EXT) == 0) {
|
|
|
|
m_free(m);
|
|
|
|
return ENOBUFS;
|
|
|
|
}
|
|
|
|
m->m_len = min(MCLBYTES, sopt_size);
|
|
|
|
} else {
|
|
|
|
m->m_len = min(MLEN, sopt_size);
|
|
|
|
}
|
|
|
|
sopt_size -= m->m_len;
|
|
|
|
*mp = m;
|
|
|
|
m_prev = m;
|
|
|
|
|
|
|
|
while (sopt_size) {
|
2012-12-05 08:04:20 +00:00
|
|
|
MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
|
2005-06-09 19:59:09 +00:00
|
|
|
if (m == NULL) {
|
1999-11-22 02:45:11 +00:00
|
|
|
m_freem(*mp);
|
|
|
|
return ENOBUFS;
|
|
|
|
}
|
|
|
|
if (sopt_size > MLEN) {
|
2012-12-05 08:04:20 +00:00
|
|
|
MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
|
|
|
|
M_NOWAIT);
|
1999-11-22 02:45:11 +00:00
|
|
|
if ((m->m_flags & M_EXT) == 0) {
|
2005-06-09 19:59:09 +00:00
|
|
|
m_freem(m);
|
1999-11-22 02:45:11 +00:00
|
|
|
m_freem(*mp);
|
|
|
|
return ENOBUFS;
|
|
|
|
}
|
|
|
|
m->m_len = min(MCLBYTES, sopt_size);
|
|
|
|
} else {
|
|
|
|
m->m_len = min(MLEN, sopt_size);
|
|
|
|
}
|
|
|
|
sopt_size -= m->m_len;
|
|
|
|
m_prev->m_next = m;
|
|
|
|
m_prev = m;
|
|
|
|
}
|
2006-02-12 15:00:27 +00:00
|
|
|
return (0);
|
1999-11-22 02:45:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
|
|
|
|
{
|
|
|
|
struct mbuf *m0 = m;
|
|
|
|
|
|
|
|
if (sopt->sopt_val == NULL)
|
2006-02-12 15:00:27 +00:00
|
|
|
return (0);
|
1999-11-22 02:45:11 +00:00
|
|
|
while (m != NULL && sopt->sopt_valsize >= m->m_len) {
|
2005-06-09 19:59:09 +00:00
|
|
|
if (sopt->sopt_td != NULL) {
|
1999-11-22 02:45:11 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
error = copyin(sopt->sopt_val, mtod(m, char *),
|
2012-12-07 22:13:33 +00:00
|
|
|
m->m_len);
|
1999-11-22 02:45:11 +00:00
|
|
|
if (error != 0) {
|
|
|
|
m_freem(m0);
|
|
|
|
return(error);
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
|
|
|
|
sopt->sopt_valsize -= m->m_len;
|
2005-06-09 19:59:09 +00:00
|
|
|
sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
|
1999-11-22 02:45:11 +00:00
|
|
|
m = m->m_next;
|
|
|
|
}
|
|
|
|
if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
|
|
|
|
panic("ip6_sooptmcopyin");
|
2006-02-12 15:00:27 +00:00
|
|
|
return (0);
|
1999-11-22 02:45:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
|
|
|
|
{
|
|
|
|
struct mbuf *m0 = m;
|
|
|
|
size_t valsize = 0;
|
|
|
|
|
|
|
|
if (sopt->sopt_val == NULL)
|
2006-02-12 15:00:27 +00:00
|
|
|
return (0);
|
1999-11-22 02:45:11 +00:00
|
|
|
while (m != NULL && sopt->sopt_valsize >= m->m_len) {
|
2005-06-09 19:59:09 +00:00
|
|
|
if (sopt->sopt_td != NULL) {
|
1999-11-22 02:45:11 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
error = copyout(mtod(m, char *), sopt->sopt_val,
|
2012-12-07 22:13:33 +00:00
|
|
|
m->m_len);
|
1999-11-22 02:45:11 +00:00
|
|
|
if (error != 0) {
|
|
|
|
m_freem(m0);
|
|
|
|
return(error);
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
|
2012-12-07 22:13:33 +00:00
|
|
|
sopt->sopt_valsize -= m->m_len;
|
|
|
|
sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
|
|
|
|
valsize += m->m_len;
|
|
|
|
m = m->m_next;
|
1999-11-22 02:45:11 +00:00
|
|
|
}
|
|
|
|
if (m != NULL) {
|
|
|
|
/* enough soopt buffer should be given from user-land */
|
|
|
|
m_freem(m0);
|
|
|
|
return(EINVAL);
|
|
|
|
}
|
|
|
|
sopt->sopt_valsize = valsize;
|
2006-02-12 15:00:27 +00:00
|
|
|
return (0);
|
1999-11-22 02:45:11 +00:00
|
|
|
}
|
|
|
|
|
2006-07-23 20:36:04 +00:00
|
|
|
/*
|
|
|
|
* sohasoutofband(): protocol notifies socket layer of the arrival of new
|
|
|
|
* out-of-band data, which will then notify socket consumers.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
sohasoutofband(struct socket *so)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
Installed the second patch attached to kern/7899 with some changes suggested
by bde, a few other tweaks to get the patch to apply cleanly again and
some improvements to the comments.
This change closes some fairly minor security holes associated with
F_SETOWN, fixes a few bugs, and removes some limitations that F_SETOWN
had on tty devices. For more details, see the description on the PR.
Because this patch increases the size of the proc and pgrp structures,
it is necessary to re-install the includes and recompile libkvm,
the vinum lkm, fstat, gcore, gdb, ipfilter, ps, top, and w.
PR: kern/7899
Reviewed by: bde, elvind
1998-11-11 10:04:13 +00:00
|
|
|
if (so->so_sigio != NULL)
|
2005-06-09 19:59:09 +00:00
|
|
|
pgsigio(&so->so_sigio, SIGURG, 0);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
selwakeuppri(&so->so_rdsel, PSOCK);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1997-04-27 20:01:29 +00:00
|
|
|
|
|
|
|
int
|
2005-06-09 19:59:09 +00:00
|
|
|
sopoll(struct socket *so, int events, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
1997-04-27 20:01:29 +00:00
|
|
|
{
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
|
2011-02-16 21:29:13 +00:00
|
|
|
/*
|
|
|
|
* We do not need to set or assert curvnet as long as everyone uses
|
|
|
|
* sopoll_generic().
|
|
|
|
*/
|
soreceive_generic(), and sopoll_generic(). Add new functions sosend(),
soreceive(), and sopoll(), which are wrappers for pru_sosend,
pru_soreceive, and pru_sopoll, and are now used univerally by socket
consumers rather than either directly invoking the old so*() functions
or directly invoking the protocol switch method (about an even split
prior to this commit).
This completes an architectural change that was begun in 1996 to permit
protocols to provide substitute implementations, as now used by UDP.
Consumers now uniformly invoke sosend(), soreceive(), and sopoll() to
perform these operations on sockets -- in particular, distributed file
systems and socket system calls.
Architectural head nod: sam, gnn, wollman
2006-07-24 15:20:08 +00:00
|
|
|
return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
|
|
|
|
td));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
|
|
|
|
struct thread *td)
|
|
|
|
{
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
int revents;
|
1997-09-14 02:34:14 +00:00
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOCK_LOCK(so);
|
|
|
|
if (SOLISTENING(so)) {
|
|
|
|
if (!(events & (POLLIN | POLLRDNORM)))
|
|
|
|
revents = 0;
|
|
|
|
else if (!TAILQ_EMPTY(&so->sol_comp))
|
|
|
|
revents = events & (POLLIN | POLLRDNORM);
|
2018-10-03 17:40:04 +00:00
|
|
|
else if ((events & POLLINIGNEOF) == 0 && so->so_error)
|
|
|
|
revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
else {
|
|
|
|
selrecord(td, &so->so_rdsel);
|
|
|
|
revents = 0;
|
2009-08-25 21:44:14 +00:00
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
} else {
|
|
|
|
revents = 0;
|
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
if (events & (POLLIN | POLLRDNORM))
|
|
|
|
if (soreadabledata(so))
|
|
|
|
revents |= events & (POLLIN | POLLRDNORM);
|
|
|
|
if (events & (POLLOUT | POLLWRNORM))
|
|
|
|
if (sowriteable(so))
|
|
|
|
revents |= events & (POLLOUT | POLLWRNORM);
|
|
|
|
if (events & (POLLPRI | POLLRDBAND))
|
|
|
|
if (so->so_oobmark ||
|
|
|
|
(so->so_rcv.sb_state & SBS_RCVATMARK))
|
|
|
|
revents |= events & (POLLPRI | POLLRDBAND);
|
|
|
|
if ((events & POLLINIGNEOF) == 0) {
|
|
|
|
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
|
|
|
|
revents |= events & (POLLIN | POLLRDNORM);
|
|
|
|
if (so->so_snd.sb_state & SBS_CANTSENDMORE)
|
|
|
|
revents |= POLLHUP;
|
|
|
|
}
|
1997-04-27 20:01:29 +00:00
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
if (revents == 0) {
|
|
|
|
if (events &
|
|
|
|
(POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
|
|
|
|
selrecord(td, &so->so_rdsel);
|
|
|
|
so->so_rcv.sb_flags |= SB_SEL;
|
|
|
|
}
|
|
|
|
if (events & (POLLOUT | POLLWRNORM)) {
|
|
|
|
selrecord(td, &so->so_wrsel);
|
|
|
|
so->so_snd.sb_flags |= SB_SEL;
|
|
|
|
}
|
1997-04-27 20:01:29 +00:00
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
1997-04-27 20:01:29 +00:00
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOCK_UNLOCK(so);
|
1997-09-14 02:34:14 +00:00
|
|
|
return (revents);
|
1997-04-27 20:01:29 +00:00
|
|
|
}
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2001-02-15 16:34:11 +00:00
|
|
|
int
|
2005-06-09 19:59:09 +00:00
|
|
|
soo_kqfilter(struct file *fp, struct knote *kn)
|
2000-04-16 18:53:38 +00:00
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct socket *so = kn->kn_fp->f_data;
|
2001-02-15 16:34:11 +00:00
|
|
|
struct sockbuf *sb;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
struct knlist *knl;
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2001-02-15 16:34:11 +00:00
|
|
|
switch (kn->kn_filter) {
|
|
|
|
case EVFILT_READ:
|
2017-02-01 13:12:07 +00:00
|
|
|
kn->kn_fop = &soread_filtops;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
knl = &so->so_rdsel.si_note;
|
2001-02-15 16:34:11 +00:00
|
|
|
sb = &so->so_rcv;
|
|
|
|
break;
|
|
|
|
case EVFILT_WRITE:
|
|
|
|
kn->kn_fop = &sowrite_filtops;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
knl = &so->so_wrsel.si_note;
|
2001-02-15 16:34:11 +00:00
|
|
|
sb = &so->so_snd;
|
|
|
|
break;
|
2017-01-16 08:25:33 +00:00
|
|
|
case EVFILT_EMPTY:
|
|
|
|
kn->kn_fop = &soempty_filtops;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
knl = &so->so_wrsel.si_note;
|
2017-01-16 08:25:33 +00:00
|
|
|
sb = &so->so_snd;
|
|
|
|
break;
|
2001-02-15 16:34:11 +00:00
|
|
|
default:
|
2005-06-09 19:59:09 +00:00
|
|
|
return (EINVAL);
|
2001-02-15 16:34:11 +00:00
|
|
|
}
|
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOCK_LOCK(so);
|
|
|
|
if (SOLISTENING(so)) {
|
|
|
|
knlist_add(knl, kn, 1);
|
|
|
|
} else {
|
|
|
|
SOCKBUF_LOCK(sb);
|
|
|
|
knlist_add(knl, kn, 1);
|
|
|
|
sb->sb_flags |= SB_KNOTE;
|
|
|
|
SOCKBUF_UNLOCK(sb);
|
|
|
|
}
|
|
|
|
SOCK_UNLOCK(so);
|
2000-04-16 18:53:38 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
/*
|
|
|
|
* Some routines that return EOPNOTSUPP for entry points that are not
|
|
|
|
* supported by a protocol. Fill in as needed.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2016-04-29 20:11:09 +00:00
|
|
|
int
|
|
|
|
pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job)
|
|
|
|
{
|
|
|
|
|
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
int
|
|
|
|
pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2013-03-02 21:11:30 +00:00
|
|
|
int
|
|
|
|
pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
|
|
|
|
struct thread *td)
|
|
|
|
{
|
|
|
|
|
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
int
|
|
|
|
pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2013-03-02 21:11:30 +00:00
|
|
|
int
|
|
|
|
pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
|
|
|
|
struct thread *td)
|
|
|
|
{
|
|
|
|
|
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
int
|
|
|
|
pru_connect2_notsupp(struct socket *so1, struct socket *so2)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
|
2007-05-16 20:41:08 +00:00
|
|
|
struct ifnet *ifp, struct thread *td)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_disconnect_notsupp(struct socket *so)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_rcvd_notsupp(struct socket *so, int flags)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
|
2007-05-16 20:41:08 +00:00
|
|
|
struct sockaddr *addr, struct mbuf *control, struct thread *td)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2014-11-30 13:24:21 +00:00
|
|
|
int
|
|
|
|
pru_ready_notsupp(struct socket *so, struct mbuf *m, int count)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (EOPNOTSUPP);
|
|
|
|
}
|
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
/*
|
2007-03-26 17:05:09 +00:00
|
|
|
* This isn't really a ``null'' operation, but it's the default one and
|
|
|
|
* doesn't do anything destructive.
|
2007-03-26 08:59:03 +00:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
pru_sense_null(struct socket *so, struct stat *sb)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
sb->st_blksize = so->so_snd.sb_hiwat;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_shutdown_notsupp(struct socket *so)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
|
2007-05-16 20:41:08 +00:00
|
|
|
struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
|
2007-05-16 20:41:08 +00:00
|
|
|
struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
|
2007-05-16 20:41:08 +00:00
|
|
|
struct thread *td)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
return EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2000-04-16 18:53:38 +00:00
|
|
|
static void
|
|
|
|
filt_sordetach(struct knote *kn)
|
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct socket *so = kn->kn_fp->f_data;
|
2000-04-16 18:53:38 +00:00
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
so_rdknl_lock(so);
|
|
|
|
knlist_remove(&so->so_rdsel.si_note, kn, 1);
|
|
|
|
if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
|
2000-04-16 18:53:38 +00:00
|
|
|
so->so_rcv.sb_flags &= ~SB_KNOTE;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
so_rdknl_unlock(so);
|
2000-04-16 18:53:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
|
|
|
filt_soread(struct knote *kn, long hint)
|
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct socket *so;
|
2004-08-24 05:28:18 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
so = kn->kn_fp->f_data;
|
2017-02-01 13:12:07 +00:00
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
if (SOLISTENING(so)) {
|
|
|
|
SOCK_LOCK_ASSERT(so);
|
|
|
|
kn->kn_data = so->sol_qlen;
|
2018-10-03 17:40:04 +00:00
|
|
|
if (so->so_error) {
|
|
|
|
kn->kn_flags |= EV_EOF;
|
|
|
|
kn->kn_fflags = so->so_error;
|
|
|
|
return (1);
|
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
return (!TAILQ_EMPTY(&so->sol_comp));
|
2017-02-01 13:12:07 +00:00
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
|
|
|
|
2014-11-12 09:57:15 +00:00
|
|
|
kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
|
2005-06-09 19:59:09 +00:00
|
|
|
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
|
|
|
|
kn->kn_flags |= EV_EOF;
|
2001-02-24 01:33:12 +00:00
|
|
|
kn->kn_fflags = so->so_error;
|
2004-08-24 05:28:18 +00:00
|
|
|
return (1);
|
2005-06-09 19:59:09 +00:00
|
|
|
} else if (so->so_error) /* temporary udp error */
|
2004-08-24 05:28:18 +00:00
|
|
|
return (1);
|
2014-08-18 23:45:40 +00:00
|
|
|
|
|
|
|
if (kn->kn_sfflags & NOTE_LOWAT) {
|
|
|
|
if (kn->kn_data >= kn->kn_sdata)
|
2017-02-01 13:12:07 +00:00
|
|
|
return (1);
|
|
|
|
} else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat)
|
|
|
|
return (1);
|
2014-08-18 23:45:40 +00:00
|
|
|
|
2014-09-08 09:04:22 +00:00
|
|
|
/* This hook returning non-zero indicates an event, not error */
|
|
|
|
return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
|
2000-04-16 18:53:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
filt_sowdetach(struct knote *kn)
|
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct socket *so = kn->kn_fp->f_data;
|
2000-04-16 18:53:38 +00:00
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
so_wrknl_lock(so);
|
|
|
|
knlist_remove(&so->so_wrsel.si_note, kn, 1);
|
|
|
|
if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
|
2000-04-16 18:53:38 +00:00
|
|
|
so->so_snd.sb_flags &= ~SB_KNOTE;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
so_wrknl_unlock(so);
|
2000-04-16 18:53:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
|
|
|
filt_sowrite(struct knote *kn, long hint)
|
|
|
|
{
|
2005-06-09 19:59:09 +00:00
|
|
|
struct socket *so;
|
2000-04-16 18:53:38 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
so = kn->kn_fp->f_data;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
|
|
|
|
if (SOLISTENING(so))
|
|
|
|
return (0);
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_snd);
|
2000-04-16 18:53:38 +00:00
|
|
|
kn->kn_data = sbspace(&so->so_snd);
|
2014-08-18 23:45:40 +00:00
|
|
|
|
2014-09-08 09:04:22 +00:00
|
|
|
hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
|
2014-08-18 23:45:40 +00:00
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
|
|
|
|
kn->kn_flags |= EV_EOF;
|
2001-02-24 01:33:12 +00:00
|
|
|
kn->kn_fflags = so->so_error;
|
2004-08-24 05:28:18 +00:00
|
|
|
return (1);
|
2005-06-09 19:59:09 +00:00
|
|
|
} else if (so->so_error) /* temporary udp error */
|
2004-08-24 05:28:18 +00:00
|
|
|
return (1);
|
2005-06-09 19:59:09 +00:00
|
|
|
else if (((so->so_state & SS_ISCONNECTED) == 0) &&
|
2002-05-31 11:52:35 +00:00
|
|
|
(so->so_proto->pr_flags & PR_CONNREQUIRED))
|
2004-08-24 05:28:18 +00:00
|
|
|
return (0);
|
2005-06-09 19:59:09 +00:00
|
|
|
else if (kn->kn_sfflags & NOTE_LOWAT)
|
2004-08-24 05:28:18 +00:00
|
|
|
return (kn->kn_data >= kn->kn_sdata);
|
2005-06-09 19:59:09 +00:00
|
|
|
else
|
|
|
|
return (kn->kn_data >= so->so_snd.sb_lowat);
|
2000-04-16 18:53:38 +00:00
|
|
|
}
|
|
|
|
|
2017-01-16 08:25:33 +00:00
|
|
|
static int
|
|
|
|
filt_soempty(struct knote *kn, long hint)
|
|
|
|
{
|
|
|
|
struct socket *so;
|
|
|
|
|
|
|
|
so = kn->kn_fp->f_data;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
|
|
|
|
if (SOLISTENING(so))
|
|
|
|
return (1);
|
|
|
|
|
2017-01-16 08:25:33 +00:00
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_snd);
|
|
|
|
kn->kn_data = sbused(&so->so_snd);
|
|
|
|
|
|
|
|
if (kn->kn_data == 0)
|
|
|
|
return (1);
|
|
|
|
else
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2005-06-09 19:59:09 +00:00
|
|
|
int
|
|
|
|
socheckuid(struct socket *so, uid_t uid)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (so == NULL)
|
|
|
|
return (EPERM);
|
|
|
|
if (so->so_cred->cr_uid != uid)
|
|
|
|
return (EPERM);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
/*
|
2007-03-26 17:05:09 +00:00
|
|
|
* These functions are used by protocols to notify the socket layer (and its
|
|
|
|
* consumers) of state changes in the sockets driven by protocol-side events.
|
2007-03-26 08:59:03 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
2007-03-26 17:05:09 +00:00
|
|
|
* Procedures to manipulate state flags of socket and do appropriate wakeups.
|
2007-03-26 08:59:03 +00:00
|
|
|
*
|
2007-03-26 17:05:09 +00:00
|
|
|
* Normal sequence from the active (originating) side is that
|
|
|
|
* soisconnecting() is called during processing of connect() call, resulting
|
|
|
|
* in an eventual call to soisconnected() if/when the connection is
|
|
|
|
* established. When the connection is torn down soisdisconnecting() is
|
|
|
|
* called during processing of disconnect() call, and soisdisconnected() is
|
|
|
|
* called when the connection to the peer is totally severed. The semantics
|
|
|
|
* of these routines are such that connectionless protocols can call
|
|
|
|
* soisconnected() and soisdisconnected() only, bypassing the in-progress
|
|
|
|
* calls when setting up a ``connection'' takes no time.
|
2007-03-26 08:59:03 +00:00
|
|
|
*
|
2007-03-26 17:05:09 +00:00
|
|
|
* From the passive side, a socket is created with two queues of sockets:
|
|
|
|
* so_incomp for connections in progress and so_comp for connections already
|
|
|
|
* made and awaiting user acceptance. As a protocol is preparing incoming
|
|
|
|
* connections, it creates a socket structure queued on so_incomp by calling
|
|
|
|
* sonewconn(). When the connection is established, soisconnected() is
|
|
|
|
* called, and transfers the socket structure to so_comp, making it available
|
|
|
|
* to accept().
|
2007-03-26 08:59:03 +00:00
|
|
|
*
|
2007-03-26 17:05:09 +00:00
|
|
|
* If a socket is closed with sockets on either so_incomp or so_comp, these
|
|
|
|
* sockets are dropped.
|
|
|
|
*
|
|
|
|
* If higher-level protocols are implemented in the kernel, the wakeups done
|
|
|
|
* here will sometimes cause software-interrupt process scheduling.
|
2007-03-26 08:59:03 +00:00
|
|
|
*/
|
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
soisconnecting(struct socket *so)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
|
|
|
|
so->so_state |= SS_ISCONNECTING;
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
soisconnected(struct socket *so)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
|
|
|
|
so->so_state |= SS_ISCONNECTED;
|
2017-09-14 18:05:54 +00:00
|
|
|
|
|
|
|
if (so->so_qstate == SQ_INCOMP) {
|
|
|
|
struct socket *head = so->so_listen;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so));
|
|
|
|
/*
|
|
|
|
* Promoting a socket from incomplete queue to complete, we
|
|
|
|
* need to go through reverse order of locking. We first do
|
|
|
|
* trylock, and if that doesn't succeed, we go the hard way
|
|
|
|
* leaving a reference and rechecking consistency after proper
|
|
|
|
* locking.
|
|
|
|
*/
|
|
|
|
if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
|
|
|
|
soref(head);
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
SOLISTEN_LOCK(head);
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
if (__predict_false(head != so->so_listen)) {
|
|
|
|
/*
|
|
|
|
* The socket went off the listen queue,
|
|
|
|
* should be lost race to close(2) of sol.
|
|
|
|
* The socket is about to soabort().
|
|
|
|
*/
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
sorele(head);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* Not the last one, as so holds a ref. */
|
|
|
|
refcount_release(&head->so_count);
|
|
|
|
}
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
again:
|
2007-03-26 08:59:03 +00:00
|
|
|
if ((so->so_options & SO_ACCEPTFILTER) == 0) {
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
TAILQ_REMOVE(&head->sol_incomp, so, so_list);
|
|
|
|
head->sol_incqlen--;
|
|
|
|
TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
|
|
|
|
head->sol_qlen++;
|
|
|
|
so->so_qstate = SQ_COMP;
|
2007-03-26 08:59:03 +00:00
|
|
|
SOCK_UNLOCK(so);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
solisten_wakeup(head); /* unlocks */
|
2007-03-26 08:59:03 +00:00
|
|
|
} else {
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
2009-06-01 21:17:03 +00:00
|
|
|
soupcall_set(so, SO_RCV,
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
head->sol_accept_filter->accf_callback,
|
|
|
|
head->sol_accept_filter_arg);
|
2007-03-26 08:59:03 +00:00
|
|
|
so->so_options &= ~SO_ACCEPTFILTER;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
ret = head->sol_accept_filter->accf_callback(so,
|
|
|
|
head->sol_accept_filter_arg, M_NOWAIT);
|
|
|
|
if (ret == SU_ISCONNECTED) {
|
2009-06-01 21:17:03 +00:00
|
|
|
soupcall_clear(so, SO_RCV);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
2007-03-26 08:59:03 +00:00
|
|
|
SOCK_UNLOCK(so);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOLISTEN_UNLOCK(head);
|
2007-03-26 08:59:03 +00:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
wakeup(&so->so_timeo);
|
|
|
|
sorwakeup(so);
|
|
|
|
sowwakeup(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
soisdisconnecting(struct socket *so)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOCK_LOCK(so);
|
2007-03-26 08:59:03 +00:00
|
|
|
so->so_state &= ~SS_ISCONNECTING;
|
|
|
|
so->so_state |= SS_ISDISCONNECTING;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
|
|
|
|
if (!SOLISTENING(so)) {
|
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
socantrcvmore_locked(so);
|
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
|
|
|
socantsendmore_locked(so);
|
|
|
|
}
|
|
|
|
SOCK_UNLOCK(so);
|
2007-03-26 08:59:03 +00:00
|
|
|
wakeup(&so->so_timeo);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2007-05-16 20:41:08 +00:00
|
|
|
soisdisconnected(struct socket *so)
|
2007-03-26 08:59:03 +00:00
|
|
|
{
|
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOCK_LOCK(so);
|
2020-05-14 20:17:09 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* There is at least one reader of so_state that does not
|
|
|
|
* acquire socket lock, namely soreceive_generic(). Ensure
|
|
|
|
* that it never sees all flags that track connection status
|
|
|
|
* cleared, by ordering the update with a barrier semantic of
|
|
|
|
* our release thread fence.
|
|
|
|
*/
|
2007-03-26 08:59:03 +00:00
|
|
|
so->so_state |= SS_ISDISCONNECTED;
|
2020-05-14 20:17:09 +00:00
|
|
|
atomic_thread_fence_rel();
|
|
|
|
so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
|
|
|
|
if (!SOLISTENING(so)) {
|
2017-08-24 20:49:19 +00:00
|
|
|
SOCK_UNLOCK(so);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
socantrcvmore_locked(so);
|
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
|
|
|
sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
|
|
|
|
socantsendmore_locked(so);
|
2017-08-24 20:49:19 +00:00
|
|
|
} else
|
|
|
|
SOCK_UNLOCK(so);
|
2007-03-26 08:59:03 +00:00
|
|
|
wakeup(&so->so_timeo);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
|
|
|
|
*/
|
|
|
|
struct sockaddr *
|
|
|
|
sodupsockaddr(const struct sockaddr *sa, int mflags)
|
|
|
|
{
|
|
|
|
struct sockaddr *sa2;
|
|
|
|
|
|
|
|
sa2 = malloc(sa->sa_len, M_SONAME, mflags);
|
|
|
|
if (sa2)
|
|
|
|
bcopy(sa, sa2, sa->sa_len);
|
|
|
|
return sa2;
|
|
|
|
}
|
|
|
|
|
2018-06-08 19:35:24 +00:00
|
|
|
/*
|
|
|
|
* Register per-socket destructor.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
sodtor_set(struct socket *so, so_dtor_t *func)
|
|
|
|
{
|
|
|
|
|
|
|
|
SOCK_LOCK_ASSERT(so);
|
|
|
|
so->so_dtor = func;
|
|
|
|
}
|
|
|
|
|
2009-06-01 21:17:03 +00:00
|
|
|
/*
|
|
|
|
* Register per-socket buffer upcalls.
|
|
|
|
*/
|
|
|
|
void
|
2017-06-07 01:48:11 +00:00
|
|
|
soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg)
|
2009-06-01 21:17:03 +00:00
|
|
|
{
|
|
|
|
struct sockbuf *sb;
|
2012-12-07 22:13:33 +00:00
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
|
|
|
|
|
2009-06-01 21:17:03 +00:00
|
|
|
switch (which) {
|
|
|
|
case SO_RCV:
|
|
|
|
sb = &so->so_rcv;
|
|
|
|
break;
|
|
|
|
case SO_SND:
|
|
|
|
sb = &so->so_snd;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
panic("soupcall_set: bad which");
|
|
|
|
}
|
|
|
|
SOCKBUF_LOCK_ASSERT(sb);
|
|
|
|
sb->sb_upcall = func;
|
|
|
|
sb->sb_upcallarg = arg;
|
|
|
|
sb->sb_flags |= SB_UPCALL;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
soupcall_clear(struct socket *so, int which)
|
|
|
|
{
|
|
|
|
struct sockbuf *sb;
|
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
|
|
|
|
|
2009-06-01 21:17:03 +00:00
|
|
|
switch (which) {
|
|
|
|
case SO_RCV:
|
|
|
|
sb = &so->so_rcv;
|
|
|
|
break;
|
|
|
|
case SO_SND:
|
|
|
|
sb = &so->so_snd;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
panic("soupcall_clear: bad which");
|
|
|
|
}
|
|
|
|
SOCKBUF_LOCK_ASSERT(sb);
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
KASSERT(sb->sb_upcall != NULL,
|
|
|
|
("%s: so %p no upcall to clear", __func__, so));
|
2009-06-01 21:17:03 +00:00
|
|
|
sb->sb_upcall = NULL;
|
|
|
|
sb->sb_upcallarg = NULL;
|
|
|
|
sb->sb_flags &= ~SB_UPCALL;
|
|
|
|
}
|
|
|
|
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
void
|
|
|
|
solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
|
|
|
|
{
|
|
|
|
|
|
|
|
SOLISTEN_LOCK_ASSERT(so);
|
|
|
|
so->sol_upcall = func;
|
|
|
|
so->sol_upcallarg = arg;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
so_rdknl_lock(void *arg)
|
|
|
|
{
|
|
|
|
struct socket *so = arg;
|
|
|
|
|
|
|
|
if (SOLISTENING(so))
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
else
|
|
|
|
SOCKBUF_LOCK(&so->so_rcv);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
so_rdknl_unlock(void *arg)
|
|
|
|
{
|
|
|
|
struct socket *so = arg;
|
|
|
|
|
|
|
|
if (SOLISTENING(so))
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
else
|
|
|
|
SOCKBUF_UNLOCK(&so->so_rcv);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
so_rdknl_assert_locked(void *arg)
|
|
|
|
{
|
|
|
|
struct socket *so = arg;
|
|
|
|
|
|
|
|
if (SOLISTENING(so))
|
|
|
|
SOCK_LOCK_ASSERT(so);
|
|
|
|
else
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
so_rdknl_assert_unlocked(void *arg)
|
|
|
|
{
|
|
|
|
struct socket *so = arg;
|
|
|
|
|
|
|
|
if (SOLISTENING(so))
|
|
|
|
SOCK_UNLOCK_ASSERT(so);
|
|
|
|
else
|
|
|
|
SOCKBUF_UNLOCK_ASSERT(&so->so_rcv);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
so_wrknl_lock(void *arg)
|
|
|
|
{
|
|
|
|
struct socket *so = arg;
|
|
|
|
|
|
|
|
if (SOLISTENING(so))
|
|
|
|
SOCK_LOCK(so);
|
|
|
|
else
|
|
|
|
SOCKBUF_LOCK(&so->so_snd);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
so_wrknl_unlock(void *arg)
|
|
|
|
{
|
|
|
|
struct socket *so = arg;
|
|
|
|
|
|
|
|
if (SOLISTENING(so))
|
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
else
|
|
|
|
SOCKBUF_UNLOCK(&so->so_snd);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
so_wrknl_assert_locked(void *arg)
|
|
|
|
{
|
|
|
|
struct socket *so = arg;
|
|
|
|
|
|
|
|
if (SOLISTENING(so))
|
|
|
|
SOCK_LOCK_ASSERT(so);
|
|
|
|
else
|
|
|
|
SOCKBUF_LOCK_ASSERT(&so->so_snd);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
so_wrknl_assert_unlocked(void *arg)
|
|
|
|
{
|
|
|
|
struct socket *so = arg;
|
|
|
|
|
|
|
|
if (SOLISTENING(so))
|
|
|
|
SOCK_UNLOCK_ASSERT(so);
|
|
|
|
else
|
|
|
|
SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
|
|
|
|
}
|
|
|
|
|
2007-03-26 08:59:03 +00:00
|
|
|
/*
|
2007-03-26 17:05:09 +00:00
|
|
|
* Create an external-format (``xsocket'') structure using the information in
|
|
|
|
* the kernel-format socket structure pointed to by so. This is done to
|
|
|
|
* reduce the spew of irrelevant information over this interface, to isolate
|
|
|
|
* user code from changes in the kernel structure, and potentially to provide
|
|
|
|
* information-hiding if we decide that some of this information should be
|
|
|
|
* hidden from users.
|
2007-03-26 08:59:03 +00:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
sotoxsocket(struct socket *so, struct xsocket *xso)
|
|
|
|
{
|
2007-05-16 20:41:08 +00:00
|
|
|
|
2018-11-22 20:49:41 +00:00
|
|
|
bzero(xso, sizeof(*xso));
|
2007-03-26 08:59:03 +00:00
|
|
|
xso->xso_len = sizeof *xso;
|
2018-07-10 13:03:06 +00:00
|
|
|
xso->xso_so = (uintptr_t)so;
|
2007-03-26 08:59:03 +00:00
|
|
|
xso->so_type = so->so_type;
|
|
|
|
xso->so_options = so->so_options;
|
|
|
|
xso->so_linger = so->so_linger;
|
|
|
|
xso->so_state = so->so_state;
|
2018-07-10 13:03:06 +00:00
|
|
|
xso->so_pcb = (uintptr_t)so->so_pcb;
|
2007-03-26 08:59:03 +00:00
|
|
|
xso->xso_protocol = so->so_proto->pr_protocol;
|
|
|
|
xso->xso_family = so->so_proto->pr_domain->dom_family;
|
|
|
|
xso->so_timeo = so->so_timeo;
|
|
|
|
xso->so_error = so->so_error;
|
|
|
|
xso->so_uid = so->so_cred->cr_uid;
|
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00
|
|
|
xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
|
|
|
|
if (SOLISTENING(so)) {
|
|
|
|
xso->so_qlen = so->sol_qlen;
|
|
|
|
xso->so_incqlen = so->sol_incqlen;
|
|
|
|
xso->so_qlimit = so->sol_qlimit;
|
|
|
|
xso->so_oobmark = 0;
|
|
|
|
} else {
|
|
|
|
xso->so_state |= so->so_qstate;
|
|
|
|
xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
|
|
|
|
xso->so_oobmark = so->so_oobmark;
|
|
|
|
sbtoxsockbuf(&so->so_snd, &xso->so_snd);
|
|
|
|
sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
|
|
|
|
}
|
2008-07-21 00:49:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
struct sockbuf *
|
|
|
|
so_sockbuf_rcv(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (&so->so_rcv);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct sockbuf *
|
|
|
|
so_sockbuf_snd(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (&so->so_snd);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
so_state_get(const struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (so->so_state);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_state_set(struct socket *so, int val)
|
|
|
|
{
|
|
|
|
|
|
|
|
so->so_state = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
so_options_get(const struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (so->so_options);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_options_set(struct socket *so, int val)
|
|
|
|
{
|
|
|
|
|
|
|
|
so->so_options = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
so_error_get(const struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (so->so_error);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_error_set(struct socket *so, int val)
|
|
|
|
{
|
|
|
|
|
|
|
|
so->so_error = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
so_linger_get(const struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (so->so_linger);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_linger_set(struct socket *so, int val)
|
|
|
|
{
|
|
|
|
|
2019-07-14 21:44:18 +00:00
|
|
|
KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz),
|
|
|
|
("%s: val %d out of range", __func__, val));
|
|
|
|
|
2008-07-21 00:49:34 +00:00
|
|
|
so->so_linger = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct protosw *
|
|
|
|
so_protosw_get(const struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (so->so_proto);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_protosw_set(struct socket *so, struct protosw *val)
|
|
|
|
{
|
|
|
|
|
|
|
|
so->so_proto = val;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_sorwakeup(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
sorwakeup(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_sowwakeup(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
sowwakeup(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_sorwakeup_locked(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
sorwakeup_locked(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_sowwakeup_locked(struct socket *so)
|
|
|
|
{
|
|
|
|
|
|
|
|
sowwakeup_locked(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_lock(struct socket *so)
|
|
|
|
{
|
2012-12-07 22:13:33 +00:00
|
|
|
|
2008-07-21 00:49:34 +00:00
|
|
|
SOCK_LOCK(so);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
so_unlock(struct socket *so)
|
|
|
|
{
|
2012-12-07 22:13:33 +00:00
|
|
|
|
2008-07-21 00:49:34 +00:00
|
|
|
SOCK_UNLOCK(so);
|
|
|
|
}
|