2005-01-07 01:45:51 +00:00
|
|
|
/*-
|
2017-11-20 19:43:44 +00:00
|
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
*
|
1995-10-03 16:54:17 +00:00
|
|
|
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
|
1994-05-24 10:09:53 +00:00
|
|
|
* The Regents of the University of California. All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
2017-02-28 23:42:47 +00:00
|
|
|
* 3. Neither the name of the University nor the names of its contributors
|
1994-05-24 10:09:53 +00:00
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
1995-10-03 16:54:17 +00:00
|
|
|
* @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
|
2007-10-07 20:44:24 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
Initial import of RFC 2385 (TCP-MD5) digest support.
This is the first of two commits; bringing in the kernel support first.
This can be enabled by compiling a kernel with options TCP_SIGNATURE
and FAST_IPSEC.
For the uninitiated, this is a TCP option which provides for a means of
authenticating TCP sessions which came into being before IPSEC. It is
still relevant today, however, as it is used by many commercial router
vendors, particularly with BGP, and as such has become a requirement for
interconnect at many major Internet points of presence.
Several parts of the TCP and IP headers, including the segment payload,
are digested with MD5, including a shared secret. The PF_KEY interface
is used to manage the secrets using security associations in the SADB.
There is a limitation here in that as there is no way to map a TCP flow
per-port back to an SPI without polluting tcpcb or using the SPD; the
code to do the latter is unstable at this time. Therefore this code only
supports per-host keying granularity.
Whilst FAST_IPSEC is mutually exclusive with KAME IPSEC (and thus IPv6),
TCP_SIGNATURE applies only to IPv4. For the vast majority of prospective
users of this feature, this will not pose any problem.
This implementation is output-only; that is, the option is honoured when
responding to a host initiating a TCP session, but no effort is made
[yet] to authenticate inbound traffic. This is, however, sufficient to
interwork with Cisco equipment.
Tested with a Cisco 2501 running IOS 12.0(27), and Quagga 0.96.4 with
local patches. Patches for tcpdump to validate TCP-MD5 sessions are also
available from me upon request.
Sponsored by: sentex.net
2004-02-11 04:26:04 +00:00
|
|
|
#include "opt_inet.h"
|
1999-12-07 17:39:16 +00:00
|
|
|
#include "opt_inet6.h"
|
1997-09-16 18:36:06 +00:00
|
|
|
#include "opt_tcpdebug.h"
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
1999-08-30 21:17:07 +00:00
|
|
|
#include <sys/callout.h>
|
1995-11-09 20:23:09 +00:00
|
|
|
#include <sys/kernel.h>
|
|
|
|
#include <sys/sysctl.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/malloc.h>
|
|
|
|
#include <sys/mbuf.h>
|
2006-11-06 13:42:10 +00:00
|
|
|
#include <sys/priv.h>
|
1999-07-11 18:32:46 +00:00
|
|
|
#include <sys/proc.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <sys/socketvar.h>
|
2017-10-02 09:45:17 +00:00
|
|
|
#ifndef INVARIANTS
|
|
|
|
#include <sys/syslog.h>
|
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/protosw.h>
|
2001-04-17 18:08:01 +00:00
|
|
|
#include <sys/random.h>
|
1998-03-28 10:18:26 +00:00
|
|
|
|
2002-03-20 05:48:55 +00:00
|
|
|
#include <vm/uma.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
#include <net/route.h>
|
|
|
|
#include <net/if.h>
|
2013-10-26 17:58:36 +00:00
|
|
|
#include <net/if_var.h>
|
2009-08-01 19:26:27 +00:00
|
|
|
#include <net/vnet.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
#include <netinet/in.h>
|
2018-07-30 20:13:38 +00:00
|
|
|
#include <netinet/in_kdtrace.h>
|
2011-04-30 11:21:29 +00:00
|
|
|
#include <netinet/in_pcb.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <netinet/in_systm.h>
|
2011-04-30 11:21:29 +00:00
|
|
|
#include <netinet/in_var.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <netinet/ip.h>
|
2011-04-30 11:21:29 +00:00
|
|
|
#include <netinet/ip_icmp.h>
|
|
|
|
#include <netinet/ip_var.h>
|
2000-01-09 19:17:30 +00:00
|
|
|
#ifdef INET6
|
|
|
|
#include <netinet/ip6.h>
|
|
|
|
#include <netinet6/in6_pcb.h>
|
|
|
|
#include <netinet6/ip6_var.h>
|
2005-07-25 12:31:43 +00:00
|
|
|
#include <netinet6/scope6_var.h>
|
2003-11-20 20:07:39 +00:00
|
|
|
#include <netinet6/nd6.h>
|
2000-01-09 19:17:30 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <netinet/tcp.h>
|
|
|
|
#include <netinet/tcp_fsm.h>
|
|
|
|
#include <netinet/tcp_seq.h>
|
|
|
|
#include <netinet/tcp_timer.h>
|
|
|
|
#include <netinet/tcp_var.h>
|
2000-01-09 19:17:30 +00:00
|
|
|
#ifdef INET6
|
|
|
|
#include <netinet6/tcp6_var.h>
|
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <netinet/tcpip.h>
|
1995-02-09 23:13:27 +00:00
|
|
|
#ifdef TCPDEBUG
|
|
|
|
#include <netinet/tcp_debug.h>
|
|
|
|
#endif
|
2011-04-30 11:21:29 +00:00
|
|
|
#ifdef INET6
|
2000-01-09 19:17:30 +00:00
|
|
|
#include <netinet6/ip6protosw.h>
|
2011-04-30 11:21:29 +00:00
|
|
|
#endif
|
2000-01-09 19:17:30 +00:00
|
|
|
|
2000-03-27 19:14:27 +00:00
|
|
|
#include <machine/in_cksum.h>
|
|
|
|
|
2006-10-22 11:52:19 +00:00
|
|
|
#include <security/mac/mac_framework.h>
|
|
|
|
|
2018-07-24 16:35:52 +00:00
|
|
|
VNET_DEFINE_STATIC(uma_zone_t, tcptw_zone);
|
2014-04-11 21:00:59 +00:00
|
|
|
#define V_tcptw_zone VNET(tcptw_zone)
|
2007-05-11 21:17:53 +00:00
|
|
|
static int maxtcptw;
|
2004-04-20 06:33:39 +00:00
|
|
|
|
2007-05-16 17:14:25 +00:00
|
|
|
/*
|
|
|
|
* The timed wait queue contains references to each of the TCP sessions
|
|
|
|
* currently in the TIME_WAIT state. The queue pointers, including the
|
|
|
|
* queue pointers in each tcptw structure, are protected using the global
|
2014-04-10 18:15:35 +00:00
|
|
|
* timewait lock, which must be held over queue iteration and modification.
|
2014-10-30 08:53:56 +00:00
|
|
|
*
|
|
|
|
* Rules on tcptw usage:
|
|
|
|
* - a inpcb is always freed _after_ its tcptw
|
|
|
|
* - a tcptw relies on its inpcb reference counting for memory stability
|
|
|
|
* - a tcptw is dereferenceable only while its inpcb is locked
|
2007-05-16 17:14:25 +00:00
|
|
|
*/
|
2018-07-24 16:35:52 +00:00
|
|
|
VNET_DEFINE_STATIC(TAILQ_HEAD(, tcptw), twq_2msl);
|
2014-04-11 21:00:59 +00:00
|
|
|
#define V_twq_2msl VNET(twq_2msl)
|
2007-05-16 17:14:25 +00:00
|
|
|
|
2014-04-10 18:15:35 +00:00
|
|
|
/* Global timewait lock */
|
2018-07-24 16:35:52 +00:00
|
|
|
VNET_DEFINE_STATIC(struct rwlock, tw_lock);
|
2014-04-11 21:00:59 +00:00
|
|
|
#define V_tw_lock VNET(tw_lock)
|
|
|
|
|
|
|
|
#define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0)
|
|
|
|
#define TW_LOCK_DESTROY(tw) rw_destroy(&(tw))
|
|
|
|
#define TW_RLOCK(tw) rw_rlock(&(tw))
|
|
|
|
#define TW_WLOCK(tw) rw_wlock(&(tw))
|
|
|
|
#define TW_RUNLOCK(tw) rw_runlock(&(tw))
|
|
|
|
#define TW_WUNLOCK(tw) rw_wunlock(&(tw))
|
|
|
|
#define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED)
|
|
|
|
#define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED)
|
|
|
|
#define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED)
|
|
|
|
#define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED)
|
|
|
|
|
|
|
|
static void tcp_tw_2msl_reset(struct tcptw *, int);
|
|
|
|
static void tcp_tw_2msl_stop(struct tcptw *, int);
|
2014-05-24 14:01:18 +00:00
|
|
|
static int tcp_twrespond(struct tcptw *, int);
|
2014-04-10 18:15:35 +00:00
|
|
|
|
2007-05-11 21:17:53 +00:00
|
|
|
static int
|
|
|
|
tcptw_auto_size(void)
|
We currently does not react to ICMP administratively prohibited
messages send by routers when they deny our traffic, this causes
a timeout when trying to connect to TCP ports/services on a remote
host, which is blocked by routers or firewalls.
rfc1122 (Requirements for Internet Hosts) section 3.2.2.1 actually
requi re that we treat such a message for a TCP session, that we
treat it like if we had recieved a RST.
quote begin.
A Destination Unreachable message that is received MUST be
reported to the transport layer. The transport layer SHOULD
use the information appropriately; for example, see Sections
4.1.3.3, 4.2.3.9, and 4.2.4 below. A transport protocol
that has its own mechanism for notifying the sender that a
port is unreachable (e.g., TCP, which sends RST segments)
MUST nevertheless accept an ICMP Port Unreachable for the
same purpose.
quote end.
I've written a small extension that implement this, it also create
a sysctl "net.inet.tcp.icmp_admin_prohib_like_rst" to control if
this new behaviour is activated.
When it's activated (set to 1) we'll treat a ICMP administratively
prohibited message (icmp type 3 code 9, 10 and 13) for a TCP
sessions, as if we recived a TCP RST, but only if the TCP session
is in SYN_SENT state.
The reason for only reacting when in SYN_SENT state, is that this
will solve the problem, and at the same time minimize the risk of
this being abused.
I suggest that we enable this new behaviour by default, but it
would be a change of current behaviour, so if people prefer to
leave it disabled by default, at least for now, this would be ok
for me, the attached diff actually have the sysctl set to 0 by
default.
PR: 23086
Submitted by: Jesper Skriver <jesper@skriver.dk>
2000-12-16 19:42:06 +00:00
|
|
|
{
|
2007-05-11 21:17:53 +00:00
|
|
|
int halfrange;
|
2005-06-01 12:06:07 +00:00
|
|
|
|
2007-05-11 21:17:53 +00:00
|
|
|
/*
|
|
|
|
* Max out at half the ephemeral port range so that TIME_WAIT
|
|
|
|
* sockets don't tie up too many ephemeral ports.
|
|
|
|
*/
|
Commit step 1 of the vimage project, (network stack)
virtualization work done by Marko Zec (zec@).
This is the first in a series of commits over the course
of the next few weeks.
Mark all uses of global variables to be virtualized
with a V_ prefix.
Use macros to map them back to their global names for
now, so this is a NOP change only.
We hope to have caught at least 85-90% of what is needed
so we do not invalidate a lot of outstanding patches again.
Obtained from: //depot/projects/vimage-commit2/...
Reviewed by: brooks, des, ed, mav, julian,
jamie, kris, rwatson, zec, ...
(various people I forgot, different versions)
md5 (with a bit of help)
Sponsored by: NLnet Foundation, The FreeBSD Foundation
X-MFC after: never
V_Commit_Message_Reviewed_By: more people than the patch
2008-08-17 23:27:27 +00:00
|
|
|
if (V_ipport_lastauto > V_ipport_firstauto)
|
|
|
|
halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2;
|
2006-04-03 14:07:50 +00:00
|
|
|
else
|
Commit step 1 of the vimage project, (network stack)
virtualization work done by Marko Zec (zec@).
This is the first in a series of commits over the course
of the next few weeks.
Mark all uses of global variables to be virtualized
with a V_ prefix.
Use macros to map them back to their global names for
now, so this is a NOP change only.
We hope to have caught at least 85-90% of what is needed
so we do not invalidate a lot of outstanding patches again.
Obtained from: //depot/projects/vimage-commit2/...
Reviewed by: brooks, des, ed, mav, julian,
jamie, kris, rwatson, zec, ...
(various people I forgot, different versions)
md5 (with a bit of help)
Sponsored by: NLnet Foundation, The FreeBSD Foundation
X-MFC after: never
V_Commit_Message_Reviewed_By: more people than the patch
2008-08-17 23:27:27 +00:00
|
|
|
halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2;
|
2007-05-11 21:17:53 +00:00
|
|
|
/* Protect against goofy port ranges smaller than 32. */
|
|
|
|
return (imin(imax(halfrange, 32), maxsockets / 5));
|
2001-02-18 09:34:55 +00:00
|
|
|
}
|
|
|
|
|
2007-05-11 21:17:53 +00:00
|
|
|
static int
|
|
|
|
sysctl_maxtcptw(SYSCTL_HANDLER_ARGS)
|
1995-09-18 15:51:40 +00:00
|
|
|
{
|
2007-05-11 21:17:53 +00:00
|
|
|
int error, new;
|
2006-04-03 14:07:50 +00:00
|
|
|
|
2007-05-11 21:17:53 +00:00
|
|
|
if (maxtcptw == 0)
|
|
|
|
new = tcptw_auto_size();
|
2006-04-03 14:07:50 +00:00
|
|
|
else
|
2007-05-11 21:17:53 +00:00
|
|
|
new = maxtcptw;
|
2007-06-04 18:25:08 +00:00
|
|
|
error = sysctl_handle_int(oidp, &new, 0, req);
|
2007-05-11 21:17:53 +00:00
|
|
|
if (error == 0 && req->newptr)
|
|
|
|
if (new >= 32) {
|
|
|
|
maxtcptw = new;
|
First pass at separating per-vnet initializer functions
from existing functions for initializing global state.
At this stage, the new per-vnet initializer functions are
directly called from the existing global initialization code,
which should in most cases result in compiler inlining those
new functions, hence yielding a near-zero functional change.
Modify the existing initializer functions which are invoked via
protosw, like ip_init() et. al., to allow them to be invoked
multiple times, i.e. per each vnet. Global state, if any,
is initialized only if such functions are called within the
context of vnet0, which will be determined via the
IS_DEFAULT_VNET(curvnet) check (currently always true).
While here, V_irtualize a few remaining global UMA zones
used by net/netinet/netipsec networking code. While it is
not yet clear to me or anybody else whether this is the right
thing to do, at this stage this makes the code more readable,
and makes it easier to track uncollected UMA-zone-backed
objects on vnet removal. In the long run, it's quite possible
that some form of shared use of UMA zone pools among multiple
vnets should be considered.
Bump __FreeBSD_version due to changes in layout of structs
vnet_ipfw, vnet_inet and vnet_net.
Approved by: julian (mentor)
2009-04-06 22:29:41 +00:00
|
|
|
uma_zone_set_max(V_tcptw_zone, maxtcptw);
|
2007-05-11 21:17:53 +00:00
|
|
|
}
|
|
|
|
return (error);
|
1995-09-18 15:51:40 +00:00
|
|
|
}
|
2007-05-13 22:16:13 +00:00
|
|
|
|
2007-05-11 21:17:53 +00:00
|
|
|
SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW,
|
|
|
|
&maxtcptw, 0, sysctl_maxtcptw, "IU",
|
|
|
|
"Maximum number of compressed TCP TIME_WAIT entries");
|
|
|
|
|
2018-07-24 16:35:52 +00:00
|
|
|
VNET_DEFINE_STATIC(int, nolocaltimewait) = 0;
|
2010-04-29 11:52:42 +00:00
|
|
|
#define V_nolocaltimewait VNET(nolocaltimewait)
|
2014-11-07 09:39:05 +00:00
|
|
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW,
|
Build on Jeff Roberson's linker-set based dynamic per-CPU allocator
(DPCPU), as suggested by Peter Wemm, and implement a new per-virtual
network stack memory allocator. Modify vnet to use the allocator
instead of monolithic global container structures (vinet, ...). This
change solves many binary compatibility problems associated with
VIMAGE, and restores ELF symbols for virtualized global variables.
Each virtualized global variable exists as a "reference copy", and also
once per virtual network stack. Virtualized global variables are
tagged at compile-time, placing the in a special linker set, which is
loaded into a contiguous region of kernel memory. Virtualized global
variables in the base kernel are linked as normal, but those in modules
are copied and relocated to a reserved portion of the kernel's vnet
region with the help of a the kernel linker.
Virtualized global variables exist in per-vnet memory set up when the
network stack instance is created, and are initialized statically from
the reference copy. Run-time access occurs via an accessor macro, which
converts from the current vnet and requested symbol to a per-vnet
address. When "options VIMAGE" is not compiled into the kernel, normal
global ELF symbols will be used instead and indirection is avoided.
This change restores static initialization for network stack global
variables, restores support for non-global symbols and types, eliminates
the need for many subsystem constructors, eliminates large per-subsystem
structures that caused many binary compatibility issues both for
monitoring applications (netstat) and kernel modules, removes the
per-function INIT_VNET_*() macros throughout the stack, eliminates the
need for vnet_symmap ksym(2) munging, and eliminates duplicate
definitions of virtualized globals under VIMAGE_GLOBALS.
Bump __FreeBSD_version and update UPDATING.
Portions submitted by: bz
Reviewed by: bz, zec
Discussed with: gnn, jamie, jeff, jhb, julian, sam
Suggested by: peter
Approved by: re (kensmith)
2009-07-14 22:48:30 +00:00
|
|
|
&VNET_NAME(nolocaltimewait), 0,
|
2007-05-11 21:17:53 +00:00
|
|
|
"Do not create compressed TCP TIME_WAIT entries for local connections");
|
1995-09-18 15:51:40 +00:00
|
|
|
|
2007-05-13 22:16:13 +00:00
|
|
|
void
|
|
|
|
tcp_tw_zone_change(void)
|
2000-01-09 19:17:30 +00:00
|
|
|
{
|
2003-11-20 20:07:39 +00:00
|
|
|
|
2007-05-11 21:17:53 +00:00
|
|
|
if (maxtcptw == 0)
|
First pass at separating per-vnet initializer functions
from existing functions for initializing global state.
At this stage, the new per-vnet initializer functions are
directly called from the existing global initialization code,
which should in most cases result in compiler inlining those
new functions, hence yielding a near-zero functional change.
Modify the existing initializer functions which are invoked via
protosw, like ip_init() et. al., to allow them to be invoked
multiple times, i.e. per each vnet. Global state, if any,
is initialized only if such functions are called within the
context of vnet0, which will be determined via the
IS_DEFAULT_VNET(curvnet) check (currently always true).
While here, V_irtualize a few remaining global UMA zones
used by net/netinet/netipsec networking code. While it is
not yet clear to me or anybody else whether this is the right
thing to do, at this stage this makes the code more readable,
and makes it easier to track uncollected UMA-zone-backed
objects on vnet removal. In the long run, it's quite possible
that some form of shared use of UMA zone pools among multiple
vnets should be considered.
Bump __FreeBSD_version due to changes in layout of structs
vnet_ipfw, vnet_inet and vnet_net.
Approved by: julian (mentor)
2009-04-06 22:29:41 +00:00
|
|
|
uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
|
2000-01-09 19:17:30 +00:00
|
|
|
}
|
|
|
|
|
2007-05-11 21:17:53 +00:00
|
|
|
void
|
2007-05-13 22:16:13 +00:00
|
|
|
tcp_tw_init(void)
|
2000-01-09 19:17:30 +00:00
|
|
|
{
|
|
|
|
|
First pass at separating per-vnet initializer functions
from existing functions for initializing global state.
At this stage, the new per-vnet initializer functions are
directly called from the existing global initialization code,
which should in most cases result in compiler inlining those
new functions, hence yielding a near-zero functional change.
Modify the existing initializer functions which are invoked via
protosw, like ip_init() et. al., to allow them to be invoked
multiple times, i.e. per each vnet. Global state, if any,
is initialized only if such functions are called within the
context of vnet0, which will be determined via the
IS_DEFAULT_VNET(curvnet) check (currently always true).
While here, V_irtualize a few remaining global UMA zones
used by net/netinet/netipsec networking code. While it is
not yet clear to me or anybody else whether this is the right
thing to do, at this stage this makes the code more readable,
and makes it easier to track uncollected UMA-zone-backed
objects on vnet removal. In the long run, it's quite possible
that some form of shared use of UMA zone pools among multiple
vnets should be considered.
Bump __FreeBSD_version due to changes in layout of structs
vnet_ipfw, vnet_inet and vnet_net.
Approved by: julian (mentor)
2009-04-06 22:29:41 +00:00
|
|
|
V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
|
2016-06-23 00:32:58 +00:00
|
|
|
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
|
2007-05-11 21:17:53 +00:00
|
|
|
TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw);
|
|
|
|
if (maxtcptw == 0)
|
First pass at separating per-vnet initializer functions
from existing functions for initializing global state.
At this stage, the new per-vnet initializer functions are
directly called from the existing global initialization code,
which should in most cases result in compiler inlining those
new functions, hence yielding a near-zero functional change.
Modify the existing initializer functions which are invoked via
protosw, like ip_init() et. al., to allow them to be invoked
multiple times, i.e. per each vnet. Global state, if any,
is initialized only if such functions are called within the
context of vnet0, which will be determined via the
IS_DEFAULT_VNET(curvnet) check (currently always true).
While here, V_irtualize a few remaining global UMA zones
used by net/netinet/netipsec networking code. While it is
not yet clear to me or anybody else whether this is the right
thing to do, at this stage this makes the code more readable,
and makes it easier to track uncollected UMA-zone-backed
objects on vnet removal. In the long run, it's quite possible
that some form of shared use of UMA zone pools among multiple
vnets should be considered.
Bump __FreeBSD_version due to changes in layout of structs
vnet_ipfw, vnet_inet and vnet_net.
Approved by: julian (mentor)
2009-04-06 22:29:41 +00:00
|
|
|
uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
|
2007-05-11 21:17:53 +00:00
|
|
|
else
|
First pass at separating per-vnet initializer functions
from existing functions for initializing global state.
At this stage, the new per-vnet initializer functions are
directly called from the existing global initialization code,
which should in most cases result in compiler inlining those
new functions, hence yielding a near-zero functional change.
Modify the existing initializer functions which are invoked via
protosw, like ip_init() et. al., to allow them to be invoked
multiple times, i.e. per each vnet. Global state, if any,
is initialized only if such functions are called within the
context of vnet0, which will be determined via the
IS_DEFAULT_VNET(curvnet) check (currently always true).
While here, V_irtualize a few remaining global UMA zones
used by net/netinet/netipsec networking code. While it is
not yet clear to me or anybody else whether this is the right
thing to do, at this stage this makes the code more readable,
and makes it easier to track uncollected UMA-zone-backed
objects on vnet removal. In the long run, it's quite possible
that some form of shared use of UMA zone pools among multiple
vnets should be considered.
Bump __FreeBSD_version due to changes in layout of structs
vnet_ipfw, vnet_inet and vnet_net.
Approved by: julian (mentor)
2009-04-06 22:29:41 +00:00
|
|
|
uma_zone_set_max(V_tcptw_zone, maxtcptw);
|
Commit step 1 of the vimage project, (network stack)
virtualization work done by Marko Zec (zec@).
This is the first in a series of commits over the course
of the next few weeks.
Mark all uses of global variables to be virtualized
with a V_ prefix.
Use macros to map them back to their global names for
now, so this is a NOP change only.
We hope to have caught at least 85-90% of what is needed
so we do not invalidate a lot of outstanding patches again.
Obtained from: //depot/projects/vimage-commit2/...
Reviewed by: brooks, des, ed, mav, julian,
jamie, kris, rwatson, zec, ...
(various people I forgot, different versions)
md5 (with a bit of help)
Sponsored by: NLnet Foundation, The FreeBSD Foundation
X-MFC after: never
V_Commit_Message_Reviewed_By: more people than the patch
2008-08-17 23:27:27 +00:00
|
|
|
TAILQ_INIT(&V_twq_2msl);
|
2014-04-10 18:15:35 +00:00
|
|
|
TW_LOCK_INIT(V_tw_lock, "tcptw");
|
2000-01-09 19:17:30 +00:00
|
|
|
}
|
|
|
|
|
Introduce an infrastructure for dismantling vnet instances.
Vnet modules and protocol domains may now register destructor
functions to clean up and release per-module state. The destructor
mechanisms can be triggered by invoking "vimage -d", or a future
equivalent command which will be provided via the new jail framework.
While this patch introduces numerous placeholder destructor functions,
many of those are currently incomplete, thus leaking memory or (even
worse) failing to stop all running timers. Many of such issues are
already known and will be incrementaly fixed over the next weeks in
smaller incremental commits.
Apart from introducing new fields in structs ifnet, domain, protosw
and vnet_net, which requires the kernel and modules to be rebuilt, this
change should have no impact on nooptions VIMAGE builds, since vnet
destructors can only be called in VIMAGE kernels. Moreover,
destructor functions should be in general compiled in only in
options VIMAGE builds, except for kernel modules which can be safely
kldunloaded at run time.
Bump __FreeBSD_version to 800097.
Reviewed by: bz, julian
Approved by: rwatson, kib (re), julian (mentor)
2009-06-08 17:15:40 +00:00
|
|
|
#ifdef VIMAGE
|
|
|
|
void
|
|
|
|
tcp_tw_destroy(void)
|
|
|
|
{
|
|
|
|
struct tcptw *tw;
|
2018-07-04 02:47:16 +00:00
|
|
|
struct epoch_tracker et;
|
Introduce an infrastructure for dismantling vnet instances.
Vnet modules and protocol domains may now register destructor
functions to clean up and release per-module state. The destructor
mechanisms can be triggered by invoking "vimage -d", or a future
equivalent command which will be provided via the new jail framework.
While this patch introduces numerous placeholder destructor functions,
many of those are currently incomplete, thus leaking memory or (even
worse) failing to stop all running timers. Many of such issues are
already known and will be incrementaly fixed over the next weeks in
smaller incremental commits.
Apart from introducing new fields in structs ifnet, domain, protosw
and vnet_net, which requires the kernel and modules to be rebuilt, this
change should have no impact on nooptions VIMAGE builds, since vnet
destructors can only be called in VIMAGE kernels. Moreover,
destructor functions should be in general compiled in only in
options VIMAGE builds, except for kernel modules which can be safely
kldunloaded at run time.
Bump __FreeBSD_version to 800097.
Reviewed by: bz, julian
Approved by: rwatson, kib (re), julian (mentor)
2009-06-08 17:15:40 +00:00
|
|
|
|
2018-07-04 02:47:16 +00:00
|
|
|
INP_INFO_RLOCK_ET(&V_tcbinfo, et);
|
2014-04-10 18:15:35 +00:00
|
|
|
while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
|
Introduce an infrastructure for dismantling vnet instances.
Vnet modules and protocol domains may now register destructor
functions to clean up and release per-module state. The destructor
mechanisms can be triggered by invoking "vimage -d", or a future
equivalent command which will be provided via the new jail framework.
While this patch introduces numerous placeholder destructor functions,
many of those are currently incomplete, thus leaking memory or (even
worse) failing to stop all running timers. Many of such issues are
already known and will be incrementaly fixed over the next weeks in
smaller incremental commits.
Apart from introducing new fields in structs ifnet, domain, protosw
and vnet_net, which requires the kernel and modules to be rebuilt, this
change should have no impact on nooptions VIMAGE builds, since vnet
destructors can only be called in VIMAGE kernels. Moreover,
destructor functions should be in general compiled in only in
options VIMAGE builds, except for kernel modules which can be safely
kldunloaded at run time.
Bump __FreeBSD_version to 800097.
Reviewed by: bz, julian
Approved by: rwatson, kib (re), julian (mentor)
2009-06-08 17:15:40 +00:00
|
|
|
tcp_twclose(tw, 0);
|
2018-07-04 02:47:16 +00:00
|
|
|
INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
|
2010-03-07 15:58:44 +00:00
|
|
|
|
2014-04-10 18:15:35 +00:00
|
|
|
TW_LOCK_DESTROY(V_tw_lock);
|
2010-03-07 15:58:44 +00:00
|
|
|
uma_zdestroy(V_tcptw_zone);
|
Introduce an infrastructure for dismantling vnet instances.
Vnet modules and protocol domains may now register destructor
functions to clean up and release per-module state. The destructor
mechanisms can be triggered by invoking "vimage -d", or a future
equivalent command which will be provided via the new jail framework.
While this patch introduces numerous placeholder destructor functions,
many of those are currently incomplete, thus leaking memory or (even
worse) failing to stop all running timers. Many of such issues are
already known and will be incrementaly fixed over the next weeks in
smaller incremental commits.
Apart from introducing new fields in structs ifnet, domain, protosw
and vnet_net, which requires the kernel and modules to be rebuilt, this
change should have no impact on nooptions VIMAGE builds, since vnet
destructors can only be called in VIMAGE kernels. Moreover,
destructor functions should be in general compiled in only in
options VIMAGE builds, except for kernel modules which can be safely
kldunloaded at run time.
Bump __FreeBSD_version to 800097.
Reviewed by: bz, julian
Approved by: rwatson, kib (re), julian (mentor)
2009-06-08 17:15:40 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2003-02-19 22:32:43 +00:00
|
|
|
/*
|
|
|
|
* Move a TCP connection into TIME_WAIT state.
|
2004-12-05 22:27:53 +00:00
|
|
|
* tcbinfo is locked.
|
2003-02-19 22:32:43 +00:00
|
|
|
* inp is locked, and is unlocked before returning.
|
|
|
|
*/
|
|
|
|
void
|
2006-04-03 12:59:27 +00:00
|
|
|
tcp_twstart(struct tcpcb *tp)
|
2003-02-19 22:32:43 +00:00
|
|
|
{
|
2018-03-21 20:59:30 +00:00
|
|
|
struct tcptw twlocal, *tw;
|
2006-09-08 13:09:15 +00:00
|
|
|
struct inpcb *inp = tp->t_inpcb;
|
2003-02-19 22:32:43 +00:00
|
|
|
struct socket *so;
|
2018-07-30 21:13:42 +00:00
|
|
|
uint32_t recwin;
|
2018-03-21 20:59:30 +00:00
|
|
|
bool acknow, local;
|
2011-04-30 11:21:29 +00:00
|
|
|
#ifdef INET6
|
2018-03-21 20:59:30 +00:00
|
|
|
bool isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
|
2011-04-30 11:21:29 +00:00
|
|
|
#endif
|
2003-02-19 22:32:43 +00:00
|
|
|
|
2015-08-03 12:13:54 +00:00
|
|
|
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WLOCK_ASSERT(inp);
|
2006-09-08 13:09:15 +00:00
|
|
|
|
2016-10-18 07:16:49 +00:00
|
|
|
/* A dropped inp should never transition to TIME_WAIT state. */
|
|
|
|
KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("tcp_twstart: "
|
|
|
|
"(inp->inp_flags & INP_DROPPED) != 0"));
|
|
|
|
|
2011-04-30 11:21:29 +00:00
|
|
|
if (V_nolocaltimewait) {
|
|
|
|
#ifdef INET6
|
|
|
|
if (isipv6)
|
2018-03-21 20:59:30 +00:00
|
|
|
local = in6_localaddr(&inp->in6p_faddr);
|
2011-04-30 11:21:29 +00:00
|
|
|
else
|
|
|
|
#endif
|
|
|
|
#ifdef INET
|
2018-03-21 20:59:30 +00:00
|
|
|
local = in_localip(inp->inp_faddr);
|
2018-03-22 05:07:57 +00:00
|
|
|
#else
|
|
|
|
local = false;
|
2011-04-30 11:21:29 +00:00
|
|
|
#endif
|
2018-03-21 20:59:30 +00:00
|
|
|
} else
|
|
|
|
local = false;
|
2015-05-01 12:49:03 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For use only by DTrace. We do not reference the state
|
|
|
|
* after this point so modifying it in place is not a problem.
|
|
|
|
*/
|
|
|
|
tcp_state_change(tp, TCPS_TIME_WAIT);
|
|
|
|
|
2018-03-21 20:59:30 +00:00
|
|
|
if (local)
|
|
|
|
tw = &twlocal;
|
|
|
|
else
|
|
|
|
tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
|
2003-03-08 22:06:20 +00:00
|
|
|
if (tw == NULL) {
|
2014-08-04 19:42:48 +00:00
|
|
|
/*
|
|
|
|
* Reached limit on total number of TIMEWAIT connections
|
|
|
|
* allowed. Remove a connection from TIMEWAIT queue in LRU
|
|
|
|
* fashion to make room for this connection.
|
2014-10-30 08:53:56 +00:00
|
|
|
*
|
2015-08-03 12:13:54 +00:00
|
|
|
* XXX: Check if it possible to always have enough room
|
|
|
|
* in advance based on guarantees provided by uma_zalloc().
|
2014-08-04 19:42:48 +00:00
|
|
|
*/
|
2014-10-30 08:53:56 +00:00
|
|
|
tw = tcp_tw_2msl_scan(1);
|
2003-03-08 22:06:20 +00:00
|
|
|
if (tw == NULL) {
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
tp = tcp_close(tp);
|
|
|
|
if (tp != NULL)
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WUNLOCK(inp);
|
2003-03-08 22:06:20 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2014-10-30 08:53:56 +00:00
|
|
|
/*
|
2018-03-21 20:59:30 +00:00
|
|
|
* For !local case the tcptw will hold a reference on its inpcb
|
|
|
|
* until tcp_twclose is called.
|
2014-10-30 08:53:56 +00:00
|
|
|
*/
|
2003-02-19 22:32:43 +00:00
|
|
|
tw->tw_inpcb = inp;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Recover last window size sent.
|
|
|
|
*/
|
2018-07-30 21:13:42 +00:00
|
|
|
so = inp->inp_socket;
|
|
|
|
recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
|
|
|
|
(long)TCP_MAXWIN << tp->rcv_scale);
|
|
|
|
if (recwin < (so->so_rcv.sb_hiwat / 4) &&
|
|
|
|
recwin < tp->t_maxseg)
|
|
|
|
recwin = 0;
|
|
|
|
if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
|
|
|
|
recwin < (tp->rcv_adv - tp->rcv_nxt))
|
|
|
|
recwin = (tp->rcv_adv - tp->rcv_nxt);
|
|
|
|
tw->last_win = htons((u_short)(recwin >> tp->rcv_scale));
|
2003-02-19 22:32:43 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Set t_recent if timestamps are used on the connection.
|
|
|
|
*/
|
2004-08-16 18:32:07 +00:00
|
|
|
if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
|
2007-05-11 18:29:39 +00:00
|
|
|
(TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
|
2003-02-19 22:32:43 +00:00
|
|
|
tw->t_recent = tp->ts_recent;
|
2007-05-11 18:29:39 +00:00
|
|
|
tw->ts_offset = tp->ts_offset;
|
|
|
|
} else {
|
2003-02-19 22:32:43 +00:00
|
|
|
tw->t_recent = 0;
|
2007-05-11 18:29:39 +00:00
|
|
|
tw->ts_offset = 0;
|
|
|
|
}
|
2003-02-19 22:32:43 +00:00
|
|
|
|
|
|
|
tw->snd_nxt = tp->snd_nxt;
|
|
|
|
tw->rcv_nxt = tp->rcv_nxt;
|
2003-11-01 07:30:08 +00:00
|
|
|
tw->iss = tp->iss;
|
2003-11-02 07:47:03 +00:00
|
|
|
tw->irs = tp->irs;
|
2003-02-19 22:32:43 +00:00
|
|
|
tw->t_starttime = tp->t_starttime;
|
2003-03-08 22:06:20 +00:00
|
|
|
tw->tw_time = 0;
|
2003-02-19 22:32:43 +00:00
|
|
|
|
|
|
|
/* XXX
|
|
|
|
* If this code will
|
|
|
|
* be used for fin-wait-2 state also, then we may need
|
|
|
|
* a ts_recent from the last segment.
|
|
|
|
*/
|
2004-11-02 22:22:22 +00:00
|
|
|
acknow = tp->t_flags & TF_ACKNOW;
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* First, discard tcpcb state, which includes stopping its timers and
|
|
|
|
* freeing it. tcp_discardcb() used to also release the inpcb, but
|
|
|
|
* that work is now done in the caller.
|
2006-08-02 16:18:05 +00:00
|
|
|
*
|
|
|
|
* Note: soisdisconnected() call used to be made in tcp_discardcb(),
|
|
|
|
* and might not be needed here any longer.
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
*/
|
2003-02-19 22:32:43 +00:00
|
|
|
tcp_discardcb(tp);
|
2006-08-02 16:18:05 +00:00
|
|
|
soisdisconnected(so);
|
2003-02-19 22:32:43 +00:00
|
|
|
tw->tw_so_options = so->so_options;
|
2018-03-21 20:59:30 +00:00
|
|
|
inp->inp_flags |= INP_TIMEWAIT;
|
2004-06-12 20:47:32 +00:00
|
|
|
if (acknow)
|
|
|
|
tcp_twrespond(tw, TH_ACK);
|
2018-03-21 20:59:30 +00:00
|
|
|
if (local)
|
|
|
|
in_pcbdrop(inp);
|
|
|
|
else {
|
|
|
|
in_pcbref(inp); /* Reference from tw */
|
|
|
|
tw->tw_cred = crhold(so->so_cred);
|
|
|
|
inp->inp_ppcb = tw;
|
|
|
|
TCPSTATES_INC(TCPS_TIME_WAIT);
|
|
|
|
tcp_tw_2msl_reset(tw, 0);
|
|
|
|
}
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the inpcb owns the sole reference to the socket, then we can
|
|
|
|
* detach and free the socket as it is not needed in time wait.
|
|
|
|
*/
|
2009-03-15 09:58:31 +00:00
|
|
|
if (inp->inp_flags & INP_SOCKREF) {
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
KASSERT(so->so_state & SS_PROTOREF,
|
|
|
|
("tcp_twstart: !SS_PROTOREF"));
|
2009-03-15 09:58:31 +00:00
|
|
|
inp->inp_flags &= ~INP_SOCKREF;
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WUNLOCK(inp);
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
SOCK_LOCK(so);
|
|
|
|
so->so_state &= ~SS_PROTOREF;
|
|
|
|
sofree(so);
|
|
|
|
} else
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WUNLOCK(inp);
|
2003-02-19 22:32:43 +00:00
|
|
|
}
|
|
|
|
|
2007-05-16 17:14:25 +00:00
|
|
|
/*
|
|
|
|
* Returns 1 if the TIME_WAIT state was killed and we should start over,
|
|
|
|
* looking for a pcb in the listen state. Returns 0 otherwise.
|
|
|
|
*/
|
|
|
|
int
|
2014-05-30 22:34:06 +00:00
|
|
|
tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th,
|
2007-05-16 17:14:25 +00:00
|
|
|
struct mbuf *m, int tlen)
|
|
|
|
{
|
|
|
|
struct tcptw *tw;
|
|
|
|
int thflags;
|
|
|
|
tcp_seq seq;
|
|
|
|
|
2015-08-03 12:13:54 +00:00
|
|
|
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WLOCK_ASSERT(inp);
|
2007-05-16 17:14:25 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* XXXRW: Time wait state for inpcb has been recycled, but inpcb is
|
|
|
|
* still present. This is undesirable, but temporarily necessary
|
|
|
|
* until we work out how to handle inpcb's who's timewait state has
|
|
|
|
* been removed.
|
|
|
|
*/
|
|
|
|
tw = intotw(inp);
|
|
|
|
if (tw == NULL)
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
thflags = th->th_flags;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NOTE: for FIN_WAIT_2 (to be added later),
|
|
|
|
* must validate sequence number before accepting RST
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the segment contains RST:
|
|
|
|
* Drop the segment - see Stevens, vol. 2, p. 964 and
|
|
|
|
* RFC 1337.
|
|
|
|
*/
|
|
|
|
if (thflags & TH_RST)
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
/* PAWS not needed at the moment */
|
|
|
|
/*
|
|
|
|
* RFC 1323 PAWS: If we have a timestamp reply on this segment
|
|
|
|
* and it's less than ts_recent, drop it.
|
|
|
|
*/
|
|
|
|
if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
|
|
|
|
TSTMP_LT(to.to_tsval, tp->ts_recent)) {
|
|
|
|
if ((thflags & TH_ACK) == 0)
|
|
|
|
goto drop;
|
|
|
|
goto ack;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* ts_recent is never updated because we never accept new segments.
|
|
|
|
*/
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If a new connection request is received
|
|
|
|
* while in TIME_WAIT, drop the old connection
|
|
|
|
* and start over if the sequence numbers
|
|
|
|
* are above the previous ones.
|
|
|
|
*/
|
|
|
|
if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
|
|
|
|
tcp_twclose(tw, 0);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2011-02-21 09:01:34 +00:00
|
|
|
* Drop the segment if it does not contain an ACK.
|
2007-05-16 17:14:25 +00:00
|
|
|
*/
|
|
|
|
if ((thflags & TH_ACK) == 0)
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reset the 2MSL timer if this is a duplicate FIN.
|
|
|
|
*/
|
|
|
|
if (thflags & TH_FIN) {
|
|
|
|
seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0);
|
|
|
|
if (seq + 1 == tw->rcv_nxt)
|
|
|
|
tcp_tw_2msl_reset(tw, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Acknowledge the segment if it has data or is not a duplicate ACK.
|
|
|
|
*/
|
|
|
|
if (thflags != TH_ACK || tlen != 0 ||
|
2018-07-30 20:13:38 +00:00
|
|
|
th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) {
|
|
|
|
TCP_PROBE5(receive, NULL, NULL, m, NULL, th);
|
2007-05-16 17:14:25 +00:00
|
|
|
tcp_twrespond(tw, TH_ACK);
|
2018-07-30 20:13:38 +00:00
|
|
|
goto dropnoprobe;
|
|
|
|
}
|
2007-05-16 17:14:25 +00:00
|
|
|
drop:
|
2018-07-30 20:13:38 +00:00
|
|
|
TCP_PROBE5(receive, NULL, NULL, m, NULL, th);
|
|
|
|
dropnoprobe:
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WUNLOCK(inp);
|
2007-05-16 17:14:25 +00:00
|
|
|
m_freem(m);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
void
|
2003-03-08 22:06:20 +00:00
|
|
|
tcp_twclose(struct tcptw *tw, int reuse)
|
2003-02-19 22:32:43 +00:00
|
|
|
{
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
struct socket *so;
|
2003-02-19 22:32:43 +00:00
|
|
|
struct inpcb *inp;
|
|
|
|
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
/*
|
2006-04-01 23:53:25 +00:00
|
|
|
* At this point, we are in one of two situations:
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
*
|
2006-07-21 17:11:15 +00:00
|
|
|
* (1) We have no socket, just an inpcb<->twtcp pair. We can free
|
|
|
|
* all state.
|
2006-04-01 23:53:25 +00:00
|
|
|
*
|
2006-07-21 17:11:15 +00:00
|
|
|
* (2) We have a socket -- if we own a reference, release it and
|
|
|
|
* notify the socket layer.
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
*/
|
2003-02-19 22:32:43 +00:00
|
|
|
inp = tw->tw_inpcb;
|
2009-03-15 09:58:31 +00:00
|
|
|
KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
|
2006-04-03 13:33:55 +00:00
|
|
|
KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
|
2015-08-03 12:13:54 +00:00
|
|
|
INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WLOCK_ASSERT(inp);
|
2004-11-23 16:23:13 +00:00
|
|
|
|
2014-04-10 18:15:35 +00:00
|
|
|
tcp_tw_2msl_stop(tw, reuse);
|
2003-02-19 22:32:43 +00:00
|
|
|
inp->inp_ppcb = NULL;
|
2006-04-25 11:17:35 +00:00
|
|
|
in_pcbdrop(inp);
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
|
|
|
|
so = inp->inp_socket;
|
2006-04-01 23:53:25 +00:00
|
|
|
if (so != NULL) {
|
2006-07-21 17:11:15 +00:00
|
|
|
/*
|
|
|
|
* If there's a socket, handle two cases: first, we own a
|
|
|
|
* strong reference, which we will now release, or we don't
|
|
|
|
* in which case another reference exists (XXXRW: think
|
|
|
|
* about this more), and we don't need to take action.
|
|
|
|
*/
|
2009-03-15 09:58:31 +00:00
|
|
|
if (inp->inp_flags & INP_SOCKREF) {
|
|
|
|
inp->inp_flags &= ~INP_SOCKREF;
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WUNLOCK(inp);
|
2006-04-01 23:53:25 +00:00
|
|
|
SOCK_LOCK(so);
|
|
|
|
KASSERT(so->so_state & SS_PROTOREF,
|
|
|
|
("tcp_twclose: INP_SOCKREF && !SS_PROTOREF"));
|
|
|
|
so->so_state &= ~SS_PROTOREF;
|
|
|
|
sofree(so);
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
} else {
|
2006-04-01 23:53:25 +00:00
|
|
|
/*
|
|
|
|
* If we don't own the only reference, the socket and
|
|
|
|
* inpcb need to be left around to be handled by
|
|
|
|
* tcp_usr_detach() later.
|
|
|
|
*/
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WUNLOCK(inp);
|
Update TCP for infrastructural changes to the socket/pcb refcount model,
pru_abort(), pru_detach(), and in_pcbdetach():
- Universally support and enforce the invariant that so_pcb is
never NULL, converting dozens of unnecessary NULL checks into
assertions, and eliminating dozens of unnecessary error handling
cases in protocol code.
- In some cases, eliminate unnecessary pcbinfo locking, as it is no
longer required to ensure so_pcb != NULL. For example, the receive
code no longer requires the pcbinfo lock, and the send code only
requires it if building a new connection on an otherwise unconnected
socket triggered via sendto() with an address. This should
significnatly reduce tcbinfo lock contention in the receive and send
cases.
- In order to support the invariant that so_pcb != NULL, it is now
necessary for the TCP code to not discard the tcpcb any time a
connection is dropped, but instead leave the tcpcb until the socket
is shutdown. This case is handled by setting INP_DROPPED, to
substitute for using a NULL so_pcb to indicate that the connection
has been dropped. This requires the inpcb lock, but not the pcbinfo
lock.
- Unlike all other protocols in the tree, TCP may need to retain access
to the socket after the file descriptor has been closed. Set
SS_PROTOREF in tcp_detach() in order to prevent the socket from being
freed, and add a flag, INP_SOCKREF, so that the TCP code knows whether
or not it needs to free the socket when the connection finally does
close. The typical case where this occurs is if close() is called on
a TCP socket before all sent data in the send socket buffer has been
transmitted or acknowledged. If INP_SOCKREF is found when the
connection is dropped, we release the inpcb, tcpcb, and socket instead
of flagging INP_DROPPED.
- Abort and detach protocol switch methods no longer return failures,
nor attempt to free sockets, as the socket layer does this.
- Annotate the existence of a long-standing race in the TCP timer code,
in which timers are stopped but not drained when the socket is freed,
as waiting for drain may lead to deadlocks, or have to occur in a
context where waiting is not permitted. This race has been handled
by testing to see if the tcpcb pointer in the inpcb is NULL (and vice
versa), which is not normally permitted, but may be true of a inpcb
and tcpcb have been freed. Add a counter to test how often this race
has actually occurred, and a large comment for each instance where
we compare potentially freed memory with NULL. This will have to be
fixed in the near future, but requires is to further address how to
handle the timer shutdown shutdown issue.
- Several TCP calls no longer potentially free the passed inpcb/tcpcb,
so no longer need to return a pointer to indicate whether the argument
passed in is still valid.
- Un-macroize debugging and locking setup for various protocol switch
methods for TCP, as it lead to more obscurity, and as locking becomes
more customized to the methods, offers less benefit.
- Assert copyright on tcp_usrreq.c due to significant modifications that
have been made as part of this work.
These changes significantly modify the memory management and connection
logic of our TCP implementation, and are (as such) High Risk Changes,
and likely to contain serious bugs. Please report problems to the
current@ mailing list ASAP, ideally with simple test cases, and
optionally, packet traces.
MFC after: 3 months
2006-04-01 16:36:36 +00:00
|
|
|
}
|
2014-10-30 08:53:56 +00:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* The socket has been already cleaned-up for us, only free the
|
|
|
|
* inpcb.
|
|
|
|
*/
|
2008-11-27 12:04:35 +00:00
|
|
|
in_pcbfree(inp);
|
2014-10-30 08:53:56 +00:00
|
|
|
}
|
2009-04-11 22:07:19 +00:00
|
|
|
TCPSTAT_INC(tcps_closed);
|
2003-02-19 22:32:43 +00:00
|
|
|
}
|
|
|
|
|
2014-05-24 14:01:18 +00:00
|
|
|
static int
|
2004-02-28 15:12:20 +00:00
|
|
|
tcp_twrespond(struct tcptw *tw, int flags)
|
2003-02-19 22:32:43 +00:00
|
|
|
{
|
|
|
|
struct inpcb *inp = tw->tw_inpcb;
|
2011-04-30 11:21:29 +00:00
|
|
|
#if defined(INET6) || defined(INET)
|
|
|
|
struct tcphdr *th = NULL;
|
|
|
|
#endif
|
2003-02-19 22:32:43 +00:00
|
|
|
struct mbuf *m;
|
2011-04-30 11:21:29 +00:00
|
|
|
#ifdef INET
|
2003-02-19 22:32:43 +00:00
|
|
|
struct ip *ip = NULL;
|
2011-04-30 11:21:29 +00:00
|
|
|
#endif
|
2003-02-19 22:32:43 +00:00
|
|
|
u_int hdrlen, optlen;
|
2011-04-30 11:21:29 +00:00
|
|
|
int error = 0; /* Keep compiler happy */
|
2007-04-18 18:14:39 +00:00
|
|
|
struct tcpopt to;
|
2003-02-19 22:32:43 +00:00
|
|
|
#ifdef INET6
|
|
|
|
struct ip6_hdr *ip6 = NULL;
|
2008-12-17 12:52:34 +00:00
|
|
|
int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
|
2003-02-19 22:32:43 +00:00
|
|
|
#endif
|
2012-11-10 10:41:00 +00:00
|
|
|
hdrlen = 0; /* Keep compiler happy */
|
2003-02-19 22:32:43 +00:00
|
|
|
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WLOCK_ASSERT(inp);
|
2004-11-23 16:23:13 +00:00
|
|
|
|
2012-12-05 08:04:20 +00:00
|
|
|
m = m_gethdr(M_NOWAIT, MT_DATA);
|
2003-02-19 22:32:43 +00:00
|
|
|
if (m == NULL)
|
|
|
|
return (ENOBUFS);
|
|
|
|
m->m_data += max_linkhdr;
|
|
|
|
|
2003-05-07 05:26:27 +00:00
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
mac_inpcb_create_mbuf(inp, m);
|
2003-05-07 05:26:27 +00:00
|
|
|
#endif
|
|
|
|
|
2003-02-19 23:43:04 +00:00
|
|
|
#ifdef INET6
|
2003-02-19 22:32:43 +00:00
|
|
|
if (isipv6) {
|
|
|
|
hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
|
|
|
|
ip6 = mtod(m, struct ip6_hdr *);
|
|
|
|
th = (struct tcphdr *)(ip6 + 1);
|
|
|
|
tcpip_fillheaders(inp, ip6, th);
|
2011-04-30 11:21:29 +00:00
|
|
|
}
|
2003-02-19 23:43:04 +00:00
|
|
|
#endif
|
2011-04-30 11:21:29 +00:00
|
|
|
#if defined(INET6) && defined(INET)
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
#ifdef INET
|
2003-02-19 23:43:04 +00:00
|
|
|
{
|
2003-02-19 22:32:43 +00:00
|
|
|
hdrlen = sizeof(struct tcpiphdr);
|
|
|
|
ip = mtod(m, struct ip *);
|
|
|
|
th = (struct tcphdr *)(ip + 1);
|
|
|
|
tcpip_fillheaders(inp, ip, th);
|
|
|
|
}
|
2011-04-30 11:21:29 +00:00
|
|
|
#endif
|
2007-04-18 18:14:39 +00:00
|
|
|
to.to_flags = 0;
|
2004-08-16 18:32:07 +00:00
|
|
|
|
|
|
|
/*
|
2003-02-19 22:32:43 +00:00
|
|
|
* Send a timestamp and echo-reply if both our side and our peer
|
|
|
|
* have sent timestamps in our SYN's and this is not a RST.
|
2004-08-16 18:32:07 +00:00
|
|
|
*/
|
2003-02-19 22:32:43 +00:00
|
|
|
if (tw->t_recent && flags == TH_ACK) {
|
2007-04-18 18:14:39 +00:00
|
|
|
to.to_flags |= TOF_TS;
|
2012-02-15 16:09:56 +00:00
|
|
|
to.to_tsval = tcp_ts_getticks() + tw->ts_offset;
|
2007-04-18 18:14:39 +00:00
|
|
|
to.to_tsecr = tw->t_recent;
|
2004-08-16 18:32:07 +00:00
|
|
|
}
|
2007-04-18 18:14:39 +00:00
|
|
|
optlen = tcp_addoptions(&to, (u_char *)(th + 1));
|
2003-02-19 22:32:43 +00:00
|
|
|
|
|
|
|
m->m_len = hdrlen + optlen;
|
|
|
|
m->m_pkthdr.len = m->m_len;
|
|
|
|
|
|
|
|
KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
|
|
|
|
|
|
|
|
th->th_seq = htonl(tw->snd_nxt);
|
|
|
|
th->th_ack = htonl(tw->rcv_nxt);
|
|
|
|
th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
|
|
|
|
th->th_flags = flags;
|
|
|
|
th->th_win = htons(tw->last_win);
|
|
|
|
|
2012-05-25 02:23:26 +00:00
|
|
|
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
|
2003-02-19 23:43:04 +00:00
|
|
|
#ifdef INET6
|
2003-02-19 22:32:43 +00:00
|
|
|
if (isipv6) {
|
It turns out that too many drivers are not only parsing the L2/3/4
headers for TSO but also for generic checksum offloading. Ideally we
would only have one common function shared amongst all drivers, and
perhaps when updating them for IPv6 we should introduce that.
Eventually we should provide the meta information along with mbufs to
avoid (re-)parsing entirely.
To not break IPv6 (checksums and offload) and to be able to MFC the
changes without risking to hurt 3rd party drivers, duplicate the v4
framework, as other OSes have done as well.
Introduce interface capability flags for TX/RX checksum offload with
IPv6, to allow independent toggling (where possible). Add CSUM_*_IPV6
flags for UDP/TCP over IPv6, and reserve further for SCTP, and IPv6
fragmentation. Define CSUM_DELAY_DATA_IPV6 as we do for legacy IP and
add an alias for CSUM_DATA_VALID_IPV6.
This pretty much brings IPv6 handling in line with IPv4.
TSO is still handled in a different way and not via if_hwassist.
Update ifconfig to allow (un)setting of the new capability flags.
Update loopback to announce the new capabilities and if_hwassist flags.
Individual driver updates will have to follow, as will SCTP.
Reported by: gallatin, dim, ..
Reviewed by: gallatin (glanced at?)
MFC after: 3 days
X-MFC with: r235961,235959,235958
2012-05-28 09:30:13 +00:00
|
|
|
m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
|
2012-05-25 02:23:26 +00:00
|
|
|
th->th_sum = in6_cksum_pseudo(ip6,
|
|
|
|
sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0);
|
2003-11-20 20:07:39 +00:00
|
|
|
ip6->ip6_hlim = in6_selecthlim(inp, NULL);
|
2018-07-30 20:13:38 +00:00
|
|
|
TCP_PROBE5(send, NULL, NULL, ip6, NULL, th);
|
2003-11-20 20:07:39 +00:00
|
|
|
error = ip6_output(m, inp->in6p_outputopts, NULL,
|
2003-02-19 22:32:43 +00:00
|
|
|
(tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
|
2011-04-30 11:21:29 +00:00
|
|
|
}
|
2003-02-19 23:43:04 +00:00
|
|
|
#endif
|
2011-04-30 11:21:29 +00:00
|
|
|
#if defined(INET6) && defined(INET)
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
#ifdef INET
|
2003-02-19 23:43:04 +00:00
|
|
|
{
|
It turns out that too many drivers are not only parsing the L2/3/4
headers for TSO but also for generic checksum offloading. Ideally we
would only have one common function shared amongst all drivers, and
perhaps when updating them for IPv6 we should introduce that.
Eventually we should provide the meta information along with mbufs to
avoid (re-)parsing entirely.
To not break IPv6 (checksums and offload) and to be able to MFC the
changes without risking to hurt 3rd party drivers, duplicate the v4
framework, as other OSes have done as well.
Introduce interface capability flags for TX/RX checksum offload with
IPv6, to allow independent toggling (where possible). Add CSUM_*_IPV6
flags for UDP/TCP over IPv6, and reserve further for SCTP, and IPv6
fragmentation. Define CSUM_DELAY_DATA_IPV6 as we do for legacy IP and
add an alias for CSUM_DATA_VALID_IPV6.
This pretty much brings IPv6 handling in line with IPv4.
TSO is still handled in a different way and not via if_hwassist.
Update ifconfig to allow (un)setting of the new capability flags.
Update loopback to announce the new capabilities and if_hwassist flags.
Individual driver updates will have to follow, as will SCTP.
Reported by: gallatin, dim, ..
Reviewed by: gallatin (glanced at?)
MFC after: 3 days
X-MFC with: r235961,235959,235958
2012-05-28 09:30:13 +00:00
|
|
|
m->m_pkthdr.csum_flags = CSUM_TCP;
|
2003-02-19 22:32:43 +00:00
|
|
|
th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
|
2004-08-16 18:32:07 +00:00
|
|
|
htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
|
2012-10-22 21:09:03 +00:00
|
|
|
ip->ip_len = htons(m->m_pkthdr.len);
|
Commit step 1 of the vimage project, (network stack)
virtualization work done by Marko Zec (zec@).
This is the first in a series of commits over the course
of the next few weeks.
Mark all uses of global variables to be virtualized
with a V_ prefix.
Use macros to map them back to their global names for
now, so this is a NOP change only.
We hope to have caught at least 85-90% of what is needed
so we do not invalidate a lot of outstanding patches again.
Obtained from: //depot/projects/vimage-commit2/...
Reviewed by: brooks, des, ed, mav, julian,
jamie, kris, rwatson, zec, ...
(various people I forgot, different versions)
md5 (with a bit of help)
Sponsored by: NLnet Foundation, The FreeBSD Foundation
X-MFC after: never
V_Commit_Message_Reviewed_By: more people than the patch
2008-08-17 23:27:27 +00:00
|
|
|
if (V_path_mtu_discovery)
|
2012-10-22 21:09:03 +00:00
|
|
|
ip->ip_off |= htons(IP_DF);
|
2018-07-30 20:13:38 +00:00
|
|
|
TCP_PROBE5(send, NULL, NULL, ip, NULL, th);
|
2003-11-20 20:07:39 +00:00
|
|
|
error = ip_output(m, inp->inp_options, NULL,
|
2004-09-05 02:34:12 +00:00
|
|
|
((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
|
|
|
|
NULL, inp);
|
2003-02-19 22:32:43 +00:00
|
|
|
}
|
2011-04-30 11:21:29 +00:00
|
|
|
#endif
|
2003-02-19 22:32:43 +00:00
|
|
|
if (flags & TH_ACK)
|
2009-04-11 22:07:19 +00:00
|
|
|
TCPSTAT_INC(tcps_sndacks);
|
2003-02-19 22:32:43 +00:00
|
|
|
else
|
2009-04-11 22:07:19 +00:00
|
|
|
TCPSTAT_INC(tcps_sndctrl);
|
|
|
|
TCPSTAT_INC(tcps_sndtotal);
|
2003-02-19 22:32:43 +00:00
|
|
|
return (error);
|
|
|
|
}
|
2007-05-16 17:14:25 +00:00
|
|
|
|
|
|
|
static void
|
|
|
|
tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
|
|
|
|
{
|
|
|
|
|
2015-08-03 12:13:54 +00:00
|
|
|
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
|
2008-04-17 21:38:18 +00:00
|
|
|
INP_WLOCK_ASSERT(tw->tw_inpcb);
|
2014-04-10 18:15:35 +00:00
|
|
|
|
|
|
|
TW_WLOCK(V_tw_lock);
|
2007-05-16 17:14:25 +00:00
|
|
|
if (rearm)
|
Commit step 1 of the vimage project, (network stack)
virtualization work done by Marko Zec (zec@).
This is the first in a series of commits over the course
of the next few weeks.
Mark all uses of global variables to be virtualized
with a V_ prefix.
Use macros to map them back to their global names for
now, so this is a NOP change only.
We hope to have caught at least 85-90% of what is needed
so we do not invalidate a lot of outstanding patches again.
Obtained from: //depot/projects/vimage-commit2/...
Reviewed by: brooks, des, ed, mav, julian,
jamie, kris, rwatson, zec, ...
(various people I forgot, different versions)
md5 (with a bit of help)
Sponsored by: NLnet Foundation, The FreeBSD Foundation
X-MFC after: never
V_Commit_Message_Reviewed_By: more people than the patch
2008-08-17 23:27:27 +00:00
|
|
|
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
|
2007-05-16 17:14:25 +00:00
|
|
|
tw->tw_time = ticks + 2 * tcp_msl;
|
Commit step 1 of the vimage project, (network stack)
virtualization work done by Marko Zec (zec@).
This is the first in a series of commits over the course
of the next few weeks.
Mark all uses of global variables to be virtualized
with a V_ prefix.
Use macros to map them back to their global names for
now, so this is a NOP change only.
We hope to have caught at least 85-90% of what is needed
so we do not invalidate a lot of outstanding patches again.
Obtained from: //depot/projects/vimage-commit2/...
Reviewed by: brooks, des, ed, mav, julian,
jamie, kris, rwatson, zec, ...
(various people I forgot, different versions)
md5 (with a bit of help)
Sponsored by: NLnet Foundation, The FreeBSD Foundation
X-MFC after: never
V_Commit_Message_Reviewed_By: more people than the patch
2008-08-17 23:27:27 +00:00
|
|
|
TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
|
2014-04-10 18:15:35 +00:00
|
|
|
TW_WUNLOCK(V_tw_lock);
|
2007-05-16 17:14:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2014-04-10 18:15:35 +00:00
|
|
|
tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
|
2007-05-16 17:14:25 +00:00
|
|
|
{
|
2014-10-30 08:53:56 +00:00
|
|
|
struct ucred *cred;
|
|
|
|
struct inpcb *inp;
|
2018-05-19 05:56:21 +00:00
|
|
|
int released __unused;
|
2007-05-16 17:14:25 +00:00
|
|
|
|
2015-08-03 12:13:54 +00:00
|
|
|
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
|
2014-04-10 18:15:35 +00:00
|
|
|
|
|
|
|
TW_WLOCK(V_tw_lock);
|
2014-10-30 08:53:56 +00:00
|
|
|
inp = tw->tw_inpcb;
|
|
|
|
tw->tw_inpcb = NULL;
|
|
|
|
|
Commit step 1 of the vimage project, (network stack)
virtualization work done by Marko Zec (zec@).
This is the first in a series of commits over the course
of the next few weeks.
Mark all uses of global variables to be virtualized
with a V_ prefix.
Use macros to map them back to their global names for
now, so this is a NOP change only.
We hope to have caught at least 85-90% of what is needed
so we do not invalidate a lot of outstanding patches again.
Obtained from: //depot/projects/vimage-commit2/...
Reviewed by: brooks, des, ed, mav, julian,
jamie, kris, rwatson, zec, ...
(various people I forgot, different versions)
md5 (with a bit of help)
Sponsored by: NLnet Foundation, The FreeBSD Foundation
X-MFC after: never
V_Commit_Message_Reviewed_By: more people than the patch
2008-08-17 23:27:27 +00:00
|
|
|
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
|
2014-10-30 08:53:56 +00:00
|
|
|
cred = tw->tw_cred;
|
2014-04-10 18:15:35 +00:00
|
|
|
tw->tw_cred = NULL;
|
2014-04-11 19:17:45 +00:00
|
|
|
TW_WUNLOCK(V_tw_lock);
|
2014-04-10 18:15:35 +00:00
|
|
|
|
2014-10-30 08:53:56 +00:00
|
|
|
if (cred != NULL)
|
|
|
|
crfree(cred);
|
|
|
|
|
|
|
|
released = in_pcbrele_wlocked(inp);
|
|
|
|
KASSERT(!released, ("%s: inp should not be released here", __func__));
|
|
|
|
|
2014-04-11 19:17:45 +00:00
|
|
|
if (!reuse)
|
2014-10-30 08:53:56 +00:00
|
|
|
uma_zfree(V_tcptw_zone, tw);
|
2016-03-15 00:15:10 +00:00
|
|
|
TCPSTATES_DEC(TCPS_TIME_WAIT);
|
2007-05-16 17:14:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
struct tcptw *
|
2014-10-30 08:53:56 +00:00
|
|
|
tcp_tw_2msl_scan(int reuse)
|
2007-05-16 17:14:25 +00:00
|
|
|
{
|
2014-04-11 21:00:59 +00:00
|
|
|
struct tcptw *tw;
|
2014-10-30 08:53:56 +00:00
|
|
|
struct inpcb *inp;
|
2018-07-04 02:47:16 +00:00
|
|
|
struct epoch_tracker et;
|
2007-05-16 17:14:25 +00:00
|
|
|
|
2014-10-30 08:53:56 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
if (reuse) {
|
|
|
|
/*
|
2015-08-03 12:13:54 +00:00
|
|
|
* Exclusive pcbinfo lock is not required in reuse case even if
|
|
|
|
* two inpcb locks can be acquired simultaneously:
|
2014-10-30 08:53:56 +00:00
|
|
|
* - the inpcb transitioning to TIME_WAIT state in
|
|
|
|
* tcp_tw_start(),
|
|
|
|
* - the inpcb closed by tcp_twclose().
|
2015-08-03 12:13:54 +00:00
|
|
|
*
|
|
|
|
* It is because only inpcbs in FIN_WAIT2 or CLOSING states can
|
|
|
|
* transition in TIME_WAIT state. Then a pcbcb cannot be in
|
|
|
|
* TIME_WAIT list and transitioning to TIME_WAIT state at same
|
|
|
|
* time.
|
2014-10-30 08:53:56 +00:00
|
|
|
*/
|
2015-08-03 12:13:54 +00:00
|
|
|
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
|
2014-04-10 18:15:35 +00:00
|
|
|
}
|
2014-10-30 08:53:56 +00:00
|
|
|
#endif
|
2014-04-11 21:00:59 +00:00
|
|
|
|
2007-05-16 17:14:25 +00:00
|
|
|
for (;;) {
|
2014-04-10 18:15:35 +00:00
|
|
|
TW_RLOCK(V_tw_lock);
|
Commit step 1 of the vimage project, (network stack)
virtualization work done by Marko Zec (zec@).
This is the first in a series of commits over the course
of the next few weeks.
Mark all uses of global variables to be virtualized
with a V_ prefix.
Use macros to map them back to their global names for
now, so this is a NOP change only.
We hope to have caught at least 85-90% of what is needed
so we do not invalidate a lot of outstanding patches again.
Obtained from: //depot/projects/vimage-commit2/...
Reviewed by: brooks, des, ed, mav, julian,
jamie, kris, rwatson, zec, ...
(various people I forgot, different versions)
md5 (with a bit of help)
Sponsored by: NLnet Foundation, The FreeBSD Foundation
X-MFC after: never
V_Commit_Message_Reviewed_By: more people than the patch
2008-08-17 23:27:27 +00:00
|
|
|
tw = TAILQ_FIRST(&V_twq_2msl);
|
2014-10-30 08:53:56 +00:00
|
|
|
if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) {
|
2014-04-10 18:15:35 +00:00
|
|
|
TW_RUNLOCK(V_tw_lock);
|
2007-05-16 17:14:25 +00:00
|
|
|
break;
|
2014-04-10 18:15:35 +00:00
|
|
|
}
|
2014-10-30 08:53:56 +00:00
|
|
|
KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
|
|
|
|
__func__));
|
|
|
|
|
|
|
|
inp = tw->tw_inpcb;
|
|
|
|
in_pcbref(inp);
|
2014-04-10 18:15:35 +00:00
|
|
|
TW_RUNLOCK(V_tw_lock);
|
|
|
|
|
2018-07-04 02:47:16 +00:00
|
|
|
INP_INFO_RLOCK_ET(&V_tcbinfo, et);
|
2018-06-19 01:54:00 +00:00
|
|
|
INP_WLOCK(inp);
|
|
|
|
tw = intotw(inp);
|
|
|
|
if (in_pcbrele_wlocked(inp)) {
|
|
|
|
if (__predict_true(tw == NULL)) {
|
2018-07-04 02:47:16 +00:00
|
|
|
INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
|
2018-06-19 01:54:00 +00:00
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
/* This should not happen as in TIMEWAIT
|
|
|
|
* state the inp should not be destroyed
|
|
|
|
* before its tcptw. If INVARIANTS is
|
|
|
|
* defined panic.
|
|
|
|
*/
|
2017-10-01 21:20:28 +00:00
|
|
|
#ifdef INVARIANTS
|
2018-06-19 01:54:00 +00:00
|
|
|
panic("%s: Panic before an infinite "
|
|
|
|
"loop: INP_TIMEWAIT && (INP_FREED "
|
|
|
|
"|| inp last reference) && tw != "
|
|
|
|
"NULL", __func__);
|
2017-10-01 21:20:28 +00:00
|
|
|
#else
|
2018-06-19 01:54:00 +00:00
|
|
|
log(LOG_ERR, "%s: Avoid an infinite "
|
|
|
|
"loop: INP_TIMEWAIT && (INP_FREED "
|
|
|
|
"|| inp last reference) && tw != "
|
|
|
|
"NULL", __func__);
|
2017-10-01 21:20:28 +00:00
|
|
|
#endif
|
2018-07-04 02:47:16 +00:00
|
|
|
INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
|
2018-06-19 01:54:00 +00:00
|
|
|
break;
|
2014-10-30 08:53:56 +00:00
|
|
|
}
|
2018-06-19 01:54:00 +00:00
|
|
|
}
|
2014-10-30 08:53:56 +00:00
|
|
|
|
2018-06-19 01:54:00 +00:00
|
|
|
if (tw == NULL) {
|
|
|
|
/* tcp_twclose() has already been called */
|
|
|
|
INP_WUNLOCK(inp);
|
2018-07-04 02:47:16 +00:00
|
|
|
INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
|
2018-06-19 01:54:00 +00:00
|
|
|
continue;
|
2014-04-10 18:15:35 +00:00
|
|
|
}
|
2018-06-19 01:54:00 +00:00
|
|
|
|
|
|
|
tcp_twclose(tw, reuse);
|
2018-07-04 02:47:16 +00:00
|
|
|
INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
|
2018-06-19 01:54:00 +00:00
|
|
|
if (reuse)
|
|
|
|
return tw;
|
2007-05-16 17:14:25 +00:00
|
|
|
}
|
2014-10-30 08:53:56 +00:00
|
|
|
|
|
|
|
return NULL;
|
2007-05-16 17:14:25 +00:00
|
|
|
}
|