tcp: remove tcptw, the compressed timewait state structure

The memory savings the tcptw brought back in 2003 (see 340c35de6a) no
longer justify the complexity required to maintain it.  For longer
explanation please check out the email [1].

Surpisingly through almost 20 years the TCP stack functionality of
handling the TIME_WAIT state with a normal tcpcb did not bitrot.  The
existing tcp_input() properly handles a tcpcb in TCPS_TIME_WAIT state,
which is confirmed by the packetdrill tcp-testsuite [2].

This change just removes tcptw and leaves INP_TIMEWAIT.  The flag will
be removed in a separate commit.  This makes it easier to review and
possibly debug the changes.

[1] https://lists.freebsd.org/archives/freebsd-net/2022-January/001206.html
[2] https://github.com/freebsd-net/tcp-testsuite

Differential revision:	https://reviews.freebsd.org/D36398
This commit is contained in:
Gleb Smirnoff 2022-10-06 19:22:23 -07:00
parent f48114b653
commit 0d7445193a
11 changed files with 124 additions and 807 deletions

View File

@ -1031,7 +1031,6 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
laddr = sin->sin_addr;
if (lport) {
struct inpcb *t;
struct tcptw *tw;
/* GROSS */
if (ntohs(lport) <= V_ipport_reservedhigh &&
@ -1070,24 +1069,9 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
}
t = in_pcblookup_local(pcbinfo, sin->sin_addr,
lport, lookupflags, cred);
if (t && (t->inp_flags & INP_TIMEWAIT)) {
/*
* XXXRW: If an incpb has had its timewait
* state recycled, we treat the address as
* being in use (for now). This is better
* than a panic, but not desirable.
*/
tw = intotw(t);
if (tw == NULL ||
((reuseport & tw->tw_so_options) == 0 &&
(reuseport_lb &
tw->tw_so_options) == 0)) {
return (EADDRINUSE);
}
} else if (t &&
((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
(reuseport & inp_so_options(t)) == 0 &&
(reuseport_lb & inp_so_options(t)) == 0) {
if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
(reuseport & inp_so_options(t)) == 0 &&
(reuseport_lb & inp_so_options(t)) == 0) {
#ifdef INET6
if (ntohl(sin->sin_addr.s_addr) !=
INADDR_ANY ||

View File

@ -999,29 +999,29 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
goto dropunlock;
}
/*
* A previous connection in TIMEWAIT state is supposed to catch stray
* or duplicate segments arriving late. If this segment was a
* legitimate new connection attempt, the old INPCB gets removed and
* we can try again to find a listening socket.
*/
if (inp->inp_flags & INP_TIMEWAIT) {
tp = intotcpcb(inp);
switch (tp->t_state) {
case TCPS_TIME_WAIT:
/*
* A previous connection in TIMEWAIT state is supposed to catch
* stray or duplicate segments arriving late. If this segment
* was a legitimate new connection attempt, the old INPCB gets
* removed and we can try again to find a listening socket.
*/
tcp_dooptions(&to, optp, optlen,
(thflags & TH_SYN) ? TO_SYN : 0);
/*
* NB: tcp_twcheck unlocks the INP and frees the mbuf.
* tcp_twcheck unlocks the inp always, and frees the m if fails.
*/
if (tcp_twcheck(inp, &to, th, m, tlen))
goto findpcb;
return (IPPROTO_DONE);
}
/*
* The TCPCB may no longer exist if the connection is winding
* down or it is in the CLOSED state. Either way we drop the
* segment and send an appropriate response.
*/
tp = intotcpcb(inp);
if (tp == NULL || tp->t_state == TCPS_CLOSED) {
case TCPS_CLOSED:
/*
* The TCPCB may no longer exist if the connection is winding
* down or it is in the CLOSED state. Either way we drop the
* segment and send an appropriate response.
*/
rstreason = BANDLIM_RST_CLOSEDPORT;
goto dropwithreset;
}
@ -3030,10 +3030,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* Starting the timer is contrary to the
* specification, but if we don't get a FIN
* we'll hang forever.
*
* XXXjl:
* we should release the tp also, and use a
* compressed state.
*/
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
soisdisconnected(so);

View File

@ -11359,8 +11359,6 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
INP_WLOCK_ASSERT(tp->t_inpcb);
KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
__func__));
KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
__func__));
tp->t_rcvtime = ticks;
/*

View File

@ -14154,8 +14154,7 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
INP_WLOCK_ASSERT(tp->t_inpcb);
KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
__func__));
KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
__func__));
if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
(tp->t_flags & TF_GPUTINPROG)) {
/*

View File

@ -1485,7 +1485,6 @@ tcp_vnet_init(void *arg __unused)
uma_zone_set_max(V_tcpcb_zone, maxsockets);
uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached");
tcp_tw_init();
syncache_init();
tcp_hc_init();
@ -1647,7 +1646,6 @@ tcp_destroy(void *unused __unused)
}
tcp_hc_destroy();
syncache_destroy();
tcp_tw_destroy();
in_pcbinfo_destroy(&V_tcbinfo);
/* tcp_discardcb() clears the sack_holes up. */
uma_zdestroy(V_sack_hole_zone);
@ -2678,33 +2676,17 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
return (error);
while ((inp = inp_next(&inpi)) != NULL) {
if (inp->inp_gencnt <= xig.xig_gen) {
int crerr;
if (inp->inp_gencnt <= xig.xig_gen &&
cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
struct xtcpcb xt;
/*
* XXX: This use of cr_cansee(), introduced with
* TCP state changes, is not quite right, but for
* now, better than nothing.
*/
if (inp->inp_flags & INP_TIMEWAIT) {
if (intotw(inp) != NULL)
crerr = cr_cansee(req->td->td_ucred,
intotw(inp)->tw_cred);
else
crerr = EINVAL; /* Skip this inp. */
tcp_inptoxtp(inp, &xt);
error = SYSCTL_OUT(req, &xt, sizeof xt);
if (error) {
INP_RUNLOCK(inp);
break;
} else
crerr = cr_canseeinpcb(req->td->td_ucred, inp);
if (crerr == 0) {
struct xtcpcb xt;
tcp_inptoxtp(inp, &xt);
error = SYSCTL_OUT(req, &xt, sizeof xt);
if (error) {
INP_RUNLOCK(inp);
break;
} else
continue;
}
continue;
}
}
@ -3639,7 +3621,6 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
struct sockaddr_storage addrs[2];
struct inpcb *inp;
struct tcpcb *tp;
struct tcptw *tw;
#ifdef INET
struct sockaddr_in *fin = NULL, *lin = NULL;
#endif
@ -3721,19 +3702,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
#endif
}
if (inp != NULL) {
if (inp->inp_flags & INP_TIMEWAIT) {
/*
* XXXRW: There currently exists a state where an
* inpcb is present, but its timewait state has been
* discarded. For now, don't allow dropping of this
* type of inpcb.
*/
tw = intotw(inp);
if (tw != NULL)
tcp_twclose(tw, 0);
else
INP_WUNLOCK(inp);
} else if ((inp->inp_flags & INP_DROPPED) == 0 &&
if ((inp->inp_flags & INP_DROPPED) == 0 &&
!SOLISTENING(inp->inp_socket)) {
tp = intotcpcb(inp);
tp = tcp_drop(tp, ECONNABORTED);
@ -4027,56 +3996,49 @@ void
tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt)
{
struct tcpcb *tp = intotcpcb(inp);
struct tcptw *tw = intotw(inp);
sbintime_t now;
bzero(xt, sizeof(*xt));
if (inp->inp_flags & INP_TIMEWAIT) {
xt->t_state = TCPS_TIME_WAIT;
xt->xt_encaps_port = tw->t_port;
} else {
xt->t_state = tp->t_state;
xt->t_logstate = tp->t_logstate;
xt->t_flags = tp->t_flags;
xt->t_sndzerowin = tp->t_sndzerowin;
xt->t_sndrexmitpack = tp->t_sndrexmitpack;
xt->t_rcvoopack = tp->t_rcvoopack;
xt->t_rcv_wnd = tp->rcv_wnd;
xt->t_snd_wnd = tp->snd_wnd;
xt->t_snd_cwnd = tp->snd_cwnd;
xt->t_snd_ssthresh = tp->snd_ssthresh;
xt->t_dsack_bytes = tp->t_dsack_bytes;
xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes;
xt->t_dsack_pack = tp->t_dsack_pack;
xt->t_maxseg = tp->t_maxseg;
xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 +
(tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0;
xt->t_state = tp->t_state;
xt->t_logstate = tp->t_logstate;
xt->t_flags = tp->t_flags;
xt->t_sndzerowin = tp->t_sndzerowin;
xt->t_sndrexmitpack = tp->t_sndrexmitpack;
xt->t_rcvoopack = tp->t_rcvoopack;
xt->t_rcv_wnd = tp->rcv_wnd;
xt->t_snd_wnd = tp->snd_wnd;
xt->t_snd_cwnd = tp->snd_cwnd;
xt->t_snd_ssthresh = tp->snd_ssthresh;
xt->t_dsack_bytes = tp->t_dsack_bytes;
xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes;
xt->t_dsack_pack = tp->t_dsack_pack;
xt->t_maxseg = tp->t_maxseg;
xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 +
(tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0;
now = getsbinuptime();
#define COPYTIMER(ttt) do { \
if (callout_active(&tp->t_timers->ttt)) \
xt->ttt = (tp->t_timers->ttt.c_time - now) / \
SBT_1MS; \
else \
xt->ttt = 0; \
now = getsbinuptime();
#define COPYTIMER(ttt) do { \
if (callout_active(&tp->t_timers->ttt)) \
xt->ttt = (tp->t_timers->ttt.c_time - now) / \
SBT_1MS; \
else \
xt->ttt = 0; \
} while (0)
COPYTIMER(tt_delack);
COPYTIMER(tt_rexmt);
COPYTIMER(tt_persist);
COPYTIMER(tt_keep);
COPYTIMER(tt_2msl);
COPYTIMER(tt_delack);
COPYTIMER(tt_rexmt);
COPYTIMER(tt_persist);
COPYTIMER(tt_keep);
COPYTIMER(tt_2msl);
#undef COPYTIMER
xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz;
xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz;
xt->xt_encaps_port = tp->t_port;
bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack,
TCP_FUNCTION_NAME_LEN_MAX);
bcopy(CC_ALGO(tp)->name, xt->xt_cc,
TCP_CA_NAME_MAX);
xt->xt_encaps_port = tp->t_port;
bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack,
TCP_FUNCTION_NAME_LEN_MAX);
bcopy(CC_ALGO(tp)->name, xt->xt_cc, TCP_CA_NAME_MAX);
#ifdef TCP_BLACKBOX
(void)tcp_log_get_id(tp, xt->xt_logid);
(void)tcp_log_get_id(tp, xt->xt_logid);
#endif
}
xt->xt_len = sizeof(struct xtcpcb);
in_pcbtoxinpcb(inp, &xt->xt_inp);

View File

@ -236,41 +236,6 @@ inp_to_cpuid(struct inpcb *inp)
}
}
/*
* Legacy TCP global callout routine called every 500 ms.
* Used to cleanup timewait states, which lack their own callouts.
*/
static struct callout tcpslow_callout;
static void
tcp_slowtimo(void *arg __unused)
{
struct epoch_tracker et;
VNET_ITERATOR_DECL(vnet_iter);
NET_EPOCH_ENTER(et);
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
(void) tcp_tw_2msl_scan(0);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
NET_EPOCH_EXIT(et);
callout_reset_sbt(&tcpslow_callout, SBT_1MS * 500, SBT_1MS * 10,
tcp_slowtimo, NULL, 0);
}
static void
tcp_slowtimo_init(void *arg __unused)
{
callout_init(&tcpslow_callout, 1);
callout_reset_sbt(&tcpslow_callout, SBT_1MS * 500, SBT_1MS * 10,
tcp_slowtimo, NULL, 0);
}
SYSINIT(tcp_timer, SI_SUB_VNET_DONE, SI_ORDER_ANY, tcp_slowtimo_init, NULL);
int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
{ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
@ -387,8 +352,12 @@ tcp_timer_2msl(void *xtp)
* there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
* Ignore fact that there were recent incoming segments.
*/
if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
tp->t_inpcb && tp->t_inpcb->inp_socket &&
if (tp->t_state == TCPS_TIME_WAIT) {
tcp_timer_close(tp);
CURVNET_RESTORE();
return;
} else if (tp->t_state == TCPS_FIN_WAIT_2 &&
tcp_fast_finwait2_recycle && tp->t_inpcb->inp_socket &&
(tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
TCPSTAT_INC(tcps_finwait2_drops);
tcp_timer_close(tp);

View File

@ -229,8 +229,6 @@ VNET_DECLARE(int, tcp_msl);
void tcp_timer_init(void);
void tcp_timer_2msl(void *xtp);
struct tcptw *
tcp_tw_2msl_scan(int reuse); /* XXX temporary? */
void tcp_timer_keep(void *xtp);
void tcp_timer_persist(void *xtp);
void tcp_timer_rexmt(void *xtp);

View File

@ -96,142 +96,26 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
VNET_DEFINE_STATIC(uma_zone_t, tcptw_zone);
#define V_tcptw_zone VNET(tcptw_zone)
static int maxtcptw;
/*
* The timed wait queue contains references to each of the TCP sessions
* currently in the TIME_WAIT state. The queue pointers, including the
* queue pointers in each tcptw structure, are protected using the global
* timewait lock, which must be held over queue iteration and modification.
*
* Rules on tcptw usage:
* - a inpcb is always freed _after_ its tcptw
* - a tcptw relies on its inpcb reference counting for memory stability
* - a tcptw is dereferenceable only while its inpcb is locked
*/
VNET_DEFINE_STATIC(TAILQ_HEAD(, tcptw), twq_2msl);
#define V_twq_2msl VNET(twq_2msl)
/* Global timewait lock */
VNET_DEFINE_STATIC(struct rwlock, tw_lock);
#define V_tw_lock VNET(tw_lock)
#define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0)
#define TW_LOCK_DESTROY(tw) rw_destroy(&(tw))
#define TW_RLOCK(tw) rw_rlock(&(tw))
#define TW_WLOCK(tw) rw_wlock(&(tw))
#define TW_RUNLOCK(tw) rw_runlock(&(tw))
#define TW_WUNLOCK(tw) rw_wunlock(&(tw))
#define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED)
#define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED)
#define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED)
#define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED)
static void tcp_tw_2msl_reset(struct tcptw *, int);
static void tcp_tw_2msl_stop(struct tcptw *, int);
static int tcp_twrespond(struct tcptw *, int);
static int
tcptw_auto_size(void)
{
int halfrange;
/*
* Max out at half the ephemeral port range so that TIME_WAIT
* sockets don't tie up too many ephemeral ports.
*/
if (V_ipport_lastauto > V_ipport_firstauto)
halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2;
else
halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2;
/* Protect against goofy port ranges smaller than 32. */
return (imin(imax(halfrange, 32), maxsockets / 5));
}
static int
sysctl_maxtcptw(SYSCTL_HANDLER_ARGS)
{
int error, new;
if (maxtcptw == 0)
new = tcptw_auto_size();
else
new = maxtcptw;
error = sysctl_handle_int(oidp, &new, 0, req);
if (error == 0 && req->newptr)
if (new >= 32) {
maxtcptw = new;
uma_zone_set_max(V_tcptw_zone, maxtcptw);
}
return (error);
}
SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
&maxtcptw, 0, sysctl_maxtcptw, "IU",
"Maximum number of compressed TCP TIME_WAIT entries");
VNET_DEFINE_STATIC(bool, nolocaltimewait) = true;
#define V_nolocaltimewait VNET(nolocaltimewait)
SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(nolocaltimewait), true,
"Do not create compressed TCP TIME_WAIT entries for local connections");
void
tcp_tw_zone_change(void)
{
if (maxtcptw == 0)
uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
}
void
tcp_tw_init(void)
{
V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw);
if (maxtcptw == 0)
uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
else
uma_zone_set_max(V_tcptw_zone, maxtcptw);
TAILQ_INIT(&V_twq_2msl);
TW_LOCK_INIT(V_tw_lock, "tcptw");
}
#ifdef VIMAGE
void
tcp_tw_destroy(void)
{
struct tcptw *tw;
struct epoch_tracker et;
NET_EPOCH_ENTER(et);
while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
tcp_twclose(tw, 0);
NET_EPOCH_EXIT(et);
TW_LOCK_DESTROY(V_tw_lock);
uma_zdestroy(V_tcptw_zone);
}
#endif
SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, nolocaltimewait,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nolocaltimewait), true,
"Do not create TCP TIME_WAIT state for local connections");
/*
* Move a TCP connection into TIME_WAIT state.
* tcbinfo is locked.
* inp is locked, and is unlocked before returning.
*
* This function used to free tcpcb and allocate a compressed TCP time-wait
* structure tcptw. This served well for 20 years but is no longer relevant
* on modern machines in the modern internet. However, the function remains
* so that TCP stacks require less modification and we don't burn the bridge
* to go back to using compressed time-wait.
*/
void
tcp_twstart(struct tcpcb *tp)
{
struct tcptw twlocal, *tw;
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
uint32_t recwin;
bool acknow, local;
#ifdef INET6
bool isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
#endif
@ -243,144 +127,44 @@ tcp_twstart(struct tcpcb *tp)
KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("tcp_twstart: "
"(inp->inp_flags & INP_DROPPED) != 0"));
if (V_nolocaltimewait) {
tcp_state_change(tp, TCPS_TIME_WAIT);
soisdisconnected(inp->inp_socket);
if (tp->t_flags & TF_ACKNOW)
tcp_output(tp);
if (V_nolocaltimewait && (
#ifdef INET6
if (isipv6)
local = in6_localaddr(&inp->in6p_faddr);
else
isipv6 ? in6_localaddr(&inp->in6p_faddr) :
#endif
#ifdef INET
local = in_localip(inp->inp_faddr);
in_localip(inp->inp_faddr)
#else
local = false;
false
#endif
} else
local = false;
/*
* For use only by DTrace. We do not reference the state
* after this point so modifying it in place is not a problem.
*/
tcp_state_change(tp, TCPS_TIME_WAIT);
if (local)
tw = &twlocal;
else
tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
if (tw == NULL) {
/*
* Reached limit on total number of TIMEWAIT connections
* allowed. Remove a connection from TIMEWAIT queue in LRU
* fashion to make room for this connection.
* If that fails, use on stack tw at least to be able to
* run through tcp_twrespond() and standard tcpcb discard
* routine.
*
* XXX: Check if it possible to always have enough room
* in advance based on guarantees provided by uma_zalloc().
*/
tw = tcp_tw_2msl_scan(1);
if (tw == NULL) {
tw = &twlocal;
local = true;
}
}
/*
* For !local case the tcptw will hold a reference on its inpcb
* until tcp_twclose is called.
*/
tw->tw_inpcb = inp;
/*
* Recover last window size sent.
*/
so = inp->inp_socket;
recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
(long)TCP_MAXWIN << tp->rcv_scale);
if (recwin < (so->so_rcv.sb_hiwat / 4) &&
recwin < tp->t_maxseg)
recwin = 0;
if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
recwin < (tp->rcv_adv - tp->rcv_nxt))
recwin = (tp->rcv_adv - tp->rcv_nxt);
tw->last_win = (u_short)(recwin >> tp->rcv_scale);
/*
* Set t_recent if timestamps are used on the connection.
*/
if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
(TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
tw->t_recent = tp->ts_recent;
tw->ts_offset = tp->ts_offset;
} else {
tw->t_recent = 0;
tw->ts_offset = 0;
)) {
if ((tp = tcp_close(tp)) != NULL)
INP_WUNLOCK(inp);
return;
}
tw->snd_nxt = tp->snd_nxt;
tw->t_port = tp->t_port;
tw->rcv_nxt = tp->rcv_nxt;
tw->tw_time = 0;
tw->tw_flags = tp->t_flags;
/* XXX
* If this code will
* be used for fin-wait-2 state also, then we may need
* a ts_recent from the last segment.
*/
acknow = tp->t_flags & TF_ACKNOW;
/*
* First, discard tcpcb state, which includes stopping its timers and
* freeing it. tcp_discardcb() used to also release the inpcb, but
* that work is now done in the caller.
*
* Note: soisdisconnected() call used to be made in tcp_discardcb(),
* and might not be needed here any longer.
*/
#ifdef TCPHPTS
tcp_hpts_remove(inp);
#endif
tcp_discardcb(tp);
soisdisconnected(so);
tw->tw_so_options = so->so_options;
inp->inp_flags |= INP_TIMEWAIT;
if (acknow)
tcp_twrespond(tw, TH_ACK);
if (local)
in_pcbdrop(inp);
else {
in_pcbref(inp); /* Reference from tw */
tw->tw_cred = crhold(so->so_cred);
inp->inp_ppcb = tw;
TCPSTATES_INC(TCPS_TIME_WAIT);
tcp_tw_2msl_reset(tw, 0);
}
/*
* If the inpcb owns the sole reference to the socket, then we can
* detach and free the socket as it is not needed in time wait.
*/
if (inp->inp_flags & INP_SOCKREF) {
inp->inp_flags &= ~INP_SOCKREF;
INP_WUNLOCK(inp);
sorele(so);
} else
INP_WUNLOCK(inp);
tcp_timer_activate(tp, TT_2MSL, 2 * V_tcp_msl);
INP_WUNLOCK(inp);
}
/*
* Returns 1 if the TIME_WAIT state was killed and we should start over,
* looking for a pcb in the listen state. Returns 0 otherwise.
* Returns true if the TIME_WAIT state was killed and we should start over,
* looking for a pcb in the listen state. Otherwise returns false and frees
* the mbuf.
*
* For pure SYN-segments the PCB shall be read-locked and the tcpopt pointer
* may be NULL. For the rest write-lock and valid tcpopt.
*/
int
bool
tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
struct mbuf *m, int tlen)
{
struct tcptw *tw;
struct tcpcb *tp = intotcpcb(inp);
char *s;
int thflags;
tcp_seq seq;
@ -388,16 +172,6 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
NET_EPOCH_ASSERT();
INP_LOCK_ASSERT(inp);
/*
* XXXRW: Time wait state for inpcb has been recycled, but inpcb is
* still present. This is undesirable, but temporarily necessary
* until we work out how to handle inpcb's who's timewait state has
* been removed.
*/
tw = intotw(inp);
if (tw == NULL)
goto drop;
thflags = tcp_get_flags(th);
#ifdef INVARIANTS
if ((thflags & (TH_SYN | TH_ACK)) == TH_SYN)
@ -459,36 +233,37 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
* Allow UDP port number changes in this case.
*/
if (((thflags & (TH_SYN | TH_ACK)) == TH_SYN) &&
SEQ_GT(th->th_seq, tw->rcv_nxt)) {
SEQ_GT(th->th_seq, tp->rcv_nxt)) {
/*
* In case we can't upgrade our lock just pretend we have
* lost this packet.
*/
if (INP_TRY_UPGRADE(inp) == 0)
goto drop;
tcp_twclose(tw, 0);
if ((tp = tcp_close(tp)) != NULL)
INP_WUNLOCK(inp);
TCPSTAT_INC(tcps_tw_recycles);
return (1);
return (true);
}
/*
* Send RST if UDP port numbers don't match
*/
if (tw->t_port != m->m_pkthdr.tcp_tun_port) {
if (tp->t_port != m->m_pkthdr.tcp_tun_port) {
if (tcp_get_flags(th) & TH_ACK) {
tcp_respond(NULL, mtod(m, void *), th, m,
tcp_respond(tp, mtod(m, void *), th, m,
(tcp_seq)0, th->th_ack, TH_RST);
} else {
if (tcp_get_flags(th) & TH_SYN)
tlen++;
if (tcp_get_flags(th) & TH_FIN)
tlen++;
tcp_respond(NULL, mtod(m, void *), th, m,
tcp_respond(tp, mtod(m, void *), th, m,
th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK);
}
INP_UNLOCK(inp);
TCPSTAT_INC(tcps_tw_resets);
return (0);
return (false);
}
/*
@ -505,7 +280,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
* the segment, unless the missing timestamps are tolerated.
* See section 3.2 of RFC 7323.
*/
if (((to->to_flags & TOF_TS) == 0) && (tw->t_recent != 0) &&
if (((to->to_flags & TOF_TS) == 0) && (tp->ts_recent != 0) &&
(V_tcp_tolerate_missing_ts == 0)) {
goto drop;
}
@ -515,344 +290,25 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
*/
if (thflags & TH_FIN) {
seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0);
if (seq + 1 == tw->rcv_nxt)
tcp_tw_2msl_reset(tw, 1);
if (seq + 1 == tp->rcv_nxt)
tcp_timer_activate(tp, TT_2MSL, 2 * V_tcp_msl);
}
/*
* Acknowledge the segment if it has data or is not a duplicate ACK.
*/
if (thflags != TH_ACK || tlen != 0 ||
th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) {
th->th_seq != tp->rcv_nxt || th->th_ack != tp->snd_nxt) {
TCP_PROBE5(receive, NULL, NULL, m, NULL, th);
tcp_twrespond(tw, TH_ACK);
tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
tp->snd_nxt, TH_ACK);
INP_UNLOCK(inp);
TCPSTAT_INC(tcps_tw_responds);
goto dropnoprobe;
return (false);
}
drop:
TCP_PROBE5(receive, NULL, NULL, m, NULL, th);
dropnoprobe:
INP_UNLOCK(inp);
m_freem(m);
return (0);
}
void
tcp_twclose(struct tcptw *tw, int reuse)
{
struct socket *so;
struct inpcb *inp;
/*
* At this point, we are in one of two situations:
*
* (1) We have no socket, just an inpcb<->twtcp pair. We can free
* all state.
*
* (2) We have a socket -- if we own a reference, release it and
* notify the socket layer.
*/
inp = tw->tw_inpcb;
KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
NET_EPOCH_ASSERT();
INP_WLOCK_ASSERT(inp);
tcp_tw_2msl_stop(tw, reuse);
inp->inp_ppcb = NULL;
in_pcbdrop(inp);
so = inp->inp_socket;
if (so != NULL) {
/*
* If there's a socket, handle two cases: first, we own a
* strong reference, which we will now release, or we don't
* in which case another reference exists (XXXRW: think
* about this more), and we don't need to take action.
*/
if (inp->inp_flags & INP_SOCKREF) {
inp->inp_flags &= ~INP_SOCKREF;
INP_WUNLOCK(inp);
sorele(so);
} else {
/*
* If we don't own the only reference, the socket and
* inpcb need to be left around to be handled by
* tcp_usr_detach() later.
*/
INP_WUNLOCK(inp);
}
} else {
/*
* The socket has been already cleaned-up for us, only free the
* inpcb.
*/
in_pcbfree(inp);
}
TCPSTAT_INC(tcps_closed);
}
static int
tcp_twrespond(struct tcptw *tw, int flags)
{
struct inpcb *inp = tw->tw_inpcb;
#if defined(INET6) || defined(INET)
struct tcphdr *th = NULL;
#endif
struct mbuf *m;
#ifdef INET
struct ip *ip = NULL;
#endif
u_int hdrlen, optlen, ulen;
int error = 0; /* Keep compiler happy */
struct tcpopt to;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
#endif
struct udphdr *udp = NULL;
hdrlen = 0; /* Keep compiler happy */
INP_WLOCK_ASSERT(inp);
m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL)
return (ENOBUFS);
m->m_data += max_linkhdr;
#ifdef MAC
mac_inpcb_create_mbuf(inp, m);
#endif
#ifdef INET6
if (isipv6) {
hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
ip6 = mtod(m, struct ip6_hdr *);
if (tw->t_port) {
udp = (struct udphdr *)(ip6 + 1);
hdrlen += sizeof(struct udphdr);
udp->uh_sport = htons(V_tcp_udp_tunneling_port);
udp->uh_dport = tw->t_port;
ulen = (hdrlen - sizeof(struct ip6_hdr));
th = (struct tcphdr *)(udp + 1);
} else
th = (struct tcphdr *)(ip6 + 1);
tcpip_fillheaders(inp, tw->t_port, ip6, th);
}
#endif
#if defined(INET6) && defined(INET)
else
#endif
#ifdef INET
{
hdrlen = sizeof(struct tcpiphdr);
ip = mtod(m, struct ip *);
if (tw->t_port) {
udp = (struct udphdr *)(ip + 1);
hdrlen += sizeof(struct udphdr);
udp->uh_sport = htons(V_tcp_udp_tunneling_port);
udp->uh_dport = tw->t_port;
ulen = (hdrlen - sizeof(struct ip));
th = (struct tcphdr *)(udp + 1);
} else
th = (struct tcphdr *)(ip + 1);
tcpip_fillheaders(inp, tw->t_port, ip, th);
}
#endif
to.to_flags = 0;
/*
* Send a timestamp and echo-reply if both our side and our peer
* have sent timestamps in our SYN's and this is not a RST.
*/
if (tw->t_recent && flags == TH_ACK) {
to.to_flags |= TOF_TS;
to.to_tsval = tcp_ts_getticks() + tw->ts_offset;
to.to_tsecr = tw->t_recent;
}
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
if (tw->tw_flags & TF_SIGNATURE)
to.to_flags |= TOF_SIGNATURE;
#endif
optlen = tcp_addoptions(&to, (u_char *)(th + 1));
if (udp) {
ulen += optlen;
udp->uh_ulen = htons(ulen);
}
m->m_len = hdrlen + optlen;
m->m_pkthdr.len = m->m_len;
KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
th->th_seq = htonl(tw->snd_nxt);
th->th_ack = htonl(tw->rcv_nxt);
th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
tcp_set_flags(th, flags);
th->th_win = htons(tw->last_win);
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
if (tw->tw_flags & TF_SIGNATURE) {
if (!TCPMD5_ENABLED() ||
TCPMD5_OUTPUT(m, th, to.to_signature) != 0)
return (-1);
}
#endif
#ifdef INET6
if (isipv6) {
if (tw->t_port) {
m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
th->th_sum = htons(0);
} else {
m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
th->th_sum = in6_cksum_pseudo(ip6,
sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0);
}
ip6->ip6_hlim = in6_selecthlim(inp, NULL);
TCP_PROBE5(send, NULL, NULL, ip6, NULL, th);
error = ip6_output(m, inp->in6p_outputopts, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
}
#endif
#if defined(INET6) && defined(INET)
else
#endif
#ifdef INET
{
if (tw->t_port) {
m->m_pkthdr.csum_flags = CSUM_UDP;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
th->th_sum = htons(0);
} else {
m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
}
ip->ip_len = htons(m->m_pkthdr.len);
if (V_path_mtu_discovery)
ip->ip_off |= htons(IP_DF);
TCP_PROBE5(send, NULL, NULL, ip, NULL, th);
error = ip_output(m, inp->inp_options, NULL,
((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
NULL, inp);
}
#endif
if (flags & TH_ACK)
TCPSTAT_INC(tcps_sndacks);
else
TCPSTAT_INC(tcps_sndctrl);
TCPSTAT_INC(tcps_sndtotal);
return (error);
}
static void
tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
{
NET_EPOCH_ASSERT();
INP_WLOCK_ASSERT(tw->tw_inpcb);
TW_WLOCK(V_tw_lock);
if (rearm)
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
tw->tw_time = ticks + 2 * V_tcp_msl;
TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
TW_WUNLOCK(V_tw_lock);
}
static void
tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
{
struct ucred *cred;
struct inpcb *inp;
int released __unused;
NET_EPOCH_ASSERT();
TW_WLOCK(V_tw_lock);
inp = tw->tw_inpcb;
tw->tw_inpcb = NULL;
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
cred = tw->tw_cred;
tw->tw_cred = NULL;
TW_WUNLOCK(V_tw_lock);
if (cred != NULL)
crfree(cred);
released = in_pcbrele_wlocked(inp);
KASSERT(!released, ("%s: inp should not be released here", __func__));
if (!reuse)
uma_zfree(V_tcptw_zone, tw);
TCPSTATES_DEC(TCPS_TIME_WAIT);
}
struct tcptw *
tcp_tw_2msl_scan(int reuse)
{
struct tcptw *tw;
struct inpcb *inp;
NET_EPOCH_ASSERT();
for (;;) {
TW_RLOCK(V_tw_lock);
tw = TAILQ_FIRST(&V_twq_2msl);
if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) {
TW_RUNLOCK(V_tw_lock);
break;
}
KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
__func__));
inp = tw->tw_inpcb;
in_pcbref(inp);
TW_RUNLOCK(V_tw_lock);
INP_WLOCK(inp);
tw = intotw(inp);
if (in_pcbrele_wlocked(inp)) {
if (__predict_true(tw == NULL)) {
continue;
} else {
/* This should not happen as in TIMEWAIT
* state the inp should not be destroyed
* before its tcptw. If INVARIANTS is
* defined panic.
*/
#ifdef INVARIANTS
panic("%s: Panic before an infinite "
"loop: INP_TIMEWAIT && (INP_FREED "
"|| inp last reference) && tw != "
"NULL", __func__);
#else
log(LOG_ERR, "%s: Avoid an infinite "
"loop: INP_TIMEWAIT && (INP_FREED "
"|| inp last reference) && tw != "
"NULL", __func__);
#endif
break;
}
}
if (tw == NULL) {
/* tcp_twclose() has already been called */
INP_WUNLOCK(inp);
continue;
}
tcp_twclose(tw, reuse);
if (reuse)
return tw;
}
return NULL;
return (false);
}

View File

@ -631,24 +631,7 @@ struct tcp_ifcap {
struct in_conninfo;
#endif /* _NETINET_IN_PCB_H_ */
struct tcptw {
struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */
uint32_t t_port:16, /* UDP port number if TCPoUDP */
t_unused:16;
tcp_seq snd_nxt;
tcp_seq rcv_nxt;
u_short last_win; /* cached window value */
short tw_so_options; /* copy of so_options */
struct ucred *tw_cred; /* user credentials */
u_int32_t t_recent;
u_int32_t ts_offset; /* our timestamp offset */
int tw_time;
TAILQ_ENTRY(tcptw) tw_2msl;
u_int tw_flags; /* tcpcb t_flags */
};
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
#define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb)
#define sototcpcb(so) (intotcpcb(sotoinpcb(so)))
/*
@ -1083,7 +1066,6 @@ struct tcpcb *
void tcp_discardcb(struct tcpcb *);
bool tcp_freecb(struct tcpcb *);
void tcp_twstart(struct tcpcb *);
void tcp_twclose(struct tcptw *, int);
int tcp_ctloutput(struct socket *, struct sockopt *);
void tcp_fini(void *);
char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, const void *,
@ -1176,12 +1158,7 @@ int tcp_default_output(struct tcpcb *);
void tcp_state_change(struct tcpcb *, int);
void tcp_respond(struct tcpcb *, void *,
struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int);
void tcp_tw_init(void);
#ifdef VIMAGE
void tcp_tw_destroy(void);
#endif
void tcp_tw_zone_change(void);
int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *,
bool tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *,
struct mbuf *, int);
void tcp_setpersist(struct tcpcb *);
void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp);

View File

@ -386,6 +386,7 @@ int
toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp)
{
struct inpcb *inp;
struct tcpcb *tp;
if (inc->inc_flags & INC_ISIPV6) {
inp = in6_pcblookup(&V_tcbinfo, &inc->inc6_faddr,
@ -398,7 +399,8 @@ toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp)
if (inp != NULL) {
INP_RLOCK_ASSERT(inp);
if ((inp->inp_flags & INP_TIMEWAIT) && th != NULL) {
tp = intotcpcb(inp);
if (tp->t_state == TCPS_TIME_WAIT && th != NULL) {
if (!tcp_twcheck(inp, NULL, th, NULL, 0))
return (EADDRINUSE);
} else {

View File

@ -242,7 +242,6 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
}
if (lport) {
struct inpcb *t;
struct tcptw *tw;
/* GROSS */
if (ntohs(lport) <= V_ipport_reservedhigh &&
@ -303,20 +302,8 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
}
t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr,
lport, lookupflags, cred);
if (t && (t->inp_flags & INP_TIMEWAIT)) {
/*
* XXXRW: If an incpb has had its timewait
* state recycled, we treat the address as
* being in use (for now). This is better
* than a panic, but not desirable.
*/
tw = intotw(t);
if (tw == NULL ||
((reuseport & tw->tw_so_options) == 0 &&
(reuseport_lb & tw->tw_so_options) == 0))
return (EADDRINUSE);
} else if (t && (reuseport & inp_so_options(t)) == 0 &&
(reuseport_lb & inp_so_options(t)) == 0) {
if (t && (reuseport & inp_so_options(t)) == 0 &&
(reuseport_lb & inp_so_options(t)) == 0) {
return (EADDRINUSE);
}
#ifdef INET
@ -327,18 +314,7 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
in6_sin6_2_sin(&sin, sin6);
t = in_pcblookup_local(pcbinfo, sin.sin_addr,
lport, lookupflags, cred);
if (t && t->inp_flags & INP_TIMEWAIT) {
tw = intotw(t);
if (tw == NULL)
return (EADDRINUSE);
if ((reuseport & tw->tw_so_options) == 0
&& (reuseport_lb & tw->tw_so_options) == 0
&& (ntohl(t->inp_laddr.s_addr) !=
INADDR_ANY || ((inp->inp_vflag &
INP_IPV6PROTO) ==
(t->inp_vflag & INP_IPV6PROTO))))
return (EADDRINUSE);
} else if (t &&
if (t &&
(reuseport & inp_so_options(t)) == 0 &&
(reuseport_lb & inp_so_options(t)) == 0 &&
(ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||