Fix a race condition in TCP timewait between tcp_tw_2msl_reuse() and

tcp_tw_2msl_scan().  This race condition drives unplanned timewait
timeout cancellation.  Also simplify implementation by holding inpcb
reference and removing tcptw reference counting.

Differential Revision:	https://reviews.freebsd.org/D826
Submitted by:		Marc De la Gueronniere <mdelagueronniere@verisign.com>
Submitted by:		jch
Reviewed By:		jhb (mentor), adrian, rwatson
Sponsored by:		Verisign, Inc.
MFC after:		2 weeks
X-MFC-With:		r264321
This commit is contained in:
Julien Charbon 2014-10-30 08:53:56 +00:00
parent 5a06ac3540
commit cea40c4888
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=273850
5 changed files with 93 additions and 68 deletions

View File

@ -243,7 +243,7 @@ tcp_slowtimo(void)
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
tcp_tw_2msl_scan();
(void) tcp_tw_2msl_scan(0);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();

View File

@ -178,8 +178,7 @@ extern int tcp_fast_finwait2_recycle;
void tcp_timer_init(void);
void tcp_timer_2msl(void *xtp);
struct tcptw *
tcp_tw_2msl_reuse(void); /* XXX temporary? */
void tcp_tw_2msl_scan(void);
tcp_tw_2msl_scan(int reuse); /* XXX temporary? */
void tcp_timer_keep(void *xtp);
void tcp_timer_persist(void *xtp);
void tcp_timer_rexmt(void *xtp);

View File

@ -49,7 +49,6 @@ __FBSDID("$FreeBSD$");
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/random.h>
#include <sys/refcount.h>
#include <vm/uma.h>
@ -101,6 +100,11 @@ static int maxtcptw;
* currently in the TIME_WAIT state. The queue pointers, including the
* queue pointers in each tcptw structure, are protected using the global
* timewait lock, which must be held over queue iteration and modification.
*
* Rules on tcptw usage:
* - a inpcb is always freed _after_ its tcptw
* - a tcptw relies on its inpcb reference counting for memory stability
* - a tcptw is dereferenceable only while its inpcb is locked
*/
static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
#define V_twq_2msl VNET(twq_2msl)
@ -124,32 +128,6 @@ static void tcp_tw_2msl_reset(struct tcptw *, int);
static void tcp_tw_2msl_stop(struct tcptw *, int);
static int tcp_twrespond(struct tcptw *, int);
/*
* tw_pcbref() bumps the reference count on an tw in order to maintain
* stability of an tw pointer despite the tw lock being released.
*/
static void
tw_pcbref(struct tcptw *tw)
{
KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
refcount_acquire(&tw->tw_refcount);
}
/*
* Drop a refcount on an tw elevated using tw_pcbref().
*/
static int
tw_pcbrele(struct tcptw *tw)
{
KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
if (!refcount_release(&tw->tw_refcount))
return (0);
uma_zfree(V_tcptw_zone, tw);
return (1);
}
static int
tcptw_auto_size(void)
{
@ -279,8 +257,11 @@ tcp_twstart(struct tcpcb *tp)
* Reached limit on total number of TIMEWAIT connections
* allowed. Remove a connection from TIMEWAIT queue in LRU
* fashion to make room for this connection.
*
* pcbinfo lock is needed here to prevent deadlock as
* two inpcb locks can be acquired simultaneously.
*/
tw = tcp_tw_2msl_reuse();
tw = tcp_tw_2msl_scan(1);
if (tw == NULL) {
tp = tcp_close(tp);
if (tp != NULL)
@ -288,8 +269,12 @@ tcp_twstart(struct tcpcb *tp)
return;
}
}
/*
* The tcptw will hold a reference on its inpcb until tcp_twclose
* is called
*/
tw->tw_inpcb = inp;
refcount_init(&tw->tw_refcount, 1);
in_pcbref(inp); /* Reference from tw */
/*
* Recover last window size sent.
@ -479,7 +464,6 @@ tcp_twclose(struct tcptw *tw, int reuse)
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
INP_WLOCK_ASSERT(inp);
tw->tw_inpcb = NULL;
tcp_tw_2msl_stop(tw, reuse);
inp->inp_ppcb = NULL;
in_pcbdrop(inp);
@ -509,8 +493,13 @@ tcp_twclose(struct tcptw *tw, int reuse)
*/
INP_WUNLOCK(inp);
}
} else
} else {
/*
* The socket has been already cleaned-up for us, only free the
* inpcb.
*/
in_pcbfree(inp);
}
TCPSTAT_INC(tcps_closed);
}
@ -641,71 +630,94 @@ tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
static void
tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
{
struct ucred *cred;
struct inpcb *inp;
int released;
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
TW_WLOCK(V_tw_lock);
inp = tw->tw_inpcb;
tw->tw_inpcb = NULL;
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
crfree(tw->tw_cred);
cred = tw->tw_cred;
tw->tw_cred = NULL;
TW_WUNLOCK(V_tw_lock);
if (cred != NULL)
crfree(cred);
released = in_pcbrele_wlocked(inp);
KASSERT(!released, ("%s: inp should not be released here", __func__));
if (!reuse)
tw_pcbrele(tw);
uma_zfree(V_tcptw_zone, tw);
}
struct tcptw *
tcp_tw_2msl_reuse(void)
tcp_tw_2msl_scan(int reuse)
{
struct tcptw *tw;
struct inpcb *inp;
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
TW_WLOCK(V_tw_lock);
tw = TAILQ_FIRST(&V_twq_2msl);
if (tw == NULL) {
TW_WUNLOCK(V_tw_lock);
return NULL;
#ifdef INVARIANTS
if (reuse) {
/*
* pcbinfo lock is needed in reuse case to prevent deadlock
* as two inpcb locks can be acquired simultaneously:
* - the inpcb transitioning to TIME_WAIT state in
* tcp_tw_start(),
* - the inpcb closed by tcp_twclose().
*/
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
}
TW_WUNLOCK(V_tw_lock);
INP_WLOCK(tw->tw_inpcb);
tcp_twclose(tw, 1);
return (tw);
}
void
tcp_tw_2msl_scan(void)
{
struct tcptw *tw;
#endif
for (;;) {
TW_RLOCK(V_tw_lock);
tw = TAILQ_FIRST(&V_twq_2msl);
if (tw == NULL || tw->tw_time - ticks > 0) {
if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) {
TW_RUNLOCK(V_tw_lock);
break;
}
tw_pcbref(tw);
KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
__func__));
inp = tw->tw_inpcb;
in_pcbref(inp);
TW_RUNLOCK(V_tw_lock);
/* Close timewait state */
if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
if (tw_pcbrele(tw)) {
INP_WLOCK(inp);
tw = intotw(inp);
if (in_pcbrele_wlocked(inp)) {
KASSERT(tw == NULL, ("%s: held last inp "
"reference but tw not NULL", __func__));
INP_INFO_WUNLOCK(&V_tcbinfo);
continue;
}
KASSERT(tw->tw_inpcb != NULL,
("%s: tw->tw_inpcb == NULL", __func__));
INP_WLOCK(tw->tw_inpcb);
tcp_twclose(tw, 0);
if (tw == NULL) {
/* tcp_twclose() has already been called */
INP_WUNLOCK(inp);
INP_INFO_WUNLOCK(&V_tcbinfo);
continue;
}
tcp_twclose(tw, reuse);
INP_INFO_WUNLOCK(&V_tcbinfo);
if (reuse)
return tw;
} else {
/* INP_INFO lock is busy; continue later. */
tw_pcbrele(tw);
/* INP_INFO lock is busy, continue later. */
INP_WLOCK(inp);
if (!in_pcbrele_wlocked(inp))
INP_WUNLOCK(inp);
break;
}
}
return NULL;
}

View File

@ -183,6 +183,21 @@ tcp_detach(struct socket *so, struct inpcb *inp)
* present until timewait ends.
*
* XXXRW: Would it be cleaner to free the tcptw here?
*
* Astute question indeed, from twtcp perspective there are
* three cases to consider:
*
* #1 tcp_detach is called at tcptw creation time by
* tcp_twstart, then do not discard the newly created tcptw
* and leave inpcb present until timewait ends
* #2 tcp_detach is called at timewait end (or reuse) by
* tcp_twclose, then the tcptw has already been discarded
* and inpcb is freed here
* #3 tcp_detach is called() after timewait ends (or reuse)
* (e.g. by soclose), then tcptw has already been discarded
* and inpcb is freed here
*
* In all three cases the tcptw should not be freed here.
*/
if (inp->inp_flags & INP_DROPPED) {
KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "

View File

@ -358,7 +358,6 @@ struct tcptw {
u_int t_starttime;
int tw_time;
TAILQ_ENTRY(tcptw) tw_2msl;
u_int tw_refcount; /* refcount */
};
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
@ -651,7 +650,7 @@ struct tcpcb *
tcp_close(struct tcpcb *);
void tcp_discardcb(struct tcpcb *);
void tcp_twstart(struct tcpcb *);
void tcp_twclose(struct tcptw *_tw, int _reuse);
void tcp_twclose(struct tcptw *, int);
void tcp_ctlinput(int, struct sockaddr *, void *);
int tcp_ctloutput(struct socket *, struct sockopt *);
struct tcpcb *