Fix a race condition in TCP timewait between tcp_tw_2msl_reuse() and
tcp_tw_2msl_scan(). This race condition drives unplanned timewait timeout cancellation. Also simplify implementation by holding inpcb reference and removing tcptw reference counting. Differential Revision: https://reviews.freebsd.org/D826 Submitted by: Marc De la Gueronniere <mdelagueronniere@verisign.com> Submitted by: jch Reviewed By: jhb (mentor), adrian, rwatson Sponsored by: Verisign, Inc. MFC after: 2 weeks X-MFC-With: r264321
This commit is contained in:
parent
5a06ac3540
commit
cea40c4888
@ -243,7 +243,7 @@ tcp_slowtimo(void)
|
||||
VNET_LIST_RLOCK_NOSLEEP();
|
||||
VNET_FOREACH(vnet_iter) {
|
||||
CURVNET_SET(vnet_iter);
|
||||
tcp_tw_2msl_scan();
|
||||
(void) tcp_tw_2msl_scan(0);
|
||||
CURVNET_RESTORE();
|
||||
}
|
||||
VNET_LIST_RUNLOCK_NOSLEEP();
|
||||
|
@ -178,8 +178,7 @@ extern int tcp_fast_finwait2_recycle;
|
||||
void tcp_timer_init(void);
|
||||
void tcp_timer_2msl(void *xtp);
|
||||
struct tcptw *
|
||||
tcp_tw_2msl_reuse(void); /* XXX temporary? */
|
||||
void tcp_tw_2msl_scan(void);
|
||||
tcp_tw_2msl_scan(int reuse); /* XXX temporary? */
|
||||
void tcp_timer_keep(void *xtp);
|
||||
void tcp_timer_persist(void *xtp);
|
||||
void tcp_timer_rexmt(void *xtp);
|
||||
|
@ -49,7 +49,6 @@ __FBSDID("$FreeBSD$");
|
||||
#include <sys/socketvar.h>
|
||||
#include <sys/protosw.h>
|
||||
#include <sys/random.h>
|
||||
#include <sys/refcount.h>
|
||||
|
||||
#include <vm/uma.h>
|
||||
|
||||
@ -101,6 +100,11 @@ static int maxtcptw;
|
||||
* currently in the TIME_WAIT state. The queue pointers, including the
|
||||
* queue pointers in each tcptw structure, are protected using the global
|
||||
* timewait lock, which must be held over queue iteration and modification.
|
||||
*
|
||||
* Rules on tcptw usage:
|
||||
* - a inpcb is always freed _after_ its tcptw
|
||||
* - a tcptw relies on its inpcb reference counting for memory stability
|
||||
* - a tcptw is dereferenceable only while its inpcb is locked
|
||||
*/
|
||||
static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
|
||||
#define V_twq_2msl VNET(twq_2msl)
|
||||
@ -124,32 +128,6 @@ static void tcp_tw_2msl_reset(struct tcptw *, int);
|
||||
static void tcp_tw_2msl_stop(struct tcptw *, int);
|
||||
static int tcp_twrespond(struct tcptw *, int);
|
||||
|
||||
/*
|
||||
* tw_pcbref() bumps the reference count on an tw in order to maintain
|
||||
* stability of an tw pointer despite the tw lock being released.
|
||||
*/
|
||||
static void
|
||||
tw_pcbref(struct tcptw *tw)
|
||||
{
|
||||
|
||||
KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
|
||||
refcount_acquire(&tw->tw_refcount);
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop a refcount on an tw elevated using tw_pcbref().
|
||||
*/
|
||||
static int
|
||||
tw_pcbrele(struct tcptw *tw)
|
||||
{
|
||||
|
||||
KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
|
||||
if (!refcount_release(&tw->tw_refcount))
|
||||
return (0);
|
||||
uma_zfree(V_tcptw_zone, tw);
|
||||
return (1);
|
||||
}
|
||||
|
||||
static int
|
||||
tcptw_auto_size(void)
|
||||
{
|
||||
@ -279,8 +257,11 @@ tcp_twstart(struct tcpcb *tp)
|
||||
* Reached limit on total number of TIMEWAIT connections
|
||||
* allowed. Remove a connection from TIMEWAIT queue in LRU
|
||||
* fashion to make room for this connection.
|
||||
*
|
||||
* pcbinfo lock is needed here to prevent deadlock as
|
||||
* two inpcb locks can be acquired simultaneously.
|
||||
*/
|
||||
tw = tcp_tw_2msl_reuse();
|
||||
tw = tcp_tw_2msl_scan(1);
|
||||
if (tw == NULL) {
|
||||
tp = tcp_close(tp);
|
||||
if (tp != NULL)
|
||||
@ -288,8 +269,12 @@ tcp_twstart(struct tcpcb *tp)
|
||||
return;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* The tcptw will hold a reference on its inpcb until tcp_twclose
|
||||
* is called
|
||||
*/
|
||||
tw->tw_inpcb = inp;
|
||||
refcount_init(&tw->tw_refcount, 1);
|
||||
in_pcbref(inp); /* Reference from tw */
|
||||
|
||||
/*
|
||||
* Recover last window size sent.
|
||||
@ -479,7 +464,6 @@ tcp_twclose(struct tcptw *tw, int reuse)
|
||||
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
|
||||
INP_WLOCK_ASSERT(inp);
|
||||
|
||||
tw->tw_inpcb = NULL;
|
||||
tcp_tw_2msl_stop(tw, reuse);
|
||||
inp->inp_ppcb = NULL;
|
||||
in_pcbdrop(inp);
|
||||
@ -509,8 +493,13 @@ tcp_twclose(struct tcptw *tw, int reuse)
|
||||
*/
|
||||
INP_WUNLOCK(inp);
|
||||
}
|
||||
} else
|
||||
} else {
|
||||
/*
|
||||
* The socket has been already cleaned-up for us, only free the
|
||||
* inpcb.
|
||||
*/
|
||||
in_pcbfree(inp);
|
||||
}
|
||||
TCPSTAT_INC(tcps_closed);
|
||||
}
|
||||
|
||||
@ -641,71 +630,94 @@ tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
|
||||
static void
|
||||
tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
|
||||
{
|
||||
struct ucred *cred;
|
||||
struct inpcb *inp;
|
||||
int released;
|
||||
|
||||
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
|
||||
|
||||
TW_WLOCK(V_tw_lock);
|
||||
inp = tw->tw_inpcb;
|
||||
tw->tw_inpcb = NULL;
|
||||
|
||||
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
|
||||
crfree(tw->tw_cred);
|
||||
cred = tw->tw_cred;
|
||||
tw->tw_cred = NULL;
|
||||
TW_WUNLOCK(V_tw_lock);
|
||||
|
||||
if (cred != NULL)
|
||||
crfree(cred);
|
||||
|
||||
released = in_pcbrele_wlocked(inp);
|
||||
KASSERT(!released, ("%s: inp should not be released here", __func__));
|
||||
|
||||
if (!reuse)
|
||||
tw_pcbrele(tw);
|
||||
uma_zfree(V_tcptw_zone, tw);
|
||||
}
|
||||
|
||||
struct tcptw *
|
||||
tcp_tw_2msl_reuse(void)
|
||||
tcp_tw_2msl_scan(int reuse)
|
||||
{
|
||||
struct tcptw *tw;
|
||||
struct inpcb *inp;
|
||||
|
||||
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
|
||||
|
||||
TW_WLOCK(V_tw_lock);
|
||||
tw = TAILQ_FIRST(&V_twq_2msl);
|
||||
if (tw == NULL) {
|
||||
TW_WUNLOCK(V_tw_lock);
|
||||
return NULL;
|
||||
#ifdef INVARIANTS
|
||||
if (reuse) {
|
||||
/*
|
||||
* pcbinfo lock is needed in reuse case to prevent deadlock
|
||||
* as two inpcb locks can be acquired simultaneously:
|
||||
* - the inpcb transitioning to TIME_WAIT state in
|
||||
* tcp_tw_start(),
|
||||
* - the inpcb closed by tcp_twclose().
|
||||
*/
|
||||
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
|
||||
}
|
||||
TW_WUNLOCK(V_tw_lock);
|
||||
|
||||
INP_WLOCK(tw->tw_inpcb);
|
||||
tcp_twclose(tw, 1);
|
||||
|
||||
return (tw);
|
||||
}
|
||||
|
||||
void
|
||||
tcp_tw_2msl_scan(void)
|
||||
{
|
||||
struct tcptw *tw;
|
||||
#endif
|
||||
|
||||
for (;;) {
|
||||
TW_RLOCK(V_tw_lock);
|
||||
tw = TAILQ_FIRST(&V_twq_2msl);
|
||||
if (tw == NULL || tw->tw_time - ticks > 0) {
|
||||
if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) {
|
||||
TW_RUNLOCK(V_tw_lock);
|
||||
break;
|
||||
}
|
||||
tw_pcbref(tw);
|
||||
KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
|
||||
__func__));
|
||||
|
||||
inp = tw->tw_inpcb;
|
||||
in_pcbref(inp);
|
||||
TW_RUNLOCK(V_tw_lock);
|
||||
|
||||
/* Close timewait state */
|
||||
if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
|
||||
if (tw_pcbrele(tw)) {
|
||||
|
||||
INP_WLOCK(inp);
|
||||
tw = intotw(inp);
|
||||
if (in_pcbrele_wlocked(inp)) {
|
||||
KASSERT(tw == NULL, ("%s: held last inp "
|
||||
"reference but tw not NULL", __func__));
|
||||
INP_INFO_WUNLOCK(&V_tcbinfo);
|
||||
continue;
|
||||
}
|
||||
|
||||
KASSERT(tw->tw_inpcb != NULL,
|
||||
("%s: tw->tw_inpcb == NULL", __func__));
|
||||
INP_WLOCK(tw->tw_inpcb);
|
||||
tcp_twclose(tw, 0);
|
||||
if (tw == NULL) {
|
||||
/* tcp_twclose() has already been called */
|
||||
INP_WUNLOCK(inp);
|
||||
INP_INFO_WUNLOCK(&V_tcbinfo);
|
||||
continue;
|
||||
}
|
||||
|
||||
tcp_twclose(tw, reuse);
|
||||
INP_INFO_WUNLOCK(&V_tcbinfo);
|
||||
if (reuse)
|
||||
return tw;
|
||||
} else {
|
||||
/* INP_INFO lock is busy; continue later. */
|
||||
tw_pcbrele(tw);
|
||||
/* INP_INFO lock is busy, continue later. */
|
||||
INP_WLOCK(inp);
|
||||
if (!in_pcbrele_wlocked(inp))
|
||||
INP_WUNLOCK(inp);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
@ -183,6 +183,21 @@ tcp_detach(struct socket *so, struct inpcb *inp)
|
||||
* present until timewait ends.
|
||||
*
|
||||
* XXXRW: Would it be cleaner to free the tcptw here?
|
||||
*
|
||||
* Astute question indeed, from twtcp perspective there are
|
||||
* three cases to consider:
|
||||
*
|
||||
* #1 tcp_detach is called at tcptw creation time by
|
||||
* tcp_twstart, then do not discard the newly created tcptw
|
||||
* and leave inpcb present until timewait ends
|
||||
* #2 tcp_detach is called at timewait end (or reuse) by
|
||||
* tcp_twclose, then the tcptw has already been discarded
|
||||
* and inpcb is freed here
|
||||
* #3 tcp_detach is called() after timewait ends (or reuse)
|
||||
* (e.g. by soclose), then tcptw has already been discarded
|
||||
* and inpcb is freed here
|
||||
*
|
||||
* In all three cases the tcptw should not be freed here.
|
||||
*/
|
||||
if (inp->inp_flags & INP_DROPPED) {
|
||||
KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
|
||||
|
@ -358,7 +358,6 @@ struct tcptw {
|
||||
u_int t_starttime;
|
||||
int tw_time;
|
||||
TAILQ_ENTRY(tcptw) tw_2msl;
|
||||
u_int tw_refcount; /* refcount */
|
||||
};
|
||||
|
||||
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
|
||||
@ -651,7 +650,7 @@ struct tcpcb *
|
||||
tcp_close(struct tcpcb *);
|
||||
void tcp_discardcb(struct tcpcb *);
|
||||
void tcp_twstart(struct tcpcb *);
|
||||
void tcp_twclose(struct tcptw *_tw, int _reuse);
|
||||
void tcp_twclose(struct tcptw *, int);
|
||||
void tcp_ctlinput(int, struct sockaddr *, void *);
|
||||
int tcp_ctloutput(struct socket *, struct sockopt *);
|
||||
struct tcpcb *
|
||||
|
Loading…
x
Reference in New Issue
Block a user