From cea40c4888847e5ba802d292e42c53acc104b7ea Mon Sep 17 00:00:00 2001 From: Julien Charbon Date: Thu, 30 Oct 2014 08:53:56 +0000 Subject: [PATCH] Fix a race condition in TCP timewait between tcp_tw_2msl_reuse() and tcp_tw_2msl_scan(). This race condition drives unplanned timewait timeout cancellation. Also simplify implementation by holding inpcb reference and removing tcptw reference counting. Differential Revision: https://reviews.freebsd.org/D826 Submitted by: Marc De la Gueronniere Submitted by: jch Reviewed By: jhb (mentor), adrian, rwatson Sponsored by: Verisign, Inc. MFC after: 2 weeks X-MFC-With: r264321 --- sys/netinet/tcp_timer.c | 2 +- sys/netinet/tcp_timer.h | 3 +- sys/netinet/tcp_timewait.c | 138 ++++++++++++++++++++----------------- sys/netinet/tcp_usrreq.c | 15 ++++ sys/netinet/tcp_var.h | 3 +- 5 files changed, 93 insertions(+), 68 deletions(-) diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 634c67602b0b..5d379ea38dfd 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -243,7 +243,7 @@ tcp_slowtimo(void) VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - tcp_tw_2msl_scan(); + (void) tcp_tw_2msl_scan(0); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h index c04723a5c97b..f80e7441ee1e 100644 --- a/sys/netinet/tcp_timer.h +++ b/sys/netinet/tcp_timer.h @@ -178,8 +178,7 @@ extern int tcp_fast_finwait2_recycle; void tcp_timer_init(void); void tcp_timer_2msl(void *xtp); struct tcptw * - tcp_tw_2msl_reuse(void); /* XXX temporary? */ -void tcp_tw_2msl_scan(void); + tcp_tw_2msl_scan(int reuse); /* XXX temporary? */ void tcp_timer_keep(void *xtp); void tcp_timer_persist(void *xtp); void tcp_timer_rexmt(void *xtp); diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c index 33555d9fe980..2bdc107db2fd 100644 --- a/sys/netinet/tcp_timewait.c +++ b/sys/netinet/tcp_timewait.c @@ -49,7 +49,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include @@ -101,6 +100,11 @@ static int maxtcptw; * currently in the TIME_WAIT state. The queue pointers, including the * queue pointers in each tcptw structure, are protected using the global * timewait lock, which must be held over queue iteration and modification. + * + * Rules on tcptw usage: + * - a inpcb is always freed _after_ its tcptw + * - a tcptw relies on its inpcb reference counting for memory stability + * - a tcptw is dereferenceable only while its inpcb is locked */ static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl); #define V_twq_2msl VNET(twq_2msl) @@ -124,32 +128,6 @@ static void tcp_tw_2msl_reset(struct tcptw *, int); static void tcp_tw_2msl_stop(struct tcptw *, int); static int tcp_twrespond(struct tcptw *, int); -/* - * tw_pcbref() bumps the reference count on an tw in order to maintain - * stability of an tw pointer despite the tw lock being released. - */ -static void -tw_pcbref(struct tcptw *tw) -{ - - KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__)); - refcount_acquire(&tw->tw_refcount); -} - -/* - * Drop a refcount on an tw elevated using tw_pcbref(). - */ -static int -tw_pcbrele(struct tcptw *tw) -{ - - KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__)); - if (!refcount_release(&tw->tw_refcount)) - return (0); - uma_zfree(V_tcptw_zone, tw); - return (1); -} - static int tcptw_auto_size(void) { @@ -279,8 +257,11 @@ tcp_twstart(struct tcpcb *tp) * Reached limit on total number of TIMEWAIT connections * allowed. Remove a connection from TIMEWAIT queue in LRU * fashion to make room for this connection. + * + * pcbinfo lock is needed here to prevent deadlock as + * two inpcb locks can be acquired simultaneously. */ - tw = tcp_tw_2msl_reuse(); + tw = tcp_tw_2msl_scan(1); if (tw == NULL) { tp = tcp_close(tp); if (tp != NULL) @@ -288,8 +269,12 @@ tcp_twstart(struct tcpcb *tp) return; } } + /* + * The tcptw will hold a reference on its inpcb until tcp_twclose + * is called + */ tw->tw_inpcb = inp; - refcount_init(&tw->tw_refcount, 1); + in_pcbref(inp); /* Reference from tw */ /* * Recover last window size sent. @@ -479,7 +464,6 @@ tcp_twclose(struct tcptw *tw, int reuse) INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */ INP_WLOCK_ASSERT(inp); - tw->tw_inpcb = NULL; tcp_tw_2msl_stop(tw, reuse); inp->inp_ppcb = NULL; in_pcbdrop(inp); @@ -509,8 +493,13 @@ tcp_twclose(struct tcptw *tw, int reuse) */ INP_WUNLOCK(inp); } - } else + } else { + /* + * The socket has been already cleaned-up for us, only free the + * inpcb. + */ in_pcbfree(inp); + } TCPSTAT_INC(tcps_closed); } @@ -641,71 +630,94 @@ tcp_tw_2msl_reset(struct tcptw *tw, int rearm) static void tcp_tw_2msl_stop(struct tcptw *tw, int reuse) { + struct ucred *cred; + struct inpcb *inp; + int released; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); TW_WLOCK(V_tw_lock); + inp = tw->tw_inpcb; + tw->tw_inpcb = NULL; + TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); - crfree(tw->tw_cred); + cred = tw->tw_cred; tw->tw_cred = NULL; TW_WUNLOCK(V_tw_lock); + if (cred != NULL) + crfree(cred); + + released = in_pcbrele_wlocked(inp); + KASSERT(!released, ("%s: inp should not be released here", __func__)); + if (!reuse) - tw_pcbrele(tw); + uma_zfree(V_tcptw_zone, tw); } struct tcptw * -tcp_tw_2msl_reuse(void) +tcp_tw_2msl_scan(int reuse) { struct tcptw *tw; + struct inpcb *inp; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - - TW_WLOCK(V_tw_lock); - tw = TAILQ_FIRST(&V_twq_2msl); - if (tw == NULL) { - TW_WUNLOCK(V_tw_lock); - return NULL; +#ifdef INVARIANTS + if (reuse) { + /* + * pcbinfo lock is needed in reuse case to prevent deadlock + * as two inpcb locks can be acquired simultaneously: + * - the inpcb transitioning to TIME_WAIT state in + * tcp_tw_start(), + * - the inpcb closed by tcp_twclose(). + */ + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); } - TW_WUNLOCK(V_tw_lock); - - INP_WLOCK(tw->tw_inpcb); - tcp_twclose(tw, 1); - - return (tw); -} - -void -tcp_tw_2msl_scan(void) -{ - struct tcptw *tw; +#endif for (;;) { TW_RLOCK(V_tw_lock); tw = TAILQ_FIRST(&V_twq_2msl); - if (tw == NULL || tw->tw_time - ticks > 0) { + if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) { TW_RUNLOCK(V_tw_lock); break; } - tw_pcbref(tw); + KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL", + __func__)); + + inp = tw->tw_inpcb; + in_pcbref(inp); TW_RUNLOCK(V_tw_lock); - /* Close timewait state */ if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) { - if (tw_pcbrele(tw)) { + + INP_WLOCK(inp); + tw = intotw(inp); + if (in_pcbrele_wlocked(inp)) { + KASSERT(tw == NULL, ("%s: held last inp " + "reference but tw not NULL", __func__)); INP_INFO_WUNLOCK(&V_tcbinfo); continue; } - KASSERT(tw->tw_inpcb != NULL, - ("%s: tw->tw_inpcb == NULL", __func__)); - INP_WLOCK(tw->tw_inpcb); - tcp_twclose(tw, 0); + if (tw == NULL) { + /* tcp_twclose() has already been called */ + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + continue; + } + + tcp_twclose(tw, reuse); INP_INFO_WUNLOCK(&V_tcbinfo); + if (reuse) + return tw; } else { - /* INP_INFO lock is busy; continue later. */ - tw_pcbrele(tw); + /* INP_INFO lock is busy, continue later. */ + INP_WLOCK(inp); + if (!in_pcbrele_wlocked(inp)) + INP_WUNLOCK(inp); break; } } + + return NULL; } diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 17efea440834..2820ffba70ed 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -183,6 +183,21 @@ tcp_detach(struct socket *so, struct inpcb *inp) * present until timewait ends. * * XXXRW: Would it be cleaner to free the tcptw here? + * + * Astute question indeed, from twtcp perspective there are + * three cases to consider: + * + * #1 tcp_detach is called at tcptw creation time by + * tcp_twstart, then do not discard the newly created tcptw + * and leave inpcb present until timewait ends + * #2 tcp_detach is called at timewait end (or reuse) by + * tcp_twclose, then the tcptw has already been discarded + * and inpcb is freed here + * #3 tcp_detach is called() after timewait ends (or reuse) + * (e.g. by soclose), then tcptw has already been discarded + * and inpcb is freed here + * + * In all three cases the tcptw should not be freed here. */ if (inp->inp_flags & INP_DROPPED) { KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && " diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 9fe6447762ec..84ac8f08ac13 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -358,7 +358,6 @@ struct tcptw { u_int t_starttime; int tw_time; TAILQ_ENTRY(tcptw) tw_2msl; - u_int tw_refcount; /* refcount */ }; #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) @@ -651,7 +650,7 @@ struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); -void tcp_twclose(struct tcptw *_tw, int _reuse); +void tcp_twclose(struct tcptw *, int); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); struct tcpcb *