Currently, the TCP slow timer can starve TCP input processing while it

walks the list of connections in TIME_WAIT closing expired connections
due to contention on the global TCP pcbinfo lock.

To remediate, introduce a new global lock to protect the list of
connections in TIME_WAIT.  Only acquire the TCP pcbinfo lock when
closing an expired connection.  This limits the window of time when
TCP input processing is stopped to the amount of time needed to close
a single connection.

Submitted by:	Julien Charbon <jcharbon@verisign.com>
Reviewed by:	rwatson, rrs, adrian
MFC after:	2 months
This commit is contained in:
John Baldwin 2014-04-10 18:15:35 +00:00
parent 1e4b22b44b
commit 66eefb1eae
4 changed files with 130 additions and 32 deletions

View File

@ -144,9 +144,7 @@ tcp_slowtimo(void)
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
INP_INFO_WLOCK(&V_tcbinfo);
(void) tcp_tw_2msl_scan(0);
INP_INFO_WUNLOCK(&V_tcbinfo);
tcp_tw_2msl_scan();
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();

View File

@ -178,7 +178,8 @@ extern int tcp_fast_finwait2_recycle;
void tcp_timer_init(void);
void tcp_timer_2msl(void *xtp);
struct tcptw *
tcp_tw_2msl_scan(int _reuse); /* XXX temporary */
tcp_tw_2msl_reuse(void); /* XXX temporary? */
void tcp_tw_2msl_scan(void);
void tcp_timer_keep(void *xtp);
void tcp_timer_persist(void *xtp);
void tcp_timer_rexmt(void *xtp);

View File

@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/random.h>
#include <sys/refcount.h>
#include <vm/uma.h>
@ -99,13 +100,61 @@ static int maxtcptw;
* The timed wait queue contains references to each of the TCP sessions
* currently in the TIME_WAIT state. The queue pointers, including the
* queue pointers in each tcptw structure, are protected using the global
* tcbinfo lock, which must be held over queue iteration and modification.
* timewait lock, which must be held over queue iteration and modification.
*/
static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
#define V_twq_2msl VNET(twq_2msl)
static void tcp_tw_2msl_reset(struct tcptw *, int);
static void tcp_tw_2msl_stop(struct tcptw *);
/* Global timewait lock */
static VNET_DEFINE(struct rwlock, tw_lock);
#define V_tw_lock VNET(tw_lock)
#define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0)
#define TW_LOCK_DESTROY(tw) rw_destroy(&(tw))
#define TW_RLOCK(tw) rw_rlock(&(tw))
#define TW_WLOCK(tw) rw_wlock(&(tw))
#define TW_RUNLOCK(tw) rw_runlock(&(tw))
#define TW_WUNLOCK(tw) rw_wunlock(&(tw))
#define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED)
#define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED)
#define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED)
#define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED)
/*
* tw_pcbref() bumps the reference count on an tw in order to maintain
* stability of an tw pointer despite the tw lock being released.
*/
static void
tw_pcbref(struct tcptw *tw)
{
KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
refcount_acquire(&tw->tw_refcount);
}
/*
* Drop a refcount on an tw elevated using tw_pcbref(). Return
* the tw lock released.
*/
static int
tw_pcbrele(struct tcptw *tw)
{
TW_WLOCK_ASSERT(V_tw_lock);
KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
if (!refcount_release(&tw->tw_refcount)) {
TW_WUNLOCK(V_tw_lock);
return (0);
}
uma_zfree(V_tcptw_zone, tw);
TW_WUNLOCK(V_tw_lock);
return (1);
}
static void tcp_tw_2msl_reset(struct tcptw *, int ream);
static void tcp_tw_2msl_stop(struct tcptw *, int reuse);
static int
tcptw_auto_size(void)
@ -172,6 +221,7 @@ tcp_tw_init(void)
else
uma_zone_set_max(V_tcptw_zone, maxtcptw);
TAILQ_INIT(&V_twq_2msl);
TW_LOCK_INIT(V_tw_lock, "tcptw");
}
#ifdef VIMAGE
@ -181,10 +231,11 @@ tcp_tw_destroy(void)
struct tcptw *tw;
INP_INFO_WLOCK(&V_tcbinfo);
while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
tcp_twclose(tw, 0);
INP_INFO_WUNLOCK(&V_tcbinfo);
TW_LOCK_DESTROY(V_tw_lock);
uma_zdestroy(V_tcptw_zone);
}
#endif
@ -205,7 +256,7 @@ tcp_twstart(struct tcpcb *tp)
int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
#endif
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
if (V_nolocaltimewait) {
@ -230,7 +281,7 @@ tcp_twstart(struct tcpcb *tp)
tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
if (tw == NULL) {
tw = tcp_tw_2msl_scan(1);
tw = tcp_tw_2msl_reuse();
if (tw == NULL) {
tp = tcp_close(tp);
if (tp != NULL)
@ -239,6 +290,7 @@ tcp_twstart(struct tcpcb *tp)
}
}
tw->tw_inpcb = inp;
refcount_init(&tw->tw_refcount, 1);
/*
* Recover last window size sent.
@ -357,7 +409,6 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
int thflags;
tcp_seq seq;
/* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
@ -459,11 +510,11 @@ tcp_twclose(struct tcptw *tw, int reuse)
inp = tw->tw_inpcb;
KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
INP_WLOCK_ASSERT(inp);
tw->tw_inpcb = NULL;
tcp_tw_2msl_stop(tw);
tcp_tw_2msl_stop(tw, reuse);
inp->inp_ppcb = NULL;
in_pcbdrop(inp);
@ -495,11 +546,6 @@ tcp_twclose(struct tcptw *tw, int reuse)
} else
in_pcbfree(inp);
TCPSTAT_INC(tcps_closed);
crfree(tw->tw_cred);
tw->tw_cred = NULL;
if (reuse)
return;
uma_zfree(V_tcptw_zone, tw);
}
int
@ -617,34 +663,88 @@ tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tw->tw_inpcb);
TW_WLOCK(V_tw_lock);
if (rearm)
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
tw->tw_time = ticks + 2 * tcp_msl;
TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
TW_WUNLOCK(V_tw_lock);
}
static void
tcp_tw_2msl_stop(struct tcptw *tw)
tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
{
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
TW_WLOCK(V_tw_lock);
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
crfree(tw->tw_cred);
tw->tw_cred = NULL;
if (!reuse) {
tw_pcbrele(tw);
return;
}
TW_WUNLOCK(V_tw_lock);
}
struct tcptw *
tcp_tw_2msl_scan(int reuse)
tcp_tw_2msl_reuse(void)
{
struct tcptw *tw;
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
for (;;) {
tw = TAILQ_FIRST(&V_twq_2msl);
if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0))
break;
INP_WLOCK(tw->tw_inpcb);
tcp_twclose(tw, reuse);
if (reuse)
return (tw);
struct tcptw *tw;
TW_WLOCK(V_tw_lock);
tw = TAILQ_FIRST(&V_twq_2msl);
if (tw == NULL) {
TW_WUNLOCK(V_tw_lock);
return NULL;
}
TW_WUNLOCK(V_tw_lock);
INP_WLOCK(tw->tw_inpcb);
tcp_twclose(tw, 1);
return (tw);
}
void
tcp_tw_2msl_scan(void)
{
struct tcptw *tw;
for (;;) {
TW_RLOCK(V_tw_lock);
tw = TAILQ_FIRST(&V_twq_2msl);
if (tw == NULL || ((tw->tw_time - ticks) > 0)) {
TW_RUNLOCK(V_tw_lock);
break;
}
tw_pcbref(tw);
TW_RUNLOCK(V_tw_lock);
/* Close timewait state */
if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
TW_WLOCK(V_tw_lock);
if (tw_pcbrele(tw))
continue;
KASSERT(tw->tw_inpcb != NULL,
("%s: tw->tw_inpcb == NULL", __func__));
INP_WLOCK(tw->tw_inpcb);
tcp_twclose(tw, 0);
INP_INFO_WUNLOCK(&V_tcbinfo);
} else {
/* INP_INFO lock is busy, continue later */
TW_WLOCK(V_tw_lock);
tw_pcbrele(tw);
break;
}
}
return (NULL);
}

View File

@ -353,8 +353,7 @@ struct tcptw {
u_int t_starttime;
int tw_time;
TAILQ_ENTRY(tcptw) tw_2msl;
void *tw_pspare; /* TCP_SIGNATURE */
u_int *tw_spare; /* TCP_SIGNATURE */
u_int tw_refcount; /* refcount */
};
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)