Currently, the TCP slow timer can starve TCP input processing while it
walks the list of connections in TIME_WAIT closing expired connections due to contention on the global TCP pcbinfo lock. To remediate, introduce a new global lock to protect the list of connections in TIME_WAIT. Only acquire the TCP pcbinfo lock when closing an expired connection. This limits the window of time when TCP input processing is stopped to the amount of time needed to close a single connection. Submitted by: Julien Charbon <jcharbon@verisign.com> Reviewed by: rwatson, rrs, adrian MFC after: 2 months
This commit is contained in:
parent
1e4b22b44b
commit
66eefb1eae
@ -144,9 +144,7 @@ tcp_slowtimo(void)
|
||||
VNET_LIST_RLOCK_NOSLEEP();
|
||||
VNET_FOREACH(vnet_iter) {
|
||||
CURVNET_SET(vnet_iter);
|
||||
INP_INFO_WLOCK(&V_tcbinfo);
|
||||
(void) tcp_tw_2msl_scan(0);
|
||||
INP_INFO_WUNLOCK(&V_tcbinfo);
|
||||
tcp_tw_2msl_scan();
|
||||
CURVNET_RESTORE();
|
||||
}
|
||||
VNET_LIST_RUNLOCK_NOSLEEP();
|
||||
|
@ -178,7 +178,8 @@ extern int tcp_fast_finwait2_recycle;
|
||||
void tcp_timer_init(void);
|
||||
void tcp_timer_2msl(void *xtp);
|
||||
struct tcptw *
|
||||
tcp_tw_2msl_scan(int _reuse); /* XXX temporary */
|
||||
tcp_tw_2msl_reuse(void); /* XXX temporary? */
|
||||
void tcp_tw_2msl_scan(void);
|
||||
void tcp_timer_keep(void *xtp);
|
||||
void tcp_timer_persist(void *xtp);
|
||||
void tcp_timer_rexmt(void *xtp);
|
||||
|
@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
|
||||
#include <sys/socketvar.h>
|
||||
#include <sys/protosw.h>
|
||||
#include <sys/random.h>
|
||||
#include <sys/refcount.h>
|
||||
|
||||
#include <vm/uma.h>
|
||||
|
||||
@ -99,13 +100,61 @@ static int maxtcptw;
|
||||
* The timed wait queue contains references to each of the TCP sessions
|
||||
* currently in the TIME_WAIT state. The queue pointers, including the
|
||||
* queue pointers in each tcptw structure, are protected using the global
|
||||
* tcbinfo lock, which must be held over queue iteration and modification.
|
||||
* timewait lock, which must be held over queue iteration and modification.
|
||||
*/
|
||||
static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
|
||||
#define V_twq_2msl VNET(twq_2msl)
|
||||
|
||||
static void tcp_tw_2msl_reset(struct tcptw *, int);
|
||||
static void tcp_tw_2msl_stop(struct tcptw *);
|
||||
/* Global timewait lock */
|
||||
static VNET_DEFINE(struct rwlock, tw_lock);
|
||||
#define V_tw_lock VNET(tw_lock)
|
||||
|
||||
#define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0)
|
||||
#define TW_LOCK_DESTROY(tw) rw_destroy(&(tw))
|
||||
#define TW_RLOCK(tw) rw_rlock(&(tw))
|
||||
#define TW_WLOCK(tw) rw_wlock(&(tw))
|
||||
#define TW_RUNLOCK(tw) rw_runlock(&(tw))
|
||||
#define TW_WUNLOCK(tw) rw_wunlock(&(tw))
|
||||
#define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED)
|
||||
#define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED)
|
||||
#define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED)
|
||||
#define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED)
|
||||
|
||||
/*
|
||||
* tw_pcbref() bumps the reference count on an tw in order to maintain
|
||||
* stability of an tw pointer despite the tw lock being released.
|
||||
*/
|
||||
static void
|
||||
tw_pcbref(struct tcptw *tw)
|
||||
{
|
||||
|
||||
KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
|
||||
refcount_acquire(&tw->tw_refcount);
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop a refcount on an tw elevated using tw_pcbref(). Return
|
||||
* the tw lock released.
|
||||
*/
|
||||
static int
|
||||
tw_pcbrele(struct tcptw *tw)
|
||||
{
|
||||
|
||||
TW_WLOCK_ASSERT(V_tw_lock);
|
||||
KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
|
||||
|
||||
if (!refcount_release(&tw->tw_refcount)) {
|
||||
TW_WUNLOCK(V_tw_lock);
|
||||
return (0);
|
||||
}
|
||||
|
||||
uma_zfree(V_tcptw_zone, tw);
|
||||
TW_WUNLOCK(V_tw_lock);
|
||||
return (1);
|
||||
}
|
||||
|
||||
static void tcp_tw_2msl_reset(struct tcptw *, int ream);
|
||||
static void tcp_tw_2msl_stop(struct tcptw *, int reuse);
|
||||
|
||||
static int
|
||||
tcptw_auto_size(void)
|
||||
@ -172,6 +221,7 @@ tcp_tw_init(void)
|
||||
else
|
||||
uma_zone_set_max(V_tcptw_zone, maxtcptw);
|
||||
TAILQ_INIT(&V_twq_2msl);
|
||||
TW_LOCK_INIT(V_tw_lock, "tcptw");
|
||||
}
|
||||
|
||||
#ifdef VIMAGE
|
||||
@ -181,10 +231,11 @@ tcp_tw_destroy(void)
|
||||
struct tcptw *tw;
|
||||
|
||||
INP_INFO_WLOCK(&V_tcbinfo);
|
||||
while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
|
||||
while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
|
||||
tcp_twclose(tw, 0);
|
||||
INP_INFO_WUNLOCK(&V_tcbinfo);
|
||||
|
||||
TW_LOCK_DESTROY(V_tw_lock);
|
||||
uma_zdestroy(V_tcptw_zone);
|
||||
}
|
||||
#endif
|
||||
@ -205,7 +256,7 @@ tcp_twstart(struct tcpcb *tp)
|
||||
int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
|
||||
#endif
|
||||
|
||||
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */
|
||||
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
|
||||
INP_WLOCK_ASSERT(inp);
|
||||
|
||||
if (V_nolocaltimewait) {
|
||||
@ -230,7 +281,7 @@ tcp_twstart(struct tcpcb *tp)
|
||||
|
||||
tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
|
||||
if (tw == NULL) {
|
||||
tw = tcp_tw_2msl_scan(1);
|
||||
tw = tcp_tw_2msl_reuse();
|
||||
if (tw == NULL) {
|
||||
tp = tcp_close(tp);
|
||||
if (tp != NULL)
|
||||
@ -239,6 +290,7 @@ tcp_twstart(struct tcpcb *tp)
|
||||
}
|
||||
}
|
||||
tw->tw_inpcb = inp;
|
||||
refcount_init(&tw->tw_refcount, 1);
|
||||
|
||||
/*
|
||||
* Recover last window size sent.
|
||||
@ -357,7 +409,6 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
|
||||
int thflags;
|
||||
tcp_seq seq;
|
||||
|
||||
/* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */
|
||||
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
|
||||
INP_WLOCK_ASSERT(inp);
|
||||
|
||||
@ -459,11 +510,11 @@ tcp_twclose(struct tcptw *tw, int reuse)
|
||||
inp = tw->tw_inpcb;
|
||||
KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
|
||||
KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
|
||||
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */
|
||||
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
|
||||
INP_WLOCK_ASSERT(inp);
|
||||
|
||||
tw->tw_inpcb = NULL;
|
||||
tcp_tw_2msl_stop(tw);
|
||||
tcp_tw_2msl_stop(tw, reuse);
|
||||
inp->inp_ppcb = NULL;
|
||||
in_pcbdrop(inp);
|
||||
|
||||
@ -495,11 +546,6 @@ tcp_twclose(struct tcptw *tw, int reuse)
|
||||
} else
|
||||
in_pcbfree(inp);
|
||||
TCPSTAT_INC(tcps_closed);
|
||||
crfree(tw->tw_cred);
|
||||
tw->tw_cred = NULL;
|
||||
if (reuse)
|
||||
return;
|
||||
uma_zfree(V_tcptw_zone, tw);
|
||||
}
|
||||
|
||||
int
|
||||
@ -617,34 +663,88 @@ tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
|
||||
|
||||
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
|
||||
INP_WLOCK_ASSERT(tw->tw_inpcb);
|
||||
|
||||
TW_WLOCK(V_tw_lock);
|
||||
if (rearm)
|
||||
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
|
||||
tw->tw_time = ticks + 2 * tcp_msl;
|
||||
TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
|
||||
TW_WUNLOCK(V_tw_lock);
|
||||
}
|
||||
|
||||
static void
|
||||
tcp_tw_2msl_stop(struct tcptw *tw)
|
||||
tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
|
||||
{
|
||||
|
||||
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
|
||||
|
||||
TW_WLOCK(V_tw_lock);
|
||||
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
|
||||
crfree(tw->tw_cred);
|
||||
tw->tw_cred = NULL;
|
||||
|
||||
if (!reuse) {
|
||||
tw_pcbrele(tw);
|
||||
return;
|
||||
}
|
||||
|
||||
TW_WUNLOCK(V_tw_lock);
|
||||
}
|
||||
|
||||
struct tcptw *
|
||||
tcp_tw_2msl_scan(int reuse)
|
||||
tcp_tw_2msl_reuse(void)
|
||||
{
|
||||
struct tcptw *tw;
|
||||
|
||||
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
|
||||
for (;;) {
|
||||
tw = TAILQ_FIRST(&V_twq_2msl);
|
||||
if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0))
|
||||
break;
|
||||
INP_WLOCK(tw->tw_inpcb);
|
||||
tcp_twclose(tw, reuse);
|
||||
if (reuse)
|
||||
return (tw);
|
||||
|
||||
struct tcptw *tw;
|
||||
|
||||
TW_WLOCK(V_tw_lock);
|
||||
tw = TAILQ_FIRST(&V_twq_2msl);
|
||||
if (tw == NULL) {
|
||||
TW_WUNLOCK(V_tw_lock);
|
||||
return NULL;
|
||||
}
|
||||
TW_WUNLOCK(V_tw_lock);
|
||||
|
||||
INP_WLOCK(tw->tw_inpcb);
|
||||
tcp_twclose(tw, 1);
|
||||
|
||||
return (tw);
|
||||
}
|
||||
|
||||
void
|
||||
tcp_tw_2msl_scan(void)
|
||||
{
|
||||
|
||||
struct tcptw *tw;
|
||||
for (;;) {
|
||||
TW_RLOCK(V_tw_lock);
|
||||
tw = TAILQ_FIRST(&V_twq_2msl);
|
||||
if (tw == NULL || ((tw->tw_time - ticks) > 0)) {
|
||||
TW_RUNLOCK(V_tw_lock);
|
||||
break;
|
||||
}
|
||||
tw_pcbref(tw);
|
||||
TW_RUNLOCK(V_tw_lock);
|
||||
|
||||
/* Close timewait state */
|
||||
if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
|
||||
TW_WLOCK(V_tw_lock);
|
||||
if (tw_pcbrele(tw))
|
||||
continue;
|
||||
|
||||
KASSERT(tw->tw_inpcb != NULL,
|
||||
("%s: tw->tw_inpcb == NULL", __func__));
|
||||
|
||||
INP_WLOCK(tw->tw_inpcb);
|
||||
tcp_twclose(tw, 0);
|
||||
INP_INFO_WUNLOCK(&V_tcbinfo);
|
||||
} else {
|
||||
/* INP_INFO lock is busy, continue later */
|
||||
TW_WLOCK(V_tw_lock);
|
||||
tw_pcbrele(tw);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return (NULL);
|
||||
}
|
||||
|
@ -353,8 +353,7 @@ struct tcptw {
|
||||
u_int t_starttime;
|
||||
int tw_time;
|
||||
TAILQ_ENTRY(tcptw) tw_2msl;
|
||||
void *tw_pspare; /* TCP_SIGNATURE */
|
||||
u_int *tw_spare; /* TCP_SIGNATURE */
|
||||
u_int tw_refcount; /* refcount */
|
||||
};
|
||||
|
||||
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
|
||||
|
Loading…
x
Reference in New Issue
Block a user