Add new socket options: TCP_KEEPINIT, TCP_KEEPIDLE, TCP_KEEPINTVL and

TCP_KEEPCNT, that allow to control initial timeout, idle time, idle
re-send interval and idle send count on a per-socket basis.

Reviewed by:	andre, bz, lstewart
This commit is contained in:
glebius 2012-02-05 16:53:02 +00:00
parent 96baefc0cb
commit 4326beb059
9 changed files with 171 additions and 29 deletions

View File

@ -38,7 +38,7 @@
.\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93
.\" $FreeBSD$
.\"
.Dd November 14, 2011
.Dd February 5, 2012
.Dt TCP 4
.Os
.Sh NAME
@ -146,6 +146,65 @@ connection.
See
.Xr mod_cc 4
for details.
.It Dv TCP_KEEPINIT
This write-only
.Xr setsockopt 2
option accepts a per-socket timeout argument of
.Vt "u_int"
in seconds, for new, non-established
.Tn TCP
connections.
For the global default in milliseconds see
.Va keepinit
in the
.Sx MIB Variables
section further down.
.It Dv TCP_KEEPIDLE
This write-only
.Xr setsockopt 2
option accepts an argument of
.Vt "u_int"
for the amount of time, in seconds, that the connection must be idle
before keepalive probes (if enabled) are sent for the connection of this
socket.
If set on a listening socket, the value is inherited by the newly created
socket upon
.Xr accept 2 .
For the global default in milliseconds see
.Va keepidle
in the
.Sx MIB Variables
section further down.
.It Dv TCP_KEEPINTVL
This write-only
.Xr setsockopt 2
option accepts an argument of
.Vt "u_int"
to set the per-socket interval, in seconds, between keepalive probes sent
to a peer.
If set on a listening socket, the value is inherited by the newly created
socket upon
.Xr accept 2 .
For the global default in milliseconds see
.Va keepintvl
in the
.Sx MIB Variables
section further down.
.It Dv TCP_KEEPCNT
This write-only
.Xr setsockopt 2
option accepts an argument of
.Vt "u_int"
and allows a per-socket tuning of the number of probes sent, with no response,
before the connection will be dropped.
If set on a listening socket, the value is inherited by the newly created
socket upon
.Xr accept 2 .
For the global default see the
.Va keepcnt
in the
.Sx MIB Variables
section further down.
.It Dv TCP_NODELAY
Under most circumstances,
.Tn TCP
@ -296,17 +355,21 @@ The Maximum Segment Lifetime, in milliseconds, for a packet.
Timeout, in milliseconds, for new, non-established
.Tn TCP
connections.
The default is 75000 msec.
.It Va keepidle
Amount of time, in milliseconds, that the connection must be idle
before keepalive probes (if enabled) are sent.
The default is 7200000 msec (2 hours).
.It Va keepintvl
The interval, in milliseconds, between keepalive probes sent to remote
machines, when no response is received on a
.Va keepidle
probe.
After
.Dv TCPTV_KEEPCNT
(default 8) probes are sent, with no response, the connection is dropped.
The default is 75000 msec.
.It Va keepcnt
Number of probes sent, with no response, before a connection
is dropped.
The default is 8 packets.
.It Va always_keepalive
Assume that
.Dv SO_KEEPALIVE

View File

@ -159,6 +159,10 @@ struct tcphdr {
#define TCP_MD5SIG 0x10 /* use MD5 digests (RFC2385) */
#define TCP_INFO 0x20 /* retrieve tcp_info structure */
#define TCP_CONGESTION 0x40 /* get/set congestion control algorithm */
#define TCP_KEEPINIT 0x80 /* N, time to establish connection */
#define TCP_KEEPIDLE 0x100 /* L,N,X start keeplives after this period */
#define TCP_KEEPINTVL 0x200 /* L,N interval between keepalives */
#define TCP_KEEPCNT 0x400 /* L,N number of keepalives before close */
#define TCP_CA_NAME_MAX 16 /* max congestion control name length */

View File

@ -1446,7 +1446,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
tp->t_rcvtime = ticks;
if (TCPS_HAVEESTABLISHED(tp->t_state))
tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
/*
* Unscale the window into a 32-bit value.
@ -1889,7 +1889,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
} else {
tp->t_state = TCPS_ESTABLISHED;
cc_conn_init(tp);
tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
tcp_timer_activate(tp, TT_KEEP,
TP_KEEPIDLE(tp));
}
} else {
/*
@ -2293,7 +2294,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
} else {
tp->t_state = TCPS_ESTABLISHED;
cc_conn_init(tp);
tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
}
/*
* If segment contains data or ACK, will call tcp_reass()
@ -2630,12 +2631,11 @@ process_ACK:
* compressed state.
*/
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
int timeout;
soisdisconnected(so);
timeout = (tcp_fast_finwait2_recycle) ?
tcp_finwait2_timeout : tcp_maxidle;
tcp_timer_activate(tp, TT_2MSL, timeout);
tcp_timer_activate(tp, TT_2MSL,
(tcp_fast_finwait2_recycle ?
tcp_finwait2_timeout :
TP_MAXIDLE(tp)));
}
tp->t_state = TCPS_FIN_WAIT_2;
}

View File

@ -845,7 +845,15 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
*/
if (sc->sc_rxmits > 1)
tp->snd_cwnd = tp->t_maxseg;
tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
/*
* Copy and activate timers.
*/
tp->t_keepinit = sototcpcb(lso)->t_keepinit;
tp->t_keepidle = sototcpcb(lso)->t_keepidle;
tp->t_keepintvl = sototcpcb(lso)->t_keepintvl;
tp->t_keepcnt = sototcpcb(lso)->t_keepcnt;
tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
INP_WUNLOCK(inp);

View File

@ -111,12 +111,12 @@ int tcp_finwait2_timeout;
SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
&tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
int tcp_keepcnt = TCPTV_KEEPCNT;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
"Number of keepalive probes to send");
static int tcp_keepcnt = TCPTV_KEEPCNT;
/* max idle probes */
int tcp_maxpersistidle;
/* max idle time in persist */
int tcp_maxidle;
static int per_cpu_timers = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
@ -138,7 +138,6 @@ tcp_slowtimo(void)
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
INP_INFO_WLOCK(&V_tcbinfo);
(void) tcp_tw_2msl_scan(0);
INP_INFO_WUNLOCK(&V_tcbinfo);
@ -255,9 +254,9 @@ tcp_timer_2msl(void *xtp)
tp = tcp_close(tp);
} else {
if (tp->t_state != TCPS_TIME_WAIT &&
ticks - tp->t_rcvtime <= tcp_maxidle)
callout_reset_on(&tp->t_timers->tt_2msl, tcp_keepintvl,
tcp_timer_2msl, tp, INP_CPU(inp));
ticks - tp->t_rcvtime <= TP_MAXIDLE(tp))
callout_reset_on(&tp->t_timers->tt_2msl,
TP_KEEPINTVL(tp), tcp_timer_2msl, tp, INP_CPU(inp));
else
tp = tcp_close(tp);
}
@ -318,7 +317,7 @@ tcp_timer_keep(void *xtp)
goto dropit;
if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
tp->t_state <= TCPS_CLOSING) {
if (ticks - tp->t_rcvtime >= tcp_keepidle + tcp_maxidle)
if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
goto dropit;
/*
* Send a packet designed to force a response
@ -340,9 +339,11 @@ tcp_timer_keep(void *xtp)
tp->rcv_nxt, tp->snd_una - 1, 0);
free(t_template, M_TEMP);
}
callout_reset_on(&tp->t_timers->tt_keep, tcp_keepintvl, tcp_timer_keep, tp, INP_CPU(inp));
callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
tcp_timer_keep, tp, INP_CPU(inp));
} else
callout_reset_on(&tp->t_timers->tt_keep, tcp_keepidle, tcp_timer_keep, tp, INP_CPU(inp));
callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
tcp_timer_keep, tp, INP_CPU(inp));
#ifdef TCPDEBUG
if (inp->inp_socket->so_options & SO_DEBUG)

View File

@ -153,10 +153,16 @@ struct tcp_timer {
#define TT_KEEP 0x08
#define TT_2MSL 0x10
#define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit)
#define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle)
#define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl)
#define TP_KEEPCNT(tp) ((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt)
#define TP_MAXIDLE(tp) (TP_KEEPCNT(tp) * TP_KEEPINTVL(tp))
extern int tcp_keepinit; /* time to establish connection */
extern int tcp_keepidle; /* time before keepalive probes begin */
extern int tcp_keepintvl; /* time between keepalive probes */
extern int tcp_maxidle; /* time to drop after starting probes */
extern int tcp_keepcnt; /* number of keepalives */
extern int tcp_delacktime; /* time before sending a delayed ACK */
extern int tcp_maxpersistidle;
extern int tcp_rexmit_min;

View File

@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
@ -1118,7 +1119,7 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
soisconnecting(so);
TCPSTAT_INC(tcps_connattempt);
tp->t_state = TCPS_SYN_SENT;
tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
tp->iss = tcp_new_isn(tp);
tcp_sendseqinit(tp);
@ -1191,7 +1192,7 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
soisconnecting(so);
TCPSTAT_INC(tcps_connattempt);
tp->t_state = TCPS_SYN_SENT;
tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
tp->iss = tcp_new_isn(tp);
tcp_sendseqinit(tp);
@ -1272,6 +1273,7 @@ int
tcp_ctloutput(struct socket *so, struct sockopt *sopt)
{
int error, opt, optval;
u_int ui;
struct inpcb *inp;
struct tcpcb *tp;
struct tcp_info ti;
@ -1439,6 +1441,59 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
INP_WUNLOCK(inp);
break;
case TCP_KEEPIDLE:
case TCP_KEEPINTVL:
case TCP_KEEPCNT:
case TCP_KEEPINIT:
INP_WUNLOCK(inp);
error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
if (error)
return (error);
if (ui > (UINT_MAX / hz)) {
error = EINVAL;
break;
}
ui *= hz;
INP_WLOCK_RECHECK(inp);
switch (sopt->sopt_name) {
case TCP_KEEPIDLE:
tp->t_keepidle = ui;
/*
* XXX: better check current remaining
* timeout and "merge" it with new value.
*/
if ((tp->t_state > TCPS_LISTEN) &&
(tp->t_state <= TCPS_CLOSING))
tcp_timer_activate(tp, TT_KEEP,
TP_KEEPIDLE(tp));
break;
case TCP_KEEPINTVL:
tp->t_keepintvl = ui;
if ((tp->t_state == TCPS_FIN_WAIT_2) &&
(TP_MAXIDLE(tp) > 0))
tcp_timer_activate(tp, TT_2MSL,
TP_MAXIDLE(tp));
break;
case TCP_KEEPCNT:
tp->t_keepcnt = ui;
if ((tp->t_state == TCPS_FIN_WAIT_2) &&
(TP_MAXIDLE(tp) > 0))
tcp_timer_activate(tp, TT_2MSL,
TP_MAXIDLE(tp));
break;
case TCP_KEEPINIT:
tp->t_keepinit = ui;
if (tp->t_state == TCPS_SYN_RECEIVED ||
tp->t_state == TCPS_SYN_SENT)
tcp_timer_activate(tp, TT_KEEP,
TP_KEEPINIT(tp));
break;
}
INP_WUNLOCK(inp);
break;
default:
INP_WUNLOCK(inp);
error = ENOPROTOOPT;
@ -1636,7 +1691,7 @@ tcp_usrclosed(struct tcpcb *tp)
int timeout;
timeout = (tcp_fast_finwait2_recycle) ?
tcp_finwait2_timeout : tcp_maxidle;
tcp_finwait2_timeout : TP_MAXIDLE(tp);
tcp_timer_activate(tp, TT_2MSL, timeout);
}
}

View File

@ -203,7 +203,12 @@ struct tcpcb {
struct cc_var *ccv; /* congestion control specific vars */
struct osd *osd; /* storage for Khelp module data */
uint32_t t_ispare[12]; /* 4 keep timers, 5 UTO, 3 TBD */
u_int t_keepinit; /* time to establish connection */
u_int t_keepidle; /* time before keepalive probes begin */
u_int t_keepintvl; /* interval between keepalives */
u_int t_keepcnt; /* number of keepalives before close */
uint32_t t_ispare[8]; /* 5 UTO, 3 TBD */
void *t_pspare2[4]; /* 4 TBD */
uint64_t _pad[6]; /* 6 TBD (1-2 CC/RTT?) */
};

View File

@ -58,7 +58,7 @@
* in the range 5 to 9.
*/
#undef __FreeBSD_version
#define __FreeBSD_version 1000006 /* Master, propagated to newvers */
#define __FreeBSD_version 1000007 /* Master, propagated to newvers */
/*
* __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,