Adapt OpenBSD pf's "sloopy" TCP state machine which is useful for Direct

Server Return mode, where not all packets would be visible to the load
balancer or gateway.

This commit should be reverted when we merge future pf versions.  The
benefit it would provide is that this version does not break any existing
public interface and thus won't be a problem if we want to MFC it to
earlier FreeBSD releases.

Discussed with:	mlaier
Obtained from:	OpenBSD
Sponsored by:	iXsystems, Inc.
MFC after:	1 month
This commit is contained in:
delphij 2009-12-24 00:43:44 +00:00
parent 1131ddf3b4
commit 40c18ac3ff
8 changed files with 510 additions and 348 deletions

View File

@ -28,7 +28,7 @@
.\" ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
.\" POSSIBILITY OF SUCH DAMAGE.
.\"
.Dd October 30, 2006
.Dd June 10, 2008
.Dt PF.CONF 5
.Os
.Sh NAME
@ -2059,6 +2059,13 @@ Changes the timeout values used for states created by this rule.
For a list of all valid timeout names, see
.Sx OPTIONS
above.
.It Ar sloppy
Uses a sloppy TCP connection tracker that does not check sequence
numbers at all, which makes insertion and ICMP teardown attacks way
easier.
This is intended to be used in situations where one does not see all
packets of a connection, i.e. in asymmetric routing situations.
Cannot be used with modulate or synproxy state.
.El
.Pp
Multiple options can be specified, separated by commas:
@ -2923,7 +2930,7 @@ tos = "tos" ( "lowdelay" | "throughput" | "reliability" |
[ "0x" ] number )
state-opts = state-opt [ [ "," ] state-opts ]
state-opt = ( "max" number | "no-sync" | timeout |
state-opt = ( "max" number | "no-sync" | timeout | sloppy |
"source-track" [ ( "rule" | "global" ) ] |
"max-src-nodes" number | "max-src-states" number |
"max-src-conn" number |

View File

@ -128,7 +128,7 @@ enum { PF_STATE_OPT_MAX, PF_STATE_OPT_NOSYNC, PF_STATE_OPT_SRCTRACK,
PF_STATE_OPT_MAX_SRC_STATES, PF_STATE_OPT_MAX_SRC_CONN,
PF_STATE_OPT_MAX_SRC_CONN_RATE, PF_STATE_OPT_MAX_SRC_NODES,
PF_STATE_OPT_OVERLOAD, PF_STATE_OPT_STATELOCK,
PF_STATE_OPT_TIMEOUT };
PF_STATE_OPT_TIMEOUT, PF_STATE_OPT_SLOPPY };
enum { PF_SRCTRACK_NONE, PF_SRCTRACK, PF_SRCTRACK_GLOBAL, PF_SRCTRACK_RULE };
@ -423,7 +423,7 @@ typedef struct {
%token QUEUE PRIORITY QLIMIT RTABLE
%token LOAD RULESET_OPTIMIZATION
%token STICKYADDRESS MAXSRCSTATES MAXSRCNODES SOURCETRACK GLOBAL RULE
%token MAXSRCCONN MAXSRCCONNRATE OVERLOAD FLUSH
%token MAXSRCCONN MAXSRCCONNRATE OVERLOAD FLUSH SLOPPY
%token TAGGED TAG IFBOUND FLOATING STATEPOLICY ROUTE
%token <v.string> STRING
%token <v.i> PORTBINARY
@ -1891,6 +1891,14 @@ pfrule : action dir logquick interface route af proto fromto
statelock = 1;
r.rule_flag |= o->data.statelock;
break;
case PF_STATE_OPT_SLOPPY:
if (r.rule_flag & PFRULE_STATESLOPPY) {
yyerror("state sloppy option: "
"multiple definitions");
YYERROR;
}
r.rule_flag |= PFRULE_STATESLOPPY;
break;
case PF_STATE_OPT_TIMEOUT:
if (o->data.timeout.number ==
PFTM_ADAPTIVE_START ||
@ -3216,6 +3224,14 @@ state_opt_item : MAXIMUM number {
$$->next = NULL;
$$->tail = $$;
}
| SLOPPY {
$$ = calloc(1, sizeof(struct node_state_opt));
if ($$ == NULL)
err(1, "state_opt_item: calloc");
$$->type = PF_STATE_OPT_SLOPPY;
$$->next = NULL;
$$->tail = $$;
}
| STRING number {
int i;
@ -4101,6 +4117,13 @@ filter_consistent(struct pf_rule *r, int anchor_call)
yyerror("keep state on block rules doesn't make sense");
problems++;
}
if (r->rule_flag & PFRULE_STATESLOPPY &&
(r->keep_state == PF_STATE_MODULATE ||
r->keep_state == PF_STATE_SYNPROXY)) {
yyerror("sloppy state matching cannot be used with "
"synproxy state or modulate state");
problems++;
}
return (-problems);
}
@ -4969,6 +4992,7 @@ lookup(char *s)
{ "scrub", SCRUB},
{ "set", SET},
{ "skip", SKIP},
{ "sloppy", SLOPPY},
{ "source-hash", SOURCEHASH},
{ "source-track", SOURCETRACK},
{ "state", STATE},

View File

@ -294,6 +294,8 @@ print_state(struct pf_state *s, int opts)
printf(", anchor %u", s->anchor.nr);
if (s->rule.nr != -1)
printf(", rule %u", s->rule.nr);
if (s->state_flags & PFSTATE_SLOPPY)
printf(", sloppy");
if (s->src_node != NULL)
printf(", source-track");
if (s->nat_src_node != NULL)

View File

@ -873,6 +873,8 @@ print_rule(struct pf_rule *r, const char *anchor_call, int verbose)
opts = 1;
if (r->rule_flag & PFRULE_IFBOUND)
opts = 1;
if (r->rule_flag & PFRULE_STATESLOPPY)
opts = 1;
for (i = 0; !opts && i < PFTM_MAX; ++i)
if (r->timeout[i])
opts = 1;
@ -939,6 +941,12 @@ print_rule(struct pf_rule *r, const char *anchor_call, int verbose)
printf("if-bound");
opts = 0;
}
if (r->rule_flag & PFRULE_STATESLOPPY) {
if (!opts)
printf(", ");
printf("sloppy");
opts = 0;
}
for (i = 0; i < PFTM_MAX; ++i)
if (r->timeout[i]) {
int j;

View File

@ -465,7 +465,7 @@ pfsync_insert_net_state(struct pfsync_state *sp, u_int8_t chksum_flag)
st->direction = sp->direction;
st->log = sp->log;
st->timeout = sp->timeout;
st->allow_opts = sp->allow_opts;
st->state_flags = sp->state_flags;
bcopy(sp->id, &st->id, sizeof(st->id));
st->creatorid = sp->creatorid;
@ -1578,7 +1578,7 @@ pfsync_pack_state(u_int8_t action, struct pf_state *st, int flags)
sp->proto = st->proto;
sp->direction = st->direction;
sp->log = st->log;
sp->allow_opts = st->allow_opts;
sp->state_flags = st->state_flags;
sp->timeout = st->timeout;
if (flags & PFSYNC_FLAG_STALE)

View File

@ -80,7 +80,7 @@ struct pfsync_state {
u_int8_t proto;
u_int8_t direction;
u_int8_t log;
u_int8_t allow_opts;
u_int8_t state_flags;
u_int8_t timeout;
u_int8_t sync_flags;
u_int8_t updates;

View File

@ -253,6 +253,13 @@ int pf_test_fragment(struct pf_rule **, int,
struct pfi_kif *, struct mbuf *, void *,
struct pf_pdesc *, struct pf_rule **,
struct pf_ruleset **);
int pf_tcp_track_full(struct pf_state_peer *,
struct pf_state_peer *, struct pf_state **,
struct pfi_kif *, struct mbuf *, int,
struct pf_pdesc *, u_short *, int *);
int pf_tcp_track_sloppy(struct pf_state_peer *,
struct pf_state_peer *, struct pf_state **,
struct pf_pdesc *, u_short *);
int pf_test_state_tcp(struct pf_state **, int,
struct pfi_kif *, struct mbuf *, int,
void *, struct pf_pdesc *, u_short *);
@ -3528,7 +3535,10 @@ pf_test_tcp(struct pf_rule **rm, struct pf_state **sm, int direction,
s->nat_rule.ptr = nr;
s->anchor.ptr = a;
STATE_INC_COUNTERS(s);
s->allow_opts = r->allow_opts;
if (r->allow_opts)
s->state_flags |= PFSTATE_ALLOWOPTS;
if (r->rule_flag & PFRULE_STATESLOPPY)
s->state_flags |= PFSTATE_SLOPPY;
s->log = r->log & PF_LOG_ALL;
if (nr != NULL)
s->log |= nr->log & PF_LOG_ALL;
@ -3925,7 +3935,10 @@ pf_test_udp(struct pf_rule **rm, struct pf_state **sm, int direction,
s->nat_rule.ptr = nr;
s->anchor.ptr = a;
STATE_INC_COUNTERS(s);
s->allow_opts = r->allow_opts;
if (r->allow_opts)
s->state_flags |= PFSTATE_ALLOWOPTS;
if (r->rule_flag & PFRULE_STATESLOPPY)
s->state_flags |= PFSTATE_SLOPPY;
s->log = r->log & PF_LOG_ALL;
if (nr != NULL)
s->log |= nr->log & PF_LOG_ALL;
@ -4238,7 +4251,10 @@ pf_test_icmp(struct pf_rule **rm, struct pf_state **sm, int direction,
s->nat_rule.ptr = nr;
s->anchor.ptr = a;
STATE_INC_COUNTERS(s);
s->allow_opts = r->allow_opts;
if (r->allow_opts)
s->state_flags |= PFSTATE_ALLOWOPTS;
if (r->rule_flag & PFRULE_STATESLOPPY)
s->state_flags |= PFSTATE_SLOPPY;
s->log = r->log & PF_LOG_ALL;
if (nr != NULL)
s->log |= nr->log & PF_LOG_ALL;
@ -4525,7 +4541,10 @@ pf_test_other(struct pf_rule **rm, struct pf_state **sm, int direction,
s->nat_rule.ptr = nr;
s->anchor.ptr = a;
STATE_INC_COUNTERS(s);
s->allow_opts = r->allow_opts;
if (r->allow_opts)
s->state_flags |= PFSTATE_ALLOWOPTS;
if (r->rule_flag & PFRULE_STATESLOPPY)
s->state_flags |= PFSTATE_SLOPPY;
s->log = r->log & PF_LOG_ALL;
if (nr != NULL)
s->log |= nr->log & PF_LOG_ALL;
@ -4665,6 +4684,430 @@ pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
return (PF_PASS);
}
int
pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
struct pf_pdesc *pd, u_short *reason, int *copyback)
{
struct tcphdr *th = pd->hdr.tcp;
u_int16_t win = ntohs(th->th_win);
u_int32_t ack, end, seq, orig_seq;
u_int8_t sws, dws;
int ackskew;
if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
sws = src->wscale & PF_WSCALE_MASK;
dws = dst->wscale & PF_WSCALE_MASK;
} else
sws = dws = 0;
/*
* Sequence tracking algorithm from Guido van Rooij's paper:
* http://www.madison-gurkha.com/publications/tcp_filtering/
* tcp_filtering.ps
*/
orig_seq = seq = ntohl(th->th_seq);
if (src->seqlo == 0) {
/* First packet from this end. Set its state */
if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
src->scrub == NULL) {
if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
REASON_SET(reason, PFRES_MEMORY);
return (PF_DROP);
}
}
/* Deferred generation of sequence number modulator */
if (dst->seqdiff && !src->seqdiff) {
#ifdef __FreeBSD__
while ((src->seqdiff = pf_new_isn(*state) - seq) == 0)
;
#else
while ((src->seqdiff = tcp_rndiss_next() - seq) == 0)
;
#endif
ack = ntohl(th->th_ack) - dst->seqdiff;
pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
src->seqdiff), 0);
pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
*copyback = 1;
} else {
ack = ntohl(th->th_ack);
}
end = seq + pd->p_len;
if (th->th_flags & TH_SYN) {
end++;
if (dst->wscale & PF_WSCALE_FLAG) {
src->wscale = pf_get_wscale(m, off, th->th_off,
pd->af);
if (src->wscale & PF_WSCALE_FLAG) {
/* Remove scale factor from initial
* window */
sws = src->wscale & PF_WSCALE_MASK;
win = ((u_int32_t)win + (1 << sws) - 1)
>> sws;
dws = dst->wscale & PF_WSCALE_MASK;
} else {
/* fixup other window */
dst->max_win <<= dst->wscale &
PF_WSCALE_MASK;
/* in case of a retrans SYN|ACK */
dst->wscale = 0;
}
}
}
if (th->th_flags & TH_FIN)
end++;
src->seqlo = seq;
if (src->state < TCPS_SYN_SENT)
src->state = TCPS_SYN_SENT;
/*
* May need to slide the window (seqhi may have been set by
* the crappy stack check or if we picked up the connection
* after establishment)
*/
if (src->seqhi == 1 ||
SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
src->seqhi = end + MAX(1, dst->max_win << dws);
if (win > src->max_win)
src->max_win = win;
} else {
ack = ntohl(th->th_ack) - dst->seqdiff;
if (src->seqdiff) {
/* Modulate sequence numbers */
pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
src->seqdiff), 0);
pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
*copyback = 1;
}
end = seq + pd->p_len;
if (th->th_flags & TH_SYN)
end++;
if (th->th_flags & TH_FIN)
end++;
}
if ((th->th_flags & TH_ACK) == 0) {
/* Let it pass through the ack skew check */
ack = dst->seqlo;
} else if ((ack == 0 &&
(th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
/* broken tcp stacks do not set ack */
(dst->state < TCPS_SYN_SENT)) {
/*
* Many stacks (ours included) will set the ACK number in an
* FIN|ACK if the SYN times out -- no sequence to ACK.
*/
ack = dst->seqlo;
}
if (seq == end) {
/* Ease sequencing restrictions on no data packets */
seq = src->seqlo;
end = seq;
}
ackskew = dst->seqlo - ack;
/*
* Need to demodulate the sequence numbers in any TCP SACK options
* (Selective ACK). We could optionally validate the SACK values
* against the current ACK window, either forwards or backwards, but
* I'm not confident that SACK has been implemented properly
* everywhere. It wouldn't surprise me if several stacks accidently
* SACK too far backwards of previously ACKed data. There really aren't
* any security implications of bad SACKing unless the target stack
* doesn't validate the option length correctly. Someone trying to
* spoof into a TCP connection won't bother blindly sending SACK
* options anyway.
*/
if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
if (pf_modulate_sack(m, off, pd, th, dst))
*copyback = 1;
}
#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
if (SEQ_GEQ(src->seqhi, end) &&
/* Last octet inside other's window space */
SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
/* Retrans: not more than one window back */
(ackskew >= -MAXACKWINDOW) &&
/* Acking not more than one reassembled fragment backwards */
(ackskew <= (MAXACKWINDOW << sws)) &&
/* Acking not more than one window forward */
((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
(orig_seq == src->seqlo + 1) || (pd->flags & PFDESC_IP_REAS) == 0)) {
/* Require an exact/+1 sequence match on resets when possible */
if (dst->scrub || src->scrub) {
if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
*state, src, dst, copyback))
return (PF_DROP);
}
/* update max window */
if (src->max_win < win)
src->max_win = win;
/* synchronize sequencing */
if (SEQ_GT(end, src->seqlo))
src->seqlo = end;
/* slide the window of what the other end can send */
if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
dst->seqhi = ack + MAX((win << sws), 1);
/* update states */
if (th->th_flags & TH_SYN)
if (src->state < TCPS_SYN_SENT)
src->state = TCPS_SYN_SENT;
if (th->th_flags & TH_FIN)
if (src->state < TCPS_CLOSING)
src->state = TCPS_CLOSING;
if (th->th_flags & TH_ACK) {
if (dst->state == TCPS_SYN_SENT) {
dst->state = TCPS_ESTABLISHED;
if (src->state == TCPS_ESTABLISHED &&
(*state)->src_node != NULL &&
pf_src_connlimit(state)) {
REASON_SET(reason, PFRES_SRCLIMIT);
return (PF_DROP);
}
} else if (dst->state == TCPS_CLOSING)
dst->state = TCPS_FIN_WAIT_2;
}
if (th->th_flags & TH_RST)
src->state = dst->state = TCPS_TIME_WAIT;
/* update expire time */
(*state)->expire = time_second;
if (src->state >= TCPS_FIN_WAIT_2 &&
dst->state >= TCPS_FIN_WAIT_2)
(*state)->timeout = PFTM_TCP_CLOSED;
else if (src->state >= TCPS_CLOSING &&
dst->state >= TCPS_CLOSING)
(*state)->timeout = PFTM_TCP_FIN_WAIT;
else if (src->state < TCPS_ESTABLISHED ||
dst->state < TCPS_ESTABLISHED)
(*state)->timeout = PFTM_TCP_OPENING;
else if (src->state >= TCPS_CLOSING ||
dst->state >= TCPS_CLOSING)
(*state)->timeout = PFTM_TCP_CLOSING;
else
(*state)->timeout = PFTM_TCP_ESTABLISHED;
/* Fall through to PASS packet */
} else if ((dst->state < TCPS_SYN_SENT ||
dst->state >= TCPS_FIN_WAIT_2 ||
src->state >= TCPS_FIN_WAIT_2) &&
SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
/* Within a window forward of the originating packet */
SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
/* Within a window backward of the originating packet */
/*
* This currently handles three situations:
* 1) Stupid stacks will shotgun SYNs before their peer
* replies.
* 2) When PF catches an already established stream (the
* firewall rebooted, the state table was flushed, routes
* changed...)
* 3) Packets get funky immediately after the connection
* closes (this should catch Solaris spurious ACK|FINs
* that web servers like to spew after a close)
*
* This must be a little more careful than the above code
* since packet floods will also be caught here. We don't
* update the TTL here to mitigate the damage of a packet
* flood and so the same code can handle awkward establishment
* and a loosened connection close.
* In the establishment case, a correct peer response will
* validate the connection, go through the normal state code
* and keep updating the state TTL.
*/
if (pf_status.debug >= PF_DEBUG_MISC) {
printf("pf: loose state match: ");
pf_print_state(*state);
pf_print_flags(th->th_flags);
printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
"pkts=%llu:%llu\n", seq, orig_seq, ack, pd->p_len,
#ifdef __FreeBSD__
ackskew, (unsigned long long)(*state)->packets[0],
(unsigned long long)(*state)->packets[1]);
#else
ackskew, (*state)->packets[0],
(*state)->packets[1]);
#endif
}
if (dst->scrub || src->scrub) {
if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
*state, src, dst, copyback))
return (PF_DROP);
}
/* update max window */
if (src->max_win < win)
src->max_win = win;
/* synchronize sequencing */
if (SEQ_GT(end, src->seqlo))
src->seqlo = end;
/* slide the window of what the other end can send */
if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
dst->seqhi = ack + MAX((win << sws), 1);
/*
* Cannot set dst->seqhi here since this could be a shotgunned
* SYN and not an already established connection.
*/
if (th->th_flags & TH_FIN)
if (src->state < TCPS_CLOSING)
src->state = TCPS_CLOSING;
if (th->th_flags & TH_RST)
src->state = dst->state = TCPS_TIME_WAIT;
/* Fall through to PASS packet */
} else {
if ((*state)->dst.state == TCPS_SYN_SENT &&
(*state)->src.state == TCPS_SYN_SENT) {
/* Send RST for state mismatches during handshake */
if (!(th->th_flags & TH_RST))
#ifdef __FreeBSD__
pf_send_tcp(m, (*state)->rule.ptr, pd->af,
#else
pf_send_tcp((*state)->rule.ptr, pd->af,
#endif
pd->dst, pd->src, th->th_dport,
th->th_sport, ntohl(th->th_ack), 0,
TH_RST, 0, 0,
(*state)->rule.ptr->return_ttl, 1, 0,
pd->eh, kif->pfik_ifp);
src->seqlo = 0;
src->seqhi = 1;
src->max_win = 1;
} else if (pf_status.debug >= PF_DEBUG_MISC) {
printf("pf: BAD state: ");
pf_print_state(*state);
pf_print_flags(th->th_flags);
printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
#ifdef notyet
"pkts=%llu:%llu dir=%s,%s\n",
#else
"pkts=%llu:%llu%s\n",
#endif
seq, orig_seq, ack, pd->p_len, ackskew,
#ifdef __FreeBSD__
(unsigned long long)(*state)->packets[0],
(unsigned long long)(*state)->packets[1],
#else
(*state)->packets[0], (*state)->packets[1],
#endif
#ifdef notyet
direction == PF_IN ? "in" : "out",
direction == (*state)->direction ? "fwd" : "rev");
#else
"");
#endif
printf("pf: State failure on: %c %c %c %c | %c %c\n",
SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
' ': '2',
(ackskew >= -MAXACKWINDOW) ? ' ' : '3',
(ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
}
REASON_SET(reason, PFRES_BADSTATE);
return (PF_DROP);
}
/* Any packets which have gotten here are to be passed */
return (PF_PASS);
}
int
pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
{
struct tcphdr *th = pd->hdr.tcp;
if (th->th_flags & TH_SYN)
if (src->state < TCPS_SYN_SENT)
src->state = TCPS_SYN_SENT;
if (th->th_flags & TH_FIN)
if (src->state < TCPS_CLOSING)
src->state = TCPS_CLOSING;
if (th->th_flags & TH_ACK) {
if (dst->state == TCPS_SYN_SENT) {
dst->state = TCPS_ESTABLISHED;
if (src->state == TCPS_ESTABLISHED &&
(*state)->src_node != NULL &&
pf_src_connlimit(state)) {
REASON_SET(reason, PFRES_SRCLIMIT);
return (PF_DROP);
}
} else if (dst->state == TCPS_CLOSING) {
dst->state = TCPS_FIN_WAIT_2;
} else if (src->state == TCPS_SYN_SENT &&
dst->state < TCPS_SYN_SENT) {
/*
* Handle a special sloppy case where we only see one
* half of the connection. If there is a ACK after
* the initial SYN without ever seeing a packet from
* the destination, set the connection to established.
*/
dst->state = src->state = TCPS_ESTABLISHED;
if ((*state)->src_node != NULL &&
pf_src_connlimit(state)) {
REASON_SET(reason, PFRES_SRCLIMIT);
return (PF_DROP);
}
} else if (src->state == TCPS_CLOSING &&
dst->state == TCPS_ESTABLISHED &&
dst->seqlo == 0) {
/*
* Handle the closing of half connections where we
* don't see the full bidirectional FIN/ACK+ACK
* handshake.
*/
dst->state = TCPS_CLOSING;
}
}
if (th->th_flags & TH_RST)
src->state = dst->state = TCPS_TIME_WAIT;
/* update expire time */
(*state)->expire = time_second;
if (src->state >= TCPS_FIN_WAIT_2 &&
dst->state >= TCPS_FIN_WAIT_2)
(*state)->timeout = PFTM_TCP_CLOSED;
else if (src->state >= TCPS_CLOSING &&
dst->state >= TCPS_CLOSING)
(*state)->timeout = PFTM_TCP_FIN_WAIT;
else if (src->state < TCPS_ESTABLISHED ||
dst->state < TCPS_ESTABLISHED)
(*state)->timeout = PFTM_TCP_OPENING;
else if (src->state >= TCPS_CLOSING ||
dst->state >= TCPS_CLOSING)
(*state)->timeout = PFTM_TCP_CLOSING;
else
(*state)->timeout = PFTM_TCP_ESTABLISHED;
return (PF_PASS);
}
int
pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
@ -4672,10 +5115,6 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
{
struct pf_state_cmp key;
struct tcphdr *th = pd->hdr.tcp;
u_int16_t win = ntohs(th->th_win);
u_int32_t ack, end, seq, orig_seq;
u_int8_t sws, dws;
int ackskew;
int copyback = 0;
struct pf_state_peer *src, *dst;
@ -4826,337 +5265,15 @@ pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
return (PF_DROP);
}
if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
sws = src->wscale & PF_WSCALE_MASK;
dws = dst->wscale & PF_WSCALE_MASK;
} else
sws = dws = 0;
/*
* Sequence tracking algorithm from Guido van Rooij's paper:
* http://www.madison-gurkha.com/publications/tcp_filtering/
* tcp_filtering.ps
*/
orig_seq = seq = ntohl(th->th_seq);
if (src->seqlo == 0) {
/* First packet from this end. Set its state */
if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
src->scrub == NULL) {
if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
REASON_SET(reason, PFRES_MEMORY);
return (PF_DROP);
}
}
/* Deferred generation of sequence number modulator */
if (dst->seqdiff && !src->seqdiff) {
#ifdef __FreeBSD__
while ((src->seqdiff = pf_new_isn(*state) - seq) == 0)
;
#else
while ((src->seqdiff = tcp_rndiss_next() - seq) == 0)
;
#endif
ack = ntohl(th->th_ack) - dst->seqdiff;
pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
src->seqdiff), 0);
pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
copyback = 1;
} else {
ack = ntohl(th->th_ack);
}
end = seq + pd->p_len;
if (th->th_flags & TH_SYN) {
end++;
if (dst->wscale & PF_WSCALE_FLAG) {
src->wscale = pf_get_wscale(m, off, th->th_off,
pd->af);
if (src->wscale & PF_WSCALE_FLAG) {
/* Remove scale factor from initial
* window */
sws = src->wscale & PF_WSCALE_MASK;
win = ((u_int32_t)win + (1 << sws) - 1)
>> sws;
dws = dst->wscale & PF_WSCALE_MASK;
} else {
/* fixup other window */
dst->max_win <<= dst->wscale &
PF_WSCALE_MASK;
/* in case of a retrans SYN|ACK */
dst->wscale = 0;
}
}
}
if (th->th_flags & TH_FIN)
end++;
src->seqlo = seq;
if (src->state < TCPS_SYN_SENT)
src->state = TCPS_SYN_SENT;
/*
* May need to slide the window (seqhi may have been set by
* the crappy stack check or if we picked up the connection
* after establishment)
*/
if (src->seqhi == 1 ||
SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
src->seqhi = end + MAX(1, dst->max_win << dws);
if (win > src->max_win)
src->max_win = win;
if ((*state)->state_flags & PFSTATE_SLOPPY) {
if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP)
return (PF_DROP);
} else {
ack = ntohl(th->th_ack) - dst->seqdiff;
if (src->seqdiff) {
/* Modulate sequence numbers */
pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
src->seqdiff), 0);
pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
copyback = 1;
}
end = seq + pd->p_len;
if (th->th_flags & TH_SYN)
end++;
if (th->th_flags & TH_FIN)
end++;
if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason,
&copyback) == PF_DROP)
return (PF_DROP);
}
if ((th->th_flags & TH_ACK) == 0) {
/* Let it pass through the ack skew check */
ack = dst->seqlo;
} else if ((ack == 0 &&
(th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
/* broken tcp stacks do not set ack */
(dst->state < TCPS_SYN_SENT)) {
/*
* Many stacks (ours included) will set the ACK number in an
* FIN|ACK if the SYN times out -- no sequence to ACK.
*/
ack = dst->seqlo;
}
if (seq == end) {
/* Ease sequencing restrictions on no data packets */
seq = src->seqlo;
end = seq;
}
ackskew = dst->seqlo - ack;
/*
* Need to demodulate the sequence numbers in any TCP SACK options
* (Selective ACK). We could optionally validate the SACK values
* against the current ACK window, either forwards or backwards, but
* I'm not confident that SACK has been implemented properly
* everywhere. It wouldn't surprise me if several stacks accidently
* SACK too far backwards of previously ACKed data. There really aren't
* any security implications of bad SACKing unless the target stack
* doesn't validate the option length correctly. Someone trying to
* spoof into a TCP connection won't bother blindly sending SACK
* options anyway.
*/
if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
if (pf_modulate_sack(m, off, pd, th, dst))
copyback = 1;
}
#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
if (SEQ_GEQ(src->seqhi, end) &&
/* Last octet inside other's window space */
SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
/* Retrans: not more than one window back */
(ackskew >= -MAXACKWINDOW) &&
/* Acking not more than one reassembled fragment backwards */
(ackskew <= (MAXACKWINDOW << sws)) &&
/* Acking not more than one window forward */
((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
(orig_seq == src->seqlo + 1) || (pd->flags & PFDESC_IP_REAS) == 0)) {
/* Require an exact/+1 sequence match on resets when possible */
if (dst->scrub || src->scrub) {
if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
*state, src, dst, &copyback))
return (PF_DROP);
}
/* update max window */
if (src->max_win < win)
src->max_win = win;
/* synchronize sequencing */
if (SEQ_GT(end, src->seqlo))
src->seqlo = end;
/* slide the window of what the other end can send */
if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
dst->seqhi = ack + MAX((win << sws), 1);
/* update states */
if (th->th_flags & TH_SYN)
if (src->state < TCPS_SYN_SENT)
src->state = TCPS_SYN_SENT;
if (th->th_flags & TH_FIN)
if (src->state < TCPS_CLOSING)
src->state = TCPS_CLOSING;
if (th->th_flags & TH_ACK) {
if (dst->state == TCPS_SYN_SENT) {
dst->state = TCPS_ESTABLISHED;
if (src->state == TCPS_ESTABLISHED &&
(*state)->src_node != NULL &&
pf_src_connlimit(state)) {
REASON_SET(reason, PFRES_SRCLIMIT);
return (PF_DROP);
}
} else if (dst->state == TCPS_CLOSING)
dst->state = TCPS_FIN_WAIT_2;
}
if (th->th_flags & TH_RST)
src->state = dst->state = TCPS_TIME_WAIT;
/* update expire time */
(*state)->expire = time_second;
if (src->state >= TCPS_FIN_WAIT_2 &&
dst->state >= TCPS_FIN_WAIT_2)
(*state)->timeout = PFTM_TCP_CLOSED;
else if (src->state >= TCPS_CLOSING &&
dst->state >= TCPS_CLOSING)
(*state)->timeout = PFTM_TCP_FIN_WAIT;
else if (src->state < TCPS_ESTABLISHED ||
dst->state < TCPS_ESTABLISHED)
(*state)->timeout = PFTM_TCP_OPENING;
else if (src->state >= TCPS_CLOSING ||
dst->state >= TCPS_CLOSING)
(*state)->timeout = PFTM_TCP_CLOSING;
else
(*state)->timeout = PFTM_TCP_ESTABLISHED;
/* Fall through to PASS packet */
} else if ((dst->state < TCPS_SYN_SENT ||
dst->state >= TCPS_FIN_WAIT_2 ||
src->state >= TCPS_FIN_WAIT_2) &&
SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
/* Within a window forward of the originating packet */
SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
/* Within a window backward of the originating packet */
/*
* This currently handles three situations:
* 1) Stupid stacks will shotgun SYNs before their peer
* replies.
* 2) When PF catches an already established stream (the
* firewall rebooted, the state table was flushed, routes
* changed...)
* 3) Packets get funky immediately after the connection
* closes (this should catch Solaris spurious ACK|FINs
* that web servers like to spew after a close)
*
* This must be a little more careful than the above code
* since packet floods will also be caught here. We don't
* update the TTL here to mitigate the damage of a packet
* flood and so the same code can handle awkward establishment
* and a loosened connection close.
* In the establishment case, a correct peer response will
* validate the connection, go through the normal state code
* and keep updating the state TTL.
*/
if (pf_status.debug >= PF_DEBUG_MISC) {
printf("pf: loose state match: ");
pf_print_state(*state);
pf_print_flags(th->th_flags);
printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
"pkts=%llu:%llu\n", seq, orig_seq, ack, pd->p_len,
#ifdef __FreeBSD__
ackskew, (unsigned long long)(*state)->packets[0],
(unsigned long long)(*state)->packets[1]);
#else
ackskew, (*state)->packets[0],
(*state)->packets[1]);
#endif
}
if (dst->scrub || src->scrub) {
if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
*state, src, dst, &copyback))
return (PF_DROP);
}
/* update max window */
if (src->max_win < win)
src->max_win = win;
/* synchronize sequencing */
if (SEQ_GT(end, src->seqlo))
src->seqlo = end;
/* slide the window of what the other end can send */
if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
dst->seqhi = ack + MAX((win << sws), 1);
/*
* Cannot set dst->seqhi here since this could be a shotgunned
* SYN and not an already established connection.
*/
if (th->th_flags & TH_FIN)
if (src->state < TCPS_CLOSING)
src->state = TCPS_CLOSING;
if (th->th_flags & TH_RST)
src->state = dst->state = TCPS_TIME_WAIT;
/* Fall through to PASS packet */
} else {
if ((*state)->dst.state == TCPS_SYN_SENT &&
(*state)->src.state == TCPS_SYN_SENT) {
/* Send RST for state mismatches during handshake */
if (!(th->th_flags & TH_RST))
#ifdef __FreeBSD__
pf_send_tcp(m, (*state)->rule.ptr, pd->af,
#else
pf_send_tcp((*state)->rule.ptr, pd->af,
#endif
pd->dst, pd->src, th->th_dport,
th->th_sport, ntohl(th->th_ack), 0,
TH_RST, 0, 0,
(*state)->rule.ptr->return_ttl, 1, 0,
pd->eh, kif->pfik_ifp);
src->seqlo = 0;
src->seqhi = 1;
src->max_win = 1;
} else if (pf_status.debug >= PF_DEBUG_MISC) {
printf("pf: BAD state: ");
pf_print_state(*state);
pf_print_flags(th->th_flags);
printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
"pkts=%llu:%llu dir=%s,%s\n",
seq, orig_seq, ack, pd->p_len, ackskew,
#ifdef __FreeBSD__
(unsigned long long)(*state)->packets[0],
(unsigned long long)(*state)->packets[1],
#else
(*state)->packets[0], (*state)->packets[1],
#endif
direction == PF_IN ? "in" : "out",
direction == (*state)->direction ? "fwd" : "rev");
printf("pf: State failure on: %c %c %c %c | %c %c\n",
SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
' ': '2',
(ackskew >= -MAXACKWINDOW) ? ' ' : '3',
(ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
}
REASON_SET(reason, PFRES_BADSTATE);
return (PF_DROP);
}
/* Any packets which have gotten here are to be passed */
/* translate source/destination address, if necessary */
if (STATE_TRANSLATE(*state)) {
if (direction == PF_OUT)
@ -5533,8 +5650,9 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
copyback = 1;
}
if (!SEQ_GEQ(src->seqhi, seq) ||
!SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))) {
if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
(!SEQ_GEQ(src->seqhi, seq) ||
!SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
if (pf_status.debug >= PF_DEBUG_MISC) {
printf("pf: BAD ICMP %d:%d ",
icmptype, pd->hdr.icmp->icmp_code);
@ -7052,7 +7170,7 @@ pf_test(int dir, struct ifnet *ifp, struct mbuf **m0,
done:
if (action == PF_PASS && h->ip_hl > 5 &&
!((s && s->allow_opts) || r->allow_opts)) {
!((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
action = PF_DROP;
REASON_SET(&reason, PFRES_IPOPTIONS);
log = 1;
@ -7513,7 +7631,7 @@ pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0,
done:
/* handle dangerous IPv6 extension headers. */
if (action == PF_PASS && rh_cnt &&
!((s && s->allow_opts) || r->allow_opts)) {
!((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
action = PF_DROP;
REASON_SET(&reason, PFRES_IPOPTIONS);
log = 1;

View File

@ -700,6 +700,7 @@ struct pf_rule {
/* rule flags again */
#define PFRULE_IFBOUND 0x00010000 /* if-bound */
#define PFRULE_STATESLOPPY 0x00020000 /* sloppy state tracking */
#define PFSTATE_HIWAT 10000 /* default state table size */
#define PFSTATE_ADAPT_START 6000 /* default adaptive timeout start */
@ -800,7 +801,9 @@ struct pf_state {
u_int8_t pad;
#endif
u_int8_t log;
u_int8_t allow_opts;
u_int8_t state_flags;
#define PFSTATE_ALLOWOPTS 0x01
#define PFSTATE_SLOPPY 0x02
u_int8_t timeout;
u_int8_t sync_flags;
#define PFSTATE_NOSYNC 0x01