tcp_input: always request read-locking of PCB for any pure SYN segment.
This is further rework of 08d9c92027
. Now we carry the knowledge of
lock type all the way through tcp_input() and also into tcp_twcheck().
Ideally the rlocking for pure SYNs should propagate all the way into
the alternative TCP stacks, but not yet today.
This should close a race when socket is bind(2)-ed but not yet
listen(2)-ed and a SYN-packet arrives racing with listen(2), discovered
recently by pho@.
This commit is contained in:
parent
7b5053ce22
commit
1db08fbe3f
@ -2524,12 +2524,6 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
|
|||||||
INP_WLOCK(inp);
|
INP_WLOCK(inp);
|
||||||
} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
|
} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
|
||||||
INP_RLOCK(inp);
|
INP_RLOCK(inp);
|
||||||
} else if (lookupflags & INPLOOKUP_RLOCKLISTEN) {
|
|
||||||
if (inp->inp_socket != NULL &&
|
|
||||||
SOLISTENING(inp->inp_socket))
|
|
||||||
INP_RLOCK(inp);
|
|
||||||
else
|
|
||||||
INP_WLOCK(inp);
|
|
||||||
} else
|
} else
|
||||||
panic("%s: locking bug", __func__);
|
panic("%s: locking bug", __func__);
|
||||||
if (__predict_false(inp->inp_flags2 & INP_FREED)) {
|
if (__predict_false(inp->inp_flags2 & INP_FREED)) {
|
||||||
@ -2557,8 +2551,8 @@ in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
|
|||||||
|
|
||||||
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
|
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
|
||||||
("%s: invalid lookup flags %d", __func__, lookupflags));
|
("%s: invalid lookup flags %d", __func__, lookupflags));
|
||||||
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
|
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
|
||||||
INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__));
|
("%s: LOCKPCB not set", __func__));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* When not using RSS, use connection groups in preference to the
|
* When not using RSS, use connection groups in preference to the
|
||||||
@ -2593,8 +2587,8 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
|
|||||||
|
|
||||||
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
|
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
|
||||||
("%s: invalid lookup flags %d", __func__, lookupflags));
|
("%s: invalid lookup flags %d", __func__, lookupflags));
|
||||||
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
|
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
|
||||||
INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__));
|
("%s: LOCKPCB not set", __func__));
|
||||||
|
|
||||||
#ifdef PCBGROUP
|
#ifdef PCBGROUP
|
||||||
/*
|
/*
|
||||||
|
@ -761,10 +761,9 @@ int inp_so_options(const struct inpcb *inp);
|
|||||||
#define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */
|
#define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */
|
||||||
#define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */
|
#define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */
|
||||||
#define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */
|
#define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */
|
||||||
#define INPLOOKUP_RLOCKLISTEN 0x00000008 /* Rlock if listening, W if !.*/
|
|
||||||
|
|
||||||
#define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
|
#define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
|
||||||
INPLOOKUP_WLOCKPCB | INPLOOKUP_RLOCKLISTEN)
|
INPLOOKUP_WLOCKPCB)
|
||||||
|
|
||||||
#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb)
|
#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb)
|
||||||
|
|
||||||
|
@ -834,11 +834,12 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
|
|||||||
fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
|
fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* For initial SYN packets arriving on listening socket,
|
* For initial SYN packets we don't need write lock on matching
|
||||||
* we don't need write lock.
|
* PCB, be it a listening one or a synchronized one. The packet
|
||||||
|
* shall not modify its state.
|
||||||
*/
|
*/
|
||||||
lookupflag = (thflags & (TH_ACK|TH_SYN)) == TH_SYN ?
|
lookupflag = (thflags & (TH_ACK|TH_SYN)) == TH_SYN ?
|
||||||
INPLOOKUP_RLOCKLISTEN : INPLOOKUP_WLOCKPCB;
|
INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB;
|
||||||
findpcb:
|
findpcb:
|
||||||
#ifdef INET6
|
#ifdef INET6
|
||||||
if (isipv6 && fwd_tag != NULL) {
|
if (isipv6 && fwd_tag != NULL) {
|
||||||
@ -1380,7 +1381,16 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
|
|||||||
* Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
|
* Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
|
||||||
* state. tcp_do_segment() always consumes the mbuf chain, unlocks
|
* state. tcp_do_segment() always consumes the mbuf chain, unlocks
|
||||||
* the inpcb, and unlocks pcbinfo.
|
* the inpcb, and unlocks pcbinfo.
|
||||||
|
*
|
||||||
|
* XXXGL: in case of a pure SYN arriving on existing connection
|
||||||
|
* TCP stacks won't need to modify the PCB, they would either drop
|
||||||
|
* the segment silently, or send a challenge ACK. However, we try
|
||||||
|
* to upgrade the lock, because calling convention for stacks is
|
||||||
|
* write-lock on PCB. If upgrade fails, drop the SYN.
|
||||||
*/
|
*/
|
||||||
|
if (lookupflag == INPLOOKUP_RLOCKPCB && INP_TRY_UPGRADE(inp) == 0)
|
||||||
|
goto dropunlock;
|
||||||
|
|
||||||
tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos);
|
tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos);
|
||||||
return (IPPROTO_DONE);
|
return (IPPROTO_DONE);
|
||||||
|
|
||||||
|
@ -377,7 +377,9 @@ tcp_twstart(struct tcpcb *tp)
|
|||||||
/*
|
/*
|
||||||
* Returns 1 if the TIME_WAIT state was killed and we should start over,
|
* Returns 1 if the TIME_WAIT state was killed and we should start over,
|
||||||
* looking for a pcb in the listen state. Returns 0 otherwise.
|
* looking for a pcb in the listen state. Returns 0 otherwise.
|
||||||
* It be called with to == NULL only for pure SYN-segments.
|
*
|
||||||
|
* For pure SYN-segments the PCB shall be read-locked and the tcpopt pointer
|
||||||
|
* may be NULL. For the rest write-lock and valid tcpopt.
|
||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
|
tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
|
||||||
@ -388,7 +390,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
|
|||||||
tcp_seq seq;
|
tcp_seq seq;
|
||||||
|
|
||||||
NET_EPOCH_ASSERT();
|
NET_EPOCH_ASSERT();
|
||||||
INP_WLOCK_ASSERT(inp);
|
INP_LOCK_ASSERT(inp);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* XXXRW: Time wait state for inpcb has been recycled, but inpcb is
|
* XXXRW: Time wait state for inpcb has been recycled, but inpcb is
|
||||||
@ -401,8 +403,16 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
|
|||||||
goto drop;
|
goto drop;
|
||||||
|
|
||||||
thflags = th->th_flags;
|
thflags = th->th_flags;
|
||||||
KASSERT(to != NULL || (thflags & (TH_SYN | TH_ACK)) == TH_SYN,
|
#ifdef INVARIANTS
|
||||||
("tcp_twcheck: called without options on a non-SYN segment"));
|
if ((thflags & (TH_SYN | TH_ACK)) == TH_SYN)
|
||||||
|
INP_RLOCK_ASSERT(inp);
|
||||||
|
else {
|
||||||
|
INP_WLOCK_ASSERT(inp);
|
||||||
|
KASSERT(to != NULL,
|
||||||
|
("%s: called without options on a non-SYN segment",
|
||||||
|
__func__));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* NOTE: for FIN_WAIT_2 (to be added later),
|
* NOTE: for FIN_WAIT_2 (to be added later),
|
||||||
@ -442,6 +452,13 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
|
|||||||
* Allow UDP port number changes in this case.
|
* Allow UDP port number changes in this case.
|
||||||
*/
|
*/
|
||||||
if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
|
if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
|
||||||
|
/*
|
||||||
|
* In case we can't upgrade our lock just pretend we have
|
||||||
|
* lost this packet.
|
||||||
|
*/
|
||||||
|
if (((thflags & (TH_SYN | TH_ACK)) == TH_SYN) &&
|
||||||
|
INP_TRY_UPGRADE(inp) == 0)
|
||||||
|
goto drop;
|
||||||
tcp_twclose(tw, 0);
|
tcp_twclose(tw, 0);
|
||||||
return (1);
|
return (1);
|
||||||
}
|
}
|
||||||
@ -471,6 +488,8 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
|
|||||||
if ((thflags & TH_ACK) == 0)
|
if ((thflags & TH_ACK) == 0)
|
||||||
goto drop;
|
goto drop;
|
||||||
|
|
||||||
|
INP_WLOCK_ASSERT(inp);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If timestamps were negotiated during SYN/ACK and a
|
* If timestamps were negotiated during SYN/ACK and a
|
||||||
* segment without a timestamp is received, silently drop
|
* segment without a timestamp is received, silently drop
|
||||||
@ -503,7 +522,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
|
|||||||
drop:
|
drop:
|
||||||
TCP_PROBE5(receive, NULL, NULL, m, NULL, th);
|
TCP_PROBE5(receive, NULL, NULL, m, NULL, th);
|
||||||
dropnoprobe:
|
dropnoprobe:
|
||||||
INP_WUNLOCK(inp);
|
INP_UNLOCK(inp);
|
||||||
m_freem(m);
|
m_freem(m);
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
@ -1299,12 +1299,6 @@ in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
|
|||||||
INP_WLOCK(inp);
|
INP_WLOCK(inp);
|
||||||
} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
|
} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
|
||||||
INP_RLOCK(inp);
|
INP_RLOCK(inp);
|
||||||
} else if (lookupflags & INPLOOKUP_RLOCKLISTEN) {
|
|
||||||
if (inp->inp_socket != NULL &&
|
|
||||||
SOLISTENING(inp->inp_socket))
|
|
||||||
INP_RLOCK(inp);
|
|
||||||
else
|
|
||||||
INP_WLOCK(inp);
|
|
||||||
} else
|
} else
|
||||||
panic("%s: locking bug", __func__);
|
panic("%s: locking bug", __func__);
|
||||||
if (__predict_false(inp->inp_flags2 & INP_FREED)) {
|
if (__predict_false(inp->inp_flags2 & INP_FREED)) {
|
||||||
@ -1331,8 +1325,8 @@ in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport,
|
|||||||
|
|
||||||
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
|
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
|
||||||
("%s: invalid lookup flags %d", __func__, lookupflags));
|
("%s: invalid lookup flags %d", __func__, lookupflags));
|
||||||
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
|
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
|
||||||
INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__));
|
("%s: LOCKPCB not set", __func__));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* When not using RSS, use connection groups in preference to the
|
* When not using RSS, use connection groups in preference to the
|
||||||
@ -1367,8 +1361,7 @@ in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
|
|||||||
|
|
||||||
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
|
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
|
||||||
("%s: invalid lookup flags %d", __func__, lookupflags));
|
("%s: invalid lookup flags %d", __func__, lookupflags));
|
||||||
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
|
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
|
||||||
INPLOOKUP_RLOCKLISTEN)) != 0,
|
|
||||||
("%s: LOCKPCB not set", __func__));
|
("%s: LOCKPCB not set", __func__));
|
||||||
|
|
||||||
#ifdef PCBGROUP
|
#ifdef PCBGROUP
|
||||||
|
Loading…
Reference in New Issue
Block a user