tcp_input/syncache: acquire only read lock on PCB for SYN,!ACK packets
When packet is a SYN packet, we don't need to modify any existing PCB. Normally SYN arrives on a listening socket, we either create a syncache entry or generate syncookie, but we don't modify anything with the listening socket or associated PCB. Thus create a new PCB lookup mode - rlock if listening. This removes the primary contention point under SYN flood - the listening socket PCB. Sidenote: when SYN arrives on a synchronized connection, we still don't need write access to PCB to send a challenge ACK or just to drop. There is only one exclusion - tcptw recycling. However, existing entanglement of tcp_input + stacks doesn't allow to make this change small. Consider this patch as first approach to the problem. Reviewed by: rrs Differential revision: https://reviews.freebsd.org/D29576
This commit is contained in:
parent
0c6282e842
commit
08d9c92027
@ -2518,31 +2518,24 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
|
||||
struct inpcb *inp;
|
||||
|
||||
inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
|
||||
(lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp,
|
||||
numa_domain);
|
||||
lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
|
||||
if (inp != NULL) {
|
||||
if (lookupflags & INPLOOKUP_WLOCKPCB) {
|
||||
INP_WLOCK(inp);
|
||||
if (__predict_false(inp->inp_flags2 & INP_FREED)) {
|
||||
INP_WUNLOCK(inp);
|
||||
inp = NULL;
|
||||
}
|
||||
} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
|
||||
INP_RLOCK(inp);
|
||||
if (__predict_false(inp->inp_flags2 & INP_FREED)) {
|
||||
INP_RUNLOCK(inp);
|
||||
inp = NULL;
|
||||
}
|
||||
} else if (lookupflags & INPLOOKUP_RLOCKLISTEN) {
|
||||
if (inp->inp_socket != NULL &&
|
||||
SOLISTENING(inp->inp_socket))
|
||||
INP_RLOCK(inp);
|
||||
else
|
||||
INP_WLOCK(inp);
|
||||
} else
|
||||
panic("%s: locking bug", __func__);
|
||||
#ifdef INVARIANTS
|
||||
if (inp != NULL) {
|
||||
if (lookupflags & INPLOOKUP_WLOCKPCB)
|
||||
INP_WLOCK_ASSERT(inp);
|
||||
else
|
||||
INP_RLOCK_ASSERT(inp);
|
||||
if (__predict_false(inp->inp_flags2 & INP_FREED)) {
|
||||
INP_UNLOCK(inp);
|
||||
inp = NULL;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
return (inp);
|
||||
@ -2564,8 +2557,8 @@ in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
|
||||
|
||||
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
|
||||
("%s: invalid lookup flags %d", __func__, lookupflags));
|
||||
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
|
||||
("%s: LOCKPCB not set", __func__));
|
||||
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
|
||||
INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__));
|
||||
|
||||
/*
|
||||
* When not using RSS, use connection groups in preference to the
|
||||
@ -2600,8 +2593,8 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
|
||||
|
||||
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
|
||||
("%s: invalid lookup flags %d", __func__, lookupflags));
|
||||
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
|
||||
("%s: LOCKPCB not set", __func__));
|
||||
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
|
||||
INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__));
|
||||
|
||||
#ifdef PCBGROUP
|
||||
/*
|
||||
|
@ -761,9 +761,10 @@ int inp_so_options(const struct inpcb *inp);
|
||||
#define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */
|
||||
#define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */
|
||||
#define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */
|
||||
#define INPLOOKUP_RLOCKLISTEN 0x00000008 /* Rlock if listening, W if !.*/
|
||||
|
||||
#define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
|
||||
INPLOOKUP_WLOCKPCB)
|
||||
INPLOOKUP_WLOCKPCB | INPLOOKUP_RLOCKLISTEN)
|
||||
|
||||
#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb)
|
||||
|
||||
|
@ -626,6 +626,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
|
||||
int drop_hdrlen;
|
||||
int thflags;
|
||||
int rstreason = 0; /* For badport_bandlim accounting purposes */
|
||||
int lookupflag;
|
||||
uint8_t iptos;
|
||||
struct m_tag *fwd_tag = NULL;
|
||||
#ifdef INET6
|
||||
@ -825,6 +826,12 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
|
||||
)
|
||||
fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
|
||||
|
||||
/*
|
||||
* For initial SYN packets arriving on listening socket,
|
||||
* we don't need write lock.
|
||||
*/
|
||||
lookupflag = (thflags & (TH_ACK|TH_SYN)) == TH_SYN ?
|
||||
INPLOOKUP_RLOCKLISTEN : INPLOOKUP_WLOCKPCB;
|
||||
findpcb:
|
||||
#ifdef INET6
|
||||
if (isipv6 && fwd_tag != NULL) {
|
||||
@ -837,7 +844,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
|
||||
*/
|
||||
inp = in6_pcblookup_mbuf(&V_tcbinfo,
|
||||
&ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport,
|
||||
INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m);
|
||||
lookupflag, m->m_pkthdr.rcvif, m);
|
||||
if (!inp) {
|
||||
/*
|
||||
* It's new. Try to find the ambushing socket.
|
||||
@ -847,14 +854,13 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
|
||||
inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src,
|
||||
th->th_sport, &next_hop6->sin6_addr,
|
||||
next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) :
|
||||
th->th_dport, INPLOOKUP_WILDCARD |
|
||||
INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
|
||||
th->th_dport, INPLOOKUP_WILDCARD | lookupflag,
|
||||
m->m_pkthdr.rcvif);
|
||||
}
|
||||
} else if (isipv6) {
|
||||
inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src,
|
||||
th->th_sport, &ip6->ip6_dst, th->th_dport,
|
||||
INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
|
||||
m->m_pkthdr.rcvif, m);
|
||||
INPLOOKUP_WILDCARD | lookupflag, m->m_pkthdr.rcvif, m);
|
||||
}
|
||||
#endif /* INET6 */
|
||||
#if defined(INET6) && defined(INET)
|
||||
@ -870,8 +876,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
|
||||
* already got one like this?
|
||||
*/
|
||||
inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport,
|
||||
ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB,
|
||||
m->m_pkthdr.rcvif, m);
|
||||
ip->ip_dst, th->th_dport, lookupflag, m->m_pkthdr.rcvif, m);
|
||||
if (!inp) {
|
||||
/*
|
||||
* It's new. Try to find the ambushing socket.
|
||||
@ -881,14 +886,13 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
|
||||
inp = in_pcblookup(&V_tcbinfo, ip->ip_src,
|
||||
th->th_sport, next_hop->sin_addr,
|
||||
next_hop->sin_port ? ntohs(next_hop->sin_port) :
|
||||
th->th_dport, INPLOOKUP_WILDCARD |
|
||||
INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
|
||||
th->th_dport, INPLOOKUP_WILDCARD | lookupflag,
|
||||
m->m_pkthdr.rcvif);
|
||||
}
|
||||
} else
|
||||
inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src,
|
||||
th->th_sport, ip->ip_dst, th->th_dport,
|
||||
INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
|
||||
m->m_pkthdr.rcvif, m);
|
||||
INPLOOKUP_WILDCARD | lookupflag, m->m_pkthdr.rcvif, m);
|
||||
#endif /* INET */
|
||||
|
||||
/*
|
||||
@ -918,14 +922,14 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
|
||||
rstreason = BANDLIM_RST_CLOSEDPORT;
|
||||
goto dropwithreset;
|
||||
}
|
||||
INP_WLOCK_ASSERT(inp);
|
||||
INP_LOCK_ASSERT(inp);
|
||||
/*
|
||||
* While waiting for inp lock during the lookup, another thread
|
||||
* can have dropped the inpcb, in which case we need to loop back
|
||||
* and try to find a new inpcb to deliver to.
|
||||
*/
|
||||
if (inp->inp_flags & INP_DROPPED) {
|
||||
INP_WUNLOCK(inp);
|
||||
INP_UNLOCK(inp);
|
||||
inp = NULL;
|
||||
goto findpcb;
|
||||
}
|
||||
@ -1014,7 +1018,6 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
|
||||
#endif
|
||||
|
||||
#ifdef MAC
|
||||
INP_WLOCK_ASSERT(inp);
|
||||
if (mac_inpcb_check_deliver(inp, m))
|
||||
goto dropunlock;
|
||||
#endif
|
||||
@ -1124,8 +1127,10 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
|
||||
* Socket is created in state SYN_RECEIVED.
|
||||
* Unlock the listen socket, lock the newly
|
||||
* created socket and update the tp variable.
|
||||
* If we came here via jump to tfo_socket_result,
|
||||
* then listening socket is read-locked.
|
||||
*/
|
||||
INP_WUNLOCK(inp); /* listen socket */
|
||||
INP_UNLOCK(inp); /* listen socket */
|
||||
inp = sotoinpcb(so);
|
||||
/*
|
||||
* New connection inpcb is already locked by
|
||||
@ -1213,6 +1218,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
|
||||
("%s: Listen socket: TH_RST or TH_ACK set", __func__));
|
||||
KASSERT(thflags & (TH_SYN),
|
||||
("%s: Listen socket: TH_SYN not set", __func__));
|
||||
INP_RLOCK_ASSERT(inp);
|
||||
#ifdef INET6
|
||||
/*
|
||||
* If deprecated address is forbidden,
|
||||
@ -1381,7 +1387,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
|
||||
|
||||
if (inp != NULL) {
|
||||
tcp_dropwithreset(m, th, tp, tlen, rstreason);
|
||||
INP_WUNLOCK(inp);
|
||||
INP_UNLOCK(inp);
|
||||
} else
|
||||
tcp_dropwithreset(m, th, NULL, tlen, rstreason);
|
||||
m = NULL; /* mbuf chain got consumed. */
|
||||
@ -1392,7 +1398,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
|
||||
TCP_PROBE5(receive, NULL, tp, m, tp, th);
|
||||
|
||||
if (inp != NULL)
|
||||
INP_WUNLOCK(inp);
|
||||
INP_UNLOCK(inp);
|
||||
|
||||
drop:
|
||||
INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
|
||||
@ -3360,7 +3366,7 @@ tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
|
||||
#endif
|
||||
|
||||
if (tp != NULL) {
|
||||
INP_WLOCK_ASSERT(tp->t_inpcb);
|
||||
INP_LOCK_ASSERT(tp->t_inpcb);
|
||||
}
|
||||
|
||||
/* Don't bother if destination was broadcast/multicast. */
|
||||
|
@ -1428,7 +1428,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
|
||||
if (tp != NULL) {
|
||||
inp = tp->t_inpcb;
|
||||
KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
|
||||
INP_WLOCK_ASSERT(inp);
|
||||
INP_LOCK_ASSERT(inp);
|
||||
} else
|
||||
inp = NULL;
|
||||
|
||||
|
@ -1397,7 +1397,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
|
||||
int tfo_response_cookie_valid = 0;
|
||||
bool locked;
|
||||
|
||||
INP_WLOCK_ASSERT(inp); /* listen socket */
|
||||
INP_RLOCK_ASSERT(inp); /* listen socket */
|
||||
KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN,
|
||||
("%s: unexpected tcp flags", __func__));
|
||||
|
||||
@ -1469,13 +1469,13 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
|
||||
|
||||
#ifdef MAC
|
||||
if (mac_syncache_init(&maclabel) != 0) {
|
||||
INP_WUNLOCK(inp);
|
||||
INP_RUNLOCK(inp);
|
||||
goto done;
|
||||
} else
|
||||
mac_syncache_create(maclabel, inp);
|
||||
#endif
|
||||
if (!tfo_cookie_valid)
|
||||
INP_WUNLOCK(inp);
|
||||
INP_RUNLOCK(inp);
|
||||
|
||||
/*
|
||||
* Remember the IP options, if any.
|
||||
@ -1528,7 +1528,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
|
||||
}
|
||||
if (sc != NULL) {
|
||||
if (tfo_cookie_valid)
|
||||
INP_WUNLOCK(inp);
|
||||
INP_RUNLOCK(inp);
|
||||
TCPSTAT_INC(tcps_sc_dupsyn);
|
||||
if (ipopts) {
|
||||
/*
|
||||
@ -1735,7 +1735,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
|
||||
|
||||
if (tfo_cookie_valid) {
|
||||
syncache_tfo_expand(sc, lsop, m, tfo_response_cookie);
|
||||
/* INP_WUNLOCK(inp) will be performed by the caller */
|
||||
/* INP_RUNLOCK(inp) will be performed by the caller */
|
||||
rv = 1;
|
||||
goto tfo_expanded;
|
||||
}
|
||||
|
@ -1293,31 +1293,24 @@ in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
|
||||
struct inpcb *inp;
|
||||
|
||||
inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
|
||||
(lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp,
|
||||
numa_domain);
|
||||
lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
|
||||
if (inp != NULL) {
|
||||
if (lookupflags & INPLOOKUP_WLOCKPCB) {
|
||||
INP_WLOCK(inp);
|
||||
if (__predict_false(inp->inp_flags2 & INP_FREED)) {
|
||||
INP_WUNLOCK(inp);
|
||||
inp = NULL;
|
||||
}
|
||||
} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
|
||||
INP_RLOCK(inp);
|
||||
if (__predict_false(inp->inp_flags2 & INP_FREED)) {
|
||||
INP_RUNLOCK(inp);
|
||||
inp = NULL;
|
||||
}
|
||||
} else if (lookupflags & INPLOOKUP_RLOCKLISTEN) {
|
||||
if (inp->inp_socket != NULL &&
|
||||
SOLISTENING(inp->inp_socket))
|
||||
INP_RLOCK(inp);
|
||||
else
|
||||
INP_WLOCK(inp);
|
||||
} else
|
||||
panic("%s: locking bug", __func__);
|
||||
#ifdef INVARIANTS
|
||||
if (inp != NULL) {
|
||||
if (lookupflags & INPLOOKUP_WLOCKPCB)
|
||||
INP_WLOCK_ASSERT(inp);
|
||||
else
|
||||
INP_RLOCK_ASSERT(inp);
|
||||
if (__predict_false(inp->inp_flags2 & INP_FREED)) {
|
||||
INP_UNLOCK(inp);
|
||||
inp = NULL;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return (inp);
|
||||
}
|
||||
@ -1338,8 +1331,8 @@ in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport,
|
||||
|
||||
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
|
||||
("%s: invalid lookup flags %d", __func__, lookupflags));
|
||||
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
|
||||
("%s: LOCKPCB not set", __func__));
|
||||
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
|
||||
INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__));
|
||||
|
||||
/*
|
||||
* When not using RSS, use connection groups in preference to the
|
||||
@ -1374,7 +1367,8 @@ in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
|
||||
|
||||
KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
|
||||
("%s: invalid lookup flags %d", __func__, lookupflags));
|
||||
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
|
||||
KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
|
||||
INPLOOKUP_RLOCKLISTEN)) != 0,
|
||||
("%s: LOCKPCB not set", __func__));
|
||||
|
||||
#ifdef PCBGROUP
|
||||
|
@ -487,7 +487,7 @@ void
|
||||
mac_syncache_create(struct label *label, struct inpcb *inp)
|
||||
{
|
||||
|
||||
INP_WLOCK_ASSERT(inp);
|
||||
INP_LOCK_ASSERT(inp);
|
||||
|
||||
MAC_POLICY_PERFORM_NOSLEEP(syncache_create, label, inp);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user