From 08d9c9202755a30f97617758595214a530afcaea Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Thu, 18 Mar 2021 19:06:13 -0700 Subject: [PATCH] tcp_input/syncache: acquire only read lock on PCB for SYN,!ACK packets When packet is a SYN packet, we don't need to modify any existing PCB. Normally SYN arrives on a listening socket, we either create a syncache entry or generate syncookie, but we don't modify anything with the listening socket or associated PCB. Thus create a new PCB lookup mode - rlock if listening. This removes the primary contention point under SYN flood - the listening socket PCB. Sidenote: when SYN arrives on a synchronized connection, we still don't need write access to PCB to send a challenge ACK or just to drop. There is only one exclusion - tcptw recycling. However, existing entanglement of tcp_input + stacks doesn't allow to make this change small. Consider this patch as first approach to the problem. Reviewed by: rrs Differential revision: https://reviews.freebsd.org/D29576 --- sys/netinet/in_pcb.c | 35 +++++++++++++------------------ sys/netinet/in_pcb.h | 3 ++- sys/netinet/tcp_input.c | 42 +++++++++++++++++++++---------------- sys/netinet/tcp_subr.c | 2 +- sys/netinet/tcp_syncache.c | 10 ++++----- sys/netinet6/in6_pcb.c | 34 +++++++++++++----------------- sys/security/mac/mac_inet.c | 2 +- 7 files changed, 61 insertions(+), 67 deletions(-) diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 40a0b4c0676e..43cd469e0fcf 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -2518,31 +2518,24 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, struct inpcb *inp; inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, - (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp, - numa_domain); + lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_WUNLOCK(inp); - inp = NULL; - } } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_RUNLOCK(inp); - inp = NULL; - } + } else if (lookupflags & INPLOOKUP_RLOCKLISTEN) { + if (inp->inp_socket != NULL && + SOLISTENING(inp->inp_socket)) + INP_RLOCK(inp); + else + INP_WLOCK(inp); } else panic("%s: locking bug", __func__); -#ifdef INVARIANTS - if (inp != NULL) { - if (lookupflags & INPLOOKUP_WLOCKPCB) - INP_WLOCK_ASSERT(inp); - else - INP_RLOCK_ASSERT(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) { + INP_UNLOCK(inp); + inp = NULL; } -#endif } return (inp); @@ -2564,8 +2557,8 @@ in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); - KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, - ("%s: LOCKPCB not set", __func__)); + KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB | + INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the @@ -2600,8 +2593,8 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); - KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, - ("%s: LOCKPCB not set", __func__)); + KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB | + INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 9604a837cfb4..4959fc238dfb 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -761,9 +761,10 @@ int inp_so_options(const struct inpcb *inp); #define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */ #define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */ #define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */ +#define INPLOOKUP_RLOCKLISTEN 0x00000008 /* Rlock if listening, W if !.*/ #define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \ - INPLOOKUP_WLOCKPCB) + INPLOOKUP_WLOCKPCB | INPLOOKUP_RLOCKLISTEN) #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index e53296670a0f..f69262230a05 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -626,6 +626,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ + int lookupflag; uint8_t iptos; struct m_tag *fwd_tag = NULL; #ifdef INET6 @@ -825,6 +826,12 @@ tcp_input(struct mbuf **mp, int *offp, int proto) ) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); + /* + * For initial SYN packets arriving on listening socket, + * we don't need write lock. + */ + lookupflag = (thflags & (TH_ACK|TH_SYN)) == TH_SYN ? + INPLOOKUP_RLOCKLISTEN : INPLOOKUP_WLOCKPCB; findpcb: #ifdef INET6 if (isipv6 && fwd_tag != NULL) { @@ -837,7 +844,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) */ inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, - INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); + lookupflag, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. @@ -847,14 +854,13 @@ tcp_input(struct mbuf **mp, int *offp, int proto) inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : - th->th_dport, INPLOOKUP_WILDCARD | - INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); + th->th_dport, INPLOOKUP_WILDCARD | lookupflag, + m->m_pkthdr.rcvif); } } else if (isipv6) { inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, - INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, - m->m_pkthdr.rcvif, m); + INPLOOKUP_WILDCARD | lookupflag, m->m_pkthdr.rcvif, m); } #endif /* INET6 */ #if defined(INET6) && defined(INET) @@ -870,8 +876,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) * already got one like this? */ inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, - ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB, - m->m_pkthdr.rcvif, m); + ip->ip_dst, th->th_dport, lookupflag, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. @@ -881,14 +886,13 @@ tcp_input(struct mbuf **mp, int *offp, int proto) inp = in_pcblookup(&V_tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : - th->th_dport, INPLOOKUP_WILDCARD | - INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); + th->th_dport, INPLOOKUP_WILDCARD | lookupflag, + m->m_pkthdr.rcvif); } } else inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, - INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, - m->m_pkthdr.rcvif, m); + INPLOOKUP_WILDCARD | lookupflag, m->m_pkthdr.rcvif, m); #endif /* INET */ /* @@ -918,14 +922,14 @@ tcp_input(struct mbuf **mp, int *offp, int proto) rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } - INP_WLOCK_ASSERT(inp); + INP_LOCK_ASSERT(inp); /* * While waiting for inp lock during the lookup, another thread * can have dropped the inpcb, in which case we need to loop back * and try to find a new inpcb to deliver to. */ if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); + INP_UNLOCK(inp); inp = NULL; goto findpcb; } @@ -1014,7 +1018,6 @@ tcp_input(struct mbuf **mp, int *offp, int proto) #endif #ifdef MAC - INP_WLOCK_ASSERT(inp); if (mac_inpcb_check_deliver(inp, m)) goto dropunlock; #endif @@ -1124,8 +1127,10 @@ tcp_input(struct mbuf **mp, int *offp, int proto) * Socket is created in state SYN_RECEIVED. * Unlock the listen socket, lock the newly * created socket and update the tp variable. + * If we came here via jump to tfo_socket_result, + * then listening socket is read-locked. */ - INP_WUNLOCK(inp); /* listen socket */ + INP_UNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); /* * New connection inpcb is already locked by @@ -1213,6 +1218,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); KASSERT(thflags & (TH_SYN), ("%s: Listen socket: TH_SYN not set", __func__)); + INP_RLOCK_ASSERT(inp); #ifdef INET6 /* * If deprecated address is forbidden, @@ -1381,7 +1387,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) if (inp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); - INP_WUNLOCK(inp); + INP_UNLOCK(inp); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ @@ -1392,7 +1398,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) TCP_PROBE5(receive, NULL, tp, m, tp, th); if (inp != NULL) - INP_WUNLOCK(inp); + INP_UNLOCK(inp); drop: INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); @@ -3360,7 +3366,7 @@ tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, #endif if (tp != NULL) { - INP_WLOCK_ASSERT(tp->t_inpcb); + INP_LOCK_ASSERT(tp->t_inpcb); } /* Don't bother if destination was broadcast/multicast. */ diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 4ed7e16f3557..05af0076b23a 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1428,7 +1428,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); - INP_WLOCK_ASSERT(inp); + INP_LOCK_ASSERT(inp); } else inp = NULL; diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 771ff44b8924..b1a0c1f7e229 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1397,7 +1397,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, int tfo_response_cookie_valid = 0; bool locked; - INP_WLOCK_ASSERT(inp); /* listen socket */ + INP_RLOCK_ASSERT(inp); /* listen socket */ KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, ("%s: unexpected tcp flags", __func__)); @@ -1469,13 +1469,13 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, #ifdef MAC if (mac_syncache_init(&maclabel) != 0) { - INP_WUNLOCK(inp); + INP_RUNLOCK(inp); goto done; } else mac_syncache_create(maclabel, inp); #endif if (!tfo_cookie_valid) - INP_WUNLOCK(inp); + INP_RUNLOCK(inp); /* * Remember the IP options, if any. @@ -1528,7 +1528,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, } if (sc != NULL) { if (tfo_cookie_valid) - INP_WUNLOCK(inp); + INP_RUNLOCK(inp); TCPSTAT_INC(tcps_sc_dupsyn); if (ipopts) { /* @@ -1735,7 +1735,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, if (tfo_cookie_valid) { syncache_tfo_expand(sc, lsop, m, tfo_response_cookie); - /* INP_WUNLOCK(inp) will be performed by the caller */ + /* INP_RUNLOCK(inp) will be performed by the caller */ rv = 1; goto tfo_expanded; } diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index 5fce9fcafa33..96be795d5757 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -1293,31 +1293,24 @@ in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, struct inpcb *inp; inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, - (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp, - numa_domain); + lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_WUNLOCK(inp); - inp = NULL; - } } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_RUNLOCK(inp); - inp = NULL; - } + } else if (lookupflags & INPLOOKUP_RLOCKLISTEN) { + if (inp->inp_socket != NULL && + SOLISTENING(inp->inp_socket)) + INP_RLOCK(inp); + else + INP_WLOCK(inp); } else panic("%s: locking bug", __func__); -#ifdef INVARIANTS - if (inp != NULL) { - if (lookupflags & INPLOOKUP_WLOCKPCB) - INP_WLOCK_ASSERT(inp); - else - INP_RLOCK_ASSERT(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) { + INP_UNLOCK(inp); + inp = NULL; } -#endif } return (inp); } @@ -1338,8 +1331,8 @@ in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); - KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, - ("%s: LOCKPCB not set", __func__)); + KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB | + INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the @@ -1374,7 +1367,8 @@ in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); - KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, + KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB | + INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP diff --git a/sys/security/mac/mac_inet.c b/sys/security/mac/mac_inet.c index 97f256c118fc..2b6a70fdf1bf 100644 --- a/sys/security/mac/mac_inet.c +++ b/sys/security/mac/mac_inet.c @@ -487,7 +487,7 @@ void mac_syncache_create(struct label *label, struct inpcb *inp) { - INP_WLOCK_ASSERT(inp); + INP_LOCK_ASSERT(inp); MAC_POLICY_PERFORM_NOSLEEP(syncache_create, label, inp); }