From de2d47842e880281da07f2589b9ec558b42c09c1 Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Thu, 2 Dec 2021 10:48:48 -0800 Subject: [PATCH] SMR protection for inpcbs With introduction of epoch(9) synchronization to network stack the inpcb database became protected by the network epoch together with static network data (interfaces, addresses, etc). However, inpcb aren't static in nature, they are created and destroyed all the time, which creates some traffic on the epoch(9) garbage collector. Fairly new feature of uma(9) - Safe Memory Reclamation allows to safely free memory in page-sized batches, with virtually zero overhead compared to uma_zfree(). However, unlike epoch(9), it puts stricter requirement on the access to the protected memory, needing the critical(9) section to access it. Details: - The database is already build on CK lists, thanks to epoch(9). - For write access nothing is changed. - For a lookup in the database SMR section is now required. Once the desired inpcb is found we need to transition from SMR section to r/w lock on the inpcb itself, with a check that inpcb isn't yet freed. This requires some compexity, since SMR section itself is a critical(9) section. The complexity is hidden from KPI users in inp_smr_lock(). - For a inpcb list traversal (a pcblist sysctl, or broadcast notification) also a new KPI is provided, that hides internals of the database - inp_next(struct inp_iterator *). Reviewed by: rrs Differential revision: https://reviews.freebsd.org/D33022 --- sys/kern/subr_witness.c | 4 +- sys/kern/uipc_ktls.c | 12 +- sys/netinet/in_pcb.c | 700 +++++++++++++++++++++---------------- sys/netinet/in_pcb.h | 225 +++++------- sys/netinet/in_pcb_var.h | 7 + sys/netinet/ip_divert.c | 75 ++-- sys/netinet/ip_gre.c | 18 +- sys/netinet/raw_ip.c | 284 +++++++-------- sys/netinet/tcp_hpts.c | 24 +- sys/netinet/tcp_input.c | 2 - sys/netinet/tcp_lro.c | 3 +- sys/netinet/tcp_subr.c | 74 ++-- sys/netinet/tcp_var.h | 2 - sys/netinet/udp_usrreq.c | 303 ++++++++-------- sys/netinet/udp_var.h | 10 - sys/netinet6/icmp6.c | 171 ++++----- sys/netinet6/in6_pcb.c | 112 +++--- sys/netinet6/ip6_gre.c | 19 +- sys/netinet6/raw_ip6.c | 169 ++++----- sys/netinet6/udp6_usrreq.c | 279 +++++++-------- 20 files changed, 1158 insertions(+), 1335 deletions(-) diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c index e3fcbbb31311..db82450f0570 100644 --- a/sys/kern/subr_witness.c +++ b/sys/kern/subr_witness.c @@ -564,15 +564,15 @@ static struct witness_order_list_entry order_lists[] = { /* * UDP/IP */ - { "udp", &lock_class_mtx_sleep }, { "udpinp", &lock_class_rw }, + { "udp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * TCP/IP */ - { "tcp", &lock_class_mtx_sleep }, { "tcpinp", &lock_class_rw }, + { "tcp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c index 07e5a4c8399f..aee9c2374e0b 100644 --- a/sys/kern/uipc_ktls.c +++ b/sys/kern/uipc_ktls.c @@ -854,10 +854,6 @@ ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction) inp = so->so_pcb; INP_WLOCK(inp); - if (inp->inp_flags2 & INP_FREED) { - INP_WUNLOCK(inp); - return (ECONNRESET); - } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); @@ -909,10 +905,6 @@ ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force, int error; INP_RLOCK(inp); - if (inp->inp_flags2 & INP_FREED) { - INP_RUNLOCK(inp); - return (ECONNRESET); - } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_RUNLOCK(inp); return (ECONNRESET); @@ -2716,8 +2708,7 @@ ktls_disable_ifnet_help(void *context, int pending __unused) INP_WLOCK(inp); so = inp->inp_socket; MPASS(so != NULL); - if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || - (inp->inp_flags2 & INP_FREED)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { goto out; } @@ -2729,7 +2720,6 @@ ktls_disable_ifnet_help(void *context, int pending __unused) counter_u64_add(ktls_ifnet_disable_ok, 1); /* ktls_set_tx_mode() drops inp wlock, so recheck flags */ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 && - (inp->inp_flags2 & INP_FREED) == 0 && (tp = intotcpcb(inp)) != NULL && tp->t_fb->tfb_hwtls_change != NULL) (*tp->t_fb->tfb_hwtls_change)(tp, 0); diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index ecca470805d2..ac8c0d3e368a 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -113,6 +113,7 @@ __FBSDID("$FreeBSD$"); #define INPCBLBGROUP_SIZMIN 8 #define INPCBLBGROUP_SIZMAX 256 +#define INP_FREED 0x00000200 /* See in_pcb.h. */ static struct callout ipport_tick_callout; @@ -145,7 +146,6 @@ VNET_DEFINE_STATIC(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) -static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, @@ -514,38 +514,43 @@ inpcb_fini(void *mem, int size) INP_LOCK_DESTROY(inp); } +/* Make sure it is safe to use hashinit(9) on CK_LIST. */ +CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); + /* * Initialize an inpcbinfo -- we should be able to reduce the number of * arguments in time. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, - struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, - char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields) + u_int hash_nelements, int porthash_nelements, char *inpcbzone_name, + uma_init inpcbzone_init) { - porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); - - INP_INFO_LOCK_INIT(pcbinfo, name); - INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ - INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); + mtx_init(&pcbinfo->ipi_lock, name, NULL, MTX_DEF); + mtx_init(&pcbinfo->ipi_hash_lock, "pcbinfohash", NULL, MTX_DEF); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif - pcbinfo->ipi_listhead = listhead; - CK_LIST_INIT(pcbinfo->ipi_listhead); + CK_LIST_INIT(&pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); + porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), - NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0); + NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, + UMA_ZONE_SMR); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); uma_zone_set_warning(pcbinfo->ipi_zone, "kern.ipc.maxsockets limit reached"); + pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); + pcbinfo->ipi_portzone = uma_zcreate(inpcbzone_name, + sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + uma_zone_set_smr(pcbinfo->ipi_portzone, pcbinfo->ipi_smr); } /* @@ -564,9 +569,8 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, pcbinfo->ipi_lbgrouphashmask); uma_zdestroy(pcbinfo->ipi_zone); - INP_LIST_LOCK_DESTROY(pcbinfo); - INP_HASH_LOCK_DESTROY(pcbinfo); - INP_INFO_LOCK_DESTROY(pcbinfo); + mtx_destroy(&pcbinfo->ipi_hash_lock); + mtx_destroy(&pcbinfo->ipi_lock); } /* @@ -580,7 +584,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) int error; error = 0; - inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); + inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(&inp->inp_start_zero, inp_zero_size); @@ -612,33 +616,38 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } -#endif - INP_WLOCK(inp); - INP_LIST_WLOCK(pcbinfo); - CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); - pcbinfo->ipi_count++; - so->so_pcb = (caddr_t)inp; -#ifdef INET6 if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif - inp->inp_gencnt = ++pcbinfo->ipi_gencnt; - refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ - /* * Routes in inpcb's can cache L2 as well; they are guaranteed * to be cleaned up. */ inp->inp_route.ro_flags = RT_LLE_CACHE; - INP_LIST_WUNLOCK(pcbinfo); +#ifdef TCPHPTS + /* + * If using hpts lets drop a random number in so + * not all new connections fall on the same CPU. + */ + inp->inp_hpts_cpu = inp->inp_input_cpu = hpts_random_cpu(inp); +#endif + refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ + INP_WLOCK(inp); + INP_INFO_WLOCK(pcbinfo); + pcbinfo->ipi_count++; + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); + INP_INFO_WUNLOCK(pcbinfo); + so->so_pcb = inp; + + return (0); + #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) out: - if (error != 0) { - crfree(inp->inp_cred); - uma_zfree(pcbinfo->ipi_zone, inp); - } -#endif + crfree(inp->inp_cred); + uma_zfree_smr(pcbinfo->ipi_zone, inp); return (error); +#endif } #ifdef INET @@ -1504,193 +1513,275 @@ in_pcbdetach(struct inpcb *inp) inp->inp_socket = NULL; } +/* + * inpcb hash lookups are protected by SMR section. + * + * Once desired pcb has been found, switching from SMR section to a pcb + * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK + * here because SMR is a critical section. + * In 99%+ cases inp_smr_lock() would obtain the lock immediately. + */ +static inline void +inp_lock(struct inpcb *inp, const inp_lookup_t lock) +{ + + lock == INPLOOKUP_RLOCKPCB ? + rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); +} + +static inline void +inp_unlock(struct inpcb *inp, const inp_lookup_t lock) +{ + + lock == INPLOOKUP_RLOCKPCB ? + rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); +} + +static inline int +inp_trylock(struct inpcb *inp, const inp_lookup_t lock) +{ + + return (lock == INPLOOKUP_RLOCKPCB ? + rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); +} + +static inline bool +in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) +{ + + return (lock == INPLOOKUP_RLOCKPCB ? + in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); +} + +bool +inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) +{ + + MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); + SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); + + if (__predict_true(inp_trylock(inp, lock))) { + if (__predict_false(inp->inp_flags & INP_FREED)) { + smr_exit(inp->inp_pcbinfo->ipi_smr); + inp_unlock(inp, lock); + return (false); + } + smr_exit(inp->inp_pcbinfo->ipi_smr); + return (true); + } + + if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { + smr_exit(inp->inp_pcbinfo->ipi_smr); + inp_lock(inp, lock); + if (__predict_false(in_pcbrele(inp, lock))) + return (false); + /* + * inp acquired through refcount & lock for sure didn't went + * through uma_zfree(). However, it may have already went + * through in_pcbfree() and has another reference, that + * prevented its release by our in_pcbrele(). + */ + if (__predict_false(inp->inp_flags & INP_FREED)) { + inp_unlock(inp, lock); + return (false); + } + return (true); + } else { + smr_exit(inp->inp_pcbinfo->ipi_smr); + return (false); + } +} + +/* + * inp_next() - inpcb hash/list traversal iterator + * + * Requires initialized struct inpcb_iterator for context. + * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). + * + * - Iterator can have either write-lock or read-lock semantics, that can not + * be changed later. + * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through + * a single hash slot. Note: only rip_input() does the latter. + * - Iterator may have optional bool matching function. The matching function + * will be executed for each inpcb in the SMR context, so it can not acquire + * locks and can safely access only immutable fields of inpcb. + * + * A fresh initialized iterator has NULL inpcb in its context and that + * means that inp_next() call would return the very first inpcb on the list + * locked with desired semantic. In all following calls the context pointer + * shall hold the current inpcb pointer. The KPI user is not supposed to + * unlock the current inpcb! Upon end of traversal inp_next() will return NULL + * and write NULL to its context. After end of traversal an iterator can be + * reused. + * + * List traversals have the following features/constraints: + * - New entries won't be seen, as they are always added to the head of a list. + * - Removed entries won't stop traversal as long as they are not added to + * a different list. This is violated by in_pcbrehash(). + */ +#define II_LIST_FIRST(ipi, hash) \ + (((hash) == INP_ALL_LIST) ? \ + CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ + CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)])) +#define II_LIST_NEXT(inp, hash) \ + (((hash) == INP_ALL_LIST) ? \ + CK_LIST_NEXT((inp), inp_list) : \ + CK_LIST_NEXT((inp), inp_hash)) +#define II_LOCK_ASSERT(inp, lock) \ + rw_assert(&(inp)->inp_lock, \ + (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) +struct inpcb * +inp_next(struct inpcb_iterator *ii) +{ + const struct inpcbinfo *ipi = ii->ipi; + inp_match_t *match = ii->match; + void *ctx = ii->ctx; + inp_lookup_t lock = ii->lock; + int hash = ii->hash; + struct inpcb *inp; + + if (ii->inp == NULL) { /* First call. */ + smr_enter(ipi->ipi_smr); + /* This is unrolled CK_LIST_FOREACH(). */ + for (inp = II_LIST_FIRST(ipi, hash); + inp != NULL; + inp = II_LIST_NEXT(inp, hash)) { + if (match != NULL && (match)(inp, ctx) == false) + continue; + if (__predict_true(inp_smr_lock(inp, lock))) + break; + else { + smr_enter(ipi->ipi_smr); + MPASS(inp != II_LIST_FIRST(ipi, hash)); + inp = II_LIST_FIRST(ipi, hash); + } + } + + if (inp == NULL) + smr_exit(ipi->ipi_smr); + else + ii->inp = inp; + + return (inp); + } + + /* Not a first call. */ + smr_enter(ipi->ipi_smr); +restart: + inp = ii->inp; + II_LOCK_ASSERT(inp, lock); +next: + inp = II_LIST_NEXT(inp, hash); + if (inp == NULL) { + smr_exit(ipi->ipi_smr); + goto found; + } + + if (match != NULL && (match)(inp, ctx) == false) + goto next; + + if (__predict_true(inp_trylock(inp, lock))) { + if (__predict_false(inp->inp_flags & INP_FREED)) { + /* + * Entries are never inserted in middle of a list, thus + * as long as we are in SMR, we can continue traversal. + * Jump to 'restart' should yield in the same result, + * but could produce unnecessary looping. Could this + * looping be unbound? + */ + inp_unlock(inp, lock); + goto next; + } else { + smr_exit(ipi->ipi_smr); + goto found; + } + } + + /* + * Can't obtain lock immediately, thus going hard. Once we exit the + * SMR section we can no longer jump to 'next', and our only stable + * anchoring point is ii->inp, which we keep locked for this case, so + * we jump to 'restart'. + */ + if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { + smr_exit(ipi->ipi_smr); + inp_lock(inp, lock); + if (__predict_false(in_pcbrele(inp, lock))) { + smr_enter(ipi->ipi_smr); + goto restart; + } + /* + * See comment in inp_smr_lock(). + */ + if (__predict_false(inp->inp_flags & INP_FREED)) { + inp_unlock(inp, lock); + smr_enter(ipi->ipi_smr); + goto restart; + } + } else + goto next; + +found: + inp_unlock(ii->inp, lock); + ii->inp = inp; + + return (ii->inp); +} + /* * in_pcbref() bumps the reference count on an inpcb in order to maintain - * stability of an inpcb pointer despite the inpcb lock being released. This - * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, - * but where the inpcb lock may already held. + * stability of an inpcb pointer despite the inpcb lock being released or + * SMR section exited. * - * in_pcbref() should be used only to provide brief memory stability, and - * must always be followed by a call to INP_WLOCK() and in_pcbrele() to - * garbage collect the inpcb if it has been in_pcbfree()'d from another - * context. Until in_pcbrele() has returned that the inpcb is still valid, - * lock and rele are the *only* safe operations that may be performed on the - * inpcb. - * - * While the inpcb will not be freed, releasing the inpcb lock means that the - * connection's state may change, so the caller should be careful to - * revalidate any cached state on reacquiring the lock. Drop the reference - * using in_pcbrele(). + * To free a reference later in_pcbrele_(r|w)locked() must be performed. */ void in_pcbref(struct inpcb *inp) { + u_int old __diagused; - KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); - - refcount_acquire(&inp->inp_refcount); + old = refcount_acquire(&inp->inp_refcount); + KASSERT(old > 0, ("%s: refcount 0", __func__)); } /* - * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to - * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we - * return a flag indicating whether or not the inpcb remains valid. If it is - * valid, we return with the inpcb lock held. - * - * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a - * reference on an inpcb. Historically more work was done here (actually, in - * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the - * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely - * about memory stability (and continued use of the write lock). + * Drop a refcount on an inpcb elevated using in_pcbref(), potentially + * freeing the pcb, if the reference was very last. */ -int +bool in_pcbrele_rlocked(struct inpcb *inp) { - struct inpcbinfo *pcbinfo; - - KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_RLOCK_ASSERT(inp); - if (refcount_release(&inp->inp_refcount) == 0) { - /* - * If the inpcb has been freed, let the caller know, even if - * this isn't the last reference. - */ - if (inp->inp_flags2 & INP_FREED) { - INP_RUNLOCK(inp); - return (1); - } - return (0); - } + if (refcount_release(&inp->inp_refcount) == 0) + return (false); - KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); -#ifdef TCPHPTS - if (inp->inp_in_hpts || inp->inp_in_input) { - struct tcp_hpts_entry *hpts; - /* - * We should not be on the hpts at - * this point in any form. we must - * get the lock to be sure. - */ - hpts = tcp_hpts_lock(inp); - if (inp->inp_in_hpts) - panic("Hpts:%p inp:%p at free still on hpts", - hpts, inp); - mtx_unlock(&hpts->p_mtx); - hpts = tcp_input_lock(inp); - if (inp->inp_in_input) - panic("Hpts:%p inp:%p at free still on input hpts", - hpts, inp); - mtx_unlock(&hpts->p_mtx); - } -#endif + MPASS(inp->inp_flags & INP_FREED); + MPASS(inp->inp_socket == NULL); + MPASS(inp->inp_in_hpts == 0); + MPASS(inp->inp_in_input == 0); INP_RUNLOCK(inp); - pcbinfo = inp->inp_pcbinfo; - uma_zfree(pcbinfo->ipi_zone, inp); - return (1); + uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); + return (true); } -int +bool in_pcbrele_wlocked(struct inpcb *inp) { - struct inpcbinfo *pcbinfo; - - KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_WLOCK_ASSERT(inp); - if (refcount_release(&inp->inp_refcount) == 0) { - /* - * If the inpcb has been freed, let the caller know, even if - * this isn't the last reference. - */ - if (inp->inp_flags2 & INP_FREED) { - INP_WUNLOCK(inp); - return (1); - } - return (0); - } + if (refcount_release(&inp->inp_refcount) == 0) + return (false); - KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); -#ifdef TCPHPTS - if (inp->inp_in_hpts || inp->inp_in_input) { - struct tcp_hpts_entry *hpts; - /* - * We should not be on the hpts at - * this point in any form. we must - * get the lock to be sure. - */ - hpts = tcp_hpts_lock(inp); - if (inp->inp_in_hpts) - panic("Hpts:%p inp:%p at free still on hpts", - hpts, inp); - mtx_unlock(&hpts->p_mtx); - hpts = tcp_input_lock(inp); - if (inp->inp_in_input) - panic("Hpts:%p inp:%p at free still on input hpts", - hpts, inp); - mtx_unlock(&hpts->p_mtx); - } -#endif + MPASS(inp->inp_flags & INP_FREED); + MPASS(inp->inp_socket == NULL); + MPASS(inp->inp_in_hpts == 0); + MPASS(inp->inp_in_input == 0); INP_WUNLOCK(inp); - pcbinfo = inp->inp_pcbinfo; - uma_zfree(pcbinfo->ipi_zone, inp); - return (1); -} - -static void -inpcbport_free(epoch_context_t ctx) -{ - struct inpcbport *phd; - - phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx); - free(phd, M_PCB); -} - -static void -in_pcbfree_deferred(epoch_context_t ctx) -{ - struct inpcb *inp; - int released __unused; - - inp = __containerof(ctx, struct inpcb, inp_epoch_ctx); - - INP_WLOCK(inp); - CURVNET_SET(inp->inp_vnet); -#ifdef INET - struct ip_moptions *imo = inp->inp_moptions; - inp->inp_moptions = NULL; -#endif - /* XXXRW: Do as much as possible here. */ -#if defined(IPSEC) || defined(IPSEC_SUPPORT) - if (inp->inp_sp != NULL) - ipsec_delete_pcbpolicy(inp); -#endif -#ifdef INET6 - struct ip6_moptions *im6o = NULL; - if (inp->inp_vflag & INP_IPV6PROTO) { - ip6_freepcbopts(inp->in6p_outputopts); - im6o = inp->in6p_moptions; - inp->in6p_moptions = NULL; - } -#endif - if (inp->inp_options) - (void)m_free(inp->inp_options); - inp->inp_vflag = 0; - crfree(inp->inp_cred); -#ifdef MAC - mac_inpcb_destroy(inp); -#endif - released = in_pcbrele_wlocked(inp); - MPASS(released); -#ifdef INET6 - ip6_freemoptions(im6o); -#endif -#ifdef INET - inp_freemoptions(imo); -#endif - CURVNET_RESTORE(); + uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); + return (true); } /* @@ -1698,32 +1789,81 @@ in_pcbfree_deferred(epoch_context_t ctx) * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is - * released using in_pcbrele(), but the inpcb is still unlocked. Almost all - * work, including removal from global lists, is done in this context, where - * the pcbinfo lock is held. + * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. + * Almost all work, including removal from global lists, is done in this + * context, where the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; - - KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - KASSERT((inp->inp_flags2 & INP_FREED) == 0, - ("%s: called twice for pcb %p", __func__, inp)); - if (inp->inp_flags2 & INP_FREED) { - INP_WUNLOCK(inp); - return; - } +#ifdef INET + struct ip_moptions *imo; +#endif +#ifdef INET6 + struct ip6_moptions *im6o; +#endif INP_WLOCK_ASSERT(inp); - INP_LIST_WLOCK(pcbinfo); - in_pcbremlists(inp); - INP_LIST_WUNLOCK(pcbinfo); + KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); + KASSERT((inp->inp_flags & INP_FREED) == 0, + ("%s: called twice for pcb %p", __func__, inp)); + + inp->inp_flags |= INP_FREED; + INP_INFO_WLOCK(pcbinfo); + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + pcbinfo->ipi_count--; + CK_LIST_REMOVE(inp, inp_list); + INP_INFO_WUNLOCK(pcbinfo); + + if (inp->inp_flags & INP_INHASHLIST) { + struct inpcbport *phd = inp->inp_phd; + + INP_HASH_WLOCK(pcbinfo); + /* XXX: Only do if SO_REUSEPORT_LB set? */ + in_pcbremlbgrouphash(inp); + + CK_LIST_REMOVE(inp, inp_hash); + CK_LIST_REMOVE(inp, inp_portlist); + if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { + CK_LIST_REMOVE(phd, phd_hash); + uma_zfree_smr(pcbinfo->ipi_portzone, phd); + } + INP_HASH_WUNLOCK(pcbinfo); + inp->inp_flags &= ~INP_INHASHLIST; + } + + crfree(inp->inp_cred); RO_INVALIDATE_CACHE(&inp->inp_route); - /* mark as destruction in progress */ - inp->inp_flags2 |= INP_FREED; - INP_WUNLOCK(inp); - NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx); +#ifdef MAC + mac_inpcb_destroy(inp); +#endif +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + if (inp->inp_sp != NULL) + ipsec_delete_pcbpolicy(inp); +#endif +#ifdef INET + if (inp->inp_options) + (void)m_free(inp->inp_options); + imo = inp->inp_moptions; +#endif +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6PROTO) { + ip6_freepcbopts(inp->in6p_outputopts); + im6o = inp->in6p_moptions; + } else + im6o = NULL; +#endif + + if (__predict_false(in_pcbrele_wlocked(inp) == false)) { + INP_WUNLOCK(inp); + } +#ifdef INET6 + ip6_freemoptions(im6o); +#endif +#ifdef INET + inp_freemoptions(imo); +#endif } /* @@ -1764,7 +1904,7 @@ in_pcbdrop(struct inpcb *inp) CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); - NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); + uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; @@ -1835,7 +1975,7 @@ in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); - CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { + CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { @@ -1854,49 +1994,57 @@ in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, INP_INFO_WUNLOCK(pcbinfo); } +static bool +inp_v4_multi_match(const struct inpcb *inp, void *v __unused) +{ + + if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) + return (true); + else + return (false); +} + void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { + struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, + inp_v4_multi_match, NULL); struct inpcb *inp; struct in_multi *inm; struct in_mfilter *imf; struct ip_moptions *imo; - INP_INFO_WLOCK(pcbinfo); - CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { - INP_WLOCK(inp); - imo = inp->inp_moptions; - if ((inp->inp_vflag & INP_IPV4) && - imo != NULL) { - /* - * Unselect the outgoing interface if it is being - * detached. - */ - if (imo->imo_multicast_ifp == ifp) - imo->imo_multicast_ifp = NULL; + IN_MULTI_LOCK_ASSERT(); - /* - * Drop multicast group membership if we joined - * through the interface being detached. - * - * XXX This can all be deferred to an epoch_call - */ + while ((inp = inp_next(&inpi)) != NULL) { + INP_WLOCK_ASSERT(inp); + + imo = inp->inp_moptions; + /* + * Unselect the outgoing interface if it is being + * detached. + */ + if (imo->imo_multicast_ifp == ifp) + imo->imo_multicast_ifp = NULL; + + /* + * Drop multicast group membership if we joined + * through the interface being detached. + * + * XXX This can all be deferred to an epoch_call + */ restart: - IP_MFILTER_FOREACH(imf, &imo->imo_head) { - if ((inm = imf->imf_inm) == NULL) - continue; - if (inm->inm_ifp != ifp) - continue; - ip_mfilter_remove(&imo->imo_head, imf); - IN_MULTI_LOCK_ASSERT(); - in_leavegroup_locked(inm, NULL); - ip_mfilter_free(imf); - goto restart; - } + IP_MFILTER_FOREACH(imf, &imo->imo_head) { + if ((inm = imf->imf_inm) == NULL) + continue; + if (inm->inm_ifp != ifp) + continue; + ip_mfilter_remove(&imo->imo_head, imf); + in_leavegroup_locked(inm, NULL); + ip_mfilter_free(imf); + goto restart; } - INP_WUNLOCK(inp); } - INP_INFO_WUNLOCK(pcbinfo); } /* @@ -1918,7 +2066,6 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); - INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { @@ -2081,8 +2228,9 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes - * that the caller has locked the hash list, and will not perform any further - * locking or reference operations on either the hash list or the connection. + * that the caller has either locked the hash list, which usually happens + * for bind(2) operations, or is in SMR section, which happens when sorting + * out incoming packets. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, @@ -2223,20 +2371,15 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, { struct inpcb *inp; + smr_enter(pcbinfo->ipi_smr); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); if (inp != NULL) { - if (lookupflags & INPLOOKUP_WLOCKPCB) { - INP_WLOCK(inp); - } else if (lookupflags & INPLOOKUP_RLOCKPCB) { - INP_RLOCK(inp); - } else - panic("%s: locking bug", __func__); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_UNLOCK(inp); + if (__predict_false(inp_smr_lock(inp, + (lookupflags & INPLOOKUP_LOCKMASK)) == false)) inp = NULL; - } - } + } else + smr_exit(pcbinfo->ipi_smr); return (inp); } @@ -2331,11 +2474,10 @@ in_pcbinshash(struct inpcb *inp) * If none exists, malloc one and tack it on. */ if (phd == NULL) { - phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); + phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } - bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context)); phd->phd_port = inp->inp_lport; CK_LIST_INIT(&phd->phd_pcblist); CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); @@ -2353,6 +2495,10 @@ in_pcbinshash(struct inpcb *inp) * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. + * + * XXXGL: a race between this function and SMR-protected hash iterator + * will lead to iterator traversing a possibly wrong hash list. However, + * this race should have been here since change from rwlock to epoch. */ void in_pcbrehash(struct inpcb *inp) @@ -2381,39 +2527,6 @@ in_pcbrehash(struct inpcb *inp) CK_LIST_INSERT_HEAD(head, inp, inp_hash); } -/* - * Remove PCB from various lists. - */ -static void -in_pcbremlists(struct inpcb *inp) -{ - struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; - - INP_WLOCK_ASSERT(inp); - INP_LIST_WLOCK_ASSERT(pcbinfo); - - inp->inp_gencnt = ++pcbinfo->ipi_gencnt; - if (inp->inp_flags & INP_INHASHLIST) { - struct inpcbport *phd = inp->inp_phd; - - INP_HASH_WLOCK(pcbinfo); - - /* XXX: Only do if SO_REUSEPORT_LB set? */ - in_pcbremlbgrouphash(inp); - - CK_LIST_REMOVE(inp, inp_hash); - CK_LIST_REMOVE(inp, inp_portlist); - if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { - CK_LIST_REMOVE(phd, phd_hash); - NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); - } - INP_HASH_WUNLOCK(pcbinfo); - inp->inp_flags &= ~INP_INHASHLIST; - } - CK_LIST_REMOVE(inp, inp_list); - pcbinfo->ipi_count--; -} - /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached @@ -2548,15 +2661,12 @@ inp_unlock_assert(struct inpcb *inp) void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_WLOCKPCB); struct inpcb *inp; - INP_INFO_WLOCK(&V_tcbinfo); - CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { - INP_WLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) func(inp, arg); - INP_WUNLOCK(inp); - } - INP_INFO_WUNLOCK(&V_tcbinfo); } struct socket * diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 47ecbd4f121b..305356914d14 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -49,7 +49,9 @@ #ifdef _KERNEL #include +#include #include +#include #include #include #endif @@ -133,32 +135,19 @@ struct in_conninfo { * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and * IPv6 sockets. In the case of TCP and UDP, further per-connection state is * hung off of inp_ppcb most of the time. Almost all fields of struct inpcb - * are static after creation or protected by a per-inpcb rwlock, inp_lock. A - * few fields are protected by multiple locks as indicated in the locking notes - * below. For these fields, all of the listed locks must be write-locked for - * any modifications. However, these fields can be safely read while any one of - * the listed locks are read-locked. This model can permit greater concurrency - * for read operations. For example, connections can be looked up while only - * holding a read lock on the global pcblist lock. This is important for - * performance when attempting to find the connection for a packet given its IP - * and port tuple. + * are static after creation or protected by a per-inpcb rwlock, inp_lock. * - * One noteworthy exception is that the global pcbinfo lock follows a different - * set of rules in relation to the inp_list field. Rather than being - * write-locked for modifications and read-locked for list iterations, it must - * be read-locked during modifications and write-locked during list iterations. - * This ensures that the relatively rare global list iterations safely walk a - * stable snapshot of connections while allowing more common list modifications - * to safely grab the pcblist lock just while adding or removing a connection - * from the global list. + * A inpcb database is indexed by addresses/ports hash as well as list of + * all pcbs that belong to a certain proto. Database lookups or list traversals + * are be performed inside SMR section. Once desired PCB is found its own + * lock is to be obtained and SMR section exited. * * Key: * (b) - Protected by the hpts lock. * (c) - Constant after initialization - * (e) - Protected by the net_epoch_prempt epoch + * (e) - Protected by the SMR section * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb - * (l) - Protected by the pcblist lock for the inpcb * (h) - Protected by the pcbhash lock for the inpcb * (s) - Protected by another subsystem's locks * (x) - Undefined locking @@ -219,17 +208,13 @@ struct in_conninfo { * socket has been freed), or there may be close(2)-related races. * * The inp_vflag field is overloaded, and would otherwise ideally be (c). - * - * TODO: Currently only the TCP stack is leveraging the global pcbinfo lock - * read-lock usage during modification, this model can be applied to other - * protocols (especially SCTP). */ struct icmp6_filter; struct inpcbpolicy; struct m_snd_tag; struct inpcb { /* Cache line #1 (amd64) */ - CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */ + CK_LIST_ENTRY(inpcb) inp_hash; /* (w:h/r:e) hash list */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ #define inp_start_zero inp_hpts @@ -311,8 +296,8 @@ struct inpcb { int in6p_cksum; short in6p_hops; }; - CK_LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */ - struct inpcbport *inp_phd; /* (i/h) head of this list */ + CK_LIST_ENTRY(inpcb) inp_portlist; /* (r:e/w:h) port list */ + struct inpcbport *inp_phd; /* (r:e/w:h) head of this list */ inp_gen_t inp_gencnt; /* (c) generation count */ void *spare_ptr; /* Spare pointer. */ rt_gen_t inp_rt_cookie; /* generation for route entry */ @@ -320,10 +305,7 @@ struct inpcb { struct route inp_route; struct route_in6 inp_route6; }; - CK_LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */ - /* (e[r]) for list iteration */ - /* (p[w]/l) for addition/removal */ - struct epoch_context inp_epoch_ctx; + CK_LIST_ENTRY(inpcb) inp_list; /* (r:e/w:p) all PCBs for proto */ }; #endif /* _KERNEL */ @@ -396,80 +378,58 @@ void in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *); #endif #endif /* _SYS_SOCKETVAR_H_ */ -struct inpcbport { - struct epoch_context phd_epoch_ctx; - CK_LIST_ENTRY(inpcbport) phd_hash; - struct inpcbhead phd_pcblist; - u_short phd_port; -}; - -/*- +#ifdef _KERNEL +/* * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. * - * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and - * ipi_list_lock: - * - ipi_lock covering the global pcb list stability during loop iteration, - * - ipi_hash_lock covering the hashed lookup tables, - * - ipi_list_lock covering mutable global fields (such as the global - * pcb list) - * - * The lock order is: - * - * ipi_lock (before) - * inpcb locks (before) - * ipi_list locks (before) + * The pcbs are protected with SMR section and thus all lists in inpcbinfo + * are CK-lists. Locking is required to insert a pcb into database. Two + * locks are provided: one for the hash and one for the global list of pcbs, + * as well as overall count and generation count. * * Locking key: * * (c) Constant or nearly constant after initialisation - * (e) - Protected by the net_epoch_prempt epoch + * (e) Protected by SMR section * (g) Locked by ipi_lock - * (l) Locked by ipi_list_lock - * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock - * (x) Synchronisation properties poorly defined + * (h) Locked by ipi_hash_lock */ struct inpcbinfo { /* * Global lock protecting inpcb list modification */ struct mtx ipi_lock; - - /* - * Global list of inpcbs on the protocol. - */ - struct inpcbhead *ipi_listhead; /* [r](e) [w](g/l) */ - u_int ipi_count; /* (l) */ + struct inpcbhead ipi_listhead; /* (r:e/w:g) */ + u_int ipi_count; /* (g) */ /* * Generation count -- incremented each time a connection is allocated * or freed. */ - u_quad_t ipi_gencnt; /* (l) */ + u_quad_t ipi_gencnt; /* (g) */ /* * Fields associated with port lookup and allocation. */ - u_short ipi_lastport; /* (x) */ - u_short ipi_lastlow; /* (x) */ - u_short ipi_lasthi; /* (x) */ + u_short ipi_lastport; /* (h) */ + u_short ipi_lastlow; /* (h) */ + u_short ipi_lasthi; /* (h) */ /* * UMA zone from which inpcbs are allocated for this protocol. */ - struct uma_zone *ipi_zone; /* (c) */ - - /* - * Global lock protecting modification hash lookup tables. - */ - struct mtx ipi_hash_lock; + uma_zone_t ipi_zone; /* (c) */ + uma_zone_t ipi_portzone; /* (c) */ + smr_t ipi_smr; /* (c) */ /* * Global hash of inpcbs, hashed by local and foreign addresses and * port numbers. */ - struct inpcbhead *ipi_hashbase; /* (h) */ - u_long ipi_hashmask; /* (h) */ + struct mtx ipi_hash_lock; + struct inpcbhead *ipi_hashbase; /* (r:e/w:h) */ + u_long ipi_hashmask; /* (c) */ /* * Global hash of inpcbs, hashed by only local port number. @@ -481,26 +441,15 @@ struct inpcbinfo { * Load balance groups used for the SO_REUSEPORT_LB option, * hashed by local port. */ - struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (h) */ + struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (r:e/w:h) */ u_long ipi_lbgrouphashmask; /* (h) */ /* * Pointer to network stack instance */ struct vnet *ipi_vnet; /* (c) */ - - /* - * general use 2 - */ - void *ipi_pspare[2]; - - /* - * Global lock protecting global inpcb list, inpcb count, etc. - */ - struct rwlock ipi_list_lock; }; -#ifdef _KERNEL /* * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group * (or unique address:port combination) can be re-used at most @@ -523,7 +472,7 @@ struct inpcblbgroup { }; #define INP_LOCK_INIT(inp, d, t) \ - rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) + rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) #define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock) #define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock) @@ -571,51 +520,21 @@ int inp_so_options(const struct inpcb *inp); #endif /* _KERNEL */ -#define INP_INFO_LOCK_INIT(ipi, d) \ - mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE) -#define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock) -#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock) -#define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock) +#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock) #define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock) #define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock) -#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock)) +#define INP_INFO_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \ + mtx_owned(&(ipi)->ipi_lock)) #define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED) #define INP_INFO_WUNLOCK_ASSERT(ipi) \ - mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED) + mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED) -#define INP_LIST_LOCK_INIT(ipi, d) \ - rw_init_flags(&(ipi)->ipi_list_lock, (d), 0) -#define INP_LIST_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_list_lock) -#define INP_LIST_RLOCK(ipi) rw_rlock(&(ipi)->ipi_list_lock) -#define INP_LIST_WLOCK(ipi) rw_wlock(&(ipi)->ipi_list_lock) -#define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock) -#define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock) -#define INP_LIST_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_list_lock) -#define INP_LIST_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_list_lock) -#define INP_LIST_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_list_lock) -#define INP_LIST_LOCK_ASSERT(ipi) \ - rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED) -#define INP_LIST_RLOCK_ASSERT(ipi) \ - rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED) -#define INP_LIST_WLOCK_ASSERT(ipi) \ - rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED) -#define INP_LIST_UNLOCK_ASSERT(ipi) \ - rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED) - -#define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF) -#define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock) #define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock) #define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_hash_lock)) -#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED); - -#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ - MTX_DEF | MTX_DUPOK) -#define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock) - -#define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock) -#define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED) -#define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock) +#define INP_HASH_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \ + mtx_owned(&(ipi)->ipi_hash_lock)) +#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, \ + MA_OWNED) #define INP_PCBHASH(faddr, lport, fport, mask) \ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) @@ -644,7 +563,7 @@ int inp_so_options(const struct inpcb *inp); #define INP_ANONPORT 0x00000040 /* port chosen for user */ #define INP_RECVIF 0x00000080 /* receive incoming interface */ #define INP_MTUDISC 0x00000100 /* user can do MTU discovery */ - /* 0x000200 unused: was INP_FAITH */ +/* INP_FREED 0x00000200 private to in_pcb.c */ #define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */ #define INP_DONTFRAG 0x00000800 /* don't fragment packet */ #define INP_BINDANY 0x00001000 /* allow bind to any address */ @@ -682,7 +601,7 @@ int inp_so_options(const struct inpcb *inp); #define INP_MBUF_ACKCMP 0x00000002 /* TCP mbuf ack compression ok */ /* 0x00000004 */ #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ -#define INP_FREED 0x00000010 /* inp itself is not valid */ +/* 0x00000010 */ #define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ #define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */ #define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */ @@ -702,15 +621,19 @@ int inp_so_options(const struct inpcb *inp); #define INP_2PCP_BASE INP_2PCP_BIT0 #define INP_2PCP_MASK (INP_2PCP_BIT0 | INP_2PCP_BIT1 | INP_2PCP_BIT2) #define INP_2PCP_SHIFT 18 /* shift PCP field in/out of inp_flags2 */ + /* - * Flags passed to in_pcblookup*() functions. + * Flags passed to in_pcblookup*(), inp_smr_lock() and inp_next(). */ -#define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */ -#define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */ -#define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */ +typedef enum { + INPLOOKUP_WILDCARD = 0x00000001, /* Allow wildcard sockets. */ + INPLOOKUP_RLOCKPCB = 0x00000002, /* Return inpcb read-locked. */ + INPLOOKUP_WLOCKPCB = 0x00000004, /* Return inpcb write-locked. */ +} inp_lookup_t; #define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \ INPLOOKUP_WLOCKPCB) +#define INPLOOKUP_LOCKMASK (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB) #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) @@ -718,13 +641,6 @@ int inp_so_options(const struct inpcb *inp); #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) -/* - * Constants for pcbinfo.ipi_hashfields. - */ -#define IPI_HASHFIELDS_NONE 0 -#define IPI_HASHFIELDS_2TUPLE 1 -#define IPI_HASHFIELDS_4TUPLE 2 - #ifdef _KERNEL VNET_DECLARE(int, ipport_reservedhigh); VNET_DECLARE(int, ipport_reservedlow); @@ -755,8 +671,8 @@ VNET_DECLARE(int, ipport_tcpallocs); #define V_ipport_tcpallocs VNET(ipport_tcpallocs) void in_pcbinfo_destroy(struct inpcbinfo *); -void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *, - int, int, char *, uma_init, u_int); +void in_pcbinfo_init(struct inpcbinfo *, const char *, u_int, int, char *, + uma_init); int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi); @@ -788,8 +704,37 @@ void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, int, struct inpcb *(*)(struct inpcb *, int)); void in_pcbref(struct inpcb *); void in_pcbrehash(struct inpcb *); -int in_pcbrele_rlocked(struct inpcb *); -int in_pcbrele_wlocked(struct inpcb *); +bool in_pcbrele_rlocked(struct inpcb *); +bool in_pcbrele_wlocked(struct inpcb *); + +typedef bool inp_match_t(const struct inpcb *, void *); +struct inpcb_iterator { + const struct inpcbinfo *ipi; + struct inpcb *inp; + inp_match_t *match; + void *ctx; + int hash; +#define INP_ALL_LIST -1 + const inp_lookup_t lock; +}; + +/* Note: sparse initializers guarantee .inp = NULL. */ +#define INP_ITERATOR(_ipi, _lock, _match, _ctx) \ + { \ + .ipi = (_ipi), \ + .lock = (_lock), \ + .hash = INP_ALL_LIST, \ + .match = (_match), \ + .ctx = (_ctx), \ + } +#define INP_ALL_ITERATOR(_ipi, _lock) \ + { \ + .ipi = (_ipi), \ + .lock = (_lock), \ + .hash = INP_ALL_LIST, \ + } + +struct inpcb *inp_next(struct inpcb_iterator *); void in_losing(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); diff --git a/sys/netinet/in_pcb_var.h b/sys/netinet/in_pcb_var.h index 5038ab404871..4db20418708d 100644 --- a/sys/netinet/in_pcb_var.h +++ b/sys/netinet/in_pcb_var.h @@ -44,6 +44,7 @@ * Definitions shared between netinet/in_pcb.c and netinet6/in6_pcb.c */ +bool inp_smr_lock(struct inpcb *, const inp_lookup_t); int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *, struct ucred *, int); int in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, @@ -52,4 +53,10 @@ int in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); +struct inpcbport { + struct inpcbhead phd_pcblist; + CK_LIST_ENTRY(inpcbport) phd_hash; + u_short phd_port; +}; + #endif /* !_NETINET_IN_PCB_VAR_H_ */ diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c index 265fc1918d82..cd0034008dc2 100644 --- a/sys/netinet/ip_divert.c +++ b/sys/netinet/ip_divert.c @@ -111,10 +111,7 @@ __FBSDID("$FreeBSD$"); */ /* Internal variables. */ -VNET_DEFINE_STATIC(struct inpcbhead, divcb); VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo); - -#define V_divcb VNET(divcb) #define V_divcbinfo VNET(divcbinfo) static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ @@ -154,8 +151,7 @@ div_init(void) * allocate one-entry hash lists than it is to check all over the * place for hashbase == NULL. */ - in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb", - div_inpcb_init, IPI_HASHFIELDS_NONE); + in_pcbinfo_init(&V_divcbinfo, "div", 1, 1, "divcb", div_inpcb_init); } static void @@ -181,6 +177,14 @@ div_input(struct mbuf **mp, int *offp, int proto) return (IPPROTO_DONE); } +static bool +div_port_match(const struct inpcb *inp, void *v) +{ + uint16_t nport = *(uint16_t *)v; + + return (inp->inp_lport == nport); +} + /* * Divert a packet by passing it up to the divert socket at port 'port'. * @@ -195,6 +199,8 @@ divert_packet(struct mbuf *m, bool incoming) struct socket *sa; u_int16_t nport; struct sockaddr_in divsrc; + struct inpcb_iterator inpi = INP_ITERATOR(&V_divcbinfo, + INPLOOKUP_RLOCKPCB, div_port_match, &nport); struct m_tag *mtag; NET_EPOCH_ASSERT(); @@ -288,27 +294,20 @@ divert_packet(struct mbuf *m, bool incoming) /* Put packet on socket queue, if any */ sa = NULL; + /* nport is inp_next's context. */ nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info)); - CK_LIST_FOREACH(inp, &V_divcb, inp_list) { + while ((inp = inp_next(&inpi)) != NULL) { + sa = inp->inp_socket; + SOCKBUF_LOCK(&sa->so_rcv); + if (sbappendaddr_locked(&sa->so_rcv, + (struct sockaddr *)&divsrc, m, NULL) == 0) { + soroverflow_locked(sa); + sa = NULL; /* force mbuf reclaim below */ + } else + sorwakeup_locked(sa); /* XXX why does only one socket match? */ - if (inp->inp_lport == nport) { - INP_RLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_RUNLOCK(inp); - continue; - } - sa = inp->inp_socket; - SOCKBUF_LOCK(&sa->so_rcv); - if (sbappendaddr_locked(&sa->so_rcv, - (struct sockaddr *)&divsrc, m, - (struct mbuf *)0) == 0) { - soroverflow_locked(sa); - sa = NULL; /* force mbuf reclaim below */ - } else - sorwakeup_locked(sa); - INP_RUNLOCK(inp); - break; - } + INP_RUNLOCK(inp); + break; } if (sa == NULL) { m_freem(m); @@ -603,14 +602,10 @@ div_attach(struct socket *so, int proto, struct thread *td) error = soreserve(so, div_sendspace, div_recvspace); if (error) return error; - INP_INFO_WLOCK(&V_divcbinfo); error = in_pcballoc(so, &V_divcbinfo); - if (error) { - INP_INFO_WUNLOCK(&V_divcbinfo); + if (error) return error; - } inp = (struct inpcb *)so->so_pcb; - INP_INFO_WUNLOCK(&V_divcbinfo); inp->inp_ip_p = proto; inp->inp_vflag |= INP_IPV4; inp->inp_flags |= INP_HDRINCL; @@ -625,11 +620,9 @@ div_detach(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_detach: inp == NULL")); - INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&V_divcbinfo); } static int @@ -652,13 +645,11 @@ div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) if (nam->sa_len != sizeof(struct sockaddr_in)) return EINVAL; ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; - INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); INP_HASH_WLOCK(&V_divcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_divcbinfo); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_divcbinfo); return error; } @@ -697,8 +688,9 @@ div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, static int div_pcblist(SYSCTL_HANDLER_ARGS) { + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_divcbinfo, + INPLOOKUP_RLOCKPCB); struct xinpgen xig; - struct epoch_tracker et; struct inpcb *inp; int error; @@ -726,21 +718,18 @@ div_pcblist(SYSCTL_HANDLER_ARGS) if (error) return error; - NET_EPOCH_ENTER(et); - for (inp = CK_LIST_FIRST(V_divcbinfo.ipi_listhead); - inp != NULL; - inp = CK_LIST_NEXT(inp, inp_list)) { - INP_RLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); - INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); - } else - INP_RUNLOCK(inp); + if (error) { + INP_RUNLOCK(inp); + break; + } + } } - NET_EPOCH_EXIT(et); if (!error) { /* diff --git a/sys/netinet/ip_gre.c b/sys/netinet/ip_gre.c index 6a2135fa32cd..a70452026642 100644 --- a/sys/netinet/ip_gre.c +++ b/sys/netinet/ip_gre.c @@ -223,25 +223,11 @@ static void in_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { - struct epoch_tracker et; struct gre_socket *gs; struct gre_softc *sc; in_addr_t dst; - NET_EPOCH_ENTER(et); - /* - * udp_append() holds reference to inp, it is safe to check - * inp_flags2 without INP_RLOCK(). - * If socket was closed before we have entered NET_EPOCH section, - * INP_FREED flag should be set. Otherwise it should be safe to - * make access to ctx data, because gre_so will be freed by - * gre_sofree() via NET_EPOCH_CALL(). - */ - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - NET_EPOCH_EXIT(et); - m_freem(m); - return; - } + NET_EPOCH_ASSERT(); gs = (struct gre_socket *)ctx; dst = ((const struct sockaddr_in *)sa)->sin_addr.s_addr; @@ -251,11 +237,9 @@ in_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp, } if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){ gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc); - NET_EPOCH_EXIT(et); return; } m_freem(m); - NET_EPOCH_EXIT(et); } static int diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index 38ab5f4a8243..de4e6e851c32 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -87,10 +87,7 @@ SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_defttl), 0, "Maximum TTL on IP packets"); -VNET_DEFINE(struct inpcbhead, ripcb); VNET_DEFINE(struct inpcbinfo, ripcbinfo); - -#define V_ripcb VNET(ripcb) #define V_ripcbinfo VNET(ripcbinfo) /* @@ -160,7 +157,7 @@ rip_inshash(struct inpcb *inp) struct inpcbhead *pcbhash; int hash; - INP_INFO_WLOCK_ASSERT(pcbinfo); + INP_HASH_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); if (inp->inp_ip_p != 0 && @@ -178,7 +175,7 @@ static void rip_delhash(struct inpcb *inp) { - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); CK_LIST_REMOVE(inp, inp_hash); @@ -212,8 +209,8 @@ void rip_init(void) { - in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, - 1, "ripcb", rip_inpcb_init, IPI_HASHFIELDS_NONE); + in_pcbinfo_init(&V_ripcbinfo, "rip", INP_PCBHASH_RAW_SIZE, 1, "ripcb", + rip_inpcb_init); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -230,47 +227,90 @@ VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL); #ifdef INET static int -rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, +rip_append(struct inpcb *inp, struct ip *ip, struct mbuf *m, struct sockaddr_in *ripsrc) { - int policyfail = 0; + struct socket *so = inp->inp_socket; + struct mbuf *n, *opts = NULL; - INP_LOCK_ASSERT(last); + INP_LOCK_ASSERT(inp); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* check AH/ESP integrity. */ - if (IPSEC_ENABLED(ipv4)) { - if (IPSEC_CHECK_POLICY(ipv4, n, last) != 0) - policyfail = 1; - } + if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) + return (0); #endif /* IPSEC */ #ifdef MAC - if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) - policyfail = 1; + if (mac_inpcb_check_deliver(inp, m) != 0) + return (0); #endif /* Check the minimum TTL for socket. */ - if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) - policyfail = 1; - if (!policyfail) { - struct mbuf *opts = NULL; - struct socket *so; + if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) + return (0); - so = last->inp_socket; - if ((last->inp_flags & INP_CONTROLOPTS) || - (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) - ip_savecontrol(last, &opts, ip, n); - SOCKBUF_LOCK(&so->so_rcv); - if (sbappendaddr_locked(&so->so_rcv, - (struct sockaddr *)ripsrc, n, opts) == 0) { - soroverflow_locked(so); - m_freem(n); - if (opts) - m_freem(opts); - } else - sorwakeup_locked(so); - } else + if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) + return (0); + + if ((inp->inp_flags & INP_CONTROLOPTS) || + (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) + ip_savecontrol(inp, &opts, ip, n); + SOCKBUF_LOCK(&so->so_rcv); + if (sbappendaddr_locked(&so->so_rcv, + (struct sockaddr *)ripsrc, n, opts) == 0) { + soroverflow_locked(so); m_freem(n); - return (policyfail); + if (opts) + m_freem(opts); + return (0); + } + sorwakeup_locked(so); + + return (1); +} + +struct rip_inp_match_ctx { + struct ip *ip; + int proto; +}; + +static bool +rip_inp_match1(const struct inpcb *inp, void *v) +{ + struct rip_inp_match_ctx *ctx = v; + + if (inp->inp_ip_p != ctx->proto) + return (false); +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + return (false); +#endif + if (inp->inp_laddr.s_addr != ctx->ip->ip_dst.s_addr) + return (false); + if (inp->inp_faddr.s_addr != ctx->ip->ip_src.s_addr) + return (false); + return (true); +} + +static bool +rip_inp_match2(const struct inpcb *inp, void *v) +{ + struct rip_inp_match_ctx *ctx = v; + + if (inp->inp_ip_p && inp->inp_ip_p != ctx->proto) + return (false); +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + return (false); +#endif + if (!in_nullhost(inp->inp_laddr) && + !in_hosteq(inp->inp_laddr, ctx->ip->ip_dst)) + return (false); + if (!in_nullhost(inp->inp_faddr) && + !in_hosteq(inp->inp_faddr, ctx->ip->ip_src)) + return (false); + return (true); } /* @@ -280,102 +320,57 @@ rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int rip_input(struct mbuf **mp, int *offp, int proto) { + struct rip_inp_match_ctx ctx = { + .ip = mtod(*mp, struct ip *), + .proto = proto, + }; + struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo, + INPLOOKUP_RLOCKPCB, rip_inp_match1, &ctx); struct ifnet *ifp; struct mbuf *m = *mp; - struct ip *ip = mtod(m, struct ip *); - struct inpcb *inp, *last; + struct inpcb *inp; struct sockaddr_in ripsrc; - int hash; - - NET_EPOCH_ASSERT(); + int appended; *mp = NULL; + appended = 0; bzero(&ripsrc, sizeof(ripsrc)); ripsrc.sin_len = sizeof(ripsrc); ripsrc.sin_family = AF_INET; - ripsrc.sin_addr = ip->ip_src; - last = NULL; + ripsrc.sin_addr = ctx.ip->ip_src; ifp = m->m_pkthdr.rcvif; - hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, - ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); - CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { - if (inp->inp_ip_p != proto) - continue; -#ifdef INET6 - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; -#endif - if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) - continue; - if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) - continue; - if (last != NULL) { - struct mbuf *n; - - n = m_copym(m, 0, M_COPYALL, M_NOWAIT); - if (n != NULL) - (void) rip_append(last, ip, n, &ripsrc); - /* XXX count dropped packet */ - INP_RUNLOCK(last); - last = NULL; - } - INP_RLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) - goto skip_1; - if (jailed_without_vnet(inp->inp_cred)) { + inpi.hash = INP_PCBHASH_RAW(proto, ctx.ip->ip_src.s_addr, + ctx.ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); + while ((inp = inp_next(&inpi)) != NULL) { + INP_RLOCK_ASSERT(inp); + if (jailed_without_vnet(inp->inp_cred) && + prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0) { /* * XXX: If faddr was bound to multicast group, * jailed raw socket will drop datagram. */ - if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) - goto skip_1; + continue; } - last = inp; - continue; - skip_1: - INP_RUNLOCK(inp); + appended += rip_append(inp, ctx.ip, m, &ripsrc); } - CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { - if (inp->inp_ip_p && inp->inp_ip_p != proto) - continue; -#ifdef INET6 - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; -#endif - if (!in_nullhost(inp->inp_laddr) && - !in_hosteq(inp->inp_laddr, ip->ip_dst)) - continue; - if (!in_nullhost(inp->inp_faddr) && - !in_hosteq(inp->inp_faddr, ip->ip_src)) - continue; - if (last != NULL) { - struct mbuf *n; - n = m_copym(m, 0, M_COPYALL, M_NOWAIT); - if (n != NULL) - (void) rip_append(last, ip, n, &ripsrc); - /* XXX count dropped packet */ - INP_RUNLOCK(last); - last = NULL; - } - INP_RLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) - goto skip_2; - if (jailed_without_vnet(inp->inp_cred)) { + inpi.hash = 0; + inpi.match = rip_inp_match2; + MPASS(inpi.inp == NULL); + while ((inp = inp_next(&inpi)) != NULL) { + INP_RLOCK_ASSERT(inp); + if (jailed_without_vnet(inp->inp_cred) && + !IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr)) && + prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0) /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ - if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && - prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) - goto skip_2; - } + continue; /* * If this raw socket has multicast state, and we * have received a multicast, check if this socket @@ -383,7 +378,7 @@ rip_input(struct mbuf **mp, int *offp, int proto) * the responsibility of the transport layer. */ if (inp->inp_moptions != NULL && - IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr))) { /* * If the incoming datagram is for IGMP, allow it * through unconditionally to the raw socket. @@ -405,7 +400,7 @@ rip_input(struct mbuf **mp, int *offp, int proto) bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; - group.sin_addr = ip->ip_dst; + group.sin_addr = ctx.ip->ip_dst; blocked = imo_multi_filter(inp->inp_moptions, ifp, @@ -415,27 +410,18 @@ rip_input(struct mbuf **mp, int *offp, int proto) if (blocked != MCAST_PASS) { IPSTAT_INC(ips_notmember); - goto skip_2; + continue; } } - last = inp; - continue; - skip_2: - INP_RUNLOCK(inp); - } - if (last != NULL) { - if (rip_append(last, ip, m, &ripsrc) != 0) - IPSTAT_INC(ips_delivered); - INP_RUNLOCK(last); - } else { - if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) { - IPSTAT_INC(ips_noproto); - IPSTAT_DEC(ips_delivered); - icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0); - } else { - m_freem(m); - } + appended += rip_append(inp, ctx.ip, m, &ripsrc); } + if (appended == 0 && + inetsw[ip_protox[ctx.ip->ip_p]].pr_input == rip_input) { + IPSTAT_INC(ips_noproto); + IPSTAT_DEC(ips_delivered); + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0); + } else + m_freem(m); return (IPPROTO_DONE); } @@ -898,18 +884,16 @@ rip_attach(struct socket *so, int proto, struct thread *td) error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); - INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); - if (error) { - INP_INFO_WUNLOCK(&V_ripcbinfo); + if (error) return (error); - } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; inp->inp_ip_ttl = V_ip_defttl; + INP_HASH_WLOCK(&V_ripcbinfo); rip_inshash(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); + INP_HASH_WUNLOCK(&V_ripcbinfo); INP_WUNLOCK(inp); return (0); } @@ -924,9 +908,10 @@ rip_detach(struct socket *so) KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("rip_detach: not closed")); - INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(&V_ripcbinfo); rip_delhash(inp); + INP_HASH_WUNLOCK(&V_ripcbinfo); if (so == V_ip_mrouter && ip_mrouter_done) ip_mrouter_done(); if (ip_rsvp_force_done) @@ -935,7 +920,6 @@ rip_detach(struct socket *so) ip_rsvp_done(); in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); } static void @@ -944,16 +928,16 @@ rip_dodisconnect(struct socket *so, struct inpcb *inp) struct inpcbinfo *pcbinfo; pcbinfo = inp->inp_pcbinfo; - INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(pcbinfo); rip_delhash(inp); inp->inp_faddr.s_addr = INADDR_ANY; rip_inshash(inp); + INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(pcbinfo); } static void @@ -1019,13 +1003,13 @@ rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) return (EADDRNOTAVAIL); - INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(&V_ripcbinfo); rip_delhash(inp); inp->inp_laddr = addr->sin_addr; rip_inshash(inp); + INP_HASH_WUNLOCK(&V_ripcbinfo); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } @@ -1045,14 +1029,14 @@ rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_connect: inp == NULL")); - INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(&V_ripcbinfo); rip_delhash(inp); inp->inp_faddr = addr->sin_addr; rip_inshash(inp); + INP_HASH_WUNLOCK(&V_ripcbinfo); soisconnected(so); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } @@ -1118,8 +1102,9 @@ rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, static int rip_pcblist(SYSCTL_HANDLER_ARGS) { + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_ripcbinfo, + INPLOOKUP_RLOCKPCB); struct xinpgen xig; - struct epoch_tracker et; struct inpcb *inp; int error; @@ -1147,24 +1132,19 @@ rip_pcblist(SYSCTL_HANDLER_ARGS) if (error) return (error); - NET_EPOCH_ENTER(et); - for (inp = CK_LIST_FIRST(V_ripcbinfo.ipi_listhead); - inp != NULL; - inp = CK_LIST_NEXT(inp, inp_list)) { - INP_RLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); - INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); - if (error) + if (error) { + INP_RUNLOCK(inp); break; - } else - INP_RUNLOCK(inp); + } + } } - NET_EPOCH_EXIT(et); if (!error) { /* diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 9ec092f703ec..764d46a1580b 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -579,28 +579,10 @@ tcp_input_lock(struct inpcb *inp) static void tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line) { - int32_t add_freed; int32_t ret; - if (inp->inp_flags2 & INP_FREED) { - /* - * Need to play a special trick so that in_pcbrele_wlocked - * does not return 1 when it really should have returned 0. - */ - add_freed = 1; - inp->inp_flags2 &= ~INP_FREED; - } else { - add_freed = 0; - } -#ifndef INP_REF_DEBUG ret = in_pcbrele_wlocked(inp); -#else - ret = __in_pcbrele_wlocked(inp, line); -#endif KASSERT(ret != 1, ("inpcb:%p release ret 1", inp)); - if (add_freed) { - inp->inp_flags2 |= INP_FREED; - } } static void @@ -1291,8 +1273,7 @@ tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) #ifdef VIMAGE CURVNET_SET(inp->inp_vnet); #endif - if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || - (inp->inp_flags2 & INP_FREED)) { + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) { out: hpts->p_inp = NULL; if (in_pcbrele_wlocked(inp) == 0) { @@ -1593,8 +1574,7 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout) hpts->p_inp = NULL; continue; } - if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || - (inp->inp_flags2 & INP_FREED)) { + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) { out_now: KASSERT(mtx_owned(&hpts->p_mtx) == 0, ("Hpts:%p owns mtx prior-to lock line:%d", diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 3ebac68c7c48..ee514a11eef6 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -244,8 +244,6 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); -VNET_DEFINE(struct inpcbhead, tcb); -#define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); /* diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index 08556009b8c4..0357056da1b1 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -1310,8 +1310,7 @@ tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le) /* Check if the inp is dead, Jim. */ if (tp == NULL || - (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) || - (inp->inp_flags2 & INP_FREED)) { + (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { INP_WUNLOCK(inp); return (TCP_LRO_CANNOT); } diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 34cc291dc274..47fa8656a51d 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1376,6 +1376,8 @@ deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, * to the default stack. */ if (force && blk->tfb_refcnt) { + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_WLOCKPCB); struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); @@ -1385,22 +1387,14 @@ deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - INP_INFO_WLOCK(&V_tcbinfo); - CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { - INP_WLOCK(inp); - if (inp->inp_flags & INP_TIMEWAIT) { - INP_WUNLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) { + if (inp->inp_flags & INP_TIMEWAIT) continue; - } tp = intotcpcb(inp); - if (tp == NULL || tp->t_fb != blk) { - INP_WUNLOCK(inp); + if (tp == NULL || tp->t_fb != blk) continue; - } tcp_switch_back_to_default(tp); - INP_WUNLOCK(inp); } - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); @@ -1488,8 +1482,8 @@ tcp_init(void) "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } - in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, - "tcp_inpcb", tcp_inpcb_init, IPI_HASHFIELDS_4TUPLE); + in_pcbinfo_init(&V_tcbinfo, "tcp", hashsize, hashsize, + "tcp_inpcb", tcp_inpcb_init); /* * These have to be type stable for the benefit of the timers. @@ -1599,9 +1593,9 @@ tcp_destroy(void *unused __unused) * Sleep to let all tcpcb timers really disappear and cleanup. */ for (;;) { - INP_LIST_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); n = V_tcbinfo.ipi_count; - INP_LIST_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); if (n == 0) break; pause("tcpdes", hz / 10); @@ -2309,6 +2303,8 @@ tcp_ccalgounload(struct cc_algo *unload_algo) struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_WLOCKPCB); /* * Check all active control blocks across all network stacks and change @@ -2318,17 +2314,12 @@ tcp_ccalgounload(struct cc_algo *unload_algo) VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - INP_INFO_WLOCK(&V_tcbinfo); /* - * New connections already part way through being initialised - * with the CC algo we're removing will not race with this code - * because the INP_INFO_WLOCK is held during initialisation. We - * therefore don't enter the loop below until the connection - * list has stabilised. + * XXXGL: would new accept(2)d connections use algo being + * unloaded? */ newalgo = CC_DEFAULT_ALGO(); - CK_LIST_FOREACH(inp, &V_tcb, inp_list) { - INP_WLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) { /* Important to skip tcptw structs. */ if (!(inp->inp_flags & INP_TIMEWAIT) && (tp = intotcpcb(inp)) != NULL) { @@ -2362,7 +2353,6 @@ tcp_ccalgounload(struct cc_algo *unload_algo) * need to try again. */ INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); VNET_LIST_RUNLOCK(); return (err); @@ -2379,9 +2369,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo) } } } - INP_WUNLOCK(inp); } - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); @@ -2399,7 +2387,6 @@ tcp_drop(struct tcpcb *tp, int errno) struct socket *so = tp->t_inpcb->inp_socket; NET_EPOCH_ASSERT(); - INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { @@ -2585,7 +2572,6 @@ tcp_close(struct tcpcb *tp) struct inpcb *inp = tp->t_inpcb; struct socket *so; - INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD @@ -2624,6 +2610,8 @@ tcp_close(struct tcpcb *tp) void tcp_drain(void) { + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_WLOCKPCB); VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) @@ -2643,13 +2631,9 @@ tcp_drain(void) * where we're really low on mbufs, this is potentially * useful. */ - INP_INFO_WLOCK(&V_tcbinfo); - CK_LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { - INP_WLOCK(inpb); - if (inpb->inp_flags & INP_TIMEWAIT) { - INP_WUNLOCK(inpb); + while ((inpb = inp_next(&inpi)) != NULL) { + if (inpb->inp_flags & INP_TIMEWAIT) continue; - } if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); @@ -2664,9 +2648,7 @@ tcp_drain(void) } #endif } - INP_WUNLOCK(inpb); } - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); @@ -2685,7 +2667,6 @@ tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; - INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || @@ -2731,9 +2712,10 @@ tcp_notify(struct inpcb *inp, int error) static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { - struct epoch_tracker et; - struct inpcb *inp; + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_RLOCKPCB); struct xinpgen xig; + struct inpcb *inp; int error; if (req->newptr != NULL) @@ -2766,11 +2748,7 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) if (error) return (error); - NET_EPOCH_ENTER(et); - for (inp = CK_LIST_FIRST(V_tcbinfo.ipi_listhead); - inp != NULL; - inp = CK_LIST_NEXT(inp, inp_list)) { - INP_RLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen) { int crerr; @@ -2791,17 +2769,15 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) struct xtcpcb xt; tcp_inptoxtp(inp, &xt); - INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); - if (error) + if (error) { + INP_RUNLOCK(inp); break; - else + } else continue; } } - INP_RUNLOCK(inp); } - NET_EPOCH_EXIT(et); if (!error) { /* diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 46d00914354f..57d7352b8f11 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -908,7 +908,6 @@ VNET_DECLARE(int, tcp_sc_rst_sock_fail); VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(int, tcp_udp_tunneling_overhead); VNET_DECLARE(int, tcp_udp_tunneling_port); -VNET_DECLARE(struct inpcbhead, tcb); VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_do_lrd VNET(tcp_do_lrd) @@ -917,7 +916,6 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_do_newcwv VNET(tcp_do_newcwv) #define V_drop_synfin VNET(drop_synfin) #define V_path_mtu_discovery VNET(path_mtu_discovery) -#define V_tcb VNET(tcb) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index efd5c77ca8c5..cd21b8640441 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -147,9 +147,7 @@ u_long udp_recvspace = 40 * (1024 + SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); -VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */ VNET_DEFINE(struct inpcbinfo, udbinfo); -VNET_DEFINE(struct inpcbhead, ulitecb); VNET_DEFINE(struct inpcbinfo, ulitecbinfo); VNET_DEFINE_STATIC(uma_zone_t, udpcb_zone); #define V_udpcb_zone VNET(udpcb_zone) @@ -211,8 +209,8 @@ udp_init(void) * Once we can calculate the flowid that way and re-establish * a 4-tuple, flip this to 4-tuple. */ - in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, - "udp_inpcb", udp_inpcb_init, IPI_HASHFIELDS_2TUPLE); + in_pcbinfo_init(&V_udbinfo, "udp", UDBHASHSIZE, UDBHASHSIZE, + "udp_inpcb", udp_inpcb_init); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_udpcb_zone, maxsockets); @@ -225,9 +223,8 @@ void udplite_init(void) { - in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE, - UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, - IPI_HASHFIELDS_2TUPLE); + in_pcbinfo_init(&V_ulitecbinfo, "udplite", UDBHASHSIZE, + UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init); } /* @@ -393,6 +390,123 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, return (0); } +static bool +udp_multi_match(const struct inpcb *inp, void *v) +{ + struct ip *ip = v; + struct udphdr *uh = (struct udphdr *)(ip + 1); + + if (inp->inp_lport != uh->uh_dport) + return (false); +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + return (false); +#endif + if (inp->inp_laddr.s_addr != INADDR_ANY && + inp->inp_laddr.s_addr != ip->ip_dst.s_addr) + return (false); + if (inp->inp_faddr.s_addr != INADDR_ANY && + inp->inp_faddr.s_addr != ip->ip_src.s_addr) + return (false); + if (inp->inp_fport != 0 && + inp->inp_fport != uh->uh_sport) + return (false); + + return (true); +} + +static int +udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in) +{ + struct ip *ip = mtod(m, struct ip *); + struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto), + INPLOOKUP_RLOCKPCB, udp_multi_match, ip); + struct udphdr *uh = (struct udphdr *)(ip + 1); + struct inpcb *inp; + struct mbuf *n; + int appends = 0; + + MPASS(ip->ip_hl == sizeof(struct ip) >> 2); + + while ((inp = inp_next(&inpi)) != NULL) { + /* + * XXXRW: Because we weren't holding either the inpcb + * or the hash lock when we checked for a match + * before, we should probably recheck now that the + * inpcb lock is held. + */ + /* + * Handle socket delivery policy for any-source + * and source-specific multicast. [RFC3678] + */ + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + struct ip_moptions *imo; + struct sockaddr_in group; + int blocked; + + imo = inp->inp_moptions; + if (imo == NULL) + continue; + bzero(&group, sizeof(struct sockaddr_in)); + group.sin_len = sizeof(struct sockaddr_in); + group.sin_family = AF_INET; + group.sin_addr = ip->ip_dst; + + blocked = imo_multi_filter(imo, m->m_pkthdr.rcvif, + (struct sockaddr *)&group, + (struct sockaddr *)&udp_in[0]); + if (blocked != MCAST_PASS) { + if (blocked == MCAST_NOTGMEMBER) + IPSTAT_INC(ips_notmember); + if (blocked == MCAST_NOTSMEMBER || + blocked == MCAST_MUTED) + UDPSTAT_INC(udps_filtermcast); + continue; + } + } + if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { + if (proto == IPPROTO_UDPLITE) + UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh); + else + UDP_PROBE(receive, NULL, inp, ip, inp, uh); + if (udp_append(inp, ip, n, sizeof(struct ip), udp_in)) { + INP_RUNLOCK(inp); + break; + } else + appends++; + } + /* + * Don't look for additional matches if this one does + * not have either the SO_REUSEPORT or SO_REUSEADDR + * socket options set. This heuristic avoids + * searching through all pcbs in the common case of a + * non-shared port. It assumes that an application + * will never clear these options after setting them. + */ + if ((inp->inp_socket->so_options & + (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) { + INP_RUNLOCK(inp); + break; + } + } + m_freem(m); + + if (appends == 0) { + /* + * No matching pcb found; discard datagram. (No need + * to send an ICMP Port Unreachable for a broadcast + * or multicast datgram.) + */ + UDPSTAT_INC(udps_noport); + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) + UDPSTAT_INC(udps_noportmcast); + else + UDPSTAT_INC(udps_noportbcast); + } + + return (IPPROTO_DONE); +} + int udp_input(struct mbuf **mp, int *offp, int proto) { @@ -519,140 +633,15 @@ udp_input(struct mbuf **mp, int *offp, int proto) } } - pcbinfo = udp_get_inpcbinfo(proto); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || - in_broadcast(ip->ip_dst, ifp)) { - struct inpcb *last; - struct inpcbhead *pcblist; + in_broadcast(ip->ip_dst, ifp)) + return (udp_multi_input(m, proto, udp_in)); - NET_EPOCH_ASSERT(); - - pcblist = udp_get_pcblist(proto); - last = NULL; - CK_LIST_FOREACH(inp, pcblist, inp_list) { - if (inp->inp_lport != uh->uh_dport) - continue; -#ifdef INET6 - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; -#endif - if (inp->inp_laddr.s_addr != INADDR_ANY && - inp->inp_laddr.s_addr != ip->ip_dst.s_addr) - continue; - if (inp->inp_faddr.s_addr != INADDR_ANY && - inp->inp_faddr.s_addr != ip->ip_src.s_addr) - continue; - if (inp->inp_fport != 0 && - inp->inp_fport != uh->uh_sport) - continue; - - INP_RLOCK(inp); - - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_RUNLOCK(inp); - continue; - } - - /* - * XXXRW: Because we weren't holding either the inpcb - * or the hash lock when we checked for a match - * before, we should probably recheck now that the - * inpcb lock is held. - */ - - /* - * Handle socket delivery policy for any-source - * and source-specific multicast. [RFC3678] - */ - if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { - struct ip_moptions *imo; - struct sockaddr_in group; - int blocked; - - imo = inp->inp_moptions; - if (imo == NULL) { - INP_RUNLOCK(inp); - continue; - } - bzero(&group, sizeof(struct sockaddr_in)); - group.sin_len = sizeof(struct sockaddr_in); - group.sin_family = AF_INET; - group.sin_addr = ip->ip_dst; - - blocked = imo_multi_filter(imo, ifp, - (struct sockaddr *)&group, - (struct sockaddr *)&udp_in[0]); - if (blocked != MCAST_PASS) { - if (blocked == MCAST_NOTGMEMBER) - IPSTAT_INC(ips_notmember); - if (blocked == MCAST_NOTSMEMBER || - blocked == MCAST_MUTED) - UDPSTAT_INC(udps_filtermcast); - INP_RUNLOCK(inp); - continue; - } - } - if (last != NULL) { - struct mbuf *n; - - if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != - NULL) { - if (proto == IPPROTO_UDPLITE) - UDPLITE_PROBE(receive, NULL, last, ip, - last, uh); - else - UDP_PROBE(receive, NULL, last, ip, last, - uh); - if (udp_append(last, ip, n, iphlen, - udp_in)) { - INP_RUNLOCK(inp); - goto badunlocked; - } - } - /* Release PCB lock taken on previous pass. */ - INP_RUNLOCK(last); - } - last = inp; - /* - * Don't look for additional matches if this one does - * not have either the SO_REUSEPORT or SO_REUSEADDR - * socket options set. This heuristic avoids - * searching through all pcbs in the common case of a - * non-shared port. It assumes that an application - * will never clear these options after setting them. - */ - if ((last->inp_socket->so_options & - (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) - break; - } - - if (last == NULL) { - /* - * No matching pcb found; discard datagram. (No need - * to send an ICMP Port Unreachable for a broadcast - * or multicast datgram.) - */ - UDPSTAT_INC(udps_noport); - if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) - UDPSTAT_INC(udps_noportmcast); - else - UDPSTAT_INC(udps_noportbcast); - goto badunlocked; - } - if (proto == IPPROTO_UDPLITE) - UDPLITE_PROBE(receive, NULL, last, ip, last, uh); - else - UDP_PROBE(receive, NULL, last, ip, last, uh); - if (udp_append(last, ip, m, iphlen, udp_in) == 0) - INP_RUNLOCK(last); - return (IPPROTO_DONE); - } + pcbinfo = udp_get_inpcbinfo(proto); /* * Locate pcb for datagram. - */ - - /* + * * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP_NEXTHOP) && @@ -852,8 +841,9 @@ udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip) static int udp_pcblist(SYSCTL_HANDLER_ARGS) { + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_udbinfo, + INPLOOKUP_RLOCKPCB); struct xinpgen xig; - struct epoch_tracker et; struct inpcb *inp; int error; @@ -881,24 +871,19 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) if (error) return (error); - NET_EPOCH_ENTER(et); - for (inp = CK_LIST_FIRST(V_udbinfo.ipi_listhead); - inp != NULL; - inp = CK_LIST_NEXT(inp, inp_list)) { - INP_RLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); - INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); - if (error) + if (error) { + INP_RUNLOCK(inp); break; - } else - INP_RUNLOCK(inp); + } + } } - NET_EPOCH_EXIT(et); if (!error) { /* @@ -1284,15 +1269,16 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { - INP_HASH_LOCK_ASSERT(pcbinfo); if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { error = EINVAL; goto release; } + INP_HASH_WLOCK(pcbinfo); error = in_pcbbind_setup(inp, (struct sockaddr *)&src, &laddr.s_addr, &lport, td->td_ucred); + INP_HASH_WUNLOCK(pcbinfo); if (error) goto release; } @@ -1335,12 +1321,14 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { - INP_HASH_LOCK_ASSERT(pcbinfo); + INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); - if (error) + if (error) { + INP_HASH_WUNLOCK(pcbinfo); goto release; + } /* * XXXRW: Why not commit the port if the address is @@ -1357,7 +1345,6 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, if (prison_flag(td->td_ucred, PR_IP4)) inp->inp_laddr = laddr; inp->inp_lport = lport; - INP_HASH_WLOCK(pcbinfo); error = in_pcbinshash(inp); INP_HASH_WUNLOCK(pcbinfo); if (error != 0) { @@ -1366,7 +1353,8 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, goto release; } inp->inp_flags |= INP_ANONPORT; - } + } else + INP_HASH_WUNLOCK(pcbinfo); } else { faddr = sin->sin_addr; fport = sin->sin_port; @@ -1560,12 +1548,9 @@ udp_attach(struct socket *so, int proto, struct thread *td) error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); - INP_INFO_WLOCK(pcbinfo); error = in_pcballoc(so, pcbinfo); - if (error) { - INP_INFO_WUNLOCK(pcbinfo); + if (error) return (error); - } inp = sotoinpcb(so); inp->inp_vflag |= INP_IPV4; @@ -1577,12 +1562,10 @@ udp_attach(struct socket *so, int proto, struct thread *td) if (error) { in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(pcbinfo); return (error); } - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(pcbinfo); + return (0); } #endif /* INET */ @@ -1718,14 +1701,12 @@ udp_detach(struct socket *so) KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); - INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); inp->inp_ppcb = NULL; in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); } diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h index eaafdb299233..9a15016b37e3 100644 --- a/sys/netinet/udp_var.h +++ b/sys/netinet/udp_var.h @@ -136,13 +136,9 @@ void kmod_udpstat_inc(int statnum); SYSCTL_DECL(_net_inet_udp); extern struct pr_usrreqs udp_usrreqs; -VNET_DECLARE(struct inpcbhead, udb); VNET_DECLARE(struct inpcbinfo, udbinfo); -VNET_DECLARE(struct inpcbhead, ulitecb); VNET_DECLARE(struct inpcbinfo, ulitecbinfo); -#define V_udb VNET(udb) #define V_udbinfo VNET(udbinfo) -#define V_ulitecb VNET(ulitecb) #define V_ulitecbinfo VNET(ulitecbinfo) extern u_long udp_sendspace; @@ -165,12 +161,6 @@ udp_get_inpcbinfo(int protocol) return (protocol == IPPROTO_UDP) ? &V_udbinfo : &V_ulitecbinfo; } -static __inline struct inpcbhead * -udp_get_pcblist(int protocol) -{ - return (protocol == IPPROTO_UDP) ? &V_udb : &V_ulitecb; -} - int udp_newudpcb(struct inpcb *); void udp_discardcb(struct udpcb *); diff --git a/sys/netinet6/icmp6.c b/sys/netinet6/icmp6.c index f4a5574084fd..3632f9396be0 100644 --- a/sys/netinet6/icmp6.c +++ b/sys/netinet6/icmp6.c @@ -124,14 +124,12 @@ VNET_PCPUSTAT_SYSUNINIT(icmp6stat); #endif /* VIMAGE */ VNET_DECLARE(struct inpcbinfo, ripcbinfo); -VNET_DECLARE(struct inpcbhead, ripcb); VNET_DECLARE(int, icmp6errppslim); VNET_DEFINE_STATIC(int, icmp6errpps_count) = 0; VNET_DEFINE_STATIC(struct timeval, icmp6errppslim_last); VNET_DECLARE(int, icmp6_nodeinfo); #define V_ripcbinfo VNET(ripcbinfo) -#define V_ripcb VNET(ripcb) #define V_icmp6errppslim VNET(icmp6errppslim) #define V_icmp6errpps_count VNET(icmp6errpps_count) #define V_icmp6errppslim_last VNET(icmp6errppslim_last) @@ -1875,21 +1873,39 @@ ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, return (copied); } +static bool +icmp6_rip6_match(const struct inpcb *inp, void *v) +{ + struct ip6_hdr *ip6 = v; + + if ((inp->inp_vflag & INP_IPV6) == 0) + return (false); + if (inp->inp_ip_p != IPPROTO_ICMPV6) + return (false); + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) + return (false); + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) + return (false); + return (true); +} + /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(struct mbuf **mp, int off) { - struct mbuf *m = *mp; + struct mbuf *n, *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo, + INPLOOKUP_RLOCKPCB, icmp6_rip6_match, ip6); struct inpcb *inp; - struct inpcb *last = NULL; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; struct mbuf *opts = NULL; - - NET_EPOCH_ASSERT(); + int delivered = 0; /* This is assumed to be safe; icmp6_input() does a pullup. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); @@ -1908,125 +1924,64 @@ icmp6_rip6_input(struct mbuf **mp, int off) return (IPPROTO_DONE); } - CK_LIST_FOREACH(inp, &V_ripcb, inp_list) { - if ((inp->inp_vflag & INP_IPV6) == 0) - continue; - if (inp->inp_ip_p != IPPROTO_ICMPV6) - continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && - !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) - continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && - !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) - continue; - INP_RLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_RUNLOCK(inp); - continue; - } + while ((inp = inp_next(&inpi)) != NULL) { if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, - inp->in6p_icmp6filt)) { - INP_RUNLOCK(inp); + inp->in6p_icmp6filt)) continue; - } - if (last != NULL) { - struct mbuf *n = NULL; - - /* - * Recent network drivers tend to allocate a single - * mbuf cluster, rather than to make a couple of - * mbufs without clusters. Also, since the IPv6 code - * path tries to avoid m_pullup(), it is highly - * probable that we still have an mbuf cluster here - * even though the necessary length can be stored in an - * mbuf's internal buffer. - * Meanwhile, the default size of the receive socket - * buffer for raw sockets is not so large. This means - * the possibility of packet loss is relatively higher - * than before. To avoid this scenario, we copy the - * received data to a separate mbuf that does not use - * a cluster, if possible. - * XXX: it is better to copy the data after stripping - * intermediate headers. - */ - if ((m->m_flags & M_EXT) && m->m_next == NULL && - m->m_len <= MHLEN) { - n = m_get(M_NOWAIT, m->m_type); - if (n != NULL) { - if (m_dup_pkthdr(n, m, M_NOWAIT)) { - bcopy(m->m_data, n->m_data, - m->m_len); - n->m_len = m->m_len; - } else { - m_free(n); - n = NULL; - } - } - } - if (n != NULL || - (n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { - if (last->inp_flags & INP_CONTROLOPTS) - ip6_savecontrol(last, n, &opts); - /* strip intermediate headers */ - m_adj(n, off); - SOCKBUF_LOCK(&last->inp_socket->so_rcv); - if (sbappendaddr_locked( - &last->inp_socket->so_rcv, - (struct sockaddr *)&fromsa, n, opts) - == 0) { - soroverflow_locked(last->inp_socket); - m_freem(n); - if (opts) { - m_freem(opts); - } - } else - sorwakeup_locked(last->inp_socket); - opts = NULL; - } - INP_RUNLOCK(last); - } - last = inp; - } - if (last != NULL) { - if (last->inp_flags & INP_CONTROLOPTS) - ip6_savecontrol(last, m, &opts); - /* strip intermediate headers */ - m_adj(m, off); - - /* avoid using mbuf clusters if possible (see above) */ + /* + * Recent network drivers tend to allocate a single + * mbuf cluster, rather than to make a couple of + * mbufs without clusters. Also, since the IPv6 code + * path tries to avoid m_pullup(), it is highly + * probable that we still have an mbuf cluster here + * even though the necessary length can be stored in an + * mbuf's internal buffer. + * Meanwhile, the default size of the receive socket + * buffer for raw sockets is not so large. This means + * the possibility of packet loss is relatively higher + * than before. To avoid this scenario, we copy the + * received data to a separate mbuf that does not use + * a cluster, if possible. + * XXX: it is better to copy the data after stripping + * intermediate headers. + */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { - struct mbuf *n; - n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; - - m_freem(m); - m = n; } else { - m_freem(n); + m_free(n); n = NULL; } } - } - SOCKBUF_LOCK(&last->inp_socket->so_rcv); - if (sbappendaddr_locked(&last->inp_socket->so_rcv, - (struct sockaddr *)&fromsa, m, opts) == 0) { - m_freem(m); + } else + n = m_copym(m, 0, M_COPYALL, M_NOWAIT); + if (n == NULL) + continue; + if (inp->inp_flags & INP_CONTROLOPTS) + ip6_savecontrol(inp, n, &opts); + /* strip intermediate headers */ + m_adj(n, off); + SOCKBUF_LOCK(&inp->inp_socket->so_rcv); + if (sbappendaddr_locked(&inp->inp_socket->so_rcv, + (struct sockaddr *)&fromsa, n, opts) == 0) { + soroverflow_locked(inp->inp_socket); + m_freem(n); if (opts) m_freem(opts); - soroverflow_locked(last->inp_socket); - } else - sorwakeup_locked(last->inp_socket); - INP_RUNLOCK(last); - } else { - m_freem(m); - IP6STAT_DEC(ip6s_delivered); + } else { + sorwakeup_locked(inp->inp_socket); + delivered++; + } + opts = NULL; } + m_freem(m); *mp = NULL; + if (delivered == 0) + IP6STAT_DEC(ip6s_delivered); return (IPPROTO_DONE); } diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index 629593bb365c..d6c6593f2adf 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -673,13 +673,21 @@ in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam) * Call the protocol specific routine (if any) to report * any errors for each matching socket. */ +static bool +inp_match6(const struct inpcb *inp, void *v __unused) +{ + + return ((inp->inp_vflag & INP_IPV6) != 0); +} void in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr *dst, u_int fport_arg, const struct sockaddr *src, u_int lport_arg, int cmd, void *cmdarg, struct inpcb *(*notify)(struct inpcb *, int)) { - struct inpcb *inp, *inp_temp; + struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, + inp_match6, NULL); + struct inpcb *inp; struct sockaddr_in6 sa6_src, *sa6_dst; u_short fport = fport_arg, lport = lport_arg; u_int32_t flowinfo; @@ -715,14 +723,8 @@ in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr *dst, notify = in6_rtchange; } errno = inet6ctlerrmap[cmd]; - INP_INFO_WLOCK(pcbinfo); - CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { - INP_WLOCK(inp); - if ((inp->inp_vflag & INP_IPV6) == 0) { - INP_WUNLOCK(inp); - continue; - } - + while ((inp = inp_next(&inpi)) != NULL) { + INP_WLOCK_ASSERT(inp); /* * If the error designates a new path MTU for a destination * and the application (associated with this socket) wanted to @@ -754,18 +756,13 @@ in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr *dst, !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) || (fport && inp->inp_fport != fport)) { - INP_WUNLOCK(inp); continue; } do_notify: - if (notify) { - if ((*notify)(inp, errno)) - INP_WUNLOCK(inp); - } else - INP_WUNLOCK(inp); + if (notify) + (*notify)(inp, errno); } - INP_INFO_WUNLOCK(pcbinfo); } /* @@ -866,49 +863,54 @@ in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr, } } +static bool +in6_multi_match(const struct inpcb *inp, void *v __unused) +{ + + if ((inp->inp_vflag & INP_IPV6) && inp->in6p_moptions != NULL) + return (true); + else + return (false); +} + void in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { + struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_RLOCKPCB, + in6_multi_match, NULL); struct inpcb *inp; struct in6_multi *inm; struct in6_mfilter *imf; struct ip6_moptions *im6o; - INP_INFO_WLOCK(pcbinfo); - CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { - INP_WLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_WUNLOCK(inp); - continue; - } + IN6_MULTI_LOCK_ASSERT(); + + while ((inp = inp_next(&inpi)) != NULL) { + INP_RLOCK_ASSERT(inp); + im6o = inp->in6p_moptions; - if ((inp->inp_vflag & INP_IPV6) && im6o != NULL) { - /* - * Unselect the outgoing ifp for multicast if it - * is being detached. - */ - if (im6o->im6o_multicast_ifp == ifp) - im6o->im6o_multicast_ifp = NULL; - /* - * Drop multicast group membership if we joined - * through the interface being detached. - */ + /* + * Unselect the outgoing ifp for multicast if it + * is being detached. + */ + if (im6o->im6o_multicast_ifp == ifp) + im6o->im6o_multicast_ifp = NULL; + /* + * Drop multicast group membership if we joined + * through the interface being detached. + */ restart: - IP6_MFILTER_FOREACH(imf, &im6o->im6o_head) { - if ((inm = imf->im6f_in6m) == NULL) - continue; - if (inm->in6m_ifp != ifp) - continue; - ip6_mfilter_remove(&im6o->im6o_head, imf); - IN6_MULTI_LOCK_ASSERT(); - in6_leavegroup_locked(inm, NULL); - ip6_mfilter_free(imf); - goto restart; - } + IP6_MFILTER_FOREACH(imf, &im6o->im6o_head) { + if ((inm = imf->im6f_in6m) == NULL) + continue; + if (inm->in6m_ifp != ifp) + continue; + ip6_mfilter_remove(&im6o->im6o_head, imf); + in6_leavegroup_locked(inm, NULL); + ip6_mfilter_free(imf); + goto restart; } - INP_WUNLOCK(inp); } - INP_INFO_WUNLOCK(pcbinfo); } /* @@ -1124,20 +1126,16 @@ in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, { struct inpcb *inp; + smr_enter(pcbinfo->ipi_smr); inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); if (inp != NULL) { - if (lookupflags & INPLOOKUP_WLOCKPCB) { - INP_WLOCK(inp); - } else if (lookupflags & INPLOOKUP_RLOCKPCB) { - INP_RLOCK(inp); - } else - panic("%s: locking bug", __func__); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_UNLOCK(inp); + if (__predict_false(inp_smr_lock(inp, + (lookupflags & INPLOOKUP_LOCKMASK)) == false)) inp = NULL; - } - } + } else + smr_exit(pcbinfo->ipi_smr); + return (inp); } diff --git a/sys/netinet6/ip6_gre.c b/sys/netinet6/ip6_gre.c index 410d1bcf952c..eb3f92d55adc 100644 --- a/sys/netinet6/ip6_gre.c +++ b/sys/netinet6/ip6_gre.c @@ -216,30 +216,15 @@ static void in6_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { - struct epoch_tracker et; struct gre_socket *gs; struct gre_softc *sc; struct sockaddr_in6 dst; - NET_EPOCH_ENTER(et); - /* - * udp_append() holds reference to inp, it is safe to check - * inp_flags2 without INP_RLOCK(). - * If socket was closed before we have entered NET_EPOCH section, - * INP_FREED flag should be set. Otherwise it should be safe to - * make access to ctx data, because gre_so will be freed by - * gre_sofree() via NET_EPOCH_CALL(). - */ - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - NET_EPOCH_EXIT(et); - m_freem(m); - return; - } + NET_EPOCH_ASSERT(); gs = (struct gre_socket *)ctx; dst = *(const struct sockaddr_in6 *)sa; if (sa6_embedscope(&dst, 0)) { - NET_EPOCH_EXIT(et); m_freem(m); return; } @@ -249,11 +234,9 @@ in6_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp, } if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){ gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc); - NET_EPOCH_EXIT(et); return; } m_freem(m); - NET_EPOCH_EXIT(et); } static int diff --git a/sys/netinet6/raw_ip6.c b/sys/netinet6/raw_ip6.c index aaba91c6d5e7..5b1790151009 100644 --- a/sys/netinet6/raw_ip6.c +++ b/sys/netinet6/raw_ip6.c @@ -119,9 +119,7 @@ __FBSDID("$FreeBSD$"); * Raw interface to IP6 protocol. */ -VNET_DECLARE(struct inpcbhead, ripcb); VNET_DECLARE(struct inpcbinfo, ripcbinfo); -#define V_ripcb VNET(ripcb) #define V_ripcbinfo VNET(ripcbinfo) extern u_long rip_sendspace; @@ -153,6 +151,33 @@ int (*ip6_mrouter_done)(void); int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *); int (*mrt6_ioctl)(u_long, caddr_t); +struct rip6_inp_match_ctx { + struct ip6_hdr *ip6; + int proto; +}; + +static bool +rip6_inp_match(const struct inpcb *inp, void *v) +{ + struct rip6_inp_match_ctx *c = v; + struct ip6_hdr *ip6 = c->ip6; + int proto = c->proto; + + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV6) == 0) + return (false); + if (inp->inp_ip_p && inp->inp_ip_p != proto) + return (false); + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) + return (false); + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) + return (false); + + return (true); +} + /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. @@ -161,12 +186,15 @@ int rip6_input(struct mbuf **mp, int *offp, int proto) { struct ifnet *ifp; - struct mbuf *m = *mp; + struct mbuf *n, *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb *inp; - struct inpcb *last = NULL; struct mbuf *opts = NULL; struct sockaddr_in6 fromsa; + struct rip6_inp_match_ctx ctx = { .ip6 = ip6, .proto = proto }; + struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo, + INPLOOKUP_RLOCKPCB, rip6_inp_match, &ctx); + int delivered = 0; NET_EPOCH_ASSERT(); @@ -176,70 +204,27 @@ rip6_input(struct mbuf **mp, int *offp, int proto) ifp = m->m_pkthdr.rcvif; - CK_LIST_FOREACH(inp, &V_ripcb, inp_list) { - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV6) == 0) - continue; - if (inp->inp_ip_p && - inp->inp_ip_p != proto) - continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && - !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) - continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && - !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) - continue; - if (last != NULL) { - struct mbuf *n = m_copym(m, 0, M_COPYALL, M_NOWAIT); - + while ((inp = inp_next(&inpi)) != NULL) { + INP_RLOCK_ASSERT(inp); #if defined(IPSEC) || defined(IPSEC_SUPPORT) - /* - * Check AH/ESP integrity. - */ - if (IPSEC_ENABLED(ipv6)) { - if (n != NULL && - IPSEC_CHECK_POLICY(ipv6, n, last) != 0) { - m_freem(n); - /* Do not inject data into pcb. */ - n = NULL; - } - } -#endif /* IPSEC */ - if (n) { - if (last->inp_flags & INP_CONTROLOPTS || - last->inp_socket->so_options & SO_TIMESTAMP) - ip6_savecontrol(last, n, &opts); - /* strip intermediate headers */ - m_adj(n, *offp); - if (sbappendaddr(&last->inp_socket->so_rcv, - (struct sockaddr *)&fromsa, - n, opts) == 0) { - soroverflow(last->inp_socket); - m_freem(n); - if (opts) - m_freem(opts); - RIP6STAT_INC(rip6s_fullsock); - } else - sorwakeup(last->inp_socket); - opts = NULL; - } - INP_RUNLOCK(last); - last = NULL; + /* + * Check AH/ESP integrity. + */ + if (IPSEC_ENABLED(ipv6) && + IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) { + /* Do not inject data into pcb. */ + continue; } - INP_RLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) - goto skip_2; - if (jailed_without_vnet(inp->inp_cred)) { +#endif /* IPSEC */ + if (jailed_without_vnet(inp->inp_cred) && + !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && + prison_check_ip6(inp->inp_cred, &ip6->ip6_dst) != 0) /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ - if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && - prison_check_ip6(inp->inp_cred, - &ip6->ip6_dst) != 0) - goto skip_2; - } + continue; if (inp->in6p_cksum != -1) { RIP6STAT_INC(rip6s_isum); if (m->m_pkthdr.len - (*offp + inp->in6p_cksum) < 2 || @@ -251,8 +236,9 @@ rip6_input(struct mbuf **mp, int *offp, int proto) * ICMP6 message. Set proto to IPPROTO_NONE * to achieve that. */ + INP_RUNLOCK(inp); proto = IPPROTO_NONE; - goto skip_2; + break; } } /* @@ -298,43 +284,30 @@ rip6_input(struct mbuf **mp, int *offp, int proto) } if (blocked != MCAST_PASS) { IP6STAT_INC(ip6s_notmember); - goto skip_2; + continue; } } - last = inp; - continue; -skip_2: - INP_RUNLOCK(inp); - } -#if defined(IPSEC) || defined(IPSEC_SUPPORT) - /* - * Check AH/ESP integrity. - */ - if (IPSEC_ENABLED(ipv6) && last != NULL && - IPSEC_CHECK_POLICY(ipv6, m, last) != 0) { - m_freem(m); - IP6STAT_DEC(ip6s_delivered); - /* Do not inject data into pcb. */ - INP_RUNLOCK(last); - } else -#endif /* IPSEC */ - if (last != NULL) { - if (last->inp_flags & INP_CONTROLOPTS || - last->inp_socket->so_options & SO_TIMESTAMP) - ip6_savecontrol(last, m, &opts); - /* Strip intermediate headers. */ - m_adj(m, *offp); - if (sbappendaddr(&last->inp_socket->so_rcv, - (struct sockaddr *)&fromsa, m, opts) == 0) { - soroverflow(last->inp_socket); - m_freem(m); + if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) + continue; + if (inp->inp_flags & INP_CONTROLOPTS || + inp->inp_socket->so_options & SO_TIMESTAMP) + ip6_savecontrol(inp, n, &opts); + /* strip intermediate headers */ + m_adj(n, *offp); + if (sbappendaddr(&inp->inp_socket->so_rcv, + (struct sockaddr *)&fromsa, n, opts) == 0) { + soroverflow(inp->inp_socket); + m_freem(n); if (opts) m_freem(opts); RIP6STAT_INC(rip6s_fullsock); - } else - sorwakeup(last->inp_socket); - INP_RUNLOCK(last); - } else { + } else { + sorwakeup(inp->inp_socket); + delivered++; + } + opts = NULL; + } + if (delivered == 0) { RIP6STAT_INC(rip6s_nosock); if (m->m_flags & M_MCAST) RIP6STAT_INC(rip6s_nosockmcast); @@ -345,7 +318,8 @@ rip6_input(struct mbuf **mp, int *offp, int proto) ICMP6_PARAMPROB_NEXTHEADER, ip6_get_prevhdr(m, *offp)); IP6STAT_DEC(ip6s_delivered); - } + } else + m_freem(m); return (IPPROTO_DONE); } @@ -678,15 +652,12 @@ rip6_attach(struct socket *so, int proto, struct thread *td) filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT); if (filter == NULL) return (ENOMEM); - INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); if (error) { - INP_INFO_WUNLOCK(&V_ripcbinfo); free(filter, M_PCB); return (error); } inp = (struct inpcb *)so->so_pcb; - INP_INFO_WUNLOCK(&V_ripcbinfo); inp->inp_vflag |= INP_IPV6; inp->inp_ip_p = (long)proto; inp->in6p_hops = -1; /* use kernel default */ @@ -708,12 +679,10 @@ rip6_detach(struct socket *so) if (so == V_ip6_mrouter && ip6_mrouter_done) ip6_mrouter_done(); /* xxx: RSVP */ - INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); free(inp->in6p_icmp6filt, M_PCB); in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); } /* XXXRW: This can't ever be called. */ diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c index 5ce3a1fd1a78..f7c08f8fc96e 100644 --- a/sys/netinet6/udp6_usrreq.c +++ b/sys/netinet6/udp6_usrreq.c @@ -207,6 +207,137 @@ udp6_append(struct inpcb *inp, struct mbuf *n, int off, return (0); } +struct udp6_multi_match_ctx { + struct ip6_hdr *ip6; + struct udphdr *uh; +}; + +static bool +udp6_multi_match(const struct inpcb *inp, void *v) +{ + struct udp6_multi_match_ctx *ctx = v; + + if ((inp->inp_vflag & INP_IPV6) == 0) + return(false); + if (inp->inp_lport != ctx->uh->uh_dport) + return(false); + if (inp->inp_fport != 0 && inp->inp_fport != ctx->uh->uh_sport) + return(false); + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ctx->ip6->ip6_dst)) + return (false); + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && + (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ctx->ip6->ip6_src) || + inp->inp_fport != ctx->uh->uh_sport)) + return (false); + + return (true); +} + +static int +udp6_multi_input(struct mbuf *m, int off, int proto, + struct sockaddr_in6 *fromsa) +{ + struct udp6_multi_match_ctx ctx; + struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto), + INPLOOKUP_RLOCKPCB, udp6_multi_match, &ctx); + struct inpcb *inp; + struct ip6_moptions *imo; + struct mbuf *n; + int appends = 0; + + /* + * In the event that laddr should be set to the link-local + * address (this happens in RIPng), the multicast address + * specified in the received packet will not match laddr. To + * handle this situation, matching is relaxed if the + * receiving interface is the same as one specified in the + * socket and if the destination multicast address matches + * one of the multicast groups specified in the socket. + */ + + /* + * KAME note: traditionally we dropped udpiphdr from mbuf + * here. We need udphdr for IPsec processing so we do that + * later. + */ + ctx.ip6 = mtod(m, struct ip6_hdr *); + ctx.uh = (struct udphdr *)((char *)ctx.ip6 + off); + while ((inp = inp_next(&inpi)) != NULL) { + INP_RLOCK_ASSERT(inp); + /* + * XXXRW: Because we weren't holding either the inpcb + * or the hash lock when we checked for a match + * before, we should probably recheck now that the + * inpcb lock is (supposed to be) held. + */ + /* + * Handle socket delivery policy for any-source + * and source-specific multicast. [RFC3678] + */ + if ((imo = inp->in6p_moptions) != NULL) { + struct sockaddr_in6 mcaddr; + int blocked; + + bzero(&mcaddr, sizeof(struct sockaddr_in6)); + mcaddr.sin6_len = sizeof(struct sockaddr_in6); + mcaddr.sin6_family = AF_INET6; + mcaddr.sin6_addr = ctx.ip6->ip6_dst; + + blocked = im6o_mc_filter(imo, m->m_pkthdr.rcvif, + (struct sockaddr *)&mcaddr, + (struct sockaddr *)&fromsa[0]); + if (blocked != MCAST_PASS) { + if (blocked == MCAST_NOTGMEMBER) + IP6STAT_INC(ip6s_notmember); + if (blocked == MCAST_NOTSMEMBER || + blocked == MCAST_MUTED) + UDPSTAT_INC(udps_filtermcast); + continue; + } + } + if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { + if (proto == IPPROTO_UDPLITE) + UDPLITE_PROBE(receive, NULL, inp, ctx.ip6, + inp, ctx.uh); + else + UDP_PROBE(receive, NULL, inp, ctx.ip6, inp, + ctx.uh); + if (udp6_append(inp, n, off, fromsa)) { + INP_RUNLOCK(inp); + break; + } else + appends++; + } + /* + * Don't look for additional matches if this one does + * not have either the SO_REUSEPORT or SO_REUSEADDR + * socket options set. This heuristic avoids + * searching through all pcbs in the common case of a + * non-shared port. It assumes that an application + * will never clear these options after setting them. + */ + if ((inp->inp_socket->so_options & + (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) { + INP_RUNLOCK(inp); + break; + } + } + m_freem(m); + + if (appends == 0) { + /* + * No matching pcb found; discard datagram. (No need + * to send an ICMP Port Unreachable for a broadcast + * or multicast datgram.) + */ + UDPSTAT_INC(udps_noport); + UDPSTAT_INC(udps_noportmcast); + } + + return (IPPROTO_DONE); +} + int udp6_input(struct mbuf **mp, int *offp, int proto) { @@ -311,144 +442,11 @@ udp6_input(struct mbuf **mp, int *offp, int proto) fromsa[1].sin6_port = uh->uh_dport; pcbinfo = udp_get_inpcbinfo(nxt); - if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { - struct inpcb *last; - struct inpcbhead *pcblist; - struct ip6_moptions *imo; - - /* - * In the event that laddr should be set to the link-local - * address (this happens in RIPng), the multicast address - * specified in the received packet will not match laddr. To - * handle this situation, matching is relaxed if the - * receiving interface is the same as one specified in the - * socket and if the destination multicast address matches - * one of the multicast groups specified in the socket. - */ - - /* - * KAME note: traditionally we dropped udpiphdr from mbuf - * here. We need udphdr for IPsec processing so we do that - * later. - */ - pcblist = udp_get_pcblist(nxt); - last = NULL; - CK_LIST_FOREACH(inp, pcblist, inp_list) { - if ((inp->inp_vflag & INP_IPV6) == 0) - continue; - if (inp->inp_lport != uh->uh_dport) - continue; - if (inp->inp_fport != 0 && - inp->inp_fport != uh->uh_sport) - continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { - if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, - &ip6->ip6_dst)) - continue; - } - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { - if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, - &ip6->ip6_src) || - inp->inp_fport != uh->uh_sport) - continue; - } - - INP_RLOCK(inp); - - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_RUNLOCK(inp); - continue; - } - - /* - * XXXRW: Because we weren't holding either the inpcb - * or the hash lock when we checked for a match - * before, we should probably recheck now that the - * inpcb lock is (supposed to be) held. - */ - - /* - * Handle socket delivery policy for any-source - * and source-specific multicast. [RFC3678] - */ - imo = inp->in6p_moptions; - if (imo != NULL) { - struct sockaddr_in6 mcaddr; - int blocked; - - bzero(&mcaddr, sizeof(struct sockaddr_in6)); - mcaddr.sin6_len = sizeof(struct sockaddr_in6); - mcaddr.sin6_family = AF_INET6; - mcaddr.sin6_addr = ip6->ip6_dst; - - blocked = im6o_mc_filter(imo, ifp, - (struct sockaddr *)&mcaddr, - (struct sockaddr *)&fromsa[0]); - if (blocked != MCAST_PASS) { - if (blocked == MCAST_NOTGMEMBER) - IP6STAT_INC(ip6s_notmember); - if (blocked == MCAST_NOTSMEMBER || - blocked == MCAST_MUTED) - UDPSTAT_INC(udps_filtermcast); - INP_RUNLOCK(inp); - continue; - } - } - - if (last != NULL) { - struct mbuf *n; - - if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != - NULL) { - if (nxt == IPPROTO_UDPLITE) - UDPLITE_PROBE(receive, NULL, - last, ip6, last, uh); - else - UDP_PROBE(receive, NULL, last, - ip6, last, uh); - if (udp6_append(last, n, off, - fromsa)) { - INP_RUNLOCK(inp); - goto badunlocked; - } - } - /* Release PCB lock taken on previous pass. */ - INP_RUNLOCK(last); - } - last = inp; - /* - * Don't look for additional matches if this one does - * not have either the SO_REUSEPORT or SO_REUSEADDR - * socket options set. This heuristic avoids - * searching through all pcbs in the common case of a - * non-shared port. It assumes that an application - * will never clear these options after setting them. - */ - if ((last->inp_socket->so_options & - (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) - break; - } - - if (last == NULL) { - /* - * No matching pcb found; discard datagram. (No need - * to send an ICMP Port Unreachable for a broadcast - * or multicast datgram.) - */ - UDPSTAT_INC(udps_noport); - UDPSTAT_INC(udps_noportmcast); - goto badunlocked; - } - - if (nxt == IPPROTO_UDPLITE) - UDPLITE_PROBE(receive, NULL, last, ip6, last, uh); - else - UDP_PROBE(receive, NULL, last, ip6, last, uh); - if (udp6_append(last, m, off, fromsa) == 0) - INP_RUNLOCK(last); + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { *mp = NULL; - return (IPPROTO_DONE); + return (udp6_multi_input(m, off, proto, fromsa)); } + /* * Locate pcb for datagram. */ @@ -1043,12 +1041,9 @@ udp6_attach(struct socket *so, int proto, struct thread *td) if (error) return (error); } - INP_INFO_WLOCK(pcbinfo); error = in_pcballoc(so, pcbinfo); - if (error) { - INP_INFO_WUNLOCK(pcbinfo); + if (error) return (error); - } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) @@ -1067,11 +1062,9 @@ udp6_attach(struct socket *so, int proto, struct thread *td) if (error) { in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(pcbinfo); return (error); } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(pcbinfo); return (0); } @@ -1275,13 +1268,11 @@ udp6_detach(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_detach: inp == NULL")); - INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); }