From d915b2801534ecb14fceb9ac894c9aa3fb65d6c8 Mon Sep 17 00:00:00 2001 From: Stephan Uphoff Date: Tue, 18 Jul 2006 22:34:27 +0000 Subject: [PATCH] Fix race conditions on enumerating pcb lists by moving the initialization ( and where appropriate the destruction) of the pcb mutex to the init/finit functions of the pcb zones. This allows locking of the pcb entries and race condition free comparison of the generation count. Rearrange locking a bit to avoid extra locking operation to update the generation count in in_pcballoc(). (in_pcballoc now returns the pcb locked) I am planning to convert pcb list handling from a type safe to a reference count model soon. ( As this allows really freeing the PCBs) Reviewed by: rwatson@, mohans@ MFC after: 1 week --- sys/netinet/in_pcb.c | 14 +++++++++----- sys/netinet/in_pcb.h | 3 ++- sys/netinet/ip_divert.c | 26 ++++++++++++++++++++++---- sys/netinet/raw_ip.c | 18 ++++++++++++++---- sys/netinet/tcp_subr.c | 16 ++++++++++++++-- sys/netinet/tcp_timewait.c | 16 ++++++++++++++-- sys/netinet/tcp_usrreq.c | 3 +-- sys/netinet/udp_usrreq.c | 18 ++++++++++++++---- sys/netinet6/in6_pcb.c | 2 +- sys/netinet6/raw_ip6.c | 3 +-- sys/netinet6/udp6_usrreq.c | 3 +-- 11 files changed, 93 insertions(+), 29 deletions(-) diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 37386d7c38a4..ddb9e3cf355d 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -167,19 +167,20 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW, /* * Allocate a PCB and associate it with the socket. + * On success return with the PCB locked. */ int -in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, const char *type) +in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; int error; INP_INFO_WLOCK_ASSERT(pcbinfo); error = 0; - inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT | M_ZERO); + inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); - inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + bzero(inp,inp_zero_size); inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; #ifdef MAC @@ -209,11 +210,13 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, const char *type) LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; - INP_LOCK_INIT(inp, "inp", type); #ifdef INET6 if (ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif + INP_LOCK(inp); + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + #if defined(IPSEC) || defined(FAST_IPSEC) || defined(MAC) out: if (error != 0) @@ -721,10 +724,11 @@ in_pcbfree(struct inpcb *inp) (void)m_free(inp->inp_options); ip_freemoptions(inp->inp_moptions); inp->inp_vflag = 0; - INP_LOCK_DESTROY(inp); + #ifdef MAC mac_destroy_inpcb(inp); #endif + INP_UNLOCK(inp); uma_zfree(ipi->ipi_zone, inp); } diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 80ae9e98dfa8..64de94ba923f 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -166,6 +166,7 @@ struct inpcb { } inp_depend6; LIST_ENTRY(inpcb) inp_portlist; struct inpcbport *inp_phd; /* head of this list */ +#define inp_zero_size offsetof(struct inpcb, inp_gencnt) inp_gen_t inp_gencnt; /* generation count of this instance */ struct mtx inp_mtx; @@ -342,7 +343,7 @@ extern int ipport_hilastauto; extern struct callout ipport_tick_callout; void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); -int in_pcballoc(struct socket *, struct inpcbinfo *, const char *); +int in_pcballoc(struct socket *, struct inpcbinfo *); int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *); int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, struct ucred *); diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c index 7136644a9661..c620e7390d29 100644 --- a/sys/netinet/ip_divert.c +++ b/sys/netinet/ip_divert.c @@ -123,6 +123,22 @@ div_zone_change(void *tag) uma_zone_set_max(divcbinfo.ipi_zone, maxsockets); } +static int +div_inpcb_init(void *mem, int size, int flags) +{ + struct inpcb *inp = (struct inpcb *) mem; + INP_LOCK_INIT(inp, "inp", "divinp"); + return (0); +} + +static void +div_inpcb_fini(void *mem, int size) +{ + struct inpcb *inp = (struct inpcb *) mem; + INP_LOCK_DESTROY(inp); +} + + void div_init(void) { @@ -137,7 +153,7 @@ div_init(void) divcbinfo.hashbase = hashinit(1, M_PCB, &divcbinfo.hashmask); divcbinfo.porthashbase = hashinit(1, M_PCB, &divcbinfo.porthashmask); divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(divcbinfo.ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, div_zone_change, NULL, EVENTHANDLER_PRI_ANY); @@ -409,13 +425,12 @@ div_attach(struct socket *so, int proto, struct thread *td) if (error) return error; INP_INFO_WLOCK(&divcbinfo); - error = in_pcballoc(so, &divcbinfo, "divinp"); + error = in_pcballoc(so, &divcbinfo); if (error) { INP_INFO_WUNLOCK(&divcbinfo); return error; } inp = (struct inpcb *)so->so_pcb; - INP_LOCK(inp); INP_INFO_WUNLOCK(&divcbinfo); inp->inp_ip_p = proto; inp->inp_vflag |= INP_IPV4; @@ -567,6 +582,7 @@ div_pcblist(SYSCTL_HANDLER_ARGS) error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; + INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); @@ -575,8 +591,10 @@ div_pcblist(SYSCTL_HANDLER_ARGS) bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); + INP_UNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); - } + } else + INP_UNLOCK(inp); } if (!error) { /* diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index b311c748493d..f635c8959c34 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -123,6 +123,14 @@ rip_zone_change(void *tag) uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); } +static int +rip_inpcb_init(void *mem, int size, int flags) +{ + struct inpcb *inp = (struct inpcb *) mem; + INP_LOCK_INIT(inp, "inp", "rawinp"); + return (0); +} + void rip_init() { @@ -137,7 +145,7 @@ rip_init() ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask); ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask); ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(ripcbinfo.ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); @@ -599,13 +607,12 @@ rip_attach(struct socket *so, int proto, struct thread *td) if (error) return error; INP_INFO_WLOCK(&ripcbinfo); - error = in_pcballoc(so, &ripcbinfo, "rawinp"); + error = in_pcballoc(so, &ripcbinfo); if (error) { INP_INFO_WUNLOCK(&ripcbinfo); return error; } inp = (struct inpcb *)so->so_pcb; - INP_LOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; @@ -836,6 +843,7 @@ rip_pcblist(SYSCTL_HANDLER_ARGS) error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; + INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); @@ -844,8 +852,10 @@ rip_pcblist(SYSCTL_HANDLER_ARGS) bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); + INP_UNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); - } + } else + INP_UNLOCK(inp); } if (!error) { /* diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 89bd41ca5768..676b22e15d94 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -262,6 +262,14 @@ tcp_zone_change(void *tag) uma_zone_set_max(tcptw_zone, maxsockets / 5); } +static int +tcp_inpcb_init(void *mem, int size, int flags) +{ + struct inpcb *inp = (struct inpcb *) mem; + INP_LOCK_INIT(inp, "inp", "tcpinp"); + return (0); +} + void tcp_init(void) { @@ -290,7 +298,7 @@ tcp_init(void) tcbinfo.porthashbase = hashinit(hashsize, M_PCB, &tcbinfo.porthashmask); tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) @@ -989,6 +997,7 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; + INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; void *inp_ppcb; @@ -1012,8 +1021,11 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) xt.xt_socket.xso_protocol = IPPROTO_TCP; } xt.xt_inp.inp_gencnt = inp->inp_gencnt; + INP_UNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); - } + } else + INP_UNLOCK(inp); + } if (!error) { /* diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c index 89bd41ca5768..676b22e15d94 100644 --- a/sys/netinet/tcp_timewait.c +++ b/sys/netinet/tcp_timewait.c @@ -262,6 +262,14 @@ tcp_zone_change(void *tag) uma_zone_set_max(tcptw_zone, maxsockets / 5); } +static int +tcp_inpcb_init(void *mem, int size, int flags) +{ + struct inpcb *inp = (struct inpcb *) mem; + INP_LOCK_INIT(inp, "inp", "tcpinp"); + return (0); +} + void tcp_init(void) { @@ -290,7 +298,7 @@ tcp_init(void) tcbinfo.porthashbase = hashinit(hashsize, M_PCB, &tcbinfo.porthashmask); tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) @@ -989,6 +997,7 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; + INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; void *inp_ppcb; @@ -1012,8 +1021,11 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) xt.xt_socket.xso_protocol = IPPROTO_TCP; } xt.xt_inp.inp_gencnt = inp->inp_gencnt; + INP_UNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); - } + } else + INP_UNLOCK(inp); + } if (!error) { /* diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index e6a31fae7416..111a9830f809 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1401,13 +1401,12 @@ tcp_attach(so) return (error); } INP_INFO_WLOCK(&tcbinfo); - error = in_pcballoc(so, &tcbinfo, "tcpinp"); + error = in_pcballoc(so, &tcbinfo); if (error) { INP_INFO_WUNLOCK(&tcbinfo); return (error); } inp = sotoinpcb(so); - INP_LOCK(inp); #ifdef INET6 if (isipv6) { inp->inp_vflag |= INP_IPV6; diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index 84c5521f6592..7fff2b62507e 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -137,6 +137,14 @@ udp_zone_change(void *tag) uma_zone_set_max(udbinfo.ipi_zone, maxsockets); } +static int +udp_inpcb_init(void *mem, int size, int flags) +{ + struct inpcb *inp = (struct inpcb *) mem; + INP_LOCK_INIT(inp, "inp", "udpinp"); + return (0); +} + void udp_init() { @@ -147,7 +155,7 @@ udp_init() udbinfo.porthashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.porthashmask); udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL, - NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(udbinfo.ipi_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, EVENTHANDLER_PRI_ANY); @@ -633,6 +641,7 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; + INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); @@ -642,8 +651,10 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); xi.xi_inp.inp_gencnt = inp->inp_gencnt; + INP_UNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); - } + } else + INP_UNLOCK(inp); } if (!error) { /* @@ -966,14 +977,13 @@ udp_attach(struct socket *so, int proto, struct thread *td) if (error) return error; INP_INFO_WLOCK(&udbinfo); - error = in_pcballoc(so, &udbinfo, "udpinp"); + error = in_pcballoc(so, &udbinfo); if (error) { INP_INFO_WUNLOCK(&udbinfo); return error; } inp = (struct inpcb *)so->so_pcb; - INP_LOCK(inp); INP_INFO_WUNLOCK(&udbinfo); inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = ip_defttl; diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index a2527888e6b7..9305ed97cdd2 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -455,7 +455,7 @@ in6_pcbfree(struct inpcb *inp) (void)m_free(inp->inp_options); ip_freemoptions(inp->inp_moptions); inp->inp_vflag = 0; - INP_LOCK_DESTROY(inp); + INP_UNLOCK(inp); uma_zfree(ipi->ipi_zone, inp); } diff --git a/sys/netinet6/raw_ip6.c b/sys/netinet6/raw_ip6.c index 111f52539451..86ff948f7902 100644 --- a/sys/netinet6/raw_ip6.c +++ b/sys/netinet6/raw_ip6.c @@ -561,14 +561,13 @@ rip6_attach(struct socket *so, int proto, struct thread *td) if (filter == NULL) return ENOMEM; INP_INFO_WLOCK(&ripcbinfo); - error = in_pcballoc(so, &ripcbinfo, "raw6inp"); + error = in_pcballoc(so, &ripcbinfo); if (error) { INP_INFO_WUNLOCK(&ripcbinfo); FREE(filter, M_PCB); return error; } inp = (struct inpcb *)so->so_pcb; - INP_LOCK(inp); INP_INFO_WUNLOCK(&ripcbinfo); inp->inp_vflag |= INP_IPV6; inp->in6p_ip6_nxt = (long)proto; diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c index 18f0c814c5b6..f031b9690eb4 100644 --- a/sys/netinet6/udp6_usrreq.c +++ b/sys/netinet6/udp6_usrreq.c @@ -503,13 +503,12 @@ udp6_attach(struct socket *so, int proto, struct thread *td) return error; } INP_INFO_WLOCK(&udbinfo); - error = in_pcballoc(so, &udbinfo, "udp6inp"); + error = in_pcballoc(so, &udbinfo); if (error) { INP_INFO_WUNLOCK(&udbinfo); return error; } inp = (struct inpcb *)so->so_pcb; - INP_LOCK(inp); INP_INFO_WUNLOCK(&udbinfo); inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)