From 6aba400a7055ed2427c6aa297774fb91f5d8db79 Mon Sep 17 00:00:00 2001 From: Attilio Rao Date: Thu, 25 Aug 2011 15:51:54 +0000 Subject: [PATCH] Fix a deficiency in the selinfo interface: If a selinfo object is recorded (via selrecord()) and then it is quickly destroyed, with the waiters missing the opportunity to awake, at the next iteration they will find the selinfo object destroyed, causing a PF#. That happens because the selinfo interface has no way to drain the waiters before to destroy the registered selinfo object. Also this race is quite rare to get in practice, because it would require a selrecord(), a poll request by another thread and a quick destruction of the selrecord()'ed selinfo object. Fix this by adding the seldrain() routine which should be called before to destroy the selinfo objects (in order to avoid such case), and fix the present cases where it might have already been called. Sometimes, the context is safe enough to prevent this type of race, like it happens in device drivers which installs selinfo objects on poll callbacks. There, the destruction of the selinfo object happens at driver detach time, when all the filedescriptors should be already closed, thus there cannot be a race. For this case, mfi(4) device driver can be set as an example, as it implements a full correct logic for preventing this from happening. Sponsored by: Sandvine Incorporated Reported by: rstone Tested by: pluknet Reviewed by: jhb, kib Approved by: re (bz) MFC after: 3 weeks --- share/man/man9/Makefile | 2 +- share/man/man9/selrecord.9 | 21 +++++++++++++++++++-- sys/kern/kern_event.c | 1 + sys/kern/sys_generic.c | 17 +++++++++++++++++ sys/kern/sys_pipe.c | 1 + sys/kern/tty.c | 2 ++ sys/kern/tty_pts.c | 2 ++ sys/kern/uipc_mqueue.c | 2 ++ sys/kern/uipc_socket.c | 2 ++ sys/kern/vfs_subr.c | 1 + sys/net/bpf.c | 2 +- sys/net/if_tap.c | 1 + sys/net/if_tun.c | 1 + sys/security/audit/audit_pipe.c | 1 + sys/sys/selinfo.h | 1 + sys/x86/acpica/acpi_apm.c | 1 + 16 files changed, 54 insertions(+), 4 deletions(-) diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile index a04cb684a67f..05bab57581c2 100644 --- a/share/man/man9/Makefile +++ b/share/man/man9/Makefile @@ -1075,7 +1075,7 @@ MLINKS+=scheduler.9 curpriority_cmp.9 \ scheduler.9 setrunnable.9 \ scheduler.9 updatepri.9 MLINKS+=securelevel_gt.9 securelevel_ge.9 -MLINKS+=selrecord.9 selwakeup.9 +MLINKS+=seldrain.9 selrecord.9 selwakeup.9 MLINKS+=sema.9 sema_destroy.9 \ sema.9 sema_init.9 \ sema.9 sema_post.9 \ diff --git a/share/man/man9/selrecord.9 b/share/man/man9/selrecord.9 index 15c5a18cf444..f7a0113fc413 100644 --- a/share/man/man9/selrecord.9 +++ b/share/man/man9/selrecord.9 @@ -26,10 +26,11 @@ .\" .\" $FreeBSD$ .\" -.Dd June 13, 2007 +.Dd August 25, 2011 .Dt SELRECORD 9 .Os .Sh NAME +.Nm seldrain , .Nm selrecord , .Nm selwakeup .Nd "record and wakeup select requests" @@ -37,14 +38,17 @@ .In sys/param.h .In sys/selinfo.h .Ft void +.Fn seldrain "struct selinfo *sip" +.Ft void .Fn selrecord "struct thread *td" "struct selinfo *sip" .Ft void .Fn selwakeup "struct selinfo *sip" .Sh DESCRIPTION +.Fn seldrain , .Fn selrecord and .Fn selwakeup -are the two central functions used by +are the three central functions used by .Xr select 2 , .Xr poll 2 and the objects that are being selected on. @@ -86,6 +90,15 @@ and .Xr poll 2 when they wake up. .Pp +.Fn seldrain +will flush the waiters queue on a specified object before its +destruction. +The object handling code must ensure that +.Fa *sip +cannot be used once +.Fn seldrain +has been called. +.Pp The contents of .Fa *sip must be zeroed, such as by softc initialization, before any call to @@ -98,6 +111,10 @@ acquires and releases .Va sellock and may acquire and release .Va sched_lock . +.Fn seldrain +could usually be just a wrapper for +.Fn selwakeup , +but consumers should not generally rely on this feature. .Sh SEE ALSO .Xr poll 2 , .Xr select 2 diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index c512b0ac734c..dc11411fd5df 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -1704,6 +1704,7 @@ kqueue_close(struct file *fp, struct thread *td) SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list); FILEDESC_XUNLOCK(fdp); + seldrain(&kq->kq_sel); knlist_destroy(&kq->kq_sel.si_note); mtx_destroy(&kq->kq_lock); kq->kq_fdp = NULL; diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index 6edd4fbdeeee..7b45efa2ff51 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -1490,6 +1490,23 @@ selfdfree(struct seltd *stp, struct selfd *sfp) uma_zfree(selfd_zone, sfp); } +/* Drain the waiters tied to all the selfd belonging the specified selinfo. */ +void +seldrain(sip) + struct selinfo *sip; +{ + + /* + * This feature is already provided by doselwakeup(), thus it is + * enough to go for it. + * Eventually, the context, should take care to avoid races + * between thread calling select()/poll() and file descriptor + * detaching, but, again, the races are just the same as + * selwakeup(). + */ + doselwakeup(sip, -1); +} + /* * Record a select request. */ diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index 14e12075f1d0..c44a2c964e49 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -1517,6 +1517,7 @@ pipeclose(cpipe) */ knlist_clear(&cpipe->pipe_sel.si_note, 1); cpipe->pipe_present = PIPE_FINALIZED; + seldrain(&cpipe->pipe_sel); knlist_destroy(&cpipe->pipe_sel.si_note); /* diff --git a/sys/kern/tty.c b/sys/kern/tty.c index 77c02dd57dd7..ce49f972b2fc 100644 --- a/sys/kern/tty.c +++ b/sys/kern/tty.c @@ -1022,6 +1022,8 @@ tty_dealloc(void *arg) MPASS(ttyinq_getsize(&tp->t_inq) == 0); MPASS(ttyoutq_getsize(&tp->t_outq) == 0); + seldrain(&tp->t_inpoll); + seldrain(&tp->t_outpoll); knlist_destroy(&tp->t_inpoll.si_note); knlist_destroy(&tp->t_outpoll.si_note); diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c index cf9f94d375d4..f2f5c4e71bd8 100644 --- a/sys/kern/tty_pts.c +++ b/sys/kern/tty_pts.c @@ -688,6 +688,8 @@ ptsdrv_free(void *softc) racct_sub_cred(psc->pts_cred, RACCT_NPTS, 1); crfree(psc->pts_cred); + seldrain(&psc->pts_inpoll); + seldrain(&psc->pts_outpoll); knlist_destroy(&psc->pts_inpoll.si_note); knlist_destroy(&psc->pts_outpoll.si_note); diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c index fbd78c16a288..b91b890e8a27 100644 --- a/sys/kern/uipc_mqueue.c +++ b/sys/kern/uipc_mqueue.c @@ -1562,6 +1562,8 @@ mqueue_free(struct mqueue *mq) } mtx_destroy(&mq->mq_mutex); + seldrain(&mq->mq_rsel); + seldrain(&mq->mq_wsel); knlist_destroy(&mq->mq_rsel.si_note); knlist_destroy(&mq->mq_wsel.si_note); uma_zfree(mqueue_zone, mq); diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 990c6bad11f8..bbd4fad38749 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -661,6 +661,8 @@ sofree(struct socket *so) */ sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); + seldrain(&so->so_snd.sb_sel); + seldrain(&so->so_rcv.sb_sel); knlist_destroy(&so->so_rcv.sb_sel.si_note); knlist_destroy(&so->so_snd.sb_sel.si_note); sodealloc(so); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index a9fe8d10773d..325ca99deb32 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -3312,6 +3312,7 @@ vbusy(struct vnode *vp) static void destroy_vpollinfo(struct vpollinfo *vi) { + seldrain(&vi->vpi_selinfo); knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); uma_zfree(vnodepoll_zone, vi); diff --git a/sys/net/bpf.c b/sys/net/bpf.c index e5165731dae7..79c77a93278c 100644 --- a/sys/net/bpf.c +++ b/sys/net/bpf.c @@ -652,10 +652,10 @@ bpf_dtor(void *data) if (d->bd_bif) bpf_detachd(d); mtx_unlock(&bpf_mtx); - selwakeuppri(&d->bd_sel, PRINET); #ifdef MAC mac_bpfdesc_destroy(d); #endif /* MAC */ + seldrain(&d->bd_sel); knlist_destroy(&d->bd_sel.si_note); callout_drain(&d->bd_callout); bpf_freed(d); diff --git a/sys/net/if_tap.c b/sys/net/if_tap.c index ad29da044997..08c669ab3b89 100644 --- a/sys/net/if_tap.c +++ b/sys/net/if_tap.c @@ -214,6 +214,7 @@ tap_destroy(struct tap_softc *tp) KASSERT(!(tp->tap_flags & TAP_OPEN), ("%s flags is out of sync", ifp->if_xname)); + seldrain(&tp->tap_rsel); knlist_destroy(&tp->tap_rsel.si_note); destroy_dev(tp->tap_dev); ether_ifdetach(ifp); diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c index d74c9fec2852..c5328848c7b7 100644 --- a/sys/net/if_tun.c +++ b/sys/net/if_tun.c @@ -259,6 +259,7 @@ tun_destroy(struct tun_softc *tp) if_detach(TUN2IFP(tp)); if_free(TUN2IFP(tp)); destroy_dev(dev); + seldrain(&tp->tun_rsel); knlist_destroy(&tp->tun_rsel.si_note); mtx_destroy(&tp->tun_mtx); cv_destroy(&tp->tun_cv); diff --git a/sys/security/audit/audit_pipe.c b/sys/security/audit/audit_pipe.c index a8db1135db76..a953eb0200da 100644 --- a/sys/security/audit/audit_pipe.c +++ b/sys/security/audit/audit_pipe.c @@ -646,6 +646,7 @@ audit_pipe_free(struct audit_pipe *ap) cv_destroy(&ap->ap_cv); AUDIT_PIPE_SX_LOCK_DESTROY(ap); AUDIT_PIPE_LOCK_DESTROY(ap); + seldrain(&ap->ap_selinfo); knlist_destroy(&ap->ap_selinfo.si_note); TAILQ_REMOVE(&audit_pipe_list, ap, ap_list); free(ap, M_AUDIT_PIPE); diff --git a/sys/sys/selinfo.h b/sys/sys/selinfo.h index 2d2f8485e772..590d184ad17c 100644 --- a/sys/sys/selinfo.h +++ b/sys/sys/selinfo.h @@ -51,6 +51,7 @@ struct selinfo { #define SEL_WAITING(si) (!TAILQ_EMPTY(&(si)->si_tdlist)) #ifdef _KERNEL +void seldrain(struct selinfo *sip); void selrecord(struct thread *selector, struct selinfo *sip); void selwakeup(struct selinfo *sip); void selwakeuppri(struct selinfo *sip, int pri); diff --git a/sys/x86/acpica/acpi_apm.c b/sys/x86/acpica/acpi_apm.c index 02be6e01fe5d..776b1be602a1 100644 --- a/sys/x86/acpica/acpi_apm.c +++ b/sys/x86/acpica/acpi_apm.c @@ -297,6 +297,7 @@ apmclose(struct cdev *dev, int flag, int fmt, struct thread *td) /* Remove this clone's data from the list and free it. */ ACPI_LOCK(acpi); STAILQ_REMOVE(&acpi_sc->apm_cdevs, clone, apm_clone_data, entries); + seldrain(&clone->sel_read); knlist_destroy(&clone->sel_read.si_note); ACPI_UNLOCK(acpi); free(clone, M_APMDEV);