Refactor select to reduce contention and hide internal implementation

details from consumers.

 - Track individual selecters on a per-descriptor basis such that there
   are no longer collisions and after sleeping for events only those
   descriptors which triggered events must be rescaned.
 - Protect the selinfo (per descriptor) structure with a mtx pool mutex.
   mtx pool mutexes were chosen to preserve api compatibility with
   existing code which does nothing but bzero() to setup selinfo
   structures.
 - Use a per-thread wait channel rather than a global wait channel.
 - Hide select implementation details in a seltd structure which is
   opaque to the rest of the kernel.
 - Provide a 'selsocket' interface for those kernel consumers who wish to
   select on a socket when they have no fd so they no longer have to
   be aware of select implementation details.

Tested by:	kris
Reviewed on:	arch
This commit is contained in:
Jeff Roberson 2007-12-16 06:21:20 +00:00
parent 53bfc2ecaf
commit ace8398da0
13 changed files with 452 additions and 380 deletions

View File

@ -1400,7 +1400,8 @@ kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
revents |= events & (POLLIN | POLLRDNORM); revents |= events & (POLLIN | POLLRDNORM);
} else { } else {
selrecord(td, &kq->kq_sel); selrecord(td, &kq->kq_sel);
kq->kq_state |= KQ_SEL; if (SEL_WAITING(&kq->kq_sel))
kq->kq_state |= KQ_SEL;
} }
} }
kqueue_release(kq, 1); kqueue_release(kq, 1);
@ -1486,8 +1487,9 @@ kqueue_close(struct file *fp, struct thread *td)
} }
if ((kq->kq_state & KQ_SEL) == KQ_SEL) { if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
kq->kq_state &= ~KQ_SEL;
selwakeuppri(&kq->kq_sel, PSOCK); selwakeuppri(&kq->kq_sel, PSOCK);
if (!SEL_WAITING(&kq->kq_sel))
kq->kq_state &= ~KQ_SEL;
} }
KQ_UNLOCK(kq); KQ_UNLOCK(kq);
@ -1522,8 +1524,9 @@ kqueue_wakeup(struct kqueue *kq)
wakeup(kq); wakeup(kq);
} }
if ((kq->kq_state & KQ_SEL) == KQ_SEL) { if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
kq->kq_state &= ~KQ_SEL;
selwakeuppri(&kq->kq_sel, PSOCK); selwakeuppri(&kq->kq_sel, PSOCK);
if (!SEL_WAITING(&kq->kq_sel))
kq->kq_state &= ~KQ_SEL;
} }
if (!knlist_empty(&kq->kq_sel.si_note)) if (!knlist_empty(&kq->kq_sel.si_note))
kqueue_schedtask(kq); kqueue_schedtask(kq);

View File

@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h> #include <sys/sysctl.h>
#include <sys/sched.h> #include <sys/sched.h>
#include <sys/sleepqueue.h> #include <sys/sleepqueue.h>
#include <sys/selinfo.h>
#include <sys/turnstile.h> #include <sys/turnstile.h>
#include <sys/ktr.h> #include <sys/ktr.h>
#include <sys/umtx.h> #include <sys/umtx.h>
@ -206,6 +207,7 @@ thread_fini(void *mem, int size)
turnstile_free(td->td_turnstile); turnstile_free(td->td_turnstile);
sleepq_free(td->td_sleepqueue); sleepq_free(td->td_sleepqueue);
umtx_thread_fini(td); umtx_thread_fini(td);
seltdfini(td);
} }
/* /*

View File

@ -69,17 +69,59 @@ __FBSDID("$FreeBSD$");
#include <sys/ktrace.h> #include <sys/ktrace.h>
#endif #endif
#include <sys/ktr.h>
static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
MALLOC_DEFINE(M_IOV, "iov", "large iov's"); MALLOC_DEFINE(M_IOV, "iov", "large iov's");
static int pollscan(struct thread *, struct pollfd *, u_int); static int pollscan(struct thread *, struct pollfd *, u_int);
static int pollrescan(struct thread *);
static int selscan(struct thread *, fd_mask **, fd_mask **, int); static int selscan(struct thread *, fd_mask **, fd_mask **, int);
static int selrescan(struct thread *, fd_mask **, fd_mask **);
static void selfdalloc(struct thread *, void *);
static void selfdfree(struct seltd *, struct selfd *);
static int dofileread(struct thread *, int, struct file *, struct uio *, static int dofileread(struct thread *, int, struct file *, struct uio *,
off_t, int); off_t, int);
static int dofilewrite(struct thread *, int, struct file *, struct uio *, static int dofilewrite(struct thread *, int, struct file *, struct uio *,
off_t, int); off_t, int);
static void doselwakeup(struct selinfo *, int); static void doselwakeup(struct selinfo *, int);
static void seltdinit(struct thread *);
static int seltdwait(struct thread *, int);
static void seltdclear(struct thread *);
/*
* One seltd per-thread allocated on demand as needed.
*
* t - protected by st_mtx
* k - Only accessed by curthread or read-only
*/
struct seltd {
STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */
struct selfd *st_free1; /* (k) free fd for read set. */
struct selfd *st_free2; /* (k) free fd for write set. */
struct mtx st_mtx; /* Protects struct seltd */
struct cv st_wait; /* (t) Wait channel. */
int st_flags; /* (t) SELTD_ flags. */
};
#define SELTD_PENDING 0x0001 /* We have pending events. */
#define SELTD_RESCAN 0x0002 /* Doing a rescan. */
/*
* One selfd allocated per-thread per-file-descriptor.
* f - protected by sf_mtx
*/
struct selfd {
STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */
TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */
struct selinfo *sf_si; /* (f) selinfo when linked. */
struct mtx *sf_mtx; /* Pointer to selinfo mtx. */
struct seltd *sf_td; /* (k) owning seltd. */
void *sf_cookie; /* (k) fd or pollfd. */
};
static uma_zone_t selfd_zone;
#ifndef _SYS_SYSPROTO_H_ #ifndef _SYS_SYSPROTO_H_
struct read_args { struct read_args {
@ -629,14 +671,6 @@ out:
return (error); return (error);
} }
/*
* sellock and selwait are initialized in selectinit() via SYSINIT.
*/
struct mtx sellock;
struct cv selwait;
u_int nselcoll; /* Select collisions since boot */
SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
#ifndef _SYS_SYSPROTO_H_ #ifndef _SYS_SYSPROTO_H_
struct select_args { struct select_args {
int nd; int nd;
@ -678,7 +712,7 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
fd_mask *ibits[3], *obits[3], *selbits, *sbp; fd_mask *ibits[3], *obits[3], *selbits, *sbp;
struct timeval atv, rtv, ttv; struct timeval atv, rtv, ttv;
int error, timo; int error, timo;
u_int ncoll, nbufbytes, ncpbytes, nfdbits; u_int nbufbytes, ncpbytes, nfdbits;
if (nd < 0) if (nd < 0)
return (EINVAL); return (EINVAL);
@ -723,7 +757,7 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
sbp += ncpbytes / sizeof *sbp; \ sbp += ncpbytes / sizeof *sbp; \
error = copyin(name, ibits[x], ncpbytes); \ error = copyin(name, ibits[x], ncpbytes); \
if (error != 0) \ if (error != 0) \
goto done_nosellock; \ goto done; \
} \ } \
} while (0) } while (0)
getbits(fd_in, 0); getbits(fd_in, 0);
@ -737,7 +771,7 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
atv = *tvp; atv = *tvp;
if (itimerfix(&atv)) { if (itimerfix(&atv)) {
error = EINVAL; error = EINVAL;
goto done_nosellock; goto done;
} }
getmicrouptime(&rtv); getmicrouptime(&rtv);
timevaladd(&atv, &rtv); timevaladd(&atv, &rtv);
@ -746,58 +780,31 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
atv.tv_usec = 0; atv.tv_usec = 0;
} }
timo = 0; timo = 0;
TAILQ_INIT(&td->td_selq); seltdinit(td);
mtx_lock(&sellock); /* Iterate until the timeout expires or descriptors become ready. */
retry: for (;;) {
ncoll = nselcoll; error = selscan(td, ibits, obits, nd);
thread_lock(td); if (error || td->td_retval[0] != 0)
td->td_flags |= TDF_SELECT; break;
thread_unlock(td); if (atv.tv_sec || atv.tv_usec) {
mtx_unlock(&sellock); getmicrouptime(&rtv);
if (timevalcmp(&rtv, &atv, >=))
error = selscan(td, ibits, obits, nd); break;
mtx_lock(&sellock); ttv = atv;
if (error || td->td_retval[0]) timevalsub(&ttv, &rtv);
goto done; timo = ttv.tv_sec > 24 * 60 * 60 ?
if (atv.tv_sec || atv.tv_usec) { 24 * 60 * 60 * hz : tvtohz(&ttv);
getmicrouptime(&rtv); }
if (timevalcmp(&rtv, &atv, >=)) error = seltdwait(td, timo);
goto done; if (error)
ttv = atv; break;
timevalsub(&ttv, &rtv); error = selrescan(td, ibits, obits);
timo = ttv.tv_sec > 24 * 60 * 60 ? if (error || td->td_retval[0] != 0)
24 * 60 * 60 * hz : tvtohz(&ttv); break;
} }
seltdclear(td);
/*
* An event of interest may occur while we do not hold
* sellock, so check TDF_SELECT and the number of
* collisions and rescan the file descriptors if
* necessary.
*/
thread_lock(td);
if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
thread_unlock(td);
goto retry;
}
thread_unlock(td);
if (timo > 0)
error = cv_timedwait_sig(&selwait, &sellock, timo);
else
error = cv_wait_sig(&selwait, &sellock);
if (error == 0)
goto retry;
done: done:
clear_selinfo_list(td);
thread_lock(td);
td->td_flags &= ~TDF_SELECT;
thread_unlock(td);
mtx_unlock(&sellock);
done_nosellock:
/* select is not restarted after signals... */ /* select is not restarted after signals... */
if (error == ERESTART) if (error == ERESTART)
error = EINTR; error = EINTR;
@ -820,6 +827,60 @@ done_nosellock:
return (error); return (error);
} }
/*
* Traverse the list of fds attached to this thread's seltd and check for
* completion.
*/
static int
selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
{
struct seltd *stp;
struct selfd *sfp;
struct selfd *sfn;
struct selinfo *si;
struct file *fp;
int msk, fd;
int n = 0;
/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
struct filedesc *fdp = td->td_proc->p_fd;
stp = td->td_sel;
FILEDESC_SLOCK(fdp);
STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
fd = (int)(uintptr_t)sfp->sf_cookie;
si = sfp->sf_si;
selfdfree(stp, sfp);
/* If the selinfo wasn't cleared the event didn't fire. */
if (si != NULL)
continue;
if ((fp = fget_locked(fdp, fd)) == NULL) {
FILEDESC_SUNLOCK(fdp);
return (EBADF);
}
for (msk = 0; msk < 3; msk++) {
if (ibits[msk] == NULL)
continue;
if ((ibits[msk][fd/NFDBITS] &
((fd_mask) 1 << (fd % NFDBITS))) == 0)
continue;
if (fo_poll(fp, flag[msk], td->td_ucred, td)) {
obits[msk][(fd)/NFDBITS] |=
((fd_mask)1 << ((fd) % NFDBITS));
n++;
}
}
}
FILEDESC_SUNLOCK(fdp);
stp->st_flags = 0;
td->td_retval[0] = n;
return (0);
}
/*
* Perform the initial filedescriptor scan and register ourselves with
* each selinfo.
*/
static int static int
selscan(td, ibits, obits, nfd) selscan(td, ibits, obits, nfd)
struct thread *td; struct thread *td;
@ -848,6 +909,7 @@ selscan(td, ibits, obits, nfd)
FILEDESC_SUNLOCK(fdp); FILEDESC_SUNLOCK(fdp);
return (EBADF); return (EBADF);
} }
selfdalloc(td, (void *)(uintptr_t)fd);
if (fo_poll(fp, flag[msk], td->td_ucred, if (fo_poll(fp, flag[msk], td->td_ucred,
td)) { td)) {
obits[msk][(fd)/NFDBITS] |= obits[msk][(fd)/NFDBITS] |=
@ -878,7 +940,7 @@ poll(td, uap)
struct pollfd smallbits[32]; struct pollfd smallbits[32];
struct timeval atv, rtv, ttv; struct timeval atv, rtv, ttv;
int error = 0, timo; int error = 0, timo;
u_int ncoll, nfds; u_int nfds;
size_t ni; size_t ni;
nfds = uap->nfds; nfds = uap->nfds;
@ -894,8 +956,7 @@ poll(td, uap)
if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
(nfds > FD_SETSIZE)) { (nfds > FD_SETSIZE)) {
PROC_UNLOCK(td->td_proc); PROC_UNLOCK(td->td_proc);
error = EINVAL; return (EINVAL);
goto done2;
} }
PROC_UNLOCK(td->td_proc); PROC_UNLOCK(td->td_proc);
ni = nfds * sizeof(struct pollfd); ni = nfds * sizeof(struct pollfd);
@ -905,13 +966,13 @@ poll(td, uap)
bits = smallbits; bits = smallbits;
error = copyin(uap->fds, bits, ni); error = copyin(uap->fds, bits, ni);
if (error) if (error)
goto done_nosellock; goto done;
if (uap->timeout != INFTIM) { if (uap->timeout != INFTIM) {
atv.tv_sec = uap->timeout / 1000; atv.tv_sec = uap->timeout / 1000;
atv.tv_usec = (uap->timeout % 1000) * 1000; atv.tv_usec = (uap->timeout % 1000) * 1000;
if (itimerfix(&atv)) { if (itimerfix(&atv)) {
error = EINVAL; error = EINVAL;
goto done_nosellock; goto done;
} }
getmicrouptime(&rtv); getmicrouptime(&rtv);
timevaladd(&atv, &rtv); timevaladd(&atv, &rtv);
@ -920,56 +981,31 @@ poll(td, uap)
atv.tv_usec = 0; atv.tv_usec = 0;
} }
timo = 0; timo = 0;
TAILQ_INIT(&td->td_selq); seltdinit(td);
mtx_lock(&sellock); /* Iterate until the timeout expires or descriptors become ready. */
retry: for (;;) {
ncoll = nselcoll; error = pollscan(td, bits, nfds);
thread_lock(td); if (error || td->td_retval[0] != 0)
td->td_flags |= TDF_SELECT; break;
thread_unlock(td); if (atv.tv_sec || atv.tv_usec) {
mtx_unlock(&sellock); getmicrouptime(&rtv);
if (timevalcmp(&rtv, &atv, >=))
error = pollscan(td, bits, nfds); break;
mtx_lock(&sellock); ttv = atv;
if (error || td->td_retval[0]) timevalsub(&ttv, &rtv);
goto done; timo = ttv.tv_sec > 24 * 60 * 60 ?
if (atv.tv_sec || atv.tv_usec) { 24 * 60 * 60 * hz : tvtohz(&ttv);
getmicrouptime(&rtv); }
if (timevalcmp(&rtv, &atv, >=)) error = seltdwait(td, timo);
goto done; if (error)
ttv = atv; break;
timevalsub(&ttv, &rtv); error = pollrescan(td);
timo = ttv.tv_sec > 24 * 60 * 60 ? if (error || td->td_retval[0] != 0)
24 * 60 * 60 * hz : tvtohz(&ttv); break;
} }
/* seltdclear(td);
* An event of interest may occur while we do not hold
* sellock, so check TDF_SELECT and the number of collisions
* and rescan the file descriptors if necessary.
*/
thread_lock(td);
if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
thread_unlock(td);
goto retry;
}
thread_unlock(td);
if (timo > 0)
error = cv_timedwait_sig(&selwait, &sellock, timo);
else
error = cv_wait_sig(&selwait, &sellock);
if (error == 0)
goto retry;
done: done:
clear_selinfo_list(td);
thread_lock(td);
td->td_flags &= ~TDF_SELECT;
thread_unlock(td);
mtx_unlock(&sellock);
done_nosellock:
/* poll is not restarted after signals... */ /* poll is not restarted after signals... */
if (error == ERESTART) if (error == ERESTART)
error = EINTR; error = EINTR;
@ -983,17 +1019,60 @@ done_nosellock:
out: out:
if (ni > sizeof(smallbits)) if (ni > sizeof(smallbits))
free(bits, M_TEMP); free(bits, M_TEMP);
done2:
return (error); return (error);
} }
static int
pollrescan(struct thread *td)
{
struct seltd *stp;
struct selfd *sfp;
struct selfd *sfn;
struct selinfo *si;
struct filedesc *fdp;
struct file *fp;
struct pollfd *fd;
int n;
n = 0;
fdp = td->td_proc->p_fd;
stp = td->td_sel;
FILEDESC_SLOCK(fdp);
STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
fd = (struct pollfd *)sfp->sf_cookie;
si = sfp->sf_si;
selfdfree(stp, sfp);
/* If the selinfo wasn't cleared the event didn't fire. */
if (si != NULL)
continue;
fp = fdp->fd_ofiles[fd->fd];
if (fp == NULL) {
fd->revents = POLLNVAL;
n++;
continue;
}
/*
* Note: backend also returns POLLHUP and
* POLLERR if appropriate.
*/
fd->revents = fo_poll(fp, fd->events, td->td_ucred, td);
if (fd->revents != 0)
n++;
}
FILEDESC_SUNLOCK(fdp);
stp->st_flags = 0;
td->td_retval[0] = n;
return (0);
}
static int static int
pollscan(td, fds, nfd) pollscan(td, fds, nfd)
struct thread *td; struct thread *td;
struct pollfd *fds; struct pollfd *fds;
u_int nfd; u_int nfd;
{ {
register struct filedesc *fdp = td->td_proc->p_fd; struct filedesc *fdp = td->td_proc->p_fd;
int i; int i;
struct file *fp; struct file *fp;
int n = 0; int n = 0;
@ -1015,6 +1094,7 @@ pollscan(td, fds, nfd)
* Note: backend also returns POLLHUP and * Note: backend also returns POLLHUP and
* POLLERR if appropriate. * POLLERR if appropriate.
*/ */
selfdalloc(td, fds);
fds->revents = fo_poll(fp, fds->events, fds->revents = fo_poll(fp, fds->events,
td->td_ucred, td); td->td_ucred, td);
if (fds->revents != 0) if (fds->revents != 0)
@ -1048,23 +1128,90 @@ openbsd_poll(td, uap)
} }
/* /*
* Remove the references to the thread from all of the objects we were * XXX This was created specifically to support netncp and netsmb. This
* polling. * allows the caller to specify a socket to wait for events on. It returns
* * 0 if any events matched and an error otherwise. There is no way to
* This code assumes that the underlying owner of the selinfo structure will * determine which events fired.
* hold sellock before it changes it, and that it will unlink itself from our
* list if it goes away.
*/ */
void int
clear_selinfo_list(td) selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
struct thread *td;
{ {
struct selinfo *si; struct timeval atv, rtv, ttv;
int error, timo;
mtx_assert(&sellock, MA_OWNED); if (tvp != NULL) {
TAILQ_FOREACH(si, &td->td_selq, si_thrlist) atv = *tvp;
si->si_thread = NULL; if (itimerfix(&atv))
TAILQ_INIT(&td->td_selq); return (EINVAL);
getmicrouptime(&rtv);
timevaladd(&atv, &rtv);
} else {
atv.tv_sec = 0;
atv.tv_usec = 0;
}
timo = 0;
seltdinit(td);
/*
* Iterate until the timeout expires or the socket becomes ready.
*/
for (;;) {
selfdalloc(td, NULL);
error = sopoll(so, events, NULL, td);
/* error here is actually the ready events. */
if (error)
return (0);
if (atv.tv_sec || atv.tv_usec) {
getmicrouptime(&rtv);
if (timevalcmp(&rtv, &atv, >=)) {
seltdclear(td);
return (EWOULDBLOCK);
}
ttv = atv;
timevalsub(&ttv, &rtv);
timo = ttv.tv_sec > 24 * 60 * 60 ?
24 * 60 * 60 * hz : tvtohz(&ttv);
}
error = seltdwait(td, timo);
seltdclear(td);
if (error)
break;
}
/* XXX Duplicates ncp/smb behavior. */
if (error == ERESTART)
error = 0;
return (error);
}
/*
* Preallocate two selfds associated with 'cookie'. Some fo_poll routines
* have two select sets, one for read and another for write.
*/
static void
selfdalloc(struct thread *td, void *cookie)
{
struct seltd *stp;
stp = td->td_sel;
if (stp->st_free1 == NULL)
stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
stp->st_free1->sf_td = stp;
stp->st_free1->sf_cookie = cookie;
if (stp->st_free2 == NULL)
stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
stp->st_free2->sf_td = stp;
stp->st_free2->sf_cookie = cookie;
}
static void
selfdfree(struct seltd *stp, struct selfd *sfp)
{
STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link);
mtx_lock(sfp->sf_mtx);
if (sfp->sf_si)
TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads);
mtx_unlock(sfp->sf_mtx);
uma_zfree(selfd_zone, sfp);
} }
/* /*
@ -1075,26 +1222,46 @@ selrecord(selector, sip)
struct thread *selector; struct thread *selector;
struct selinfo *sip; struct selinfo *sip;
{ {
struct selfd *sfp;
struct seltd *stp;
struct mtx *mtxp;
mtx_lock(&sellock); stp = selector->td_sel;
/* /*
* If the selinfo's thread pointer is NULL then take ownership of it. * Don't record when doing a rescan.
*
* If the thread pointer is not NULL and it points to another
* thread, then we have a collision.
*
* If the thread pointer is not NULL and points back to us then leave
* it alone as we've already added pointed it at us and added it to
* our list.
*/ */
if (sip->si_thread == NULL) { if (stp->st_flags & SELTD_RESCAN)
sip->si_thread = selector; return;
TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); /*
} else if (sip->si_thread != selector) { * Grab one of the preallocated descriptors.
sip->si_flags |= SI_COLL; */
sfp = NULL;
if ((sfp = stp->st_free1) != NULL)
stp->st_free1 = NULL;
else if ((sfp = stp->st_free2) != NULL)
stp->st_free2 = NULL;
else
panic("selrecord: No free selfd on selq");
mtxp = mtx_pool_find(mtxpool_sleep, sip);
/*
* Initialize the sfp and queue it in the thread.
*/
sfp->sf_si = sip;
sfp->sf_mtx = mtxp;
STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link);
/*
* Now that we've locked the sip, check for initialization.
*/
mtx_lock(mtxp);
if (sip->si_mtx == NULL) {
sip->si_mtx = mtxp;
TAILQ_INIT(&sip->si_tdlist);
} }
/*
mtx_unlock(&sellock); * Add this thread to the list of selfds listening on this selinfo.
*/
TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads);
mtx_unlock(sip->si_mtx);
} }
/* Wake up a selecting thread. */ /* Wake up a selecting thread. */
@ -1122,36 +1289,115 @@ doselwakeup(sip, pri)
struct selinfo *sip; struct selinfo *sip;
int pri; int pri;
{ {
struct thread *td; struct selfd *sfp;
struct selfd *sfn;
struct seltd *stp;
mtx_lock(&sellock); /* If it's not initialized there can't be any waiters. */
td = sip->si_thread; if (sip->si_mtx == NULL)
if ((sip->si_flags & SI_COLL) != 0) {
nselcoll++;
sip->si_flags &= ~SI_COLL;
cv_broadcastpri(&selwait, pri);
}
if (td == NULL) {
mtx_unlock(&sellock);
return; return;
/*
* Locking the selinfo locks all selfds associated with it.
*/
mtx_lock(sip->si_mtx);
TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) {
/*
* Once we remove this sfp from the list and clear the
* sf_si seltdclear will know to ignore this si.
*/
TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads);
sfp->sf_si = NULL;
stp = sfp->sf_td;
mtx_lock(&stp->st_mtx);
stp->st_flags |= SELTD_PENDING;
cv_broadcastpri(&stp->st_wait, pri);
mtx_unlock(&stp->st_mtx);
} }
TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); mtx_unlock(sip->si_mtx);
sip->si_thread = NULL; }
thread_lock(td);
td->td_flags &= ~TDF_SELECT; static void
thread_unlock(td); seltdinit(struct thread *td)
sleepq_remove(td, &selwait); {
mtx_unlock(&sellock); struct seltd *stp;
if ((stp = td->td_sel) != NULL)
goto out;
td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO);
mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF);
cv_init(&stp->st_wait, "select");
out:
stp->st_flags = 0;
STAILQ_INIT(&stp->st_selq);
}
static int
seltdwait(struct thread *td, int timo)
{
struct seltd *stp;
int error;
stp = td->td_sel;
/*
* An event of interest may occur while we do not hold the seltd
* locked so check the pending flag before we sleep.
*/
mtx_lock(&stp->st_mtx);
/*
* Any further calls to selrecord will be a rescan.
*/
stp->st_flags |= SELTD_RESCAN;
if (stp->st_flags & SELTD_PENDING) {
mtx_unlock(&stp->st_mtx);
return (0);
}
if (timo > 0)
error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo);
else
error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
mtx_unlock(&stp->st_mtx);
return (error);
}
void
seltdfini(struct thread *td)
{
struct seltd *stp;
stp = td->td_sel;
if (stp == NULL)
return;
if (stp->st_free1)
uma_zfree(selfd_zone, stp->st_free1);
if (stp->st_free2)
uma_zfree(selfd_zone, stp->st_free2);
td->td_sel = NULL;
free(stp, M_SELECT);
}
/*
* Remove the references to the thread from all of the objects we were
* polling.
*/
static void
seltdclear(struct thread *td)
{
struct seltd *stp;
struct selfd *sfp;
struct selfd *sfn;
stp = td->td_sel;
STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn)
selfdfree(stp, sfp);
stp->st_flags = 0;
} }
static void selectinit(void *); static void selectinit(void *);
SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL);
/* ARGSUSED*/
static void static void
selectinit(dummy) selectinit(void *dummy __unused)
void *dummy;
{ {
cv_init(&selwait, "select"); selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL,
mtx_init(&sellock, "sellck", NULL, MTX_DEF); NULL, NULL, UMA_ALIGN_PTR, 0);
} }

View File

@ -524,8 +524,9 @@ pipeselwakeup(cpipe)
PIPE_LOCK_ASSERT(cpipe, MA_OWNED); PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
if (cpipe->pipe_state & PIPE_SEL) { if (cpipe->pipe_state & PIPE_SEL) {
cpipe->pipe_state &= ~PIPE_SEL;
selwakeuppri(&cpipe->pipe_sel, PSOCK); selwakeuppri(&cpipe->pipe_sel, PSOCK);
if (!SEL_WAITING(&cpipe->pipe_sel))
cpipe->pipe_state &= ~PIPE_SEL;
} }
if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
pgsigio(&cpipe->pipe_sigio, SIGIO, 0); pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
@ -1354,12 +1355,14 @@ pipe_poll(fp, events, active_cred, td)
if (revents == 0) { if (revents == 0) {
if (events & (POLLIN | POLLRDNORM)) { if (events & (POLLIN | POLLRDNORM)) {
selrecord(td, &rpipe->pipe_sel); selrecord(td, &rpipe->pipe_sel);
rpipe->pipe_state |= PIPE_SEL; if (SEL_WAITING(&rpipe->pipe_sel))
rpipe->pipe_state |= PIPE_SEL;
} }
if (events & (POLLOUT | POLLWRNORM)) { if (events & (POLLOUT | POLLWRNORM)) {
selrecord(td, &wpipe->pipe_sel); selrecord(td, &wpipe->pipe_sel);
wpipe->pipe_state |= PIPE_SEL; if (SEL_WAITING(&wpipe->pipe_sel))
wpipe->pipe_state |= PIPE_SEL;
} }
} }
#ifdef MAC #ifdef MAC

View File

@ -176,7 +176,8 @@ sowakeup(struct socket *so, struct sockbuf *sb)
SOCKBUF_LOCK_ASSERT(sb); SOCKBUF_LOCK_ASSERT(sb);
selwakeuppri(&sb->sb_sel, PSOCK); selwakeuppri(&sb->sb_sel, PSOCK);
sb->sb_flags &= ~SB_SEL; if (!SEL_WAITING(&sb->sb_sel))
sb->sb_flags &= ~SB_SEL;
if (sb->sb_flags & SB_WAIT) { if (sb->sb_flags & SB_WAIT) {
sb->sb_flags &= ~SB_WAIT; sb->sb_flags &= ~SB_WAIT;
wakeup(&sb->sb_cc); wakeup(&sb->sb_cc);

View File

@ -43,6 +43,8 @@ __FBSDID("$FreeBSD$");
#include <sys/mbuf.h> #include <sys/mbuf.h>
#include <sys/poll.h> #include <sys/poll.h>
#include <sys/proc.h> #include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/uio.h> #include <sys/uio.h>
#include <netncp/ncp.h> #include <netncp/ncp.h>
@ -274,7 +276,9 @@ ncp_request_int(struct ncp_rq *rqp)
/* /*
* Flush out replies on previous reqs * Flush out replies on previous reqs
*/ */
while (ncp_poll(so, POLLIN) != 0) { tv.tv_sec = 0;
tv.tv_usec = 0;
while (selsocket(so, POLLIN, &tv, td) == 0) {
if (ncp_sock_recv(so, &m, &len) != 0) if (ncp_sock_recv(so, &m, &len) != 0)
break; break;
m_freem(m); m_freem(m);
@ -319,7 +323,7 @@ ncp_request_int(struct ncp_rq *rqp)
} }
tv.tv_sec = conn->li.timeout; tv.tv_sec = conn->li.timeout;
tv.tv_usec = 0; tv.tv_usec = 0;
error = ncp_sock_rselect(so, td, &tv, POLLIN); error = selsocket(so, POLLIN, &tv, td);
if (error == EWOULDBLOCK ) /* timeout expired */ if (error == EWOULDBLOCK ) /* timeout expired */
continue; continue;
error = ncp_chkintr(conn, td); error = ncp_chkintr(conn, td);
@ -335,7 +339,9 @@ ncp_request_int(struct ncp_rq *rqp)
dosend = 1; /* resend rq if error */ dosend = 1; /* resend rq if error */
for (;;) { for (;;) {
error = 0; error = 0;
if (ncp_poll(so, POLLIN) == 0) tv.tv_sec = 0;
tv.tv_usec = 0;
if (selsocket(so, POLLIN, &tv, td) != 0)
break; break;
/* if (so->so_rcv.sb_cc == 0) { /* if (so->so_rcv.sb_cc == 0) {
break; break;

View File

@ -65,7 +65,6 @@ __FBSDID("$FreeBSD$");
#define ipx_setnullhost(x) ((x).x_host.s_host[0] = 0); \ #define ipx_setnullhost(x) ((x).x_host.s_host[0] = 0); \
((x).x_host.s_host[1] = 0); ((x).x_host.s_host[2] = 0); ((x).x_host.s_host[1] = 0); ((x).x_host.s_host[2] = 0);
/*int ncp_poll(struct socket *so, int events);*/
/*static int ncp_getsockname(struct socket *so, caddr_t asa, int *alen);*/ /*static int ncp_getsockname(struct socket *so, caddr_t asa, int *alen);*/
static int ncp_soconnect(struct socket *so, struct sockaddr *target, static int ncp_soconnect(struct socket *so, struct sockaddr *target,
struct thread *td); struct thread *td);
@ -181,110 +180,6 @@ ncp_sock_send(struct socket *so, struct mbuf *top, struct ncp_rq *rqp)
return error; return error;
} }
int
ncp_poll(struct socket *so, int events)
{
struct thread *td = curthread;
int revents;
/* Fake up enough state to look like we are in poll(2). */
mtx_lock(&sellock);
thread_lock(td);
td->td_flags |= TDF_SELECT;
thread_unlock(td);
mtx_unlock(&sellock);
TAILQ_INIT(&td->td_selq);
revents = sopoll(so, events, NULL, td);
/* Tear down the fake poll(2) state. */
mtx_lock(&sellock);
clear_selinfo_list(td);
thread_lock(td);
td->td_flags &= ~TDF_SELECT;
thread_unlock(td);
mtx_unlock(&sellock);
return (revents);
}
int
ncp_sock_rselect(struct socket *so, struct thread *td, struct timeval *tv,
int events)
{
struct timeval atv, rtv, ttv;
int ncoll, timo, error, revents;
if (tv) {
atv = *tv;
if (itimerfix(&atv)) {
error = EINVAL;
goto done_noproclock;
}
getmicrouptime(&rtv);
timevaladd(&atv, &rtv);
}
timo = 0;
mtx_lock(&sellock);
retry:
ncoll = nselcoll;
thread_lock(td);
td->td_flags |= TDF_SELECT;
thread_unlock(td);
mtx_unlock(&sellock);
TAILQ_INIT(&td->td_selq);
revents = sopoll(so, events, NULL, td);
mtx_lock(&sellock);
if (revents) {
error = 0;
goto done;
}
if (tv) {
getmicrouptime(&rtv);
if (timevalcmp(&rtv, &atv, >=)) {
error = EWOULDBLOCK;
goto done;
}
ttv = atv;
timevalsub(&ttv, &rtv);
timo = tvtohz(&ttv);
}
/*
* An event of our interest may occur during locking a thread.
* In order to avoid missing the event that occurred during locking
* the process, test TDF_SELECT and rescan file descriptors if
* necessary.
*/
thread_lock(td);
if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
thread_unlock(td);
goto retry;
}
thread_unlock(td);
if (timo > 0)
error = cv_timedwait(&selwait, &sellock, timo);
else {
cv_wait(&selwait, &sellock);
error = 0;
}
done:
clear_selinfo_list(td);
thread_lock(td);
td->td_flags &= ~TDF_SELECT;
thread_unlock(td);
mtx_unlock(&sellock);
done_noproclock:
if (error == ERESTART)
error = 0;
return (error);
}
/* /*
* Connect to specified server via IPX * Connect to specified server via IPX
*/ */

View File

@ -45,9 +45,6 @@ int ncp_sock_connect(struct ncp_conn *ncp);
int ncp_sock_recv(struct socket *so, struct mbuf **mp, int *rlen); int ncp_sock_recv(struct socket *so, struct mbuf **mp, int *rlen);
int ncp_sock_send(struct socket *so, struct mbuf *data, struct ncp_rq *rqp); int ncp_sock_send(struct socket *so, struct mbuf *data, struct ncp_rq *rqp);
int ncp_sock_disconnect(struct ncp_conn *conn); int ncp_sock_disconnect(struct ncp_conn *conn);
int ncp_poll(struct socket *so, int events);
int ncp_sock_rselect(struct socket *so, struct thread *td, struct timeval *tv,
int events);
int ncp_sock_checksum(struct ncp_conn *conn, int enable); int ncp_sock_checksum(struct ncp_conn *conn, int enable);
void ncp_check_rq(struct ncp_conn *conn); void ncp_check_rq(struct ncp_conn *conn);

View File

@ -94,84 +94,6 @@ nb_setsockopt_int(struct socket *so, int level, int name, int val)
return sosetopt(so, &sopt); return sosetopt(so, &sopt);
} }
static int
nbssn_rselect(struct nbpcb *nbp, struct timeval *tv, int events,
struct thread *td)
{
struct timeval atv, rtv, ttv;
int ncoll, timo, error, revents;
if (tv) {
atv = *tv;
if (itimerfix(&atv)) {
error = EINVAL;
goto done_noproclock;
}
getmicrouptime(&rtv);
timevaladd(&atv, &rtv);
}
timo = 0;
mtx_lock(&sellock);
retry:
ncoll = nselcoll;
thread_lock(td);
td->td_flags |= TDF_SELECT;
thread_unlock(td);
mtx_unlock(&sellock);
/* XXX: Should be done when the thread is initialized. */
TAILQ_INIT(&td->td_selq);
revents = sopoll(nbp->nbp_tso, events, NULL, td);
mtx_lock(&sellock);
if (revents) {
error = 0;
goto done;
}
if (tv) {
getmicrouptime(&rtv);
if (timevalcmp(&rtv, &atv, >=)) {
error = EWOULDBLOCK;
goto done;
}
ttv = atv;
timevalsub(&ttv, &rtv);
timo = tvtohz(&ttv);
}
/*
* An event of our interest may occur during locking a process.
* In order to avoid missing the event that occurred during locking
* the process, test P_SELECT and rescan file descriptors if
* necessary.
*/
thread_lock(td);
if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
thread_unlock(td);
goto retry;
}
thread_unlock(td);
if (timo > 0)
error = cv_timedwait(&selwait, &sellock, timo);
else {
cv_wait(&selwait, &sellock);
error = 0;
}
done:
clear_selinfo_list(td);
thread_lock(td);
td->td_flags &= ~TDF_SELECT;
thread_unlock(td);
mtx_unlock(&sellock);
done_noproclock:
if (error == ERESTART)
return 0;
return error;
}
static int static int
nb_intr(struct nbpcb *nbp, struct proc *p) nb_intr(struct nbpcb *nbp, struct proc *p)
{ {
@ -302,7 +224,7 @@ nbssn_rq_request(struct nbpcb *nbp, struct thread *td)
if (error) if (error)
return error; return error;
TIMESPEC_TO_TIMEVAL(&tv, &nbp->nbp_timo); TIMESPEC_TO_TIMEVAL(&tv, &nbp->nbp_timo);
error = nbssn_rselect(nbp, &tv, POLLIN, td); error = selsocket(nbp->nbp_tso, POLLIN, &tv, td);
if (error == EWOULDBLOCK) { /* Timeout */ if (error == EWOULDBLOCK) { /* Timeout */
NBDEBUG("initial request timeout\n"); NBDEBUG("initial request timeout\n");
return ETIMEDOUT; return ETIMEDOUT;

View File

@ -142,7 +142,6 @@ struct pargs {
* m - Giant * m - Giant
* n - not locked, lazy * n - not locked, lazy
* o - ktrace lock * o - ktrace lock
* p - select lock (sellock)
* q - td_contested lock * q - td_contested lock
* r - p_peers lock * r - p_peers lock
* t - thread lock * t - thread lock
@ -210,7 +209,7 @@ struct thread {
TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */
TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */
TAILQ_HEAD(, selinfo) td_selq; /* (p) List of selinfos. */ struct seltd *td_sel; /* Select queue/channel. */
struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */
struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */
@ -322,7 +321,7 @@ do { \
#define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */
#define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */
#define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */
#define TDF_SELECT 0x00000040 /* Selecting; wakeup/waiting danger. */ #define TDF_UNUSEDx40 0x00000040 /* --available-- */
#define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */
#define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */ #define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */
#define TDF_UBORROWING 0x00000200 /* Thread is borrowing user pri. */ #define TDF_UBORROWING 0x00000200 /* Thread is borrowing user pri. */

View File

@ -35,26 +35,26 @@
#include <sys/event.h> /* for struct klist */ #include <sys/event.h> /* for struct klist */
struct selfd;
TAILQ_HEAD(selfdlist, selfd);
/* /*
* Used to maintain information about processes that wish to be * Used to maintain information about processes that wish to be
* notified when I/O becomes possible. * notified when I/O becomes possible.
*/ */
struct selinfo { struct selinfo {
TAILQ_ENTRY(selinfo) si_thrlist; /* list hung off of thread */ struct selfdlist si_tdlist; /* List of sleeping threads. */
struct thread *si_thread; /* thread waiting */ struct knlist si_note; /* kernel note list */
struct knlist si_note; /* kernel note list */ struct mtx *si_mtx; /* Lock for tdlist. */
short si_flags; /* see below */
}; };
#define SI_COLL 0x0001 /* collision occurred */
#define SEL_WAITING(si) \ #define SEL_WAITING(si) (!TAILQ_EMPTY(&(si)->si_tdlist))
((si)->si_thread != NULL || ((si)->si_flags & SI_COLL) != 0)
#ifdef _KERNEL #ifdef _KERNEL
void clear_selinfo_list(struct thread *td);
void selrecord(struct thread *selector, struct selinfo *sip); void selrecord(struct thread *selector, struct selinfo *sip);
void selwakeup(struct selinfo *sip); void selwakeup(struct selinfo *sip);
void selwakeuppri(struct selinfo *sip, int pri); void selwakeuppri(struct selinfo *sip, int pri);
void seltdfini(struct thread *td);
#endif #endif
#endif /* !_SYS_SELINFO_H_ */ #endif /* !_SYS_SELINFO_H_ */

View File

@ -546,6 +546,8 @@ int sosetopt(struct socket *so, struct sockopt *sopt);
int soshutdown(struct socket *so, int how); int soshutdown(struct socket *so, int how);
void sotoxsocket(struct socket *so, struct xsocket *xso); void sotoxsocket(struct socket *so, struct xsocket *xso);
void sowakeup(struct socket *so, struct sockbuf *sb); void sowakeup(struct socket *so, struct sockbuf *sb);
int selsocket(struct socket *so, int events, struct timeval *tv,
struct thread *td);
#ifdef SOCKBUF_DEBUG #ifdef SOCKBUF_DEBUG
void sblastrecordchk(struct sockbuf *, const char *, int); void sblastrecordchk(struct sockbuf *, const char *, int);

View File

@ -56,10 +56,6 @@ extern int kstack_pages; /* number of kernel stack pages */
extern int nswap; /* size of swap space */ extern int nswap; /* size of swap space */
extern u_int nselcoll; /* select collisions since boot */
extern struct mtx sellock; /* select lock variable */
extern struct cv selwait; /* select conditional variable */
extern long physmem; /* physical memory */ extern long physmem; /* physical memory */
extern long realmem; /* 'real' memory */ extern long realmem; /* 'real' memory */