freebsd-skq/sys/kern/sys_generic.c
John Baldwin 91d5354a2c Locking for the per-process resource limits structure.
- struct plimit includes a mutex to protect a reference count.  The plimit
  structure is treated similarly to struct ucred in that is is always copy
  on write, so having a reference to a structure is sufficient to read from
  it without needing a further lock.
- The proc lock protects the p_limit pointer and must be held while reading
  limits from a process to keep the limit structure from changing out from
  under you while reading from it.
- Various global limits that are ints are not protected by a lock since
  int writes are atomic on all the archs we support and thus a lock
  wouldn't buy us anything.
- All accesses to individual resource limits from a process are abstracted
  behind a simple lim_rlimit(), lim_max(), and lim_cur() API that return
  either an rlimit, or the current or max individual limit of the specified
  resource from a process.
- dosetrlimit() was renamed to kern_setrlimit() to match existing style of
  other similar syscall helper functions.
- The alpha OSF/1 compat layer no longer calls getrlimit() and setrlimit()
  (it didn't used the stackgap when it should have) but uses lim_rlimit()
  and kern_setrlimit() instead.
- The svr4 compat no longer uses the stackgap for resource limits calls,
  but uses lim_rlimit() and kern_setrlimit() instead.
- The ibcs2 compat no longer uses the stackgap for resource limits.  It
  also no longer uses the stackgap for accessing sysctl's for the
  ibcs2_sysconf() syscall but uses kernel_sysctl() instead.  As a result,
  ibcs2_sysconf() no longer needs Giant.
- The p_rlimit macro no longer exists.

Submitted by:	mtm (mostly, I only did a few cleanups and catchups)
Tested on:	i386
Compiled on:	alpha, amd64
2004-02-04 21:52:57 +00:00

1238 lines
26 KiB
C

/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/socketvar.h>
#include <sys/uio.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/poll.h>
#include <sys/resourcevar.h>
#include <sys/selinfo.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/vnode.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/condvar.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <vm/vm.h>
#include <vm/vm_page.h>
static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
MALLOC_DEFINE(M_IOV, "iov", "large iov's");
static int pollscan(struct thread *, struct pollfd *, u_int);
static int selscan(struct thread *, fd_mask **, fd_mask **, int);
static int dofileread(struct thread *, struct file *, int, void *,
size_t, off_t, int);
static int dofilewrite(struct thread *, struct file *, int,
const void *, size_t, off_t, int);
static void doselwakeup(struct selinfo *, int);
/*
* Read system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct read_args {
int fd;
void *buf;
size_t nbyte;
};
#endif
/*
* MPSAFE
*/
int
read(td, uap)
struct thread *td;
struct read_args *uap;
{
struct file *fp;
int error;
if ((error = fget_read(td, uap->fd, &fp)) == 0) {
error = dofileread(td, fp, uap->fd, uap->buf,
uap->nbyte, (off_t)-1, 0);
fdrop(fp, td);
}
return(error);
}
/*
* Pread system call
*/
#ifndef _SYS_SYSPROTO_H_
struct pread_args {
int fd;
void *buf;
size_t nbyte;
int pad;
off_t offset;
};
#endif
/*
* MPSAFE
*/
int
pread(td, uap)
struct thread *td;
struct pread_args *uap;
{
struct file *fp;
int error;
if ((error = fget_read(td, uap->fd, &fp)) != 0)
return (error);
if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
error = ESPIPE;
else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
error = EINVAL;
else {
error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
uap->offset, FOF_OFFSET);
}
fdrop(fp, td);
return(error);
}
/*
* Code common for read and pread
*/
static int
dofileread(td, fp, fd, buf, nbyte, offset, flags)
struct thread *td;
struct file *fp;
int fd, flags;
void *buf;
size_t nbyte;
off_t offset;
{
struct uio auio;
struct iovec aiov;
long cnt, error = 0;
#ifdef KTRACE
struct iovec ktriov;
struct uio ktruio;
int didktr = 0;
#endif
aiov.iov_base = buf;
aiov.iov_len = nbyte;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = offset;
if (nbyte > INT_MAX)
return (EINVAL);
auio.uio_resid = nbyte;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_td = td;
#ifdef KTRACE
/*
* if tracing, save a copy of iovec
*/
if (KTRPOINT(td, KTR_GENIO)) {
ktriov = aiov;
ktruio = auio;
didktr = 1;
}
#endif
cnt = nbyte;
if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
}
cnt -= auio.uio_resid;
#ifdef KTRACE
if (didktr && error == 0) {
ktruio.uio_iov = &ktriov;
ktruio.uio_resid = cnt;
ktrgenio(fd, UIO_READ, &ktruio, error);
}
#endif
td->td_retval[0] = cnt;
return (error);
}
/*
* Scatter read system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct readv_args {
int fd;
struct iovec *iovp;
u_int iovcnt;
};
#endif
/*
* MPSAFE
*/
int
readv(td, uap)
struct thread *td;
struct readv_args *uap;
{
struct file *fp;
struct uio auio;
struct iovec *iov;
struct iovec *needfree;
struct iovec aiov[UIO_SMALLIOV];
long i, cnt;
int error;
u_int iovlen;
#ifdef KTRACE
struct iovec *ktriov = NULL;
struct uio ktruio;
#endif
if ((error = fget_read(td, uap->fd, &fp)) != 0)
return (error);
needfree = NULL;
/* note: can't use iovlen until iovcnt is validated */
iovlen = uap->iovcnt * sizeof (struct iovec);
if (uap->iovcnt > UIO_SMALLIOV) {
if (uap->iovcnt > UIO_MAXIOV) {
error = EINVAL;
goto done;
}
MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
needfree = iov;
} else
iov = aiov;
auio.uio_iov = iov;
auio.uio_iovcnt = uap->iovcnt;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_td = td;
auio.uio_offset = -1;
if ((error = copyin(uap->iovp, iov, iovlen)))
goto done;
auio.uio_resid = 0;
for (i = 0; i < uap->iovcnt; i++) {
if (iov->iov_len > INT_MAX - auio.uio_resid) {
error = EINVAL;
goto done;
}
auio.uio_resid += iov->iov_len;
iov++;
}
#ifdef KTRACE
/*
* if tracing, save a copy of iovec
*/
if (KTRPOINT(td, KTR_GENIO)) {
MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
bcopy(auio.uio_iov, ktriov, iovlen);
ktruio = auio;
}
#endif
cnt = auio.uio_resid;
if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
}
cnt -= auio.uio_resid;
#ifdef KTRACE
if (ktriov != NULL) {
if (error == 0) {
ktruio.uio_iov = ktriov;
ktruio.uio_resid = cnt;
ktrgenio(uap->fd, UIO_READ, &ktruio, error);
}
FREE(ktriov, M_TEMP);
}
#endif
td->td_retval[0] = cnt;
done:
fdrop(fp, td);
if (needfree)
FREE(needfree, M_IOV);
return (error);
}
/*
* Write system call
*/
#ifndef _SYS_SYSPROTO_H_
struct write_args {
int fd;
const void *buf;
size_t nbyte;
};
#endif
/*
* MPSAFE
*/
int
write(td, uap)
struct thread *td;
struct write_args *uap;
{
struct file *fp;
int error;
if ((error = fget_write(td, uap->fd, &fp)) == 0) {
error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
(off_t)-1, 0);
fdrop(fp, td);
} else {
error = EBADF; /* XXX this can't be right */
}
return(error);
}
/*
* Pwrite system call
*/
#ifndef _SYS_SYSPROTO_H_
struct pwrite_args {
int fd;
const void *buf;
size_t nbyte;
int pad;
off_t offset;
};
#endif
/*
* MPSAFE
*/
int
pwrite(td, uap)
struct thread *td;
struct pwrite_args *uap;
{
struct file *fp;
int error;
if ((error = fget_write(td, uap->fd, &fp)) == 0) {
if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
error = ESPIPE;
else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
error = EINVAL;
else {
error = dofilewrite(td, fp, uap->fd, uap->buf,
uap->nbyte, uap->offset, FOF_OFFSET);
}
fdrop(fp, td);
} else {
error = EBADF; /* this can't be right */
}
return(error);
}
static int
dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
struct thread *td;
struct file *fp;
int fd, flags;
const void *buf;
size_t nbyte;
off_t offset;
{
struct uio auio;
struct iovec aiov;
long cnt, error = 0;
#ifdef KTRACE
struct iovec ktriov;
struct uio ktruio;
int didktr = 0;
#endif
aiov.iov_base = (void *)(uintptr_t)buf;
aiov.iov_len = nbyte;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = offset;
if (nbyte > INT_MAX)
return (EINVAL);
auio.uio_resid = nbyte;
auio.uio_rw = UIO_WRITE;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_td = td;
#ifdef KTRACE
/*
* if tracing, save a copy of iovec and uio
*/
if (KTRPOINT(td, KTR_GENIO)) {
ktriov = aiov;
ktruio = auio;
didktr = 1;
}
#endif
cnt = nbyte;
if (fp->f_type == DTYPE_VNODE)
bwillwrite();
if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
/* Socket layer is responsible for issuing SIGPIPE. */
if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
PROC_LOCK(td->td_proc);
psignal(td->td_proc, SIGPIPE);
PROC_UNLOCK(td->td_proc);
}
}
cnt -= auio.uio_resid;
#ifdef KTRACE
if (didktr && error == 0) {
ktruio.uio_iov = &ktriov;
ktruio.uio_resid = cnt;
ktrgenio(fd, UIO_WRITE, &ktruio, error);
}
#endif
td->td_retval[0] = cnt;
return (error);
}
/*
* Gather write system call
*/
#ifndef _SYS_SYSPROTO_H_
struct writev_args {
int fd;
struct iovec *iovp;
u_int iovcnt;
};
#endif
/*
* MPSAFE
*/
int
writev(td, uap)
struct thread *td;
register struct writev_args *uap;
{
struct file *fp;
struct uio auio;
register struct iovec *iov;
struct iovec *needfree;
struct iovec aiov[UIO_SMALLIOV];
long i, cnt, error = 0;
u_int iovlen;
#ifdef KTRACE
struct iovec *ktriov = NULL;
struct uio ktruio;
#endif
if ((error = fget_write(td, uap->fd, &fp)) != 0)
return (EBADF);
needfree = NULL;
/* note: can't use iovlen until iovcnt is validated */
iovlen = uap->iovcnt * sizeof (struct iovec);
if (uap->iovcnt > UIO_SMALLIOV) {
if (uap->iovcnt > UIO_MAXIOV) {
error = EINVAL;
goto done;
}
MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
needfree = iov;
} else
iov = aiov;
auio.uio_iov = iov;
auio.uio_iovcnt = uap->iovcnt;
auio.uio_rw = UIO_WRITE;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_td = td;
auio.uio_offset = -1;
if ((error = copyin(uap->iovp, iov, iovlen)))
goto done;
auio.uio_resid = 0;
for (i = 0; i < uap->iovcnt; i++) {
if (iov->iov_len > INT_MAX - auio.uio_resid) {
error = EINVAL;
goto done;
}
auio.uio_resid += iov->iov_len;
iov++;
}
#ifdef KTRACE
/*
* if tracing, save a copy of iovec and uio
*/
if (KTRPOINT(td, KTR_GENIO)) {
MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
bcopy(auio.uio_iov, ktriov, iovlen);
ktruio = auio;
}
#endif
cnt = auio.uio_resid;
if (fp->f_type == DTYPE_VNODE)
bwillwrite();
if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
if (error == EPIPE) {
PROC_LOCK(td->td_proc);
psignal(td->td_proc, SIGPIPE);
PROC_UNLOCK(td->td_proc);
}
}
cnt -= auio.uio_resid;
#ifdef KTRACE
if (ktriov != NULL) {
if (error == 0) {
ktruio.uio_iov = ktriov;
ktruio.uio_resid = cnt;
ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
}
FREE(ktriov, M_TEMP);
}
#endif
td->td_retval[0] = cnt;
done:
fdrop(fp, td);
if (needfree)
FREE(needfree, M_IOV);
return (error);
}
/*
* Ioctl system call
*/
#ifndef _SYS_SYSPROTO_H_
struct ioctl_args {
int fd;
u_long com;
caddr_t data;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
ioctl(td, uap)
struct thread *td;
register struct ioctl_args *uap;
{
struct file *fp;
register struct filedesc *fdp;
register u_long com;
int error = 0;
register u_int size;
caddr_t data, memp;
int tmp;
#define STK_PARAMS 128
union {
char stkbuf[STK_PARAMS];
long align;
} ubuf;
if ((error = fget(td, uap->fd, &fp)) != 0)
return (error);
mtx_lock(&Giant);
if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
fdrop(fp, td);
mtx_unlock(&Giant);
return (EBADF);
}
fdp = td->td_proc->p_fd;
switch (com = uap->com) {
case FIONCLEX:
FILEDESC_LOCK(fdp);
fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
FILEDESC_UNLOCK(fdp);
fdrop(fp, td);
mtx_unlock(&Giant);
return (0);
case FIOCLEX:
FILEDESC_LOCK(fdp);
fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
FILEDESC_UNLOCK(fdp);
fdrop(fp, td);
mtx_unlock(&Giant);
return (0);
}
/*
* Interpret high order word to find amount of data to be
* copied to/from the user's address space.
*/
size = IOCPARM_LEN(com);
if (size > IOCPARM_MAX) {
fdrop(fp, td);
mtx_unlock(&Giant);
return (ENOTTY);
}
memp = NULL;
if (size > sizeof (ubuf.stkbuf)) {
memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
data = memp;
} else {
data = ubuf.stkbuf;
}
if (com&IOC_IN) {
if (size) {
error = copyin(uap->data, data, (u_int)size);
if (error) {
if (memp)
free(memp, M_IOCTLOPS);
fdrop(fp, td);
goto done;
}
} else {
*(caddr_t *)data = uap->data;
}
} else if ((com&IOC_OUT) && size) {
/*
* Zero the buffer so the user always
* gets back something deterministic.
*/
bzero(data, size);
} else if (com&IOC_VOID) {
*(caddr_t *)data = uap->data;
}
switch (com) {
case FIONBIO:
FILE_LOCK(fp);
if ((tmp = *(int *)data))
fp->f_flag |= FNONBLOCK;
else
fp->f_flag &= ~FNONBLOCK;
FILE_UNLOCK(fp);
error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
break;
case FIOASYNC:
FILE_LOCK(fp);
if ((tmp = *(int *)data))
fp->f_flag |= FASYNC;
else
fp->f_flag &= ~FASYNC;
FILE_UNLOCK(fp);
error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
break;
default:
error = fo_ioctl(fp, com, data, td->td_ucred, td);
/*
* Copy any data to user, size was
* already set and checked above.
*/
if (error == 0 && (com&IOC_OUT) && size)
error = copyout(data, uap->data, (u_int)size);
break;
}
if (memp)
free(memp, M_IOCTLOPS);
fdrop(fp, td);
done:
mtx_unlock(&Giant);
return (error);
}
/*
* sellock and selwait are initialized in selectinit() via SYSINIT.
*/
struct mtx sellock;
struct cv selwait;
u_int nselcoll; /* Select collisions since boot */
SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
/*
* Select system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct select_args {
int nd;
fd_set *in, *ou, *ex;
struct timeval *tv;
};
#endif
/*
* MPSAFE
*/
int
select(td, uap)
register struct thread *td;
register struct select_args *uap;
{
struct timeval tv, *tvp;
int error;
if (uap->tv != NULL) {
error = copyin(uap->tv, &tv, sizeof(tv));
if (error)
return (error);
tvp = &tv;
} else
tvp = NULL;
return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
}
int
kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
fd_set *fd_ex, struct timeval *tvp)
{
struct filedesc *fdp;
/*
* The magic 2048 here is chosen to be just enough for FD_SETSIZE
* infds with the new FD_SETSIZE of 1024, and more than enough for
* FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
* of 256.
*/
fd_mask s_selbits[howmany(2048, NFDBITS)];
fd_mask *ibits[3], *obits[3], *selbits, *sbp;
struct timeval atv, rtv, ttv;
int error, timo;
u_int ncoll, nbufbytes, ncpbytes, nfdbits;
if (nd < 0)
return (EINVAL);
fdp = td->td_proc->p_fd;
mtx_lock(&Giant);
FILEDESC_LOCK(fdp);
if (nd > td->td_proc->p_fd->fd_nfiles)
nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
FILEDESC_UNLOCK(fdp);
/*
* Allocate just enough bits for the non-null fd_sets. Use the
* preallocated auto buffer if possible.
*/
nfdbits = roundup(nd, NFDBITS);
ncpbytes = nfdbits / NBBY;
nbufbytes = 0;
if (fd_in != NULL)
nbufbytes += 2 * ncpbytes;
if (fd_ou != NULL)
nbufbytes += 2 * ncpbytes;
if (fd_ex != NULL)
nbufbytes += 2 * ncpbytes;
if (nbufbytes <= sizeof s_selbits)
selbits = &s_selbits[0];
else
selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
/*
* Assign pointers into the bit buffers and fetch the input bits.
* Put the output buffers together so that they can be bzeroed
* together.
*/
sbp = selbits;
#define getbits(name, x) \
do { \
if (name == NULL) \
ibits[x] = NULL; \
else { \
ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
obits[x] = sbp; \
sbp += ncpbytes / sizeof *sbp; \
error = copyin(name, ibits[x], ncpbytes); \
if (error != 0) \
goto done_nosellock; \
} \
} while (0)
getbits(fd_in, 0);
getbits(fd_ou, 1);
getbits(fd_ex, 2);
#undef getbits
if (nbufbytes != 0)
bzero(selbits, nbufbytes / 2);
if (tvp != NULL) {
atv = *tvp;
if (itimerfix(&atv)) {
error = EINVAL;
goto done_nosellock;
}
getmicrouptime(&rtv);
timevaladd(&atv, &rtv);
} else {
atv.tv_sec = 0;
atv.tv_usec = 0;
}
timo = 0;
TAILQ_INIT(&td->td_selq);
mtx_lock(&sellock);
retry:
ncoll = nselcoll;
mtx_lock_spin(&sched_lock);
td->td_flags |= TDF_SELECT;
mtx_unlock_spin(&sched_lock);
mtx_unlock(&sellock);
error = selscan(td, ibits, obits, nd);
mtx_lock(&sellock);
if (error || td->td_retval[0])
goto done;
if (atv.tv_sec || atv.tv_usec) {
getmicrouptime(&rtv);
if (timevalcmp(&rtv, &atv, >=))
goto done;
ttv = atv;
timevalsub(&ttv, &rtv);
timo = ttv.tv_sec > 24 * 60 * 60 ?
24 * 60 * 60 * hz : tvtohz(&ttv);
}
/*
* An event of interest may occur while we do not hold
* sellock, so check TDF_SELECT and the number of
* collisions and rescan the file descriptors if
* necessary.
*/
mtx_lock_spin(&sched_lock);
if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
mtx_unlock_spin(&sched_lock);
goto retry;
}
mtx_unlock_spin(&sched_lock);
if (timo > 0)
error = cv_timedwait_sig(&selwait, &sellock, timo);
else
error = cv_wait_sig(&selwait, &sellock);
if (error == 0)
goto retry;
done:
clear_selinfo_list(td);
mtx_lock_spin(&sched_lock);
td->td_flags &= ~TDF_SELECT;
mtx_unlock_spin(&sched_lock);
mtx_unlock(&sellock);
done_nosellock:
/* select is not restarted after signals... */
if (error == ERESTART)
error = EINTR;
if (error == EWOULDBLOCK)
error = 0;
#define putbits(name, x) \
if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
error = error2;
if (error == 0) {
int error2;
putbits(fd_in, 0);
putbits(fd_ou, 1);
putbits(fd_ex, 2);
#undef putbits
}
if (selbits != &s_selbits[0])
free(selbits, M_SELECT);
mtx_unlock(&Giant);
return (error);
}
static int
selscan(td, ibits, obits, nfd)
struct thread *td;
fd_mask **ibits, **obits;
int nfd;
{
int msk, i, fd;
fd_mask bits;
struct file *fp;
int n = 0;
/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
struct filedesc *fdp = td->td_proc->p_fd;
FILEDESC_LOCK(fdp);
for (msk = 0; msk < 3; msk++) {
if (ibits[msk] == NULL)
continue;
for (i = 0; i < nfd; i += NFDBITS) {
bits = ibits[msk][i/NFDBITS];
/* ffs(int mask) not portable, fd_mask is long */
for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
if (!(bits & 1))
continue;
if ((fp = fget_locked(fdp, fd)) == NULL) {
FILEDESC_UNLOCK(fdp);
return (EBADF);
}
if (fo_poll(fp, flag[msk], td->td_ucred,
td)) {
obits[msk][(fd)/NFDBITS] |=
((fd_mask)1 << ((fd) % NFDBITS));
n++;
}
}
}
}
FILEDESC_UNLOCK(fdp);
td->td_retval[0] = n;
return (0);
}
/*
* Poll system call.
*/
#ifndef _SYS_SYSPROTO_H_
struct poll_args {
struct pollfd *fds;
u_int nfds;
int timeout;
};
#endif
/*
* MPSAFE
*/
int
poll(td, uap)
struct thread *td;
struct poll_args *uap;
{
caddr_t bits;
char smallbits[32 * sizeof(struct pollfd)];
struct timeval atv, rtv, ttv;
int error = 0, timo;
u_int ncoll, nfds;
size_t ni;
nfds = uap->nfds;
mtx_lock(&Giant);
/*
* This is kinda bogus. We have fd limits, but that is not
* really related to the size of the pollfd array. Make sure
* we let the process use at least FD_SETSIZE entries and at
* least enough for the current limits. We want to be reasonably
* safe, but not overly restrictive.
*/
PROC_LOCK(td->td_proc);
if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
(nfds > FD_SETSIZE)) {
PROC_UNLOCK(td->td_proc);
error = EINVAL;
goto done2;
}
PROC_UNLOCK(td->td_proc);
ni = nfds * sizeof(struct pollfd);
if (ni > sizeof(smallbits))
bits = malloc(ni, M_TEMP, M_WAITOK);
else
bits = smallbits;
error = copyin(uap->fds, bits, ni);
if (error)
goto done_nosellock;
if (uap->timeout != INFTIM) {
atv.tv_sec = uap->timeout / 1000;
atv.tv_usec = (uap->timeout % 1000) * 1000;
if (itimerfix(&atv)) {
error = EINVAL;
goto done_nosellock;
}
getmicrouptime(&rtv);
timevaladd(&atv, &rtv);
} else {
atv.tv_sec = 0;
atv.tv_usec = 0;
}
timo = 0;
TAILQ_INIT(&td->td_selq);
mtx_lock(&sellock);
retry:
ncoll = nselcoll;
mtx_lock_spin(&sched_lock);
td->td_flags |= TDF_SELECT;
mtx_unlock_spin(&sched_lock);
mtx_unlock(&sellock);
error = pollscan(td, (struct pollfd *)bits, nfds);
mtx_lock(&sellock);
if (error || td->td_retval[0])
goto done;
if (atv.tv_sec || atv.tv_usec) {
getmicrouptime(&rtv);
if (timevalcmp(&rtv, &atv, >=))
goto done;
ttv = atv;
timevalsub(&ttv, &rtv);
timo = ttv.tv_sec > 24 * 60 * 60 ?
24 * 60 * 60 * hz : tvtohz(&ttv);
}
/*
* An event of interest may occur while we do not hold
* sellock, so check TDF_SELECT and the number of collisions
* and rescan the file descriptors if necessary.
*/
mtx_lock_spin(&sched_lock);
if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
mtx_unlock_spin(&sched_lock);
goto retry;
}
mtx_unlock_spin(&sched_lock);
if (timo > 0)
error = cv_timedwait_sig(&selwait, &sellock, timo);
else
error = cv_wait_sig(&selwait, &sellock);
if (error == 0)
goto retry;
done:
clear_selinfo_list(td);
mtx_lock_spin(&sched_lock);
td->td_flags &= ~TDF_SELECT;
mtx_unlock_spin(&sched_lock);
mtx_unlock(&sellock);
done_nosellock:
/* poll is not restarted after signals... */
if (error == ERESTART)
error = EINTR;
if (error == EWOULDBLOCK)
error = 0;
if (error == 0) {
error = copyout(bits, uap->fds, ni);
if (error)
goto out;
}
out:
if (ni > sizeof(smallbits))
free(bits, M_TEMP);
done2:
mtx_unlock(&Giant);
return (error);
}
static int
pollscan(td, fds, nfd)
struct thread *td;
struct pollfd *fds;
u_int nfd;
{
register struct filedesc *fdp = td->td_proc->p_fd;
int i;
struct file *fp;
int n = 0;
FILEDESC_LOCK(fdp);
for (i = 0; i < nfd; i++, fds++) {
if (fds->fd >= fdp->fd_nfiles) {
fds->revents = POLLNVAL;
n++;
} else if (fds->fd < 0) {
fds->revents = 0;
} else {
fp = fdp->fd_ofiles[fds->fd];
if (fp == NULL) {
fds->revents = POLLNVAL;
n++;
} else {
/*
* Note: backend also returns POLLHUP and
* POLLERR if appropriate.
*/
fds->revents = fo_poll(fp, fds->events,
td->td_ucred, td);
if (fds->revents != 0)
n++;
}
}
}
FILEDESC_UNLOCK(fdp);
td->td_retval[0] = n;
return (0);
}
/*
* OpenBSD poll system call.
* XXX this isn't quite a true representation.. OpenBSD uses select ops.
*/
#ifndef _SYS_SYSPROTO_H_
struct openbsd_poll_args {
struct pollfd *fds;
u_int nfds;
int timeout;
};
#endif
/*
* MPSAFE
*/
int
openbsd_poll(td, uap)
register struct thread *td;
register struct openbsd_poll_args *uap;
{
return (poll(td, (struct poll_args *)uap));
}
/*
* Remove the references to the thread from all of the objects
* we were polling.
*
* This code assumes that the underlying owner of the selinfo
* structure will hold sellock before it changes it, and that
* it will unlink itself from our list if it goes away.
*/
void
clear_selinfo_list(td)
struct thread *td;
{
struct selinfo *si;
mtx_assert(&sellock, MA_OWNED);
TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
si->si_thread = NULL;
TAILQ_INIT(&td->td_selq);
}
/*
* Record a select request.
*/
void
selrecord(selector, sip)
struct thread *selector;
struct selinfo *sip;
{
mtx_lock(&sellock);
/*
* If the selinfo's thread pointer is NULL then take ownership of it.
*
* If the thread pointer is not NULL and it points to another
* thread, then we have a collision.
*
* If the thread pointer is not NULL and points back to us then leave
* it alone as we've already added pointed it at us and added it to
* our list.
*/
if (sip->si_thread == NULL) {
sip->si_thread = selector;
TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
} else if (sip->si_thread != selector) {
sip->si_flags |= SI_COLL;
}
mtx_unlock(&sellock);
}
/* Wake up a selecting thread. */
void
selwakeup(sip)
struct selinfo *sip;
{
doselwakeup(sip, -1);
}
/* Wake up a selecting thread, and set its priority. */
void
selwakeuppri(sip, pri)
struct selinfo *sip;
int pri;
{
doselwakeup(sip, pri);
}
/*
* Do a wakeup when a selectable event occurs.
*/
static void
doselwakeup(sip, pri)
struct selinfo *sip;
int pri;
{
struct thread *td;
mtx_lock(&sellock);
td = sip->si_thread;
if ((sip->si_flags & SI_COLL) != 0) {
nselcoll++;
sip->si_flags &= ~SI_COLL;
cv_broadcastpri(&selwait, pri);
}
if (td == NULL) {
mtx_unlock(&sellock);
return;
}
TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
sip->si_thread = NULL;
mtx_lock_spin(&sched_lock);
if (td->td_wchan == &selwait) {
cv_waitq_remove(td);
TD_CLR_SLEEPING(td);
if (pri >= PRI_MIN && pri <= PRI_MAX && td->td_priority > pri)
td->td_priority = pri;
setrunnable(td);
} else
td->td_flags &= ~TDF_SELECT;
mtx_unlock_spin(&sched_lock);
mtx_unlock(&sellock);
}
static void selectinit(void *);
SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
/* ARGSUSED*/
static void
selectinit(dummy)
void *dummy;
{
cv_init(&selwait, "select");
mtx_init(&sellock, "sellck", NULL, MTX_DEF);
}