2000-04-16 19:02:08 +00:00
|
|
|
/*-
|
2001-02-24 01:44:03 +00:00
|
|
|
* Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
|
2004-08-15 06:24:42 +00:00
|
|
|
* Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
|
2009-09-16 03:15:57 +00:00
|
|
|
* Copyright (c) 2009 Apple, Inc.
|
2000-04-16 19:02:08 +00:00
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
2003-06-11 00:56:59 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
2006-09-24 02:23:29 +00:00
|
|
|
#include "opt_ktrace.h"
|
2014-01-07 01:17:27 +00:00
|
|
|
#include "opt_kqueue.h"
|
2006-09-24 02:23:29 +00:00
|
|
|
|
2000-04-16 19:02:08 +00:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
2014-03-16 10:55:57 +00:00
|
|
|
#include <sys/capsicum.h>
|
2000-04-16 19:02:08 +00:00
|
|
|
#include <sys/kernel.h>
|
2001-05-01 08:13:21 +00:00
|
|
|
#include <sys/lock.h>
|
|
|
|
#include <sys/mutex.h>
|
2012-03-26 09:34:17 +00:00
|
|
|
#include <sys/rwlock.h>
|
2000-04-16 19:02:08 +00:00
|
|
|
#include <sys/proc.h>
|
2004-07-14 07:02:03 +00:00
|
|
|
#include <sys/malloc.h>
|
2000-04-16 19:02:08 +00:00
|
|
|
#include <sys/unistd.h>
|
|
|
|
#include <sys/file.h>
|
2003-01-01 01:56:19 +00:00
|
|
|
#include <sys/filedesc.h>
|
2004-07-14 07:02:03 +00:00
|
|
|
#include <sys/filio.h>
|
2000-04-16 19:02:08 +00:00
|
|
|
#include <sys/fcntl.h>
|
2004-08-15 06:24:42 +00:00
|
|
|
#include <sys/kthread.h>
|
2001-01-09 04:33:49 +00:00
|
|
|
#include <sys/selinfo.h>
|
2013-06-16 09:30:35 +00:00
|
|
|
#include <sys/stdatomic.h>
|
2000-04-16 19:02:08 +00:00
|
|
|
#include <sys/queue.h>
|
|
|
|
#include <sys/event.h>
|
|
|
|
#include <sys/eventvar.h>
|
|
|
|
#include <sys/poll.h>
|
|
|
|
#include <sys/protosw.h>
|
2013-10-21 16:44:53 +00:00
|
|
|
#include <sys/resourcevar.h>
|
2004-07-14 07:02:03 +00:00
|
|
|
#include <sys/sigio.h>
|
|
|
|
#include <sys/signalvar.h>
|
2000-04-16 19:02:08 +00:00
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <sys/socketvar.h>
|
|
|
|
#include <sys/stat.h>
|
2001-09-29 17:48:39 +00:00
|
|
|
#include <sys/sysctl.h>
|
2000-04-16 19:02:08 +00:00
|
|
|
#include <sys/sysproto.h>
|
2005-03-01 17:45:55 +00:00
|
|
|
#include <sys/syscallsubr.h>
|
2004-08-15 06:24:42 +00:00
|
|
|
#include <sys/taskqueue.h>
|
2000-04-16 19:02:08 +00:00
|
|
|
#include <sys/uio.h>
|
2014-09-22 16:20:47 +00:00
|
|
|
#include <sys/user.h>
|
2006-09-24 02:23:29 +00:00
|
|
|
#ifdef KTRACE
|
|
|
|
#include <sys/ktrace.h>
|
|
|
|
#endif
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2002-03-20 04:09:59 +00:00
|
|
|
#include <vm/uma.h>
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2005-02-10 12:02:37 +00:00
|
|
|
static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
/*
|
|
|
|
* This lock is used if multiple kq locks are required. This possibly
|
|
|
|
* should be made into a per proc lock.
|
|
|
|
*/
|
|
|
|
static struct mtx kq_global;
|
|
|
|
MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
|
|
|
|
#define KQ_GLOBAL_LOCK(lck, haslck) do { \
|
|
|
|
if (!haslck) \
|
|
|
|
mtx_lock(lck); \
|
|
|
|
haslck = 1; \
|
|
|
|
} while (0)
|
|
|
|
#define KQ_GLOBAL_UNLOCK(lck, haslck) do { \
|
|
|
|
if (haslck) \
|
|
|
|
mtx_unlock(lck); \
|
|
|
|
haslck = 0; \
|
|
|
|
} while (0)
|
|
|
|
|
2016-05-24 21:13:33 +00:00
|
|
|
TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
|
2004-08-15 06:24:42 +00:00
|
|
|
|
2005-06-03 23:15:01 +00:00
|
|
|
static int kevent_copyout(void *arg, struct kevent *kevp, int count);
|
|
|
|
static int kevent_copyin(void *arg, struct kevent *kevp, int count);
|
2006-09-24 04:47:47 +00:00
|
|
|
static int kqueue_register(struct kqueue *kq, struct kevent *kev,
|
|
|
|
struct thread *td, int waitok);
|
2007-05-27 19:24:00 +00:00
|
|
|
static int kqueue_acquire(struct file *fp, struct kqueue **kqp);
|
2004-08-15 06:24:42 +00:00
|
|
|
static void kqueue_release(struct kqueue *kq, int locked);
|
2015-08-11 13:47:23 +00:00
|
|
|
static void kqueue_destroy(struct kqueue *kq);
|
|
|
|
static void kqueue_drain(struct kqueue *kq, struct thread *td);
|
2004-08-15 06:24:42 +00:00
|
|
|
static int kqueue_expand(struct kqueue *kq, struct filterops *fops,
|
|
|
|
uintptr_t ident, int waitok);
|
|
|
|
static void kqueue_task(void *arg, int pending);
|
|
|
|
static int kqueue_scan(struct kqueue *kq, int maxevents,
|
2005-06-03 23:15:01 +00:00
|
|
|
struct kevent_copyops *k_ops,
|
|
|
|
const struct timespec *timeout,
|
|
|
|
struct kevent *keva, struct thread *td);
|
2000-04-16 19:02:08 +00:00
|
|
|
static void kqueue_wakeup(struct kqueue *kq);
|
2004-08-15 06:24:42 +00:00
|
|
|
static struct filterops *kqueue_fo_find(int filt);
|
|
|
|
static void kqueue_fo_release(int filt);
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2013-09-18 18:48:33 +00:00
|
|
|
static fo_ioctl_t kqueue_ioctl;
|
|
|
|
static fo_poll_t kqueue_poll;
|
|
|
|
static fo_kqfilter_t kqueue_kqfilter;
|
|
|
|
static fo_stat_t kqueue_stat;
|
|
|
|
static fo_close_t kqueue_close;
|
2014-09-22 16:20:47 +00:00
|
|
|
static fo_fill_kinfo_t kqueue_fill_kinfo;
|
2013-09-18 18:48:33 +00:00
|
|
|
|
|
|
|
static struct fileops kqueueops = {
|
2014-09-12 21:29:10 +00:00
|
|
|
.fo_read = invfo_rdwr,
|
|
|
|
.fo_write = invfo_rdwr,
|
|
|
|
.fo_truncate = invfo_truncate,
|
2003-06-18 18:16:40 +00:00
|
|
|
.fo_ioctl = kqueue_ioctl,
|
|
|
|
.fo_poll = kqueue_poll,
|
|
|
|
.fo_kqfilter = kqueue_kqfilter,
|
|
|
|
.fo_stat = kqueue_stat,
|
|
|
|
.fo_close = kqueue_close,
|
2011-08-16 20:07:47 +00:00
|
|
|
.fo_chmod = invfo_chmod,
|
|
|
|
.fo_chown = invfo_chown,
|
2013-08-15 07:54:31 +00:00
|
|
|
.fo_sendfile = invfo_sendfile,
|
2014-09-22 16:20:47 +00:00
|
|
|
.fo_fill_kinfo = kqueue_fill_kinfo,
|
2001-02-15 16:34:11 +00:00
|
|
|
};
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
static int knote_attach(struct knote *kn, struct kqueue *kq);
|
2001-09-12 08:38:13 +00:00
|
|
|
static void knote_drop(struct knote *kn, struct thread *td);
|
2000-04-16 19:02:08 +00:00
|
|
|
static void knote_enqueue(struct knote *kn);
|
|
|
|
static void knote_dequeue(struct knote *kn);
|
|
|
|
static void knote_init(void);
|
2004-08-15 06:24:42 +00:00
|
|
|
static struct knote *knote_alloc(int waitok);
|
2000-04-16 19:02:08 +00:00
|
|
|
static void knote_free(struct knote *kn);
|
|
|
|
|
2001-02-15 16:34:11 +00:00
|
|
|
static void filt_kqdetach(struct knote *kn);
|
|
|
|
static int filt_kqueue(struct knote *kn, long hint);
|
|
|
|
static int filt_procattach(struct knote *kn);
|
|
|
|
static void filt_procdetach(struct knote *kn);
|
|
|
|
static int filt_proc(struct knote *kn, long hint);
|
|
|
|
static int filt_fileattach(struct knote *kn);
|
2001-07-19 18:34:40 +00:00
|
|
|
static void filt_timerexpire(void *knx);
|
|
|
|
static int filt_timerattach(struct knote *kn);
|
|
|
|
static void filt_timerdetach(struct knote *kn);
|
|
|
|
static int filt_timer(struct knote *kn, long hint);
|
2009-09-16 03:30:12 +00:00
|
|
|
static int filt_userattach(struct knote *kn);
|
|
|
|
static void filt_userdetach(struct knote *kn);
|
|
|
|
static int filt_user(struct knote *kn, long hint);
|
2009-09-17 17:48:13 +00:00
|
|
|
static void filt_usertouch(struct knote *kn, struct kevent *kev,
|
2009-09-22 16:16:02 +00:00
|
|
|
u_long type);
|
2001-02-15 16:34:11 +00:00
|
|
|
|
2009-09-12 20:03:45 +00:00
|
|
|
static struct filterops file_filtops = {
|
|
|
|
.f_isfd = 1,
|
|
|
|
.f_attach = filt_fileattach,
|
|
|
|
};
|
|
|
|
static struct filterops kqread_filtops = {
|
|
|
|
.f_isfd = 1,
|
|
|
|
.f_detach = filt_kqdetach,
|
|
|
|
.f_event = filt_kqueue,
|
|
|
|
};
|
2004-08-15 06:24:42 +00:00
|
|
|
/* XXX - move to kern_proc.c? */
|
2009-09-12 20:03:45 +00:00
|
|
|
static struct filterops proc_filtops = {
|
|
|
|
.f_isfd = 0,
|
|
|
|
.f_attach = filt_procattach,
|
|
|
|
.f_detach = filt_procdetach,
|
|
|
|
.f_event = filt_proc,
|
|
|
|
};
|
|
|
|
static struct filterops timer_filtops = {
|
|
|
|
.f_isfd = 0,
|
|
|
|
.f_attach = filt_timerattach,
|
|
|
|
.f_detach = filt_timerdetach,
|
|
|
|
.f_event = filt_timer,
|
|
|
|
};
|
2009-09-16 03:30:12 +00:00
|
|
|
static struct filterops user_filtops = {
|
|
|
|
.f_attach = filt_userattach,
|
|
|
|
.f_detach = filt_userdetach,
|
|
|
|
.f_event = filt_user,
|
|
|
|
.f_touch = filt_usertouch,
|
|
|
|
};
|
2001-02-15 16:34:11 +00:00
|
|
|
|
2002-03-20 04:09:59 +00:00
|
|
|
static uma_zone_t knote_zone;
|
2013-06-16 09:30:35 +00:00
|
|
|
static atomic_uint kq_ncallouts = ATOMIC_VAR_INIT(0);
|
|
|
|
static unsigned int kq_calloutmax = 4 * 1024;
|
|
|
|
SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
|
2001-09-29 17:48:39 +00:00
|
|
|
&kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
/* XXX - ensure not KN_INFLUX?? */
|
|
|
|
#define KNOTE_ACTIVATE(kn, islock) do { \
|
|
|
|
if ((islock)) \
|
|
|
|
mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \
|
|
|
|
else \
|
|
|
|
KQ_LOCK((kn)->kn_kq); \
|
|
|
|
(kn)->kn_status |= KN_ACTIVE; \
|
|
|
|
if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \
|
|
|
|
knote_enqueue((kn)); \
|
|
|
|
if (!(islock)) \
|
|
|
|
KQ_UNLOCK((kn)->kn_kq); \
|
2000-04-16 19:02:08 +00:00
|
|
|
} while(0)
|
2004-08-15 06:24:42 +00:00
|
|
|
#define KQ_LOCK(kq) do { \
|
|
|
|
mtx_lock(&(kq)->kq_lock); \
|
|
|
|
} while (0)
|
|
|
|
#define KQ_FLUX_WAKEUP(kq) do { \
|
|
|
|
if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \
|
|
|
|
(kq)->kq_state &= ~KQ_FLUXWAIT; \
|
|
|
|
wakeup((kq)); \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
#define KQ_UNLOCK_FLUX(kq) do { \
|
|
|
|
KQ_FLUX_WAKEUP(kq); \
|
|
|
|
mtx_unlock(&(kq)->kq_lock); \
|
|
|
|
} while (0)
|
|
|
|
#define KQ_UNLOCK(kq) do { \
|
|
|
|
mtx_unlock(&(kq)->kq_lock); \
|
|
|
|
} while (0)
|
|
|
|
#define KQ_OWNED(kq) do { \
|
|
|
|
mtx_assert(&(kq)->kq_lock, MA_OWNED); \
|
|
|
|
} while (0)
|
|
|
|
#define KQ_NOTOWNED(kq) do { \
|
|
|
|
mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \
|
|
|
|
} while (0)
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
|
|
|
|
static struct knlist *
|
|
|
|
kn_list_lock(struct knote *kn)
|
|
|
|
{
|
|
|
|
struct knlist *knl;
|
|
|
|
|
|
|
|
knl = kn->kn_knlist;
|
|
|
|
if (knl != NULL)
|
|
|
|
knl->kl_lock(knl->kl_lockarg);
|
|
|
|
return (knl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kn_list_unlock(struct knlist *knl)
|
|
|
|
{
|
|
|
|
bool do_free;
|
|
|
|
|
|
|
|
if (knl == NULL)
|
|
|
|
return;
|
|
|
|
do_free = knl->kl_autodestroy && knlist_empty(knl);
|
|
|
|
knl->kl_unlock(knl->kl_lockarg);
|
|
|
|
if (do_free) {
|
|
|
|
knlist_destroy(knl);
|
|
|
|
free(knl, M_KQUEUE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-07-01 16:28:32 +00:00
|
|
|
#define KNL_ASSERT_LOCK(knl, islocked) do { \
|
|
|
|
if (islocked) \
|
|
|
|
KNL_ASSERT_LOCKED(knl); \
|
|
|
|
else \
|
|
|
|
KNL_ASSERT_UNLOCKED(knl); \
|
2004-08-15 06:24:42 +00:00
|
|
|
} while (0)
|
2005-07-01 16:28:32 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
#define KNL_ASSERT_LOCKED(knl) do { \
|
2009-06-10 20:59:32 +00:00
|
|
|
knl->kl_assert_locked((knl)->kl_lockarg); \
|
2005-07-01 16:28:32 +00:00
|
|
|
} while (0)
|
2009-06-10 20:59:32 +00:00
|
|
|
#define KNL_ASSERT_UNLOCKED(knl) do { \
|
|
|
|
knl->kl_assert_unlocked((knl)->kl_lockarg); \
|
2005-07-01 16:28:32 +00:00
|
|
|
} while (0)
|
|
|
|
#else /* !INVARIANTS */
|
|
|
|
#define KNL_ASSERT_LOCKED(knl) do {} while(0)
|
|
|
|
#define KNL_ASSERT_UNLOCKED(knl) do {} while (0)
|
|
|
|
#endif /* INVARIANTS */
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2014-01-07 01:17:27 +00:00
|
|
|
#ifndef KN_HASHSIZE
|
2000-04-16 19:02:08 +00:00
|
|
|
#define KN_HASHSIZE 64 /* XXX should be tunable */
|
2014-01-07 01:17:27 +00:00
|
|
|
#endif
|
|
|
|
|
2000-04-16 19:02:08 +00:00
|
|
|
#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
|
|
|
|
|
2001-12-29 07:13:47 +00:00
|
|
|
static int
|
|
|
|
filt_nullattach(struct knote *kn)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (ENXIO);
|
|
|
|
};
|
|
|
|
|
2009-09-12 20:03:45 +00:00
|
|
|
struct filterops null_filtops = {
|
|
|
|
.f_isfd = 0,
|
|
|
|
.f_attach = filt_nullattach,
|
|
|
|
};
|
2001-12-29 07:13:47 +00:00
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
/* XXX - make SYSINIT to add these, and move into respective modules. */
|
2000-04-16 19:02:08 +00:00
|
|
|
extern struct filterops sig_filtops;
|
2004-07-04 10:52:54 +00:00
|
|
|
extern struct filterops fs_filtops;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
|
|
|
/*
|
2001-02-15 16:34:11 +00:00
|
|
|
* Table for for all system-defined filters.
|
2000-04-16 19:02:08 +00:00
|
|
|
*/
|
2004-08-15 06:24:42 +00:00
|
|
|
static struct mtx filterops_lock;
|
|
|
|
MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
|
|
|
|
MTX_DEF);
|
|
|
|
static struct {
|
|
|
|
struct filterops *for_fop;
|
2014-11-16 01:18:41 +00:00
|
|
|
int for_nolock;
|
2004-08-15 06:24:42 +00:00
|
|
|
int for_refcnt;
|
|
|
|
} sysfilt_ops[EVFILT_SYSCOUNT] = {
|
2014-11-16 01:18:41 +00:00
|
|
|
{ &file_filtops, 1 }, /* EVFILT_READ */
|
|
|
|
{ &file_filtops, 1 }, /* EVFILT_WRITE */
|
2004-08-15 06:24:42 +00:00
|
|
|
{ &null_filtops }, /* EVFILT_AIO */
|
2014-11-16 01:18:41 +00:00
|
|
|
{ &file_filtops, 1 }, /* EVFILT_VNODE */
|
|
|
|
{ &proc_filtops, 1 }, /* EVFILT_PROC */
|
|
|
|
{ &sig_filtops, 1 }, /* EVFILT_SIGNAL */
|
|
|
|
{ &timer_filtops, 1 }, /* EVFILT_TIMER */
|
|
|
|
{ &file_filtops, 1 }, /* EVFILT_PROCDESC */
|
|
|
|
{ &fs_filtops, 1 }, /* EVFILT_FS */
|
2005-10-12 17:51:31 +00:00
|
|
|
{ &null_filtops }, /* EVFILT_LIO */
|
2014-11-16 01:18:41 +00:00
|
|
|
{ &user_filtops, 1 }, /* EVFILT_USER */
|
2014-01-17 05:15:44 +00:00
|
|
|
{ &null_filtops }, /* EVFILT_SENDFILE */
|
2000-04-16 19:02:08 +00:00
|
|
|
};
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
/*
|
|
|
|
* Simple redirection for all cdevsw style objects to call their fo_kqfilter
|
|
|
|
* method.
|
|
|
|
*/
|
2000-04-16 19:02:08 +00:00
|
|
|
static int
|
2001-02-15 16:34:11 +00:00
|
|
|
filt_fileattach(struct knote *kn)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2004-08-13 07:38:58 +00:00
|
|
|
|
2001-02-15 16:34:11 +00:00
|
|
|
return (fo_kqfilter(kn->kn_fp, kn));
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
|
2001-02-15 16:34:11 +00:00
|
|
|
/*ARGSUSED*/
|
2013-09-18 18:48:33 +00:00
|
|
|
static int
|
2001-02-15 16:34:11 +00:00
|
|
|
kqueue_kqfilter(struct file *fp, struct knote *kn)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2003-01-13 00:33:17 +00:00
|
|
|
struct kqueue *kq = kn->kn_fp->f_data;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2001-02-15 16:34:11 +00:00
|
|
|
if (kn->kn_filter != EVFILT_READ)
|
2004-08-15 06:24:42 +00:00
|
|
|
return (EINVAL);
|
2001-02-15 16:34:11 +00:00
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
kn->kn_status |= KN_KQUEUE;
|
2001-02-15 16:34:11 +00:00
|
|
|
kn->kn_fop = &kqread_filtops;
|
2004-08-15 06:24:42 +00:00
|
|
|
knlist_add(&kq->kq_sel.si_note, kn, 0);
|
|
|
|
|
2000-04-16 19:02:08 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
filt_kqdetach(struct knote *kn)
|
|
|
|
{
|
2003-01-13 00:33:17 +00:00
|
|
|
struct kqueue *kq = kn->kn_fp->f_data;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
knlist_remove(&kq->kq_sel.si_note, kn, 0);
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
|
|
|
filt_kqueue(struct knote *kn, long hint)
|
|
|
|
{
|
2003-01-13 00:33:17 +00:00
|
|
|
struct kqueue *kq = kn->kn_fp->f_data;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
|
|
|
kn->kn_data = kq->kq_count;
|
|
|
|
return (kn->kn_data > 0);
|
|
|
|
}
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
/* XXX - move to kern_proc.c? */
|
2000-04-16 19:02:08 +00:00
|
|
|
static int
|
|
|
|
filt_procattach(struct knote *kn)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
2001-04-12 21:32:02 +00:00
|
|
|
int error;
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
bool exiting, immediate;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
exiting = immediate = false;
|
2000-04-16 19:02:08 +00:00
|
|
|
p = pfind(kn->kn_id);
|
2003-04-12 01:57:04 +00:00
|
|
|
if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
|
|
|
|
p = zpfind(kn->kn_id);
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
exiting = true;
|
2004-08-15 06:24:42 +00:00
|
|
|
} else if (p != NULL && (p->p_flag & P_WEXIT)) {
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
exiting = true;
|
2003-04-12 01:57:04 +00:00
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
|
2003-11-04 01:41:47 +00:00
|
|
|
if (p == NULL)
|
|
|
|
return (ESRCH);
|
2010-02-14 13:59:01 +00:00
|
|
|
if ((error = p_cansee(curthread, p))) {
|
|
|
|
PROC_UNLOCK(p);
|
2001-04-12 21:32:02 +00:00
|
|
|
return (error);
|
2010-02-14 13:59:01 +00:00
|
|
|
}
|
2000-04-16 19:02:08 +00:00
|
|
|
|
|
|
|
kn->kn_ptr.p_proc = p;
|
|
|
|
kn->kn_flags |= EV_CLEAR; /* automatically set */
|
|
|
|
|
|
|
|
/*
|
2016-01-28 20:24:15 +00:00
|
|
|
* Internal flag indicating registration done by kernel for the
|
|
|
|
* purposes of getting a NOTE_CHILD notification.
|
2000-04-16 19:02:08 +00:00
|
|
|
*/
|
2016-01-28 20:24:15 +00:00
|
|
|
if (kn->kn_flags & EV_FLAG2) {
|
|
|
|
kn->kn_flags &= ~EV_FLAG2;
|
2000-04-16 19:02:08 +00:00
|
|
|
kn->kn_data = kn->kn_sdata; /* ppid */
|
|
|
|
kn->kn_fflags = NOTE_CHILD;
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
|
|
|
|
immediate = true; /* Force immediate activation of child note. */
|
2016-01-28 20:24:15 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Internal flag indicating registration done by kernel (for other than
|
|
|
|
* NOTE_CHILD).
|
|
|
|
*/
|
|
|
|
if (kn->kn_flags & EV_FLAG1) {
|
2000-04-16 19:02:08 +00:00
|
|
|
kn->kn_flags &= ~EV_FLAG1;
|
|
|
|
}
|
|
|
|
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
knlist_add(p->p_klist, kn, 1);
|
2003-04-12 01:57:04 +00:00
|
|
|
|
|
|
|
/*
|
2016-01-28 20:24:15 +00:00
|
|
|
* Immediately activate any child notes or, in the case of a zombie
|
|
|
|
* target process, exit notes. The latter is necessary to handle the
|
|
|
|
* case where the target process, e.g. a child, dies before the kevent
|
|
|
|
* is registered.
|
2003-04-12 01:57:04 +00:00
|
|
|
*/
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
|
2004-08-15 06:24:42 +00:00
|
|
|
KNOTE_ACTIVATE(kn, 0);
|
2003-04-12 01:57:04 +00:00
|
|
|
|
2001-01-24 00:35:12 +00:00
|
|
|
PROC_UNLOCK(p);
|
2000-04-16 19:02:08 +00:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The knote may be attached to a different process, which may exit,
|
|
|
|
* leaving nothing for the knote to be attached to. So when the process
|
|
|
|
* exits, the knote is marked as DETACHED and also flagged as ONESHOT so
|
|
|
|
* it will be deleted when read out. However, as part of the knote deletion,
|
|
|
|
* this routine is called, so a check is needed to avoid actually performing
|
|
|
|
* a detach, because the original process does not exist any more.
|
|
|
|
*/
|
2004-08-15 06:24:42 +00:00
|
|
|
/* XXX - move to kern_proc.c? */
|
2000-04-16 19:02:08 +00:00
|
|
|
static void
|
|
|
|
filt_procdetach(struct knote *kn)
|
|
|
|
{
|
|
|
|
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
knlist_remove(kn->kn_knlist, kn, 0);
|
2004-08-15 06:24:42 +00:00
|
|
|
kn->kn_ptr.p_proc = NULL;
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
/* XXX - move to kern_proc.c? */
|
2000-04-16 19:02:08 +00:00
|
|
|
static int
|
|
|
|
filt_proc(struct knote *kn, long hint)
|
|
|
|
{
|
2014-04-07 18:10:49 +00:00
|
|
|
struct proc *p;
|
2000-04-16 19:02:08 +00:00
|
|
|
u_int event;
|
|
|
|
|
2014-04-07 18:10:49 +00:00
|
|
|
p = kn->kn_ptr.p_proc;
|
|
|
|
/* Mask off extra data. */
|
2000-04-16 19:02:08 +00:00
|
|
|
event = (u_int)hint & NOTE_PCTRLMASK;
|
|
|
|
|
2014-04-07 18:10:49 +00:00
|
|
|
/* If the user is interested in this event, record it. */
|
2000-04-16 19:02:08 +00:00
|
|
|
if (kn->kn_sfflags & event)
|
|
|
|
kn->kn_fflags |= event;
|
|
|
|
|
2014-04-07 18:10:49 +00:00
|
|
|
/* Process is gone, so flag the event as finished. */
|
2000-04-16 19:02:08 +00:00
|
|
|
if (event == NOTE_EXIT) {
|
2014-04-07 18:10:49 +00:00
|
|
|
kn->kn_flags |= EV_EOF | EV_ONESHOT;
|
2004-08-15 06:24:42 +00:00
|
|
|
kn->kn_ptr.p_proc = NULL;
|
2013-08-07 19:56:35 +00:00
|
|
|
if (kn->kn_fflags & NOTE_EXIT)
|
2015-07-18 09:02:50 +00:00
|
|
|
kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
|
2013-08-07 19:56:35 +00:00
|
|
|
if (kn->kn_fflags == 0)
|
|
|
|
kn->kn_flags |= EV_DROP;
|
2000-04-16 19:02:08 +00:00
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
2008-07-07 09:30:11 +00:00
|
|
|
return (kn->kn_fflags != 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called when the process forked. It mostly does the same as the
|
|
|
|
* knote(), activating all knotes registered to be activated when the
|
|
|
|
* process forked. Additionally, for each knote attached to the
|
|
|
|
* parent, check whether user wants to track the new process. If so
|
|
|
|
* attach a new knote to it, and immediately report an event with the
|
|
|
|
* child's pid.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
knote_fork(struct knlist *list, int pid)
|
|
|
|
{
|
|
|
|
struct kqueue *kq;
|
|
|
|
struct knote *kn;
|
|
|
|
struct kevent kev;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (list == NULL)
|
|
|
|
return;
|
|
|
|
list->kl_lock(list->kl_lockarg);
|
|
|
|
|
|
|
|
SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
|
|
|
|
kq = kn->kn_kq;
|
|
|
|
KQ_LOCK(kq);
|
2014-04-05 14:09:16 +00:00
|
|
|
if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
|
2008-07-07 09:30:11 +00:00
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
continue;
|
|
|
|
}
|
2000-04-16 19:02:08 +00:00
|
|
|
|
|
|
|
/*
|
2008-07-07 09:30:11 +00:00
|
|
|
* The same as knote(), activate the event.
|
2000-04-16 19:02:08 +00:00
|
|
|
*/
|
2008-07-07 09:30:11 +00:00
|
|
|
if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
|
|
|
|
kn->kn_status |= KN_HASKQLOCK;
|
2013-08-13 18:45:58 +00:00
|
|
|
if (kn->kn_fop->f_event(kn, NOTE_FORK))
|
2008-07-07 09:30:11 +00:00
|
|
|
KNOTE_ACTIVATE(kn, 1);
|
|
|
|
kn->kn_status &= ~KN_HASKQLOCK;
|
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The NOTE_TRACK case. In addition to the activation
|
2016-01-28 20:24:15 +00:00
|
|
|
* of the event, we need to register new events to
|
2008-07-07 09:30:11 +00:00
|
|
|
* track the child. Drop the locks in preparation for
|
|
|
|
* the call to kqueue_register().
|
|
|
|
*/
|
|
|
|
kn->kn_status |= KN_INFLUX;
|
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
list->kl_unlock(list->kl_lockarg);
|
|
|
|
|
|
|
|
/*
|
2016-01-28 20:24:15 +00:00
|
|
|
* Activate existing knote and register tracking knotes with
|
2008-07-07 09:30:11 +00:00
|
|
|
* new process.
|
2016-01-28 20:24:15 +00:00
|
|
|
*
|
|
|
|
* First register a knote to get just the child notice. This
|
|
|
|
* must be a separate note from a potential NOTE_EXIT
|
|
|
|
* notification since both NOTE_CHILD and NOTE_EXIT are defined
|
|
|
|
* to use the data field (in conflicting ways).
|
|
|
|
*/
|
|
|
|
kev.ident = pid;
|
|
|
|
kev.filter = kn->kn_filter;
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
|
|
|
|
EV_FLAG2;
|
2016-01-28 20:24:15 +00:00
|
|
|
kev.fflags = kn->kn_sfflags;
|
|
|
|
kev.data = kn->kn_id; /* parent */
|
|
|
|
kev.udata = kn->kn_kevent.udata;/* preserve udata */
|
|
|
|
error = kqueue_register(kq, &kev, NULL, 0);
|
|
|
|
if (error)
|
|
|
|
kn->kn_fflags |= NOTE_TRACKERR;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Then register another knote to track other potential events
|
|
|
|
* from the new process.
|
2008-07-07 09:30:11 +00:00
|
|
|
*/
|
|
|
|
kev.ident = pid;
|
2000-04-16 19:02:08 +00:00
|
|
|
kev.filter = kn->kn_filter;
|
|
|
|
kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
|
|
|
|
kev.fflags = kn->kn_sfflags;
|
2008-07-07 09:30:11 +00:00
|
|
|
kev.data = kn->kn_id; /* parent */
|
|
|
|
kev.udata = kn->kn_kevent.udata;/* preserve udata */
|
|
|
|
error = kqueue_register(kq, &kev, NULL, 0);
|
2000-04-16 19:02:08 +00:00
|
|
|
if (error)
|
|
|
|
kn->kn_fflags |= NOTE_TRACKERR;
|
2013-08-13 18:45:58 +00:00
|
|
|
if (kn->kn_fop->f_event(kn, NOTE_FORK))
|
|
|
|
KNOTE_ACTIVATE(kn, 0);
|
2008-07-07 09:30:11 +00:00
|
|
|
KQ_LOCK(kq);
|
|
|
|
kn->kn_status &= ~KN_INFLUX;
|
|
|
|
KQ_UNLOCK_FLUX(kq);
|
|
|
|
list->kl_lock(list->kl_lockarg);
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
2008-07-07 09:30:11 +00:00
|
|
|
list->kl_unlock(list->kl_lockarg);
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
|
2012-07-13 13:24:33 +00:00
|
|
|
/*
|
|
|
|
* XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
|
|
|
|
* interval timer support code.
|
|
|
|
*/
|
2014-07-18 14:27:04 +00:00
|
|
|
|
|
|
|
#define NOTE_TIMER_PRECMASK (NOTE_SECONDS|NOTE_MSECONDS|NOTE_USECONDS| \
|
|
|
|
NOTE_NSECONDS)
|
|
|
|
|
2016-03-12 23:02:53 +00:00
|
|
|
static sbintime_t
|
2014-07-18 14:27:04 +00:00
|
|
|
timer2sbintime(intptr_t data, int flags)
|
2004-08-15 06:24:42 +00:00
|
|
|
{
|
2014-07-18 14:27:04 +00:00
|
|
|
|
2016-03-12 23:02:53 +00:00
|
|
|
/*
|
|
|
|
* Macros for converting to the fractional second portion of an
|
|
|
|
* sbintime_t using 64bit multiplication to improve precision.
|
|
|
|
*/
|
|
|
|
#define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
|
|
|
|
#define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
|
|
|
|
#define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
|
2014-07-18 14:27:04 +00:00
|
|
|
switch (flags & NOTE_TIMER_PRECMASK) {
|
|
|
|
case NOTE_SECONDS:
|
2016-03-12 23:02:53 +00:00
|
|
|
#ifdef __LP64__
|
|
|
|
if (data > (SBT_MAX / SBT_1S))
|
|
|
|
return SBT_MAX;
|
|
|
|
#endif
|
|
|
|
return ((sbintime_t)data << 32);
|
2014-07-18 14:27:04 +00:00
|
|
|
case NOTE_MSECONDS: /* FALLTHROUGH */
|
|
|
|
case 0:
|
2016-03-12 23:02:53 +00:00
|
|
|
if (data >= 1000) {
|
|
|
|
int64_t secs = data / 1000;
|
|
|
|
#ifdef __LP64__
|
|
|
|
if (secs > (SBT_MAX / SBT_1S))
|
|
|
|
return SBT_MAX;
|
|
|
|
#endif
|
|
|
|
return (secs << 32 | MS_TO_SBT(data % 1000));
|
|
|
|
}
|
|
|
|
return MS_TO_SBT(data);
|
2014-07-18 14:27:04 +00:00
|
|
|
case NOTE_USECONDS:
|
2016-03-12 23:02:53 +00:00
|
|
|
if (data >= 1000000) {
|
|
|
|
int64_t secs = data / 1000000;
|
|
|
|
#ifdef __LP64__
|
|
|
|
if (secs > (SBT_MAX / SBT_1S))
|
|
|
|
return SBT_MAX;
|
|
|
|
#endif
|
|
|
|
return (secs << 32 | US_TO_SBT(data % 1000000));
|
|
|
|
}
|
|
|
|
return US_TO_SBT(data);
|
2014-07-18 14:27:04 +00:00
|
|
|
case NOTE_NSECONDS:
|
2016-03-12 23:02:53 +00:00
|
|
|
if (data >= 1000000000) {
|
|
|
|
int64_t secs = data / 1000000000;
|
2013-12-19 21:35:33 +00:00
|
|
|
#ifdef __LP64__
|
2016-03-12 23:02:53 +00:00
|
|
|
if (secs > (SBT_MAX / SBT_1S))
|
|
|
|
return SBT_MAX;
|
2013-12-19 21:35:33 +00:00
|
|
|
#endif
|
2016-03-12 23:02:53 +00:00
|
|
|
return (secs << 32 | US_TO_SBT(data % 1000000000));
|
|
|
|
}
|
|
|
|
return NS_TO_SBT(data);
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return (-1);
|
2004-08-15 06:24:42 +00:00
|
|
|
}
|
|
|
|
|
2001-07-19 18:34:40 +00:00
|
|
|
static void
|
|
|
|
filt_timerexpire(void *knx)
|
|
|
|
{
|
2001-09-29 17:48:39 +00:00
|
|
|
struct callout *calloutp;
|
2013-03-04 16:55:16 +00:00
|
|
|
struct knote *kn;
|
2001-07-19 18:34:40 +00:00
|
|
|
|
2013-03-04 16:55:16 +00:00
|
|
|
kn = knx;
|
2001-07-19 18:34:40 +00:00
|
|
|
kn->kn_data++;
|
2004-08-15 06:24:42 +00:00
|
|
|
KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */
|
2001-07-19 18:34:40 +00:00
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
|
2001-09-29 17:48:39 +00:00
|
|
|
calloutp = (struct callout *)kn->kn_hook;
|
2014-10-04 15:59:15 +00:00
|
|
|
*kn->kn_ptr.p_nexttime += timer2sbintime(kn->kn_sdata,
|
|
|
|
kn->kn_sfflags);
|
|
|
|
callout_reset_sbt_on(calloutp, *kn->kn_ptr.p_nexttime, 0,
|
|
|
|
filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE);
|
2001-07-19 18:34:40 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2014-07-18 14:27:04 +00:00
|
|
|
* data contains amount of time to sleep
|
2004-08-12 18:06:21 +00:00
|
|
|
*/
|
2001-07-19 18:34:40 +00:00
|
|
|
static int
|
|
|
|
filt_timerattach(struct knote *kn)
|
|
|
|
{
|
2001-09-29 17:48:39 +00:00
|
|
|
struct callout *calloutp;
|
2013-09-26 13:17:31 +00:00
|
|
|
sbintime_t to;
|
2013-06-16 09:30:35 +00:00
|
|
|
unsigned int ncallouts;
|
2001-07-19 18:34:40 +00:00
|
|
|
|
2013-09-26 13:17:31 +00:00
|
|
|
if ((intptr_t)kn->kn_sdata < 0)
|
|
|
|
return (EINVAL);
|
|
|
|
if ((intptr_t)kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
|
|
|
|
kn->kn_sdata = 1;
|
2014-07-18 14:27:04 +00:00
|
|
|
/* Only precision unit are supported in flags so far */
|
|
|
|
if (kn->kn_sfflags & ~NOTE_TIMER_PRECMASK)
|
|
|
|
return (EINVAL);
|
|
|
|
|
|
|
|
to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
|
2013-09-26 13:17:31 +00:00
|
|
|
if (to < 0)
|
|
|
|
return (EINVAL);
|
|
|
|
|
2013-06-16 09:30:35 +00:00
|
|
|
ncallouts = atomic_load_explicit(&kq_ncallouts, memory_order_relaxed);
|
|
|
|
do {
|
|
|
|
if (ncallouts >= kq_calloutmax)
|
|
|
|
return (ENOMEM);
|
|
|
|
} while (!atomic_compare_exchange_weak_explicit(&kq_ncallouts,
|
|
|
|
&ncallouts, ncallouts + 1, memory_order_relaxed,
|
|
|
|
memory_order_relaxed));
|
2001-07-19 18:34:40 +00:00
|
|
|
|
|
|
|
kn->kn_flags |= EV_CLEAR; /* automatically set */
|
2013-08-26 18:53:19 +00:00
|
|
|
kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */
|
2014-10-04 15:59:15 +00:00
|
|
|
kn->kn_ptr.p_nexttime = malloc(sizeof(sbintime_t), M_KQUEUE, M_WAITOK);
|
2008-10-23 20:26:15 +00:00
|
|
|
calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
|
2015-05-22 17:05:21 +00:00
|
|
|
callout_init(calloutp, 1);
|
2002-06-29 00:29:12 +00:00
|
|
|
kn->kn_hook = calloutp;
|
2014-10-04 15:59:15 +00:00
|
|
|
*kn->kn_ptr.p_nexttime = to + sbinuptime();
|
|
|
|
callout_reset_sbt_on(calloutp, *kn->kn_ptr.p_nexttime, 0,
|
|
|
|
filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE);
|
2001-07-19 18:34:40 +00:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
filt_timerdetach(struct knote *kn)
|
|
|
|
{
|
2001-09-29 17:48:39 +00:00
|
|
|
struct callout *calloutp;
|
2013-06-16 09:30:35 +00:00
|
|
|
unsigned int old;
|
2001-07-19 18:34:40 +00:00
|
|
|
|
2001-09-29 17:48:39 +00:00
|
|
|
calloutp = (struct callout *)kn->kn_hook;
|
2004-04-07 05:59:57 +00:00
|
|
|
callout_drain(calloutp);
|
2008-10-23 15:53:51 +00:00
|
|
|
free(calloutp, M_KQUEUE);
|
2014-10-04 15:59:15 +00:00
|
|
|
free(kn->kn_ptr.p_nexttime, M_KQUEUE);
|
2013-06-16 09:30:35 +00:00
|
|
|
old = atomic_fetch_sub_explicit(&kq_ncallouts, 1, memory_order_relaxed);
|
|
|
|
KASSERT(old > 0, ("Number of callouts cannot become negative"));
|
2013-08-26 18:53:19 +00:00
|
|
|
kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */
|
2001-07-19 18:34:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
filt_timer(struct knote *kn, long hint)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (kn->kn_data != 0);
|
|
|
|
}
|
|
|
|
|
2009-09-16 03:30:12 +00:00
|
|
|
static int
|
|
|
|
filt_userattach(struct knote *kn)
|
|
|
|
{
|
|
|
|
|
|
|
|
/*
|
|
|
|
* EVFILT_USER knotes are not attached to anything in the kernel.
|
|
|
|
*/
|
|
|
|
kn->kn_hook = NULL;
|
|
|
|
if (kn->kn_fflags & NOTE_TRIGGER)
|
|
|
|
kn->kn_hookid = 1;
|
|
|
|
else
|
|
|
|
kn->kn_hookid = 0;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
filt_userdetach(__unused struct knote *kn)
|
|
|
|
{
|
|
|
|
|
|
|
|
/*
|
|
|
|
* EVFILT_USER knotes are not attached to anything in the kernel.
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
filt_user(struct knote *kn, __unused long hint)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (kn->kn_hookid);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2009-09-22 16:16:02 +00:00
|
|
|
filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
|
2009-09-16 03:30:12 +00:00
|
|
|
{
|
2009-09-22 16:16:02 +00:00
|
|
|
u_int ffctrl;
|
2009-09-16 03:30:12 +00:00
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case EVENT_REGISTER:
|
|
|
|
if (kev->fflags & NOTE_TRIGGER)
|
|
|
|
kn->kn_hookid = 1;
|
|
|
|
|
|
|
|
ffctrl = kev->fflags & NOTE_FFCTRLMASK;
|
|
|
|
kev->fflags &= NOTE_FFLAGSMASK;
|
|
|
|
switch (ffctrl) {
|
|
|
|
case NOTE_FFNOP:
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NOTE_FFAND:
|
|
|
|
kn->kn_sfflags &= kev->fflags;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NOTE_FFOR:
|
|
|
|
kn->kn_sfflags |= kev->fflags;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NOTE_FFCOPY:
|
|
|
|
kn->kn_sfflags = kev->fflags;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
/* XXX Return error? */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
kn->kn_sdata = kev->data;
|
|
|
|
if (kev->flags & EV_CLEAR) {
|
|
|
|
kn->kn_hookid = 0;
|
|
|
|
kn->kn_data = 0;
|
|
|
|
kn->kn_fflags = 0;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case EVENT_PROCESS:
|
|
|
|
*kev = kn->kn_kevent;
|
|
|
|
kev->fflags = kn->kn_sfflags;
|
|
|
|
kev->data = kn->kn_sdata;
|
|
|
|
if (kn->kn_flags & EV_CLEAR) {
|
|
|
|
kn->kn_hookid = 0;
|
|
|
|
kn->kn_data = 0;
|
|
|
|
kn->kn_fflags = 0;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
panic("filt_usertouch() - invalid type (%ld)", type);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-06-10 01:51:18 +00:00
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_kqueue(struct thread *td, struct kqueue_args *uap)
|
2015-05-24 16:36:29 +00:00
|
|
|
{
|
|
|
|
|
2015-08-05 07:36:50 +00:00
|
|
|
return (kern_kqueue(td, 0, NULL));
|
2015-05-24 16:36:29 +00:00
|
|
|
}
|
|
|
|
|
2015-08-11 13:47:23 +00:00
|
|
|
static void
|
|
|
|
kqueue_init(struct kqueue *kq)
|
|
|
|
{
|
|
|
|
|
|
|
|
mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
|
|
|
|
TAILQ_INIT(&kq->kq_head);
|
|
|
|
knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
|
|
|
|
TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
|
|
|
|
}
|
|
|
|
|
2015-05-24 16:36:29 +00:00
|
|
|
int
|
2015-08-05 07:36:50 +00:00
|
|
|
kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2013-09-18 18:48:33 +00:00
|
|
|
struct filedesc *fdp;
|
|
|
|
struct kqueue *kq;
|
|
|
|
struct file *fp;
|
2013-10-21 16:44:53 +00:00
|
|
|
struct ucred *cred;
|
2013-09-18 18:48:33 +00:00
|
|
|
int fd, error;
|
|
|
|
|
2015-09-23 12:45:08 +00:00
|
|
|
fdp = td->td_proc->p_fd;
|
2013-10-21 16:44:53 +00:00
|
|
|
cred = td->td_ucred;
|
2015-09-23 12:45:08 +00:00
|
|
|
if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
|
2013-11-03 23:06:24 +00:00
|
|
|
return (ENOMEM);
|
2013-10-21 16:44:53 +00:00
|
|
|
|
2015-08-05 07:36:50 +00:00
|
|
|
error = falloc_caps(td, &fp, &fd, flags, fcaps);
|
2015-09-23 12:45:08 +00:00
|
|
|
if (error != 0) {
|
|
|
|
chgkqcnt(cred->cr_ruidinfo, -1, 0);
|
|
|
|
return (error);
|
|
|
|
}
|
2013-09-18 18:48:33 +00:00
|
|
|
|
|
|
|
/* An extra reference on `fp' has been held for us by falloc(). */
|
|
|
|
kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
|
2015-08-11 13:47:23 +00:00
|
|
|
kqueue_init(kq);
|
2013-09-18 18:48:33 +00:00
|
|
|
kq->kq_fdp = fdp;
|
2015-09-23 12:45:08 +00:00
|
|
|
kq->kq_cred = crhold(cred);
|
2013-09-18 18:48:33 +00:00
|
|
|
|
|
|
|
FILEDESC_XLOCK(fdp);
|
|
|
|
TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
|
|
|
|
FILEDESC_XUNLOCK(fdp);
|
|
|
|
|
|
|
|
finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
|
|
|
|
fdrop(fp, td);
|
|
|
|
|
|
|
|
td->td_retval[0] = fd;
|
2015-09-23 12:45:08 +00:00
|
|
|
return (0);
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef _SYS_SYSPROTO_H_
|
|
|
|
struct kevent_args {
|
|
|
|
int fd;
|
2000-07-28 22:32:25 +00:00
|
|
|
const struct kevent *changelist;
|
2000-04-16 19:02:08 +00:00
|
|
|
int nchanges;
|
|
|
|
struct kevent *eventlist;
|
2000-07-18 19:31:52 +00:00
|
|
|
int nevents;
|
2000-07-28 22:32:25 +00:00
|
|
|
const struct timespec *timeout;
|
2000-04-16 19:02:08 +00:00
|
|
|
};
|
|
|
|
#endif
|
|
|
|
int
|
2011-09-16 13:58:51 +00:00
|
|
|
sys_kevent(struct thread *td, struct kevent_args *uap)
|
2005-03-01 17:45:55 +00:00
|
|
|
{
|
|
|
|
struct timespec ts, *tsp;
|
2005-06-03 23:15:01 +00:00
|
|
|
struct kevent_copyops k_ops = { uap,
|
|
|
|
kevent_copyout,
|
|
|
|
kevent_copyin};
|
2005-03-01 17:45:55 +00:00
|
|
|
int error;
|
2006-09-24 02:23:29 +00:00
|
|
|
#ifdef KTRACE
|
|
|
|
struct uio ktruio;
|
|
|
|
struct iovec ktriov;
|
|
|
|
struct uio *ktruioin = NULL;
|
|
|
|
struct uio *ktruioout = NULL;
|
|
|
|
#endif
|
2005-03-01 17:45:55 +00:00
|
|
|
|
|
|
|
if (uap->timeout != NULL) {
|
|
|
|
error = copyin(uap->timeout, &ts, sizeof(ts));
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
tsp = &ts;
|
|
|
|
} else
|
|
|
|
tsp = NULL;
|
|
|
|
|
2006-09-24 02:23:29 +00:00
|
|
|
#ifdef KTRACE
|
|
|
|
if (KTRPOINT(td, KTR_GENIO)) {
|
|
|
|
ktriov.iov_base = uap->changelist;
|
|
|
|
ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
|
|
|
|
ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
|
|
|
|
.uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
|
|
|
|
.uio_td = td };
|
|
|
|
ktruioin = cloneuio(&ktruio);
|
|
|
|
ktriov.iov_base = uap->eventlist;
|
|
|
|
ktriov.iov_len = uap->nevents * sizeof(struct kevent);
|
|
|
|
ktruioout = cloneuio(&ktruio);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
|
|
|
|
&k_ops, tsp);
|
|
|
|
|
|
|
|
#ifdef KTRACE
|
|
|
|
if (ktruioin != NULL) {
|
|
|
|
ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
|
|
|
|
ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
|
|
|
|
ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
|
|
|
|
ktrgenio(uap->fd, UIO_READ, ktruioout, error);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return (error);
|
2005-03-01 17:45:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2005-06-03 23:15:01 +00:00
|
|
|
* Copy 'count' items into the destination list pointed to by uap->eventlist.
|
2005-03-01 17:45:55 +00:00
|
|
|
*/
|
|
|
|
static int
|
2005-06-03 23:15:01 +00:00
|
|
|
kevent_copyout(void *arg, struct kevent *kevp, int count)
|
2005-03-01 17:45:55 +00:00
|
|
|
{
|
2005-06-03 23:15:01 +00:00
|
|
|
struct kevent_args *uap;
|
2005-03-01 17:45:55 +00:00
|
|
|
int error;
|
|
|
|
|
2005-06-03 23:15:01 +00:00
|
|
|
KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
|
|
|
|
uap = (struct kevent_args *)arg;
|
|
|
|
|
|
|
|
error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
|
|
|
|
if (error == 0)
|
|
|
|
uap->eventlist += count;
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copy 'count' items from the list pointed to by uap->changelist.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
kevent_copyin(void *arg, struct kevent *kevp, int count)
|
|
|
|
{
|
|
|
|
struct kevent_args *uap;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
|
|
|
|
uap = (struct kevent_args *)arg;
|
|
|
|
|
|
|
|
error = copyin(uap->changelist, kevp, count * sizeof *kevp);
|
|
|
|
if (error == 0)
|
|
|
|
uap->changelist += count;
|
2005-03-01 17:45:55 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2005-06-03 23:15:01 +00:00
|
|
|
kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
|
|
|
|
struct kevent_copyops *k_ops, const struct timespec *timeout)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2013-09-18 18:48:33 +00:00
|
|
|
cap_rights_t rights;
|
2015-05-24 16:36:29 +00:00
|
|
|
struct file *fp;
|
|
|
|
int error;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2013-11-15 19:55:35 +00:00
|
|
|
cap_rights_init(&rights);
|
|
|
|
if (nchanges > 0)
|
|
|
|
cap_rights_set(&rights, CAP_KQUEUE_CHANGE);
|
|
|
|
if (nevents > 0)
|
|
|
|
cap_rights_set(&rights, CAP_KQUEUE_EVENT);
|
|
|
|
error = fget(td, fd, &rights, &fp);
|
2013-09-18 18:48:33 +00:00
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
2013-11-15 19:55:35 +00:00
|
|
|
|
2015-05-24 16:36:29 +00:00
|
|
|
error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
|
|
|
|
fdrop(fp, td);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2015-08-11 13:47:23 +00:00
|
|
|
static int
|
|
|
|
kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
|
2015-05-24 16:36:29 +00:00
|
|
|
struct kevent_copyops *k_ops, const struct timespec *timeout)
|
|
|
|
{
|
|
|
|
struct kevent keva[KQ_NEVENTS];
|
|
|
|
struct kevent *kevp, *changes;
|
|
|
|
int i, n, nerrors, error;
|
|
|
|
|
2000-04-16 19:02:08 +00:00
|
|
|
nerrors = 0;
|
2005-03-01 17:45:55 +00:00
|
|
|
while (nchanges > 0) {
|
2005-06-03 23:15:01 +00:00
|
|
|
n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
|
|
|
|
error = k_ops->k_copyin(k_ops->arg, keva, n);
|
|
|
|
if (error)
|
2015-08-11 13:47:23 +00:00
|
|
|
return (error);
|
2005-06-03 23:15:01 +00:00
|
|
|
changes = keva;
|
2000-04-16 19:02:08 +00:00
|
|
|
for (i = 0; i < n; i++) {
|
2005-03-01 17:45:55 +00:00
|
|
|
kevp = &changes[i];
|
2005-10-12 17:51:31 +00:00
|
|
|
if (!kevp->filter)
|
|
|
|
continue;
|
2000-07-18 19:31:52 +00:00
|
|
|
kevp->flags &= ~EV_SYSFLAGS;
|
2004-08-15 06:24:42 +00:00
|
|
|
error = kqueue_register(kq, kevp, td, 1);
|
2009-09-16 03:49:54 +00:00
|
|
|
if (error || (kevp->flags & EV_RECEIPT)) {
|
2015-08-11 13:47:23 +00:00
|
|
|
if (nevents == 0)
|
|
|
|
return (error);
|
|
|
|
kevp->flags = EV_ERROR;
|
|
|
|
kevp->data = error;
|
|
|
|
(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
|
|
|
|
nevents--;
|
|
|
|
nerrors++;
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
}
|
2005-03-01 17:45:55 +00:00
|
|
|
nchanges -= n;
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
if (nerrors) {
|
2004-08-15 06:24:42 +00:00
|
|
|
td->td_retval[0] = nerrors;
|
2015-08-11 13:47:23 +00:00
|
|
|
return (0);
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
|
2015-08-11 13:47:23 +00:00
|
|
|
return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
|
|
|
|
struct kevent_copyops *k_ops, const struct timespec *timeout)
|
|
|
|
{
|
|
|
|
struct kqueue *kq;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = kqueue_acquire(fp, &kq);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
|
2004-08-15 06:24:42 +00:00
|
|
|
kqueue_release(kq, 0);
|
2000-04-16 19:02:08 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2015-08-12 17:46:26 +00:00
|
|
|
/*
|
|
|
|
* Performs a kevent() call on a temporarily created kqueue. This can be
|
|
|
|
* used to perform one-shot polling, similar to poll() and select().
|
|
|
|
*/
|
2015-08-11 13:47:23 +00:00
|
|
|
int
|
|
|
|
kern_kevent_anonymous(struct thread *td, int nevents,
|
|
|
|
struct kevent_copyops *k_ops)
|
|
|
|
{
|
|
|
|
struct kqueue kq = {};
|
|
|
|
int error;
|
|
|
|
|
|
|
|
kqueue_init(&kq);
|
|
|
|
kq.kq_refcnt = 1;
|
|
|
|
error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
|
|
|
|
kqueue_drain(&kq, td);
|
|
|
|
kqueue_destroy(&kq);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2001-12-29 07:13:47 +00:00
|
|
|
int
|
|
|
|
kqueue_add_filteropts(int filt, struct filterops *filtops)
|
|
|
|
{
|
2004-08-15 06:24:42 +00:00
|
|
|
int error;
|
|
|
|
|
2009-12-31 20:56:28 +00:00
|
|
|
error = 0;
|
2004-08-15 06:24:42 +00:00
|
|
|
if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
|
|
|
|
printf(
|
|
|
|
"trying to add a filterop that is out of range: %d is beyond %d\n",
|
|
|
|
~filt, EVFILT_SYSCOUNT);
|
|
|
|
return EINVAL;
|
|
|
|
}
|
|
|
|
mtx_lock(&filterops_lock);
|
|
|
|
if (sysfilt_ops[~filt].for_fop != &null_filtops &&
|
|
|
|
sysfilt_ops[~filt].for_fop != NULL)
|
|
|
|
error = EEXIST;
|
|
|
|
else {
|
|
|
|
sysfilt_ops[~filt].for_fop = filtops;
|
|
|
|
sysfilt_ops[~filt].for_refcnt = 0;
|
|
|
|
}
|
|
|
|
mtx_unlock(&filterops_lock);
|
2001-12-29 07:13:47 +00:00
|
|
|
|
2009-12-31 20:56:28 +00:00
|
|
|
return (error);
|
2001-12-29 07:13:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
kqueue_del_filteropts(int filt)
|
|
|
|
{
|
2004-08-15 06:24:42 +00:00
|
|
|
int error;
|
2001-12-29 07:13:47 +00:00
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
error = 0;
|
|
|
|
if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
|
|
|
|
return EINVAL;
|
|
|
|
|
|
|
|
mtx_lock(&filterops_lock);
|
|
|
|
if (sysfilt_ops[~filt].for_fop == &null_filtops ||
|
|
|
|
sysfilt_ops[~filt].for_fop == NULL)
|
|
|
|
error = EINVAL;
|
|
|
|
else if (sysfilt_ops[~filt].for_refcnt != 0)
|
|
|
|
error = EBUSY;
|
|
|
|
else {
|
|
|
|
sysfilt_ops[~filt].for_fop = &null_filtops;
|
|
|
|
sysfilt_ops[~filt].for_refcnt = 0;
|
|
|
|
}
|
|
|
|
mtx_unlock(&filterops_lock);
|
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct filterops *
|
|
|
|
kqueue_fo_find(int filt)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
|
|
|
|
return NULL;
|
|
|
|
|
2014-11-16 01:18:41 +00:00
|
|
|
if (sysfilt_ops[~filt].for_nolock)
|
|
|
|
return sysfilt_ops[~filt].for_fop;
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
mtx_lock(&filterops_lock);
|
|
|
|
sysfilt_ops[~filt].for_refcnt++;
|
|
|
|
if (sysfilt_ops[~filt].for_fop == NULL)
|
|
|
|
sysfilt_ops[~filt].for_fop = &null_filtops;
|
|
|
|
mtx_unlock(&filterops_lock);
|
|
|
|
|
|
|
|
return sysfilt_ops[~filt].for_fop;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kqueue_fo_release(int filt)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
|
|
|
|
return;
|
|
|
|
|
2014-11-16 01:18:41 +00:00
|
|
|
if (sysfilt_ops[~filt].for_nolock)
|
|
|
|
return;
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
mtx_lock(&filterops_lock);
|
|
|
|
KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
|
|
|
|
("filter object refcount not valid on release"));
|
|
|
|
sysfilt_ops[~filt].for_refcnt--;
|
|
|
|
mtx_unlock(&filterops_lock);
|
2001-12-29 07:13:47 +00:00
|
|
|
}
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
/*
|
2007-05-27 19:24:00 +00:00
|
|
|
* A ref to kq (obtained via kqueue_acquire) must be held. waitok will
|
2004-08-15 06:24:42 +00:00
|
|
|
* influence if memory allocation should wait. Make sure it is 0 if you
|
|
|
|
* hold any mutexes.
|
|
|
|
*/
|
2006-09-24 04:47:47 +00:00
|
|
|
static int
|
2004-08-15 06:24:42 +00:00
|
|
|
kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
|
|
|
struct filterops *fops;
|
2004-08-15 06:24:42 +00:00
|
|
|
struct file *fp;
|
|
|
|
struct knote *kn, *tkn;
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
struct knlist *knl;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
2004-08-15 06:24:42 +00:00
|
|
|
int error, filt, event;
|
2013-09-22 19:54:47 +00:00
|
|
|
int haskqglobal, filedesc_unlock;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2016-02-19 01:35:01 +00:00
|
|
|
if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
|
|
|
|
return (EINVAL);
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
fp = NULL;
|
|
|
|
kn = NULL;
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
knl = NULL;
|
2004-08-15 06:24:42 +00:00
|
|
|
error = 0;
|
|
|
|
haskqglobal = 0;
|
2013-09-22 19:54:47 +00:00
|
|
|
filedesc_unlock = 0;
|
2004-08-15 06:24:42 +00:00
|
|
|
|
|
|
|
filt = kev->filter;
|
|
|
|
fops = kqueue_fo_find(filt);
|
|
|
|
if (fops == NULL)
|
|
|
|
return EINVAL;
|
|
|
|
|
2015-09-01 13:21:32 +00:00
|
|
|
if (kev->flags & EV_ADD) {
|
|
|
|
/*
|
|
|
|
* Prevent waiting with locks. Non-sleepable
|
|
|
|
* allocation failures are handled in the loop, only
|
|
|
|
* if the spare knote appears to be actually required.
|
|
|
|
*/
|
|
|
|
tkn = knote_alloc(waitok);
|
|
|
|
} else {
|
2014-11-16 01:18:41 +00:00
|
|
|
tkn = NULL;
|
2015-09-01 13:21:32 +00:00
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
|
|
|
|
findkn:
|
2000-04-16 19:02:08 +00:00
|
|
|
if (fops->f_isfd) {
|
2004-08-15 06:24:42 +00:00
|
|
|
KASSERT(td != NULL, ("td is NULL"));
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
error = fget(td, kev->ident,
|
2013-11-15 19:55:35 +00:00
|
|
|
cap_rights_init(&rights, CAP_EVENT), &fp);
|
2006-06-12 21:46:23 +00:00
|
|
|
if (error)
|
2004-08-15 06:24:42 +00:00
|
|
|
goto done;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
|
|
|
|
kev->ident, 0) != 0) {
|
2006-06-12 21:46:23 +00:00
|
|
|
/* try again */
|
2004-08-15 06:24:42 +00:00
|
|
|
fdrop(fp, td);
|
|
|
|
fp = NULL;
|
|
|
|
error = kqueue_expand(kq, fops, kev->ident, waitok);
|
|
|
|
if (error)
|
|
|
|
goto done;
|
|
|
|
goto findkn;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fp->f_type == DTYPE_KQUEUE) {
|
|
|
|
/*
|
2016-01-28 20:24:15 +00:00
|
|
|
* If we add some intelligence about what we are doing,
|
2004-08-15 06:24:42 +00:00
|
|
|
* we should be able to support events on ourselves.
|
|
|
|
* We need to know when we are doing this to prevent
|
|
|
|
* getting both the knlist lock and the kq lock since
|
|
|
|
* they are the same thing.
|
|
|
|
*/
|
|
|
|
if (fp->f_data == kq) {
|
|
|
|
error = EINVAL;
|
2006-06-02 13:21:21 +00:00
|
|
|
goto done;
|
2004-08-15 06:24:42 +00:00
|
|
|
}
|
|
|
|
|
2013-09-22 19:54:47 +00:00
|
|
|
/*
|
|
|
|
* Pre-lock the filedesc before the global
|
|
|
|
* lock mutex, see the comment in
|
|
|
|
* kqueue_close().
|
|
|
|
*/
|
|
|
|
FILEDESC_XLOCK(td->td_proc->p_fd);
|
|
|
|
filedesc_unlock = 1;
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
|
|
|
|
}
|
|
|
|
|
|
|
|
KQ_LOCK(kq);
|
|
|
|
if (kev->ident < kq->kq_knlistsize) {
|
|
|
|
SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
|
|
|
|
if (kev->filter == kn->kn_filter)
|
2000-04-16 19:02:08 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
2004-08-15 06:24:42 +00:00
|
|
|
if ((kev->flags & EV_ADD) == EV_ADD)
|
|
|
|
kqueue_expand(kq, fops, kev->ident, waitok);
|
|
|
|
|
|
|
|
KQ_LOCK(kq);
|
2016-01-28 20:24:15 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If possible, find an existing knote to use for this kevent.
|
|
|
|
*/
|
|
|
|
if (kev->filter == EVFILT_PROC &&
|
|
|
|
(kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
|
|
|
|
/* This is an internal creation of a process tracking
|
|
|
|
* note. Don't attempt to coalesce this with an
|
|
|
|
* existing note.
|
|
|
|
*/
|
|
|
|
;
|
|
|
|
} else if (kq->kq_knhashmask != 0) {
|
2000-04-16 19:02:08 +00:00
|
|
|
struct klist *list;
|
2004-08-13 07:38:58 +00:00
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
list = &kq->kq_knhash[
|
|
|
|
KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
|
2000-04-16 19:02:08 +00:00
|
|
|
SLIST_FOREACH(kn, list, kn_link)
|
|
|
|
if (kev->ident == kn->kn_id &&
|
|
|
|
kev->filter == kn->kn_filter)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
|
2016-01-28 20:24:15 +00:00
|
|
|
/* knote is in the process of changing, wait for it to stabilize. */
|
2004-08-15 06:24:42 +00:00
|
|
|
if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
|
2009-10-10 14:56:34 +00:00
|
|
|
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
|
2013-09-22 19:54:47 +00:00
|
|
|
if (filedesc_unlock) {
|
|
|
|
FILEDESC_XUNLOCK(td->td_proc->p_fd);
|
|
|
|
filedesc_unlock = 0;
|
|
|
|
}
|
2009-10-10 14:56:34 +00:00
|
|
|
kq->kq_state |= KQ_FLUXWAIT;
|
|
|
|
msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
|
2004-08-15 06:24:42 +00:00
|
|
|
if (fp != NULL) {
|
|
|
|
fdrop(fp, td);
|
|
|
|
fp = NULL;
|
|
|
|
}
|
|
|
|
goto findkn;
|
|
|
|
}
|
2000-04-16 19:02:08 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* kn now contains the matching knote, or NULL if no match
|
|
|
|
*/
|
2009-09-16 03:15:57 +00:00
|
|
|
if (kn == NULL) {
|
|
|
|
if (kev->flags & EV_ADD) {
|
2004-08-15 06:24:42 +00:00
|
|
|
kn = tkn;
|
|
|
|
tkn = NULL;
|
2000-11-18 21:01:04 +00:00
|
|
|
if (kn == NULL) {
|
2006-06-02 13:23:39 +00:00
|
|
|
KQ_UNLOCK(kq);
|
2000-11-18 21:01:04 +00:00
|
|
|
error = ENOMEM;
|
|
|
|
goto done;
|
|
|
|
}
|
2000-04-16 19:02:08 +00:00
|
|
|
kn->kn_fp = fp;
|
|
|
|
kn->kn_kq = kq;
|
|
|
|
kn->kn_fop = fops;
|
2000-11-18 21:01:04 +00:00
|
|
|
/*
|
2004-08-15 06:24:42 +00:00
|
|
|
* apply reference counts to knote structure, and
|
2000-11-18 21:01:04 +00:00
|
|
|
* do not release it at the end of this routine.
|
|
|
|
*/
|
2004-08-15 06:24:42 +00:00
|
|
|
fops = NULL;
|
2000-11-18 21:01:04 +00:00
|
|
|
fp = NULL;
|
|
|
|
|
2000-06-22 18:39:31 +00:00
|
|
|
kn->kn_sfflags = kev->fflags;
|
|
|
|
kn->kn_sdata = kev->data;
|
|
|
|
kev->fflags = 0;
|
|
|
|
kev->data = 0;
|
|
|
|
kn->kn_kevent = *kev;
|
2006-04-01 20:15:39 +00:00
|
|
|
kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
|
2014-11-16 01:18:41 +00:00
|
|
|
EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
|
2004-08-15 06:24:42 +00:00
|
|
|
kn->kn_status = KN_INFLUX|KN_DETACHED;
|
2000-06-22 18:39:31 +00:00
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
error = knote_attach(kn, kq);
|
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
if (error != 0) {
|
|
|
|
tkn = kn;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((error = kn->kn_fop->f_attach(kn)) != 0) {
|
2001-09-12 08:38:13 +00:00
|
|
|
knote_drop(kn, td);
|
2000-04-16 19:02:08 +00:00
|
|
|
goto done;
|
|
|
|
}
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
knl = kn_list_lock(kn);
|
2009-09-16 03:15:57 +00:00
|
|
|
goto done_ev_add;
|
2000-06-22 18:39:31 +00:00
|
|
|
} else {
|
2009-09-16 03:15:57 +00:00
|
|
|
/* No matching knote and the EV_ADD flag is not set. */
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_UNLOCK(kq);
|
2009-09-16 03:15:57 +00:00
|
|
|
error = ENOENT;
|
|
|
|
goto done;
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
2009-09-16 03:15:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (kev->flags & EV_DELETE) {
|
2004-08-15 06:24:42 +00:00
|
|
|
kn->kn_status |= KN_INFLUX;
|
|
|
|
KQ_UNLOCK(kq);
|
2004-09-06 19:02:42 +00:00
|
|
|
if (!(kn->kn_status & KN_DETACHED))
|
|
|
|
kn->kn_fop->f_detach(kn);
|
2001-09-12 08:38:13 +00:00
|
|
|
knote_drop(kn, td);
|
2000-04-16 19:02:08 +00:00
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
2014-11-16 01:18:41 +00:00
|
|
|
if (kev->flags & EV_FORCEONESHOT) {
|
|
|
|
kn->kn_flags |= EV_ONESHOT;
|
|
|
|
KNOTE_ACTIVATE(kn, 1);
|
|
|
|
}
|
|
|
|
|
2009-09-16 03:15:57 +00:00
|
|
|
/*
|
|
|
|
* The user may change some filter values after the initial EV_ADD,
|
|
|
|
* but doing so will not reset any filter which has already been
|
|
|
|
* triggered.
|
|
|
|
*/
|
2014-04-05 14:09:16 +00:00
|
|
|
kn->kn_status |= KN_INFLUX | KN_SCAN;
|
2009-09-16 03:15:57 +00:00
|
|
|
KQ_UNLOCK(kq);
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
knl = kn_list_lock(kn);
|
2009-09-16 03:15:57 +00:00
|
|
|
kn->kn_kevent.udata = kev->udata;
|
|
|
|
if (!fops->f_isfd && fops->f_touch != NULL) {
|
|
|
|
fops->f_touch(kn, kev, EVENT_REGISTER);
|
|
|
|
} else {
|
|
|
|
kn->kn_sfflags = kev->fflags;
|
|
|
|
kn->kn_sdata = kev->data;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We can get here with kn->kn_knlist == NULL. This can happen when
|
|
|
|
* the initial attach event decides that the event is "completed"
|
|
|
|
* already. i.e. filt_procattach is called on a zombie process. It
|
|
|
|
* will call filt_proc which will remove it from the list, and NULL
|
|
|
|
* kn_knlist.
|
|
|
|
*/
|
|
|
|
done_ev_add:
|
2016-02-19 01:49:33 +00:00
|
|
|
if ((kev->flags & EV_ENABLE) != 0)
|
|
|
|
kn->kn_status &= ~KN_DISABLED;
|
|
|
|
else if ((kev->flags & EV_DISABLE) != 0)
|
2014-11-16 01:18:41 +00:00
|
|
|
kn->kn_status |= KN_DISABLED;
|
|
|
|
|
|
|
|
if ((kn->kn_status & KN_DISABLED) == 0)
|
|
|
|
event = kn->kn_fop->f_event(kn, 0);
|
|
|
|
else
|
|
|
|
event = 0;
|
2016-02-19 01:49:33 +00:00
|
|
|
|
2009-09-16 03:15:57 +00:00
|
|
|
KQ_LOCK(kq);
|
|
|
|
if (event)
|
2016-02-19 01:49:33 +00:00
|
|
|
kn->kn_status |= KN_ACTIVE;
|
|
|
|
if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
|
|
|
|
KN_ACTIVE)
|
|
|
|
knote_enqueue(kn);
|
2014-04-05 14:09:16 +00:00
|
|
|
kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
kn_list_unlock(knl);
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_UNLOCK_FLUX(kq);
|
2000-04-16 19:02:08 +00:00
|
|
|
|
|
|
|
done:
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
|
2013-09-22 19:54:47 +00:00
|
|
|
if (filedesc_unlock)
|
|
|
|
FILEDESC_XUNLOCK(td->td_proc->p_fd);
|
2000-11-18 21:01:04 +00:00
|
|
|
if (fp != NULL)
|
2001-09-12 08:38:13 +00:00
|
|
|
fdrop(fp, td);
|
2015-09-01 13:21:32 +00:00
|
|
|
knote_free(tkn);
|
2004-08-15 06:24:42 +00:00
|
|
|
if (fops != NULL)
|
|
|
|
kqueue_fo_release(filt);
|
2000-04-16 19:02:08 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2007-05-27 19:24:00 +00:00
|
|
|
kqueue_acquire(struct file *fp, struct kqueue **kqp)
|
2004-08-15 06:24:42 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
struct kqueue *kq;
|
|
|
|
|
|
|
|
error = 0;
|
|
|
|
|
2007-12-30 01:42:15 +00:00
|
|
|
kq = fp->f_data;
|
|
|
|
if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
|
|
|
|
return (EBADF);
|
|
|
|
*kqp = kq;
|
|
|
|
KQ_LOCK(kq);
|
|
|
|
if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_UNLOCK(kq);
|
2007-12-30 01:42:15 +00:00
|
|
|
return (EBADF);
|
|
|
|
}
|
|
|
|
kq->kq_refcnt++;
|
|
|
|
KQ_UNLOCK(kq);
|
2004-08-15 06:24:42 +00:00
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kqueue_release(struct kqueue *kq, int locked)
|
|
|
|
{
|
|
|
|
if (locked)
|
|
|
|
KQ_OWNED(kq);
|
|
|
|
else
|
|
|
|
KQ_LOCK(kq);
|
|
|
|
kq->kq_refcnt--;
|
|
|
|
if (kq->kq_refcnt == 1)
|
|
|
|
wakeup(&kq->kq_refcnt);
|
|
|
|
if (!locked)
|
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kqueue_schedtask(struct kqueue *kq)
|
|
|
|
{
|
|
|
|
|
|
|
|
KQ_OWNED(kq);
|
|
|
|
KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
|
|
|
|
("scheduling kqueue task while draining"));
|
|
|
|
|
|
|
|
if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
|
2016-05-24 21:13:33 +00:00
|
|
|
taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
|
2004-08-15 06:24:42 +00:00
|
|
|
kq->kq_state |= KQ_TASKSCHED;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Expand the kq to make sure we have storage for fops/ident pair.
|
|
|
|
*
|
|
|
|
* Return 0 on success (or no work necessary), return errno on failure.
|
|
|
|
*
|
|
|
|
* Not calling hashinit w/ waitok (proper malloc flag) should be safe.
|
|
|
|
* If kqueue_register is called from a non-fd context, there usually/should
|
|
|
|
* be no locks held.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
|
|
|
|
int waitok)
|
|
|
|
{
|
2010-03-30 18:31:55 +00:00
|
|
|
struct klist *list, *tmp_knhash, *to_free;
|
2004-08-15 06:24:42 +00:00
|
|
|
u_long tmp_knhashmask;
|
|
|
|
int size;
|
|
|
|
int fd;
|
|
|
|
int mflag = waitok ? M_WAITOK : M_NOWAIT;
|
|
|
|
|
|
|
|
KQ_NOTOWNED(kq);
|
|
|
|
|
2010-03-30 18:31:55 +00:00
|
|
|
to_free = NULL;
|
2004-08-15 06:24:42 +00:00
|
|
|
if (fops->f_isfd) {
|
|
|
|
fd = ident;
|
|
|
|
if (kq->kq_knlistsize <= fd) {
|
|
|
|
size = kq->kq_knlistsize;
|
|
|
|
while (size <= fd)
|
|
|
|
size += KQEXTENT;
|
2009-09-28 10:22:46 +00:00
|
|
|
list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
|
2004-08-15 06:24:42 +00:00
|
|
|
if (list == NULL)
|
|
|
|
return ENOMEM;
|
|
|
|
KQ_LOCK(kq);
|
|
|
|
if (kq->kq_knlistsize > fd) {
|
2010-03-30 18:31:55 +00:00
|
|
|
to_free = list;
|
2004-08-15 06:24:42 +00:00
|
|
|
list = NULL;
|
|
|
|
} else {
|
|
|
|
if (kq->kq_knlist != NULL) {
|
|
|
|
bcopy(kq->kq_knlist, list,
|
2009-09-28 10:22:46 +00:00
|
|
|
kq->kq_knlistsize * sizeof(*list));
|
2010-03-30 18:31:55 +00:00
|
|
|
to_free = kq->kq_knlist;
|
2004-08-15 06:24:42 +00:00
|
|
|
kq->kq_knlist = NULL;
|
|
|
|
}
|
|
|
|
bzero((caddr_t)list +
|
2009-09-28 10:22:46 +00:00
|
|
|
kq->kq_knlistsize * sizeof(*list),
|
|
|
|
(size - kq->kq_knlistsize) * sizeof(*list));
|
2004-08-15 06:24:42 +00:00
|
|
|
kq->kq_knlistsize = size;
|
|
|
|
kq->kq_knlist = list;
|
|
|
|
}
|
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (kq->kq_knhashmask == 0) {
|
|
|
|
tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
|
|
|
|
&tmp_knhashmask);
|
|
|
|
if (tmp_knhash == NULL)
|
|
|
|
return ENOMEM;
|
|
|
|
KQ_LOCK(kq);
|
|
|
|
if (kq->kq_knhashmask == 0) {
|
|
|
|
kq->kq_knhash = tmp_knhash;
|
|
|
|
kq->kq_knhashmask = tmp_knhashmask;
|
|
|
|
} else {
|
2010-03-30 18:31:55 +00:00
|
|
|
to_free = tmp_knhash;
|
2004-08-15 06:24:42 +00:00
|
|
|
}
|
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
}
|
|
|
|
}
|
2010-03-30 18:31:55 +00:00
|
|
|
free(to_free, M_KQUEUE);
|
2004-08-15 06:24:42 +00:00
|
|
|
|
|
|
|
KQ_NOTOWNED(kq);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kqueue_task(void *arg, int pending)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2002-01-13 11:58:06 +00:00
|
|
|
struct kqueue *kq;
|
2004-08-15 06:24:42 +00:00
|
|
|
int haskqglobal;
|
|
|
|
|
|
|
|
haskqglobal = 0;
|
|
|
|
kq = arg;
|
|
|
|
|
|
|
|
KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
|
|
|
|
KQ_LOCK(kq);
|
|
|
|
|
|
|
|
KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
|
|
|
|
|
|
|
|
kq->kq_state &= ~KQ_TASKSCHED;
|
|
|
|
if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
|
|
|
|
wakeup(&kq->kq_state);
|
|
|
|
}
|
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scan, update kn_data (if not ONESHOT), and copyout triggered events.
|
|
|
|
* We treat KN_MARKER knotes as if they are INFLUX.
|
|
|
|
*/
|
|
|
|
static int
|
2005-06-03 23:15:01 +00:00
|
|
|
kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
|
|
|
|
const struct timespec *tsp, struct kevent *keva, struct thread *td)
|
2004-08-15 06:24:42 +00:00
|
|
|
{
|
2000-04-16 19:02:08 +00:00
|
|
|
struct kevent *kevp;
|
2004-08-16 03:08:38 +00:00
|
|
|
struct knote *kn, *marker;
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
struct knlist *knl;
|
2013-03-04 16:55:16 +00:00
|
|
|
sbintime_t asbt, rsbt;
|
|
|
|
int count, error, haskqglobal, influx, nkev, touch;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
|
|
|
count = maxevents;
|
2004-08-15 06:24:42 +00:00
|
|
|
nkev = 0;
|
|
|
|
error = 0;
|
|
|
|
haskqglobal = 0;
|
|
|
|
|
|
|
|
if (maxevents == 0)
|
|
|
|
goto done_nl;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2013-03-04 16:55:16 +00:00
|
|
|
rsbt = 0;
|
2000-08-07 16:45:42 +00:00
|
|
|
if (tsp != NULL) {
|
2013-03-04 16:55:16 +00:00
|
|
|
if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
|
2013-03-07 16:50:07 +00:00
|
|
|
tsp->tv_nsec >= 1000000000) {
|
2000-04-16 19:02:08 +00:00
|
|
|
error = EINVAL;
|
2004-08-15 06:24:42 +00:00
|
|
|
goto done_nl;
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
2013-03-04 16:55:16 +00:00
|
|
|
if (timespecisset(tsp)) {
|
2013-03-09 09:07:13 +00:00
|
|
|
if (tsp->tv_sec <= INT32_MAX) {
|
2013-03-06 19:37:38 +00:00
|
|
|
rsbt = tstosbt(*tsp);
|
|
|
|
if (TIMESEL(&asbt, rsbt))
|
|
|
|
asbt += tc_tick_sbt;
|
2014-04-12 23:29:29 +00:00
|
|
|
if (asbt <= SBT_MAX - rsbt)
|
2013-03-09 09:07:13 +00:00
|
|
|
asbt += rsbt;
|
|
|
|
else
|
2013-03-06 19:37:38 +00:00
|
|
|
asbt = 0;
|
|
|
|
rsbt >>= tc_precexp;
|
|
|
|
} else
|
|
|
|
asbt = 0;
|
2013-03-04 16:55:16 +00:00
|
|
|
} else
|
|
|
|
asbt = -1;
|
|
|
|
} else
|
|
|
|
asbt = 0;
|
2004-08-16 03:08:38 +00:00
|
|
|
marker = knote_alloc(1);
|
|
|
|
marker->kn_status = KN_MARKER;
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_LOCK(kq);
|
2000-04-16 19:02:08 +00:00
|
|
|
|
|
|
|
retry:
|
2004-08-15 06:24:42 +00:00
|
|
|
kevp = keva;
|
2000-04-16 19:02:08 +00:00
|
|
|
if (kq->kq_count == 0) {
|
2013-03-04 16:55:16 +00:00
|
|
|
if (asbt == -1) {
|
2000-08-07 16:45:42 +00:00
|
|
|
error = EWOULDBLOCK;
|
|
|
|
} else {
|
|
|
|
kq->kq_state |= KQ_SLEEP;
|
2013-03-04 16:55:16 +00:00
|
|
|
error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
|
|
|
|
"kqread", asbt, rsbt, C_ABSOLUTE);
|
2000-08-07 16:45:42 +00:00
|
|
|
}
|
2000-08-01 04:27:50 +00:00
|
|
|
if (error == 0)
|
2000-04-16 19:02:08 +00:00
|
|
|
goto retry;
|
2000-08-01 04:27:50 +00:00
|
|
|
/* don't restart after signals... */
|
|
|
|
if (error == ERESTART)
|
|
|
|
error = EINTR;
|
|
|
|
else if (error == EWOULDBLOCK)
|
2000-04-16 19:02:08 +00:00
|
|
|
error = 0;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
2004-08-16 03:08:38 +00:00
|
|
|
TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
|
2008-05-10 11:37:05 +00:00
|
|
|
influx = 0;
|
2000-04-16 19:02:08 +00:00
|
|
|
while (count) {
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_OWNED(kq);
|
2000-04-16 19:02:08 +00:00
|
|
|
kn = TAILQ_FIRST(&kq->kq_head);
|
2004-08-15 06:24:42 +00:00
|
|
|
|
2004-08-16 03:08:38 +00:00
|
|
|
if ((kn->kn_status == KN_MARKER && kn != marker) ||
|
2004-08-15 06:24:42 +00:00
|
|
|
(kn->kn_status & KN_INFLUX) == KN_INFLUX) {
|
2008-05-10 11:37:05 +00:00
|
|
|
if (influx) {
|
|
|
|
influx = 0;
|
|
|
|
KQ_FLUX_WAKEUP(kq);
|
|
|
|
}
|
2008-07-07 09:15:29 +00:00
|
|
|
kq->kq_state |= KQ_FLUXWAIT;
|
2004-08-15 06:24:42 +00:00
|
|
|
error = msleep(kq, &kq->kq_lock, PSOCK,
|
|
|
|
"kqflxwt", 0);
|
|
|
|
continue;
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
|
|
|
|
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
|
|
|
|
if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
|
2000-04-16 19:02:08 +00:00
|
|
|
kn->kn_status &= ~KN_QUEUED;
|
|
|
|
kq->kq_count--;
|
|
|
|
continue;
|
|
|
|
}
|
2004-08-16 03:08:38 +00:00
|
|
|
if (kn == marker) {
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_FLUX_WAKEUP(kq);
|
|
|
|
if (count == maxevents)
|
|
|
|
goto retry;
|
|
|
|
goto done;
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
KASSERT((kn->kn_status & KN_INFLUX) == 0,
|
|
|
|
("KN_INFLUX set when not suppose to be"));
|
|
|
|
|
2013-08-07 19:56:35 +00:00
|
|
|
if ((kn->kn_flags & EV_DROP) == EV_DROP) {
|
|
|
|
kn->kn_status &= ~KN_QUEUED;
|
|
|
|
kn->kn_status |= KN_INFLUX;
|
|
|
|
kq->kq_count--;
|
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
/*
|
|
|
|
* We don't need to lock the list since we've marked
|
|
|
|
* it _INFLUX.
|
|
|
|
*/
|
|
|
|
if (!(kn->kn_status & KN_DETACHED))
|
|
|
|
kn->kn_fop->f_detach(kn);
|
|
|
|
knote_drop(kn, td);
|
|
|
|
KQ_LOCK(kq);
|
|
|
|
continue;
|
|
|
|
} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
|
2000-04-16 19:02:08 +00:00
|
|
|
kn->kn_status &= ~KN_QUEUED;
|
2004-08-15 06:24:42 +00:00
|
|
|
kn->kn_status |= KN_INFLUX;
|
2000-04-16 19:02:08 +00:00
|
|
|
kq->kq_count--;
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
/*
|
|
|
|
* We don't need to lock the list since we've marked
|
|
|
|
* it _INFLUX.
|
|
|
|
*/
|
|
|
|
*kevp = kn->kn_kevent;
|
2004-09-06 19:02:42 +00:00
|
|
|
if (!(kn->kn_status & KN_DETACHED))
|
|
|
|
kn->kn_fop->f_detach(kn);
|
2001-09-12 08:38:13 +00:00
|
|
|
knote_drop(kn, td);
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_LOCK(kq);
|
|
|
|
kn = NULL;
|
2000-04-16 19:02:08 +00:00
|
|
|
} else {
|
2014-04-05 14:09:16 +00:00
|
|
|
kn->kn_status |= KN_INFLUX | KN_SCAN;
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
|
|
|
|
KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
knl = kn_list_lock(kn);
|
2004-08-15 06:24:42 +00:00
|
|
|
if (kn->kn_fop->f_event(kn, 0) == 0) {
|
|
|
|
KQ_LOCK(kq);
|
2006-04-14 14:27:28 +00:00
|
|
|
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
|
2004-08-15 06:24:42 +00:00
|
|
|
kn->kn_status &=
|
2014-04-05 14:09:16 +00:00
|
|
|
~(KN_QUEUED | KN_ACTIVE | KN_INFLUX |
|
|
|
|
KN_SCAN);
|
2004-08-15 06:24:42 +00:00
|
|
|
kq->kq_count--;
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
kn_list_unlock(knl);
|
2008-05-10 11:37:05 +00:00
|
|
|
influx = 1;
|
2004-08-15 06:24:42 +00:00
|
|
|
continue;
|
|
|
|
}
|
2009-09-16 03:15:57 +00:00
|
|
|
touch = (!kn->kn_fop->f_isfd &&
|
|
|
|
kn->kn_fop->f_touch != NULL);
|
|
|
|
if (touch)
|
|
|
|
kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
|
|
|
|
else
|
|
|
|
*kevp = kn->kn_kevent;
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_LOCK(kq);
|
2006-04-14 14:27:28 +00:00
|
|
|
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
|
2013-08-26 18:53:19 +00:00
|
|
|
if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
|
2009-09-16 03:15:57 +00:00
|
|
|
/*
|
|
|
|
* Manually clear knotes who weren't
|
|
|
|
* 'touch'ed.
|
|
|
|
*/
|
2009-09-16 03:37:39 +00:00
|
|
|
if (touch == 0 && kn->kn_flags & EV_CLEAR) {
|
2009-09-16 03:15:57 +00:00
|
|
|
kn->kn_data = 0;
|
|
|
|
kn->kn_fflags = 0;
|
|
|
|
}
|
2009-09-16 03:37:39 +00:00
|
|
|
if (kn->kn_flags & EV_DISPATCH)
|
|
|
|
kn->kn_status |= KN_DISABLED;
|
2004-08-15 06:24:42 +00:00
|
|
|
kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
|
|
|
|
kq->kq_count--;
|
|
|
|
} else
|
|
|
|
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
|
2005-09-15 21:10:12 +00:00
|
|
|
|
2014-04-05 14:09:16 +00:00
|
|
|
kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
kn_list_unlock(knl);
|
2008-05-10 11:37:05 +00:00
|
|
|
influx = 1;
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
|
|
|
|
/* we are returning a copy to the user */
|
|
|
|
kevp++;
|
|
|
|
nkev++;
|
2000-04-16 19:02:08 +00:00
|
|
|
count--;
|
2004-08-15 06:24:42 +00:00
|
|
|
|
2000-04-16 19:02:08 +00:00
|
|
|
if (nkev == KQ_NEVENTS) {
|
2008-05-10 11:37:05 +00:00
|
|
|
influx = 0;
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_UNLOCK_FLUX(kq);
|
2005-06-03 23:15:01 +00:00
|
|
|
error = k_ops->k_copyout(k_ops->arg, keva, nkev);
|
2000-04-16 19:02:08 +00:00
|
|
|
nkev = 0;
|
2004-08-15 06:24:42 +00:00
|
|
|
kevp = keva;
|
|
|
|
KQ_LOCK(kq);
|
2000-05-04 20:19:17 +00:00
|
|
|
if (error)
|
|
|
|
break;
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
}
|
2004-08-16 03:08:38 +00:00
|
|
|
TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
|
2000-04-16 19:02:08 +00:00
|
|
|
done:
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_OWNED(kq);
|
|
|
|
KQ_UNLOCK_FLUX(kq);
|
2004-08-16 03:08:38 +00:00
|
|
|
knote_free(marker);
|
2004-08-15 06:24:42 +00:00
|
|
|
done_nl:
|
|
|
|
KQ_NOTOWNED(kq);
|
2000-04-16 19:02:08 +00:00
|
|
|
if (nkev != 0)
|
2005-06-03 23:15:01 +00:00
|
|
|
error = k_ops->k_copyout(k_ops->arg, keva, nkev);
|
2004-08-15 06:24:42 +00:00
|
|
|
td->td_retval[0] = maxevents - count;
|
2000-04-16 19:02:08 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
2013-09-18 18:48:33 +00:00
|
|
|
static int
|
2004-07-14 07:02:03 +00:00
|
|
|
kqueue_ioctl(struct file *fp, u_long cmd, void *data,
|
2002-08-17 02:36:16 +00:00
|
|
|
struct ucred *active_cred, struct thread *td)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2004-07-15 03:49:52 +00:00
|
|
|
/*
|
|
|
|
* Enabling sigio causes two major problems:
|
|
|
|
* 1) infinite recursion:
|
|
|
|
* Synopsys: kevent is being used to track signals and have FIOASYNC
|
|
|
|
* set. On receipt of a signal this will cause a kqueue to recurse
|
|
|
|
* into itself over and over. Sending the sigio causes the kqueue
|
|
|
|
* to become ready, which in turn posts sigio again, forever.
|
|
|
|
* Solution: this can be solved by setting a flag in the kqueue that
|
|
|
|
* we have a SIGIO in progress.
|
|
|
|
* 2) locking problems:
|
|
|
|
* Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
|
|
|
|
* us above the proc and pgrp locks.
|
|
|
|
* Solution: Post a signal using an async mechanism, being sure to
|
|
|
|
* record a generation count in the delivery so that we do not deliver
|
|
|
|
* a signal to the wrong process.
|
|
|
|
*
|
|
|
|
* Note, these two mechanisms are somewhat mutually exclusive!
|
|
|
|
*/
|
|
|
|
#if 0
|
2004-07-14 07:02:03 +00:00
|
|
|
struct kqueue *kq;
|
|
|
|
|
|
|
|
kq = fp->f_data;
|
|
|
|
switch (cmd) {
|
|
|
|
case FIOASYNC:
|
|
|
|
if (*(int *)data) {
|
|
|
|
kq->kq_state |= KQ_ASYNC;
|
|
|
|
} else {
|
|
|
|
kq->kq_state &= ~KQ_ASYNC;
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
case FIOSETOWN:
|
|
|
|
return (fsetown(*(int *)data, &kq->kq_sigio));
|
|
|
|
|
|
|
|
case FIOGETOWN:
|
|
|
|
*(int *)data = fgetown(&kq->kq_sigio);
|
|
|
|
return (0);
|
|
|
|
}
|
2004-07-15 03:49:52 +00:00
|
|
|
#endif
|
2004-07-14 07:02:03 +00:00
|
|
|
|
2000-04-16 19:02:08 +00:00
|
|
|
return (ENOTTY);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
2013-09-18 18:48:33 +00:00
|
|
|
static int
|
Make similar changes to fo_stat() and fo_poll() as made earlier to
fo_read() and fo_write(): explicitly use the cred argument to fo_poll()
as "active_cred" using the passed file descriptor's f_cred reference
to provide access to the file credential. Add an active_cred
argument to fo_stat() so that implementers have access to the active
credential as well as the file credential. Generally modify callers
of fo_stat() to pass in td->td_ucred rather than fp->f_cred, which
was redundantly provided via the fp argument. This set of modifications
also permits threads to perform these operations on behalf of another
thread without modifying their credential.
Trickle this change down into fo_stat/poll() implementations:
- badfo_poll(), badfo_stat(): modify/add arguments.
- kqueue_poll(), kqueue_stat(): modify arguments.
- pipe_poll(), pipe_stat(): modify/add arguments, pass active_cred to
MAC checks rather than td->td_ucred.
- soo_poll(), soo_stat(): modify/add arguments, pass fp->f_cred rather
than cred to pru_sopoll() to maintain current semantics.
- sopoll(): moidfy arguments.
- vn_poll(), vn_statfile(): modify/add arguments, pass new arguments
to vn_stat(). Pass active_cred to MAC and fp->f_cred to VOP_POLL()
to maintian current semantics.
- vn_close(): rename cred to file_cred to reflect reality while I'm here.
- vn_stat(): Add active_cred and file_cred arguments to vn_stat()
and consumers so that this distinction is maintained at the VFS
as well as 'struct file' layer. Pass active_cred instead of
td->td_ucred to MAC and to VOP_GETATTR() to maintain current semantics.
- fifofs: modify the creation of a "filetemp" so that the file
credential is properly initialized and can be used in the socket
code if desired. Pass ap->a_td->td_ucred as the active
credential to soo_poll(). If we teach the vnop interface about
the distinction between file and active credentials, we would use
the active credential here.
Note that current inconsistent passing of active_cred vs. file_cred to
VOP's is maintained. It's not clear why GETATTR would be authorized
using active_cred while POLL would be authorized using file_cred at
the file system level.
Obtained from: TrustedBSD Project
Sponsored by: DARPA, NAI Labs
2002-08-16 12:52:03 +00:00
|
|
|
kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
|
2002-08-16 14:12:40 +00:00
|
|
|
struct thread *td)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2002-01-13 11:58:06 +00:00
|
|
|
struct kqueue *kq;
|
2000-04-16 19:02:08 +00:00
|
|
|
int revents = 0;
|
2004-08-15 06:24:42 +00:00
|
|
|
int error;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2007-05-27 19:24:00 +00:00
|
|
|
if ((error = kqueue_acquire(fp, &kq)))
|
2004-08-15 06:24:42 +00:00
|
|
|
return POLLERR;
|
|
|
|
|
|
|
|
KQ_LOCK(kq);
|
|
|
|
if (events & (POLLIN | POLLRDNORM)) {
|
|
|
|
if (kq->kq_count) {
|
|
|
|
revents |= events & (POLLIN | POLLRDNORM);
|
2000-04-16 19:02:08 +00:00
|
|
|
} else {
|
2004-08-15 06:24:42 +00:00
|
|
|
selrecord(td, &kq->kq_sel);
|
2007-12-16 06:21:20 +00:00
|
|
|
if (SEL_WAITING(&kq->kq_sel))
|
|
|
|
kq->kq_state |= KQ_SEL;
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
kqueue_release(kq, 1);
|
|
|
|
KQ_UNLOCK(kq);
|
2000-04-16 19:02:08 +00:00
|
|
|
return (revents);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
2013-09-18 18:48:33 +00:00
|
|
|
static int
|
Make similar changes to fo_stat() and fo_poll() as made earlier to
fo_read() and fo_write(): explicitly use the cred argument to fo_poll()
as "active_cred" using the passed file descriptor's f_cred reference
to provide access to the file credential. Add an active_cred
argument to fo_stat() so that implementers have access to the active
credential as well as the file credential. Generally modify callers
of fo_stat() to pass in td->td_ucred rather than fp->f_cred, which
was redundantly provided via the fp argument. This set of modifications
also permits threads to perform these operations on behalf of another
thread without modifying their credential.
Trickle this change down into fo_stat/poll() implementations:
- badfo_poll(), badfo_stat(): modify/add arguments.
- kqueue_poll(), kqueue_stat(): modify arguments.
- pipe_poll(), pipe_stat(): modify/add arguments, pass active_cred to
MAC checks rather than td->td_ucred.
- soo_poll(), soo_stat(): modify/add arguments, pass fp->f_cred rather
than cred to pru_sopoll() to maintain current semantics.
- sopoll(): moidfy arguments.
- vn_poll(), vn_statfile(): modify/add arguments, pass new arguments
to vn_stat(). Pass active_cred to MAC and fp->f_cred to VOP_POLL()
to maintian current semantics.
- vn_close(): rename cred to file_cred to reflect reality while I'm here.
- vn_stat(): Add active_cred and file_cred arguments to vn_stat()
and consumers so that this distinction is maintained at the VFS
as well as 'struct file' layer. Pass active_cred instead of
td->td_ucred to MAC and to VOP_GETATTR() to maintain current semantics.
- fifofs: modify the creation of a "filetemp" so that the file
credential is properly initialized and can be used in the socket
code if desired. Pass ap->a_td->td_ucred as the active
credential to soo_poll(). If we teach the vnop interface about
the distinction between file and active credentials, we would use
the active credential here.
Note that current inconsistent passing of active_cred vs. file_cred to
VOP's is maintained. It's not clear why GETATTR would be authorized
using active_cred while POLL would be authorized using file_cred at
the file system level.
Obtained from: TrustedBSD Project
Sponsored by: DARPA, NAI Labs
2002-08-16 12:52:03 +00:00
|
|
|
kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
|
2002-08-16 14:12:40 +00:00
|
|
|
struct thread *td)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
|
|
|
|
2005-05-24 23:42:50 +00:00
|
|
|
bzero((void *)st, sizeof *st);
|
|
|
|
/*
|
|
|
|
* We no longer return kq_count because the unlocked value is useless.
|
|
|
|
* If you spent all this time getting the count, why not spend your
|
|
|
|
* syscall better by calling kevent?
|
|
|
|
*
|
|
|
|
* XXX - This is needed for libc_r.
|
|
|
|
*/
|
|
|
|
st->st_mode = S_IFIFO;
|
|
|
|
return (0);
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
|
2015-08-11 13:47:23 +00:00
|
|
|
static void
|
|
|
|
kqueue_drain(struct kqueue *kq, struct thread *td)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2004-08-15 06:24:42 +00:00
|
|
|
struct knote *kn;
|
2000-04-16 19:02:08 +00:00
|
|
|
int i;
|
2004-08-15 06:24:42 +00:00
|
|
|
|
|
|
|
KQ_LOCK(kq);
|
|
|
|
|
|
|
|
KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
|
|
|
|
("kqueue already closing"));
|
|
|
|
kq->kq_state |= KQ_CLOSING;
|
|
|
|
if (kq->kq_refcnt > 1)
|
|
|
|
msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
|
|
|
|
|
|
|
|
KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
|
|
|
|
|
|
|
|
KASSERT(knlist_empty(&kq->kq_sel.si_note),
|
|
|
|
("kqueue's knlist not empty"));
|
|
|
|
|
|
|
|
for (i = 0; i < kq->kq_knlistsize; i++) {
|
|
|
|
while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
|
2008-05-10 11:35:32 +00:00
|
|
|
if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
|
|
|
|
kq->kq_state |= KQ_FLUXWAIT;
|
|
|
|
msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
|
|
|
|
continue;
|
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
kn->kn_status |= KN_INFLUX;
|
|
|
|
KQ_UNLOCK(kq);
|
2004-09-06 19:02:42 +00:00
|
|
|
if (!(kn->kn_status & KN_DETACHED))
|
|
|
|
kn->kn_fop->f_detach(kn);
|
2004-08-15 06:24:42 +00:00
|
|
|
knote_drop(kn, td);
|
|
|
|
KQ_LOCK(kq);
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
if (kq->kq_knhashmask != 0) {
|
|
|
|
for (i = 0; i <= kq->kq_knhashmask; i++) {
|
|
|
|
while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
|
2008-05-10 11:35:32 +00:00
|
|
|
if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
|
|
|
|
kq->kq_state |= KQ_FLUXWAIT;
|
|
|
|
msleep(kq, &kq->kq_lock, PSOCK,
|
|
|
|
"kqclo2", 0);
|
|
|
|
continue;
|
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
kn->kn_status |= KN_INFLUX;
|
|
|
|
KQ_UNLOCK(kq);
|
2004-09-06 19:02:42 +00:00
|
|
|
if (!(kn->kn_status & KN_DETACHED))
|
|
|
|
kn->kn_fop->f_detach(kn);
|
2004-08-15 06:24:42 +00:00
|
|
|
knote_drop(kn, td);
|
|
|
|
KQ_LOCK(kq);
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
|
|
|
|
if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
|
|
|
|
kq->kq_state |= KQ_TASKDRAIN;
|
|
|
|
msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
|
2004-02-20 04:00:48 +00:00
|
|
|
selwakeuppri(&kq->kq_sel, PSOCK);
|
2007-12-16 06:21:20 +00:00
|
|
|
if (!SEL_WAITING(&kq->kq_sel))
|
|
|
|
kq->kq_state &= ~KQ_SEL;
|
2004-02-20 04:00:48 +00:00
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
|
|
|
|
KQ_UNLOCK(kq);
|
2015-08-11 13:47:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kqueue_destroy(struct kqueue *kq)
|
|
|
|
{
|
|
|
|
|
2015-08-12 17:46:26 +00:00
|
|
|
KASSERT(kq->kq_fdp == NULL,
|
|
|
|
("kqueue still attached to a file descriptor"));
|
2015-08-11 13:47:23 +00:00
|
|
|
seldrain(&kq->kq_sel);
|
|
|
|
knlist_destroy(&kq->kq_sel.si_note);
|
|
|
|
mtx_destroy(&kq->kq_lock);
|
|
|
|
|
|
|
|
if (kq->kq_knhash != NULL)
|
|
|
|
free(kq->kq_knhash, M_KQUEUE);
|
|
|
|
if (kq->kq_knlist != NULL)
|
|
|
|
free(kq->kq_knlist, M_KQUEUE);
|
|
|
|
|
|
|
|
funsetown(&kq->kq_sigio);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
|
|
|
kqueue_close(struct file *fp, struct thread *td)
|
|
|
|
{
|
|
|
|
struct kqueue *kq = fp->f_data;
|
|
|
|
struct filedesc *fdp;
|
|
|
|
int error;
|
|
|
|
int filedesc_unlock;
|
|
|
|
|
|
|
|
if ((error = kqueue_acquire(fp, &kq)))
|
|
|
|
return error;
|
|
|
|
kqueue_drain(kq, td);
|
2004-08-15 06:24:42 +00:00
|
|
|
|
2013-09-22 19:54:47 +00:00
|
|
|
/*
|
|
|
|
* We could be called due to the knote_drop() doing fdrop(),
|
|
|
|
* called from kqueue_register(). In this case the global
|
|
|
|
* lock is owned, and filedesc sx is locked before, to not
|
|
|
|
* take the sleepable lock after non-sleepable.
|
|
|
|
*/
|
2015-08-11 13:47:23 +00:00
|
|
|
fdp = kq->kq_fdp;
|
2015-08-12 17:46:26 +00:00
|
|
|
kq->kq_fdp = NULL;
|
2013-09-22 19:54:47 +00:00
|
|
|
if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
|
|
|
|
FILEDESC_XLOCK(fdp);
|
|
|
|
filedesc_unlock = 1;
|
|
|
|
} else
|
|
|
|
filedesc_unlock = 0;
|
2013-09-13 19:50:50 +00:00
|
|
|
TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
|
2013-09-22 19:54:47 +00:00
|
|
|
if (filedesc_unlock)
|
|
|
|
FILEDESC_XUNLOCK(fdp);
|
2004-08-15 06:24:42 +00:00
|
|
|
|
2015-08-11 13:47:23 +00:00
|
|
|
kqueue_destroy(kq);
|
2013-10-21 16:44:53 +00:00
|
|
|
chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
|
|
|
|
crfree(kq->kq_cred);
|
2001-09-29 17:48:39 +00:00
|
|
|
free(kq, M_KQUEUE);
|
2003-01-13 00:33:17 +00:00
|
|
|
fp->f_data = NULL;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2014-09-22 16:20:47 +00:00
|
|
|
static int
|
|
|
|
kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
|
|
|
|
{
|
|
|
|
|
|
|
|
kif->kf_type = KF_TYPE_KQUEUE;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2000-04-16 19:02:08 +00:00
|
|
|
static void
|
|
|
|
kqueue_wakeup(struct kqueue *kq)
|
|
|
|
{
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_OWNED(kq);
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
|
2000-04-16 19:02:08 +00:00
|
|
|
kq->kq_state &= ~KQ_SLEEP;
|
|
|
|
wakeup(kq);
|
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
|
2003-11-09 09:17:26 +00:00
|
|
|
selwakeuppri(&kq->kq_sel, PSOCK);
|
2007-12-16 06:21:20 +00:00
|
|
|
if (!SEL_WAITING(&kq->kq_sel))
|
|
|
|
kq->kq_state &= ~KQ_SEL;
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
if (!knlist_empty(&kq->kq_sel.si_note))
|
|
|
|
kqueue_schedtask(kq);
|
|
|
|
if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
|
2004-07-14 07:02:03 +00:00
|
|
|
pgsigio(&kq->kq_sigio, SIGIO, 0);
|
|
|
|
}
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2004-08-15 06:24:42 +00:00
|
|
|
* Walk down a list of knotes, activating them if their event has triggered.
|
|
|
|
*
|
|
|
|
* There is a possibility to optimize in the case of one kq watching another.
|
|
|
|
* Instead of scheduling a task to wake it up, you could pass enough state
|
|
|
|
* down the chain to make up the parent kqueue. Make this code functional
|
|
|
|
* first.
|
2000-04-16 19:02:08 +00:00
|
|
|
*/
|
|
|
|
void
|
2009-06-28 21:49:43 +00:00
|
|
|
knote(struct knlist *list, long hint, int lockflags)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2004-08-15 06:24:42 +00:00
|
|
|
struct kqueue *kq;
|
2015-09-01 14:05:29 +00:00
|
|
|
struct knote *kn, *tkn;
|
2009-06-28 21:49:43 +00:00
|
|
|
int error;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
if (list == NULL)
|
|
|
|
return;
|
|
|
|
|
2009-06-28 21:49:43 +00:00
|
|
|
KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
|
2005-07-01 16:28:32 +00:00
|
|
|
|
2009-06-28 21:49:43 +00:00
|
|
|
if ((lockflags & KNF_LISTLOCKED) == 0)
|
2005-07-01 16:28:32 +00:00
|
|
|
list->kl_lock(list->kl_lockarg);
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
/*
|
2015-09-01 14:05:29 +00:00
|
|
|
* If we unlock the list lock (and set KN_INFLUX), we can
|
|
|
|
* eliminate the kqueue scheduling, but this will introduce
|
|
|
|
* four lock/unlock's for each knote to test. Also, marker
|
|
|
|
* would be needed to keep iteration position, since filters
|
|
|
|
* or other threads could remove events.
|
2004-08-15 06:24:42 +00:00
|
|
|
*/
|
2015-09-01 14:05:29 +00:00
|
|
|
SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
|
2004-08-15 06:24:42 +00:00
|
|
|
kq = kn->kn_kq;
|
2014-04-05 14:09:16 +00:00
|
|
|
KQ_LOCK(kq);
|
|
|
|
if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
|
|
|
|
/*
|
|
|
|
* Do not process the influx notes, except for
|
|
|
|
* the influx coming from the kq unlock in the
|
|
|
|
* kqueue_scan(). In the later case, we do
|
|
|
|
* not interfere with the scan, since the code
|
|
|
|
* fragment in kqueue_scan() locks the knlist,
|
|
|
|
* and cannot proceed until we finished.
|
|
|
|
*/
|
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
} else if ((lockflags & KNF_NOKQLOCK) != 0) {
|
|
|
|
kn->kn_status |= KN_INFLUX;
|
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
error = kn->kn_fop->f_event(kn, hint);
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_LOCK(kq);
|
2014-04-05 14:09:16 +00:00
|
|
|
kn->kn_status &= ~KN_INFLUX;
|
|
|
|
if (error)
|
|
|
|
KNOTE_ACTIVATE(kn, 1);
|
|
|
|
KQ_UNLOCK_FLUX(kq);
|
|
|
|
} else {
|
|
|
|
kn->kn_status |= KN_HASKQLOCK;
|
|
|
|
if (kn->kn_fop->f_event(kn, hint))
|
|
|
|
KNOTE_ACTIVATE(kn, 1);
|
|
|
|
kn->kn_status &= ~KN_HASKQLOCK;
|
|
|
|
KQ_UNLOCK(kq);
|
2004-08-15 06:24:42 +00:00
|
|
|
}
|
|
|
|
}
|
2009-06-28 21:49:43 +00:00
|
|
|
if ((lockflags & KNF_LISTLOCKED) == 0)
|
2005-07-01 16:28:32 +00:00
|
|
|
list->kl_unlock(list->kl_lockarg);
|
2004-08-15 06:24:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* add a knote to a knlist
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
knlist_add(struct knlist *knl, struct knote *kn, int islocked)
|
|
|
|
{
|
2005-07-01 16:28:32 +00:00
|
|
|
KNL_ASSERT_LOCK(knl, islocked);
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_NOTOWNED(kn->kn_kq);
|
|
|
|
KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
|
|
|
|
(KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
|
|
|
|
if (!islocked)
|
2005-07-01 16:28:32 +00:00
|
|
|
knl->kl_lock(knl->kl_lockarg);
|
2004-08-15 06:24:42 +00:00
|
|
|
SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
|
|
|
|
if (!islocked)
|
2005-07-01 16:28:32 +00:00
|
|
|
knl->kl_unlock(knl->kl_lockarg);
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_LOCK(kn->kn_kq);
|
|
|
|
kn->kn_knlist = knl;
|
|
|
|
kn->kn_status &= ~KN_DETACHED;
|
|
|
|
KQ_UNLOCK(kn->kn_kq);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
|
|
|
|
int kqislocked)
|
2004-08-15 06:24:42 +00:00
|
|
|
{
|
|
|
|
KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
|
2005-07-01 16:28:32 +00:00
|
|
|
KNL_ASSERT_LOCK(knl, knlislocked);
|
2004-08-15 06:24:42 +00:00
|
|
|
mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
|
|
|
|
if (!kqislocked)
|
|
|
|
KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
|
|
|
|
("knlist_remove called w/o knote being KN_INFLUX or already removed"));
|
|
|
|
if (!knlislocked)
|
2005-07-01 16:28:32 +00:00
|
|
|
knl->kl_lock(knl->kl_lockarg);
|
2004-08-15 06:24:42 +00:00
|
|
|
SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
|
|
|
|
kn->kn_knlist = NULL;
|
|
|
|
if (!knlislocked)
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
kn_list_unlock(knl);
|
2004-08-15 06:24:42 +00:00
|
|
|
if (!kqislocked)
|
|
|
|
KQ_LOCK(kn->kn_kq);
|
|
|
|
kn->kn_status |= KN_DETACHED;
|
|
|
|
if (!kqislocked)
|
|
|
|
KQ_UNLOCK(kn->kn_kq);
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2013-08-26 18:53:19 +00:00
|
|
|
* remove knote from the specified knlist
|
2000-04-16 19:02:08 +00:00
|
|
|
*/
|
|
|
|
void
|
2004-08-15 06:24:42 +00:00
|
|
|
knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
|
|
|
|
{
|
|
|
|
|
|
|
|
knlist_remove_kq(knl, kn, islocked, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
knlist_empty(struct knlist *knl)
|
|
|
|
{
|
2012-03-26 09:34:17 +00:00
|
|
|
|
2005-07-01 16:28:32 +00:00
|
|
|
KNL_ASSERT_LOCKED(knl);
|
2004-08-15 06:24:42 +00:00
|
|
|
return SLIST_EMPTY(&knl->kl_list);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct mtx knlist_lock;
|
|
|
|
MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
|
|
|
|
MTX_DEF);
|
2005-07-01 16:28:32 +00:00
|
|
|
static void knlist_mtx_lock(void *arg);
|
|
|
|
static void knlist_mtx_unlock(void *arg);
|
|
|
|
|
|
|
|
static void
|
|
|
|
knlist_mtx_lock(void *arg)
|
|
|
|
{
|
2012-03-26 09:34:17 +00:00
|
|
|
|
2005-07-01 16:28:32 +00:00
|
|
|
mtx_lock((struct mtx *)arg);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
knlist_mtx_unlock(void *arg)
|
|
|
|
{
|
2012-03-26 09:34:17 +00:00
|
|
|
|
2005-07-01 16:28:32 +00:00
|
|
|
mtx_unlock((struct mtx *)arg);
|
|
|
|
}
|
|
|
|
|
2009-06-10 20:59:32 +00:00
|
|
|
static void
|
|
|
|
knlist_mtx_assert_locked(void *arg)
|
|
|
|
{
|
2012-03-26 09:34:17 +00:00
|
|
|
|
2009-06-10 20:59:32 +00:00
|
|
|
mtx_assert((struct mtx *)arg, MA_OWNED);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
knlist_mtx_assert_unlocked(void *arg)
|
2005-07-01 16:28:32 +00:00
|
|
|
{
|
2012-03-26 09:34:17 +00:00
|
|
|
|
2009-06-10 20:59:32 +00:00
|
|
|
mtx_assert((struct mtx *)arg, MA_NOTOWNED);
|
2005-07-01 16:28:32 +00:00
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
|
2012-03-26 09:34:17 +00:00
|
|
|
static void
|
|
|
|
knlist_rw_rlock(void *arg)
|
|
|
|
{
|
|
|
|
|
|
|
|
rw_rlock((struct rwlock *)arg);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
knlist_rw_runlock(void *arg)
|
|
|
|
{
|
|
|
|
|
|
|
|
rw_runlock((struct rwlock *)arg);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
knlist_rw_assert_locked(void *arg)
|
|
|
|
{
|
|
|
|
|
|
|
|
rw_assert((struct rwlock *)arg, RA_LOCKED);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
knlist_rw_assert_unlocked(void *arg)
|
|
|
|
{
|
|
|
|
|
|
|
|
rw_assert((struct rwlock *)arg, RA_UNLOCKED);
|
|
|
|
}
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
void
|
2005-07-01 16:28:32 +00:00
|
|
|
knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
|
2009-06-10 20:59:32 +00:00
|
|
|
void (*kl_unlock)(void *),
|
|
|
|
void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
|
2004-08-15 06:24:42 +00:00
|
|
|
{
|
|
|
|
|
2005-07-01 16:28:32 +00:00
|
|
|
if (lock == NULL)
|
|
|
|
knl->kl_lockarg = &knlist_lock;
|
|
|
|
else
|
|
|
|
knl->kl_lockarg = lock;
|
|
|
|
|
|
|
|
if (kl_lock == NULL)
|
|
|
|
knl->kl_lock = knlist_mtx_lock;
|
|
|
|
else
|
|
|
|
knl->kl_lock = kl_lock;
|
2006-04-07 17:21:27 +00:00
|
|
|
if (kl_unlock == NULL)
|
2005-07-01 16:28:32 +00:00
|
|
|
knl->kl_unlock = knlist_mtx_unlock;
|
|
|
|
else
|
|
|
|
knl->kl_unlock = kl_unlock;
|
2009-06-10 20:59:32 +00:00
|
|
|
if (kl_assert_locked == NULL)
|
|
|
|
knl->kl_assert_locked = knlist_mtx_assert_locked;
|
2004-08-15 06:24:42 +00:00
|
|
|
else
|
2009-06-10 20:59:32 +00:00
|
|
|
knl->kl_assert_locked = kl_assert_locked;
|
|
|
|
if (kl_assert_unlocked == NULL)
|
|
|
|
knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
|
|
|
|
else
|
|
|
|
knl->kl_assert_unlocked = kl_assert_unlocked;
|
2004-08-15 06:24:42 +00:00
|
|
|
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
knl->kl_autodestroy = false;
|
2004-08-15 06:24:42 +00:00
|
|
|
SLIST_INIT(&knl->kl_list);
|
|
|
|
}
|
|
|
|
|
2009-06-10 20:59:32 +00:00
|
|
|
void
|
|
|
|
knlist_init_mtx(struct knlist *knl, struct mtx *lock)
|
|
|
|
{
|
|
|
|
|
|
|
|
knlist_init(knl, lock, NULL, NULL, NULL, NULL);
|
|
|
|
}
|
|
|
|
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
struct knlist *
|
|
|
|
knlist_alloc(struct mtx *lock)
|
|
|
|
{
|
|
|
|
struct knlist *knl;
|
|
|
|
|
|
|
|
knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
|
|
|
|
knlist_init_mtx(knl, lock);
|
|
|
|
return (knl);
|
|
|
|
}
|
|
|
|
|
2012-03-26 09:34:17 +00:00
|
|
|
void
|
|
|
|
knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
|
|
|
|
{
|
|
|
|
|
|
|
|
knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
|
|
|
|
knlist_rw_assert_locked, knlist_rw_assert_unlocked);
|
|
|
|
}
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
void
|
|
|
|
knlist_destroy(struct knlist *knl)
|
|
|
|
{
|
|
|
|
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
/*
|
|
|
|
* if we run across this error, we need to find the offending
|
2013-08-26 18:53:19 +00:00
|
|
|
* driver and have it call knlist_clear or knlist_delete.
|
2004-08-15 06:24:42 +00:00
|
|
|
*/
|
|
|
|
if (!SLIST_EMPTY(&knl->kl_list))
|
|
|
|
printf("WARNING: destroying knlist w/ knotes on it!\n");
|
|
|
|
#endif
|
|
|
|
|
2005-07-01 16:28:32 +00:00
|
|
|
knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
|
2004-08-15 06:24:42 +00:00
|
|
|
SLIST_INIT(&knl->kl_list);
|
|
|
|
}
|
|
|
|
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
void
|
|
|
|
knlist_detach(struct knlist *knl)
|
|
|
|
{
|
|
|
|
|
|
|
|
KNL_ASSERT_LOCKED(knl);
|
|
|
|
knl->kl_autodestroy = true;
|
|
|
|
if (knlist_empty(knl)) {
|
|
|
|
knlist_destroy(knl);
|
|
|
|
free(knl, M_KQUEUE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
/*
|
|
|
|
* Even if we are locked, we may need to drop the lock to allow any influx
|
|
|
|
* knotes time to "settle".
|
|
|
|
*/
|
|
|
|
void
|
2005-03-18 01:11:39 +00:00
|
|
|
knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2006-06-02 13:18:59 +00:00
|
|
|
struct knote *kn, *kn2;
|
2004-08-15 06:24:42 +00:00
|
|
|
struct kqueue *kq;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
When filt_proc() removes event from the knlist due to the process
exiting (NOTE_EXIT->knlist_remove_inevent()), two things happen:
- knote kn_knlist pointer is reset
- INFLUX knote is removed from the process knlist.
And, there are two consequences:
- KN_LIST_UNLOCK() on such knote is nop
- there is nothing which would block exit1() from processing past the
knlist_destroy() (and knlist_destroy() resets knlist lock pointers).
Both consequences result either in leaked process lock, or
dereferencing NULL function pointers for locking.
Handle this by stopping embedding the process knlist into struct proc.
Instead, the knlist is allocated together with struct proc, but marked
as autodestroy on the zombie reap, by knlist_detach() function. The
knlist is freed when last kevent is removed from the list, in
particular, at the zombie reap time if the list is empty. As result,
the knlist_remove_inevent() is no longer needed and removed.
Other changes:
In filt_procattach(), clear NOTE_EXEC and NOTE_FORK desired events
from kn_sfflags for knote registered by kernel to only get NOTE_CHILD
notifications. The flags leak resulted in excessive
NOTE_EXEC/NOTE_FORK reports.
Fix immediate note activation in filt_procattach(). Condition should
be either the immediate CHILD_NOTE activation, or immediate NOTE_EXIT
report for the exiting process.
In knote_fork(), do not perform racy check for KN_INFLUX before kq
lock is taken. Besides being racy, it did not accounted for notes
just added by scan (KN_SCAN).
Some minor and incomplete style fixes.
Analyzed and tested by: Eric Badger <eric@badgerio.us>
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Approved by: re (gjb)
Differential revision: https://reviews.freebsd.org/D6859
2016-06-27 21:52:17 +00:00
|
|
|
KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
|
2004-08-15 06:24:42 +00:00
|
|
|
if (islocked)
|
2005-07-01 16:28:32 +00:00
|
|
|
KNL_ASSERT_LOCKED(knl);
|
2004-08-15 06:24:42 +00:00
|
|
|
else {
|
2005-07-01 16:28:32 +00:00
|
|
|
KNL_ASSERT_UNLOCKED(knl);
|
2007-05-27 19:24:00 +00:00
|
|
|
again: /* need to reacquire lock since we have dropped it */
|
2005-07-01 16:28:32 +00:00
|
|
|
knl->kl_lock(knl->kl_lockarg);
|
2004-08-15 06:24:42 +00:00
|
|
|
}
|
|
|
|
|
2006-06-02 13:18:59 +00:00
|
|
|
SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
|
2004-08-15 06:24:42 +00:00
|
|
|
kq = kn->kn_kq;
|
|
|
|
KQ_LOCK(kq);
|
2005-03-18 01:11:39 +00:00
|
|
|
if ((kn->kn_status & KN_INFLUX)) {
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
knlist_remove_kq(knl, kn, 1, 1);
|
2005-03-18 01:11:39 +00:00
|
|
|
if (killkn) {
|
|
|
|
kn->kn_status |= KN_INFLUX | KN_DETACHED;
|
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
knote_drop(kn, td);
|
|
|
|
} else {
|
|
|
|
/* Make sure cleared knotes disappear soon */
|
|
|
|
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
|
|
|
|
KQ_UNLOCK(kq);
|
|
|
|
}
|
2004-08-15 06:24:42 +00:00
|
|
|
kq = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!SLIST_EMPTY(&knl->kl_list)) {
|
|
|
|
/* there are still KN_INFLUX remaining */
|
|
|
|
kn = SLIST_FIRST(&knl->kl_list);
|
|
|
|
kq = kn->kn_kq;
|
|
|
|
KQ_LOCK(kq);
|
|
|
|
KASSERT(kn->kn_status & KN_INFLUX,
|
|
|
|
("knote removed w/o list lock"));
|
2005-07-01 16:28:32 +00:00
|
|
|
knl->kl_unlock(knl->kl_lockarg);
|
2004-08-15 06:24:42 +00:00
|
|
|
kq->kq_state |= KQ_FLUXWAIT;
|
|
|
|
msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
|
|
|
|
kq = NULL;
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (islocked)
|
2005-07-01 16:28:32 +00:00
|
|
|
KNL_ASSERT_LOCKED(knl);
|
2004-08-15 06:24:42 +00:00
|
|
|
else {
|
2005-07-01 16:28:32 +00:00
|
|
|
knl->kl_unlock(knl->kl_lockarg);
|
|
|
|
KNL_ASSERT_UNLOCKED(knl);
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
* Remove all knotes referencing a specified fd must be called with FILEDESC
|
|
|
|
* lock. This prevents a race where a new fd comes along and occupies the
|
|
|
|
* entry and we attach a knote to the fd.
|
2000-04-16 19:02:08 +00:00
|
|
|
*/
|
|
|
|
void
|
2001-09-12 08:38:13 +00:00
|
|
|
knote_fdclose(struct thread *td, int fd)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2001-09-12 08:38:13 +00:00
|
|
|
struct filedesc *fdp = td->td_proc->p_fd;
|
2004-08-15 06:24:42 +00:00
|
|
|
struct kqueue *kq;
|
|
|
|
struct knote *kn;
|
|
|
|
int influx;
|
2000-04-16 19:02:08 +00:00
|
|
|
|
Replace custom file descriptor array sleep lock constructed using a mutex
and flags with an sxlock. This leads to a significant and measurable
performance improvement as a result of access to shared locking for
frequent lookup operations, reduced general overhead, and reduced overhead
in the event of contention. All of these are imported for threaded
applications where simultaneous access to a shared file descriptor array
occurs frequently. Kris has reported 2x-4x transaction rate improvements
on 8-core MySQL benchmarks; smaller improvements can be expected for many
workloads as a result of reduced overhead.
- Generally eliminate the distinction between "fast" and regular
acquisisition of the filedesc lock; the plan is that they will now all
be fast. Change all locking instances to either shared or exclusive
locks.
- Correct a bug (pointed out by kib) in fdfree() where previously msleep()
was called without the mutex held; sx_sleep() is now always called with
the sxlock held exclusively.
- Universally hold the struct file lock over changes to struct file,
rather than the filedesc lock or no lock. Always update the f_ops
field last. A further memory barrier is required here in the future
(discussed with jhb).
- Improve locking and reference management in linux_at(), which fails to
properly acquire vnode references before using vnode pointers. Annotate
improper use of vn_fullpath(), which will be replaced at a future date.
In fcntl(), we conservatively acquire an exclusive lock, even though in
some cases a shared lock may be sufficient, which should be revisited.
The dropping of the filedesc lock in fdgrowtable() is no longer required
as the sxlock can be held over the sleep operation; we should consider
removing that (pointed out by attilio).
Tested by: kris
Discussed with: jhb, kris, attilio, jeff
2007-04-04 09:11:34 +00:00
|
|
|
FILEDESC_XLOCK_ASSERT(fdp);
|
2004-08-15 06:24:42 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We shouldn't have to worry about new kevents appearing on fd
|
|
|
|
* since filedesc is locked.
|
|
|
|
*/
|
2013-09-13 19:50:50 +00:00
|
|
|
TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_LOCK(kq);
|
|
|
|
|
|
|
|
again:
|
|
|
|
influx = 0;
|
|
|
|
while (kq->kq_knlistsize > fd &&
|
|
|
|
(kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
|
|
|
|
if (kn->kn_status & KN_INFLUX) {
|
|
|
|
/* someone else might be waiting on our knote */
|
|
|
|
if (influx)
|
|
|
|
wakeup(kq);
|
|
|
|
kq->kq_state |= KQ_FLUXWAIT;
|
|
|
|
msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
kn->kn_status |= KN_INFLUX;
|
|
|
|
KQ_UNLOCK(kq);
|
2004-09-06 19:02:42 +00:00
|
|
|
if (!(kn->kn_status & KN_DETACHED))
|
|
|
|
kn->kn_fop->f_detach(kn);
|
2004-08-15 06:24:42 +00:00
|
|
|
knote_drop(kn, td);
|
|
|
|
influx = 1;
|
|
|
|
KQ_LOCK(kq);
|
|
|
|
}
|
|
|
|
KQ_UNLOCK_FLUX(kq);
|
|
|
|
}
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
static int
|
|
|
|
knote_attach(struct knote *kn, struct kqueue *kq)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2004-08-15 06:24:42 +00:00
|
|
|
struct klist *list;
|
2002-01-13 11:58:06 +00:00
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
|
|
|
|
KQ_OWNED(kq);
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
if (kn->kn_fop->f_isfd) {
|
|
|
|
if (kn->kn_id >= kq->kq_knlistsize)
|
|
|
|
return ENOMEM;
|
|
|
|
list = &kq->kq_knlist[kn->kn_id];
|
|
|
|
} else {
|
|
|
|
if (kq->kq_knhash == NULL)
|
|
|
|
return ENOMEM;
|
|
|
|
list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
SLIST_INSERT_HEAD(list, kn, kn_link);
|
2004-08-15 06:24:42 +00:00
|
|
|
|
|
|
|
return 0;
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2005-10-12 17:51:31 +00:00
|
|
|
* knote must already have been detached using the f_detach method.
|
2004-08-15 06:24:42 +00:00
|
|
|
* no lock need to be held, it is assumed that the KN_INFLUX flag is set
|
|
|
|
* to prevent other removal.
|
2000-04-16 19:02:08 +00:00
|
|
|
*/
|
|
|
|
static void
|
2001-09-12 08:38:13 +00:00
|
|
|
knote_drop(struct knote *kn, struct thread *td)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2004-08-15 06:24:42 +00:00
|
|
|
struct kqueue *kq;
|
2000-04-16 19:02:08 +00:00
|
|
|
struct klist *list;
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
kq = kn->kn_kq;
|
|
|
|
|
|
|
|
KQ_NOTOWNED(kq);
|
|
|
|
KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
|
|
|
|
("knote_drop called without KN_INFLUX set in kn_status"));
|
|
|
|
|
|
|
|
KQ_LOCK(kq);
|
2000-04-16 19:02:08 +00:00
|
|
|
if (kn->kn_fop->f_isfd)
|
2004-08-15 06:24:42 +00:00
|
|
|
list = &kq->kq_knlist[kn->kn_id];
|
2000-04-16 19:02:08 +00:00
|
|
|
else
|
2004-08-15 06:24:42 +00:00
|
|
|
list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
|
2000-04-16 19:02:08 +00:00
|
|
|
|
2005-10-12 17:51:31 +00:00
|
|
|
if (!SLIST_EMPTY(list))
|
|
|
|
SLIST_REMOVE(list, kn, knote, kn_link);
|
2000-04-16 19:02:08 +00:00
|
|
|
if (kn->kn_status & KN_QUEUED)
|
|
|
|
knote_dequeue(kn);
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_UNLOCK_FLUX(kq);
|
|
|
|
|
|
|
|
if (kn->kn_fop->f_isfd) {
|
|
|
|
fdrop(kn->kn_fp, td);
|
|
|
|
kn->kn_fp = NULL;
|
|
|
|
}
|
|
|
|
kqueue_fo_release(kn->kn_kevent.filter);
|
|
|
|
kn->kn_fop = NULL;
|
2000-04-16 19:02:08 +00:00
|
|
|
knote_free(kn);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
knote_enqueue(struct knote *kn)
|
|
|
|
{
|
|
|
|
struct kqueue *kq = kn->kn_kq;
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_OWNED(kn->kn_kq);
|
2000-05-04 20:19:17 +00:00
|
|
|
KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
|
|
|
|
|
2004-08-12 18:06:21 +00:00
|
|
|
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
|
2000-04-16 19:02:08 +00:00
|
|
|
kn->kn_status |= KN_QUEUED;
|
|
|
|
kq->kq_count++;
|
|
|
|
kqueue_wakeup(kq);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
knote_dequeue(struct knote *kn)
|
|
|
|
{
|
|
|
|
struct kqueue *kq = kn->kn_kq;
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
KQ_OWNED(kn->kn_kq);
|
2000-05-04 20:19:17 +00:00
|
|
|
KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
|
|
|
|
|
2004-08-12 18:06:21 +00:00
|
|
|
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
|
2000-04-16 19:02:08 +00:00
|
|
|
kn->kn_status &= ~KN_QUEUED;
|
|
|
|
kq->kq_count--;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
knote_init(void)
|
|
|
|
{
|
2004-08-15 06:24:42 +00:00
|
|
|
|
2002-03-20 04:09:59 +00:00
|
|
|
knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
|
|
|
|
NULL, NULL, UMA_ALIGN_PTR, 0);
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
2008-03-16 10:58:09 +00:00
|
|
|
SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
|
2000-04-16 19:02:08 +00:00
|
|
|
|
|
|
|
static struct knote *
|
2004-08-15 06:24:42 +00:00
|
|
|
knote_alloc(int waitok)
|
2000-04-16 19:02:08 +00:00
|
|
|
{
|
2015-09-01 13:21:32 +00:00
|
|
|
|
|
|
|
return (uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT) |
|
|
|
|
M_ZERO));
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
knote_free(struct knote *kn)
|
|
|
|
{
|
2015-09-01 13:21:32 +00:00
|
|
|
|
|
|
|
uma_zfree(knote_zone, kn);
|
2000-04-16 19:02:08 +00:00
|
|
|
}
|
2006-09-24 04:47:47 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Register the kev w/ the kq specified by fd.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
|
|
|
|
{
|
|
|
|
struct kqueue *kq;
|
|
|
|
struct file *fp;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
2006-09-24 04:47:47 +00:00
|
|
|
int error;
|
|
|
|
|
2013-11-15 19:55:35 +00:00
|
|
|
error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp);
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
if (error != 0)
|
2006-09-24 04:47:47 +00:00
|
|
|
return (error);
|
2007-05-27 19:24:00 +00:00
|
|
|
if ((error = kqueue_acquire(fp, &kq)) != 0)
|
|
|
|
goto noacquire;
|
2006-09-24 04:47:47 +00:00
|
|
|
|
|
|
|
error = kqueue_register(kq, kev, td, waitok);
|
|
|
|
|
|
|
|
kqueue_release(kq, 0);
|
|
|
|
|
2007-05-27 19:24:00 +00:00
|
|
|
noacquire:
|
2006-09-25 01:29:48 +00:00
|
|
|
fdrop(fp, td);
|
2006-09-24 04:47:47 +00:00
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|