Make knote KN_INFLUX state counted. This is final fix for the issue

closed by r310302 for knote().

If KN_INFLUX | KN_SCAN flags are set for the note passed to knote() or
knote_fork(), i.e. the knote is scanned, we might erronously clear
INFLUX when finishing notification.  For normal knote() it was fixed
in r310302 simply by remembering the fact that we do not own
KN_INFLUX, since there we own knlist lock and scan thread cannot clear
KN_INFLUX until we drop the lock.  For knote_fork(), the situation is
more complicated, e must drop knlist lock AKA the process lock, since
we need to register new knotes.

Change KN_INFLUX into counter and allow shared ownership of the
in-flux state between scan and knote_fork() or knote().  Both in-flux
setters need to ensure that knote is not dropped in parallel.  Added
assert about kn_influx == 1 in knote_drop() verifies that in-flux state
is not shared when knote is destroyed.

Since KBI of the struct knote is changed by addition of the int
kn_influx field, reorder kn_hook and kn_hookid to fill pad on LP64
arches [1].  This keeps sizeof(struct knote) to same 128 bytes as it
was before addition of kn_influx, on amd64.

Reviewed by:	markj
Suggested by:	markj [1]
Tested by:	pho (previous version)
Sponsored by:	The FreeBSD Foundation
Differential revision:	https://reviews.freebsd.org/D8898
This commit is contained in:
Konstantin Belousov 2016-12-26 19:33:40 +00:00
parent 8b590e9506
commit fd30dd7c26
2 changed files with 92 additions and 59 deletions

View File

@ -193,7 +193,7 @@ static unsigned int kq_calloutmax = 4 * 1024;
SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
&kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
/* XXX - ensure not KN_INFLUX?? */
/* XXX - ensure not influx ? */
#define KNOTE_ACTIVATE(kn, islock) do { \
if ((islock)) \
mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \
@ -254,6 +254,32 @@ kn_list_unlock(struct knlist *knl)
}
}
static bool
kn_in_flux(struct knote *kn)
{
return (kn->kn_influx > 0);
}
static void
kn_enter_flux(struct knote *kn)
{
KQ_OWNED(kn->kn_kq);
MPASS(kn->kn_influx < INT_MAX);
kn->kn_influx++;
}
static bool
kn_leave_flux(struct knote *kn)
{
KQ_OWNED(kn->kn_kq);
MPASS(kn->kn_influx > 0);
kn->kn_influx--;
return (kn->kn_influx == 0);
}
#define KNL_ASSERT_LOCK(knl, islocked) do { \
if (islocked) \
KNL_ASSERT_LOCKED(knl); \
@ -498,7 +524,7 @@ knote_fork(struct knlist *list, int pid)
SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
kq = kn->kn_kq;
KQ_LOCK(kq);
if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
KQ_UNLOCK(kq);
continue;
}
@ -521,7 +547,7 @@ knote_fork(struct knlist *list, int pid)
* track the child. Drop the locks in preparation for
* the call to kqueue_register().
*/
kn->kn_status |= KN_INFLUX;
kn_enter_flux(kn);
KQ_UNLOCK(kq);
list->kl_unlock(list->kl_lockarg);
@ -561,7 +587,7 @@ knote_fork(struct knlist *list, int pid)
if (kn->kn_fop->f_event(kn, NOTE_FORK))
KNOTE_ACTIVATE(kn, 0);
KQ_LOCK(kq);
kn->kn_status &= ~KN_INFLUX;
kn_leave_flux(kn);
KQ_UNLOCK_FLUX(kq);
list->kl_lock(list->kl_lockarg);
}
@ -1262,7 +1288,7 @@ findkn:
}
/* knote is in the process of changing, wait for it to stabilize. */
if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
if (kn != NULL && kn_in_flux(kn)) {
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
if (filedesc_unlock) {
FILEDESC_XUNLOCK(td->td_proc->p_fd);
@ -1306,7 +1332,8 @@ findkn:
kn->kn_kevent = *kev;
kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
kn->kn_status = KN_INFLUX|KN_DETACHED;
kn->kn_status = KN_DETACHED;
kn_enter_flux(kn);
error = knote_attach(kn, kq);
KQ_UNLOCK(kq);
@ -1330,7 +1357,7 @@ findkn:
}
if (kev->flags & EV_DELETE) {
kn->kn_status |= KN_INFLUX;
kn_enter_flux(kn);
KQ_UNLOCK(kq);
if (!(kn->kn_status & KN_DETACHED))
kn->kn_fop->f_detach(kn);
@ -1348,7 +1375,8 @@ findkn:
* but doing so will not reset any filter which has already been
* triggered.
*/
kn->kn_status |= KN_INFLUX | KN_SCAN;
kn->kn_status |= KN_SCAN;
kn_enter_flux(kn);
KQ_UNLOCK(kq);
knl = kn_list_lock(kn);
kn->kn_kevent.udata = kev->udata;
@ -1383,7 +1411,8 @@ done_ev_add:
if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
KN_ACTIVE)
knote_enqueue(kn);
kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
kn->kn_status &= ~KN_SCAN;
kn_leave_flux(kn);
kn_list_unlock(knl);
KQ_UNLOCK_FLUX(kq);
@ -1546,7 +1575,7 @@ kqueue_task(void *arg, int pending)
/*
* Scan, update kn_data (if not ONESHOT), and copyout triggered events.
* We treat KN_MARKER knotes as if they are INFLUX.
* We treat KN_MARKER knotes as if they are in flux.
*/
static int
kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
@ -1620,7 +1649,7 @@ retry:
kn = TAILQ_FIRST(&kq->kq_head);
if ((kn->kn_status == KN_MARKER && kn != marker) ||
(kn->kn_status & KN_INFLUX) == KN_INFLUX) {
kn_in_flux(kn)) {
if (influx) {
influx = 0;
KQ_FLUX_WAKEUP(kq);
@ -1643,17 +1672,17 @@ retry:
goto retry;
goto done;
}
KASSERT((kn->kn_status & KN_INFLUX) == 0,
("KN_INFLUX set when not suppose to be"));
KASSERT(!kn_in_flux(kn),
("knote %p is unexpectedly in flux", kn));
if ((kn->kn_flags & EV_DROP) == EV_DROP) {
kn->kn_status &= ~KN_QUEUED;
kn->kn_status |= KN_INFLUX;
kn_enter_flux(kn);
kq->kq_count--;
KQ_UNLOCK(kq);
/*
* We don't need to lock the list since we've marked
* it _INFLUX.
* We don't need to lock the list since we've
* marked it as in flux.
*/
if (!(kn->kn_status & KN_DETACHED))
kn->kn_fop->f_detach(kn);
@ -1662,12 +1691,12 @@ retry:
continue;
} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
kn->kn_status &= ~KN_QUEUED;
kn->kn_status |= KN_INFLUX;
kn_enter_flux(kn);
kq->kq_count--;
KQ_UNLOCK(kq);
/*
* We don't need to lock the list since we've marked
* it _INFLUX.
* We don't need to lock the list since we've
* marked the knote as being in flux.
*/
*kevp = kn->kn_kevent;
if (!(kn->kn_status & KN_DETACHED))
@ -1676,7 +1705,8 @@ retry:
KQ_LOCK(kq);
kn = NULL;
} else {
kn->kn_status |= KN_INFLUX | KN_SCAN;
kn->kn_status |= KN_SCAN;
kn_enter_flux(kn);
KQ_UNLOCK(kq);
if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
@ -1684,9 +1714,9 @@ retry:
if (kn->kn_fop->f_event(kn, 0) == 0) {
KQ_LOCK(kq);
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
kn->kn_status &=
~(KN_QUEUED | KN_ACTIVE | KN_INFLUX |
kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
KN_SCAN);
kn_leave_flux(kn);
kq->kq_count--;
kn_list_unlock(knl);
influx = 1;
@ -1716,7 +1746,8 @@ retry:
} else
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
kn->kn_status &= ~KN_SCAN;
kn_leave_flux(kn);
kn_list_unlock(knl);
influx = 1;
}
@ -1864,12 +1895,12 @@ kqueue_drain(struct kqueue *kq, struct thread *td)
for (i = 0; i < kq->kq_knlistsize; i++) {
while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
if (kn_in_flux(kn)) {
kq->kq_state |= KQ_FLUXWAIT;
msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
continue;
}
kn->kn_status |= KN_INFLUX;
kn_enter_flux(kn);
KQ_UNLOCK(kq);
if (!(kn->kn_status & KN_DETACHED))
kn->kn_fop->f_detach(kn);
@ -1880,13 +1911,13 @@ kqueue_drain(struct kqueue *kq, struct thread *td)
if (kq->kq_knhashmask != 0) {
for (i = 0; i <= kq->kq_knhashmask; i++) {
while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
if (kn_in_flux(kn)) {
kq->kq_state |= KQ_FLUXWAIT;
msleep(kq, &kq->kq_lock, PSOCK,
"kqclo2", 0);
continue;
}
kn->kn_status |= KN_INFLUX;
kn_enter_flux(kn);
KQ_UNLOCK(kq);
if (!(kn->kn_status & KN_DETACHED))
kn->kn_fop->f_detach(kn);
@ -2010,7 +2041,6 @@ knote(struct knlist *list, long hint, int lockflags)
struct kqueue *kq;
struct knote *kn, *tkn;
int error;
bool own_influx;
if (list == NULL)
return;
@ -2021,7 +2051,7 @@ knote(struct knlist *list, long hint, int lockflags)
list->kl_lock(list->kl_lockarg);
/*
* If we unlock the list lock (and set KN_INFLUX), we can
* If we unlock the list lock (and enter influx), we can
* eliminate the kqueue scheduling, but this will introduce
* four lock/unlock's for each knote to test. Also, marker
* would be needed to keep iteration position, since filters
@ -2030,7 +2060,7 @@ knote(struct knlist *list, long hint, int lockflags)
SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
kq = kn->kn_kq;
KQ_LOCK(kq);
if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
/*
* Do not process the influx notes, except for
* the influx coming from the kq unlock in the
@ -2041,14 +2071,11 @@ knote(struct knlist *list, long hint, int lockflags)
*/
KQ_UNLOCK(kq);
} else if ((lockflags & KNF_NOKQLOCK) != 0) {
own_influx = (kn->kn_status & KN_INFLUX) == 0;
if (own_influx)
kn->kn_status |= KN_INFLUX;
kn_enter_flux(kn);
KQ_UNLOCK(kq);
error = kn->kn_fop->f_event(kn, hint);
KQ_LOCK(kq);
if (own_influx)
kn->kn_status &= ~KN_INFLUX;
kn_leave_flux(kn);
if (error)
KNOTE_ACTIVATE(kn, 1);
KQ_UNLOCK_FLUX(kq);
@ -2070,10 +2097,12 @@ knote(struct knlist *list, long hint, int lockflags)
void
knlist_add(struct knlist *knl, struct knote *kn, int islocked)
{
KNL_ASSERT_LOCK(knl, islocked);
KQ_NOTOWNED(kn->kn_kq);
KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
(KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
KASSERT((kn->kn_status & KN_DETACHED) != 0,
("knote %p was not detached", kn));
if (!islocked)
knl->kl_lock(knl->kl_lockarg);
SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
@ -2089,12 +2118,13 @@ static void
knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
int kqislocked)
{
KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
KNL_ASSERT_LOCK(knl, knlislocked);
mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
if (!kqislocked)
KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
("knlist_remove called w/o knote being KN_INFLUX or already removed"));
KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
KASSERT((kn->kn_status & KN_DETACHED) == 0,
("knote %p was already detached", kn));
if (!knlislocked)
knl->kl_lock(knl->kl_lockarg);
SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
@ -2287,30 +2317,30 @@ again: /* need to reacquire lock since we have dropped it */
SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
kq = kn->kn_kq;
KQ_LOCK(kq);
if ((kn->kn_status & KN_INFLUX)) {
if (kn_in_flux(kn)) {
KQ_UNLOCK(kq);
continue;
}
knlist_remove_kq(knl, kn, 1, 1);
if (killkn) {
kn->kn_status |= KN_INFLUX | KN_DETACHED;
kn->kn_status |= KN_DETACHED;
kn_enter_flux(kn);
KQ_UNLOCK(kq);
knote_drop(kn, td);
} else {
/* Make sure cleared knotes disappear soon */
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
kn->kn_flags |= EV_EOF | EV_ONESHOT;
KQ_UNLOCK(kq);
}
kq = NULL;
}
if (!SLIST_EMPTY(&knl->kl_list)) {
/* there are still KN_INFLUX remaining */
/* there are still in flux knotes remaining */
kn = SLIST_FIRST(&knl->kl_list);
kq = kn->kn_kq;
KQ_LOCK(kq);
KASSERT(kn->kn_status & KN_INFLUX,
("knote removed w/o list lock"));
KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
knl->kl_unlock(knl->kl_lockarg);
kq->kq_state |= KQ_FLUXWAIT;
msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
@ -2352,7 +2382,7 @@ again:
influx = 0;
while (kq->kq_knlistsize > fd &&
(kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
if (kn->kn_status & KN_INFLUX) {
if (kn_in_flux(kn)) {
/* someone else might be waiting on our knote */
if (influx)
wakeup(kq);
@ -2360,7 +2390,7 @@ again:
msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
goto again;
}
kn->kn_status |= KN_INFLUX;
kn_enter_flux(kn);
KQ_UNLOCK(kq);
if (!(kn->kn_status & KN_DETACHED))
kn->kn_fop->f_detach(kn);
@ -2377,7 +2407,7 @@ knote_attach(struct knote *kn, struct kqueue *kq)
{
struct klist *list;
KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
KQ_OWNED(kq);
if (kn->kn_fop->f_isfd) {
@ -2395,7 +2425,7 @@ knote_attach(struct knote *kn, struct kqueue *kq)
/*
* knote must already have been detached using the f_detach method.
* no lock need to be held, it is assumed that the KN_INFLUX flag is set
* no lock need to be held, it is assumed that the influx state is set
* to prevent other removal.
*/
static void
@ -2407,10 +2437,10 @@ knote_drop(struct knote *kn, struct thread *td)
kq = kn->kn_kq;
KQ_NOTOWNED(kq);
KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
("knote_drop called without KN_INFLUX set in kn_status"));
KQ_LOCK(kq);
KASSERT(kn->kn_influx == 1,
("knote_drop called on %p with influx %d", kn, kn->kn_influx));
if (kn->kn_fop->f_isfd)
list = &kq->kq_knlist[kn->kn_id];
else

View File

@ -202,8 +202,11 @@ struct filterops {
};
/*
* Setting the KN_INFLUX flag enables you to unlock the kq that this knote
* is on, and modify kn_status as if you had the KQ lock.
* An in-flux knote cannot be dropped from its kq while the kq is
* unlocked. If the KN_SCAN flag is not set, a thread can only set
* kn_influx when it is exclusive owner of the knote state, and can
* modify kn_status as if it had the KQ lock. KN_SCAN must not be set
* on a knote which is already in flux.
*
* kn_sfflags, kn_sdata, and kn_kevent are protected by the knlist lock.
*/
@ -214,16 +217,18 @@ struct knote {
TAILQ_ENTRY(knote) kn_tqe;
struct kqueue *kn_kq; /* which queue we are on */
struct kevent kn_kevent;
void *kn_hook;
int kn_hookid;
int kn_status; /* protected by kq lock */
#define KN_ACTIVE 0x01 /* event has been triggered */
#define KN_QUEUED 0x02 /* event is on queue */
#define KN_DISABLED 0x04 /* event is disabled */
#define KN_DETACHED 0x08 /* knote is detached */
#define KN_INFLUX 0x10 /* knote is in flux */
#define KN_MARKER 0x20 /* ignore this knote */
#define KN_KQUEUE 0x40 /* this knote belongs to a kq */
#define KN_HASKQLOCK 0x80 /* for _inevent */
#define KN_SCAN 0x100 /* flux set in kqueue_scan() */
int kn_influx;
int kn_sfflags; /* saved filter flags */
intptr_t kn_sdata; /* saved data field */
union {
@ -234,8 +239,6 @@ struct knote {
void *p_v; /* generic other pointer */
} kn_ptr;
struct filterops *kn_fop;
void *kn_hook;
int kn_hookid;
#define kn_id kn_kevent.ident
#define kn_filter kn_kevent.filter