This commit is contained in:
Oscar Zhao 2019-04-19 15:33:07 -04:00
parent cb22931bab
commit cb4c673500
4 changed files with 650 additions and 124 deletions

View File

@ -115,17 +115,13 @@ extern struct cpu_group *cpu_top;
} while(0) } while(0)
/* no need to handle overflow as long as the existing org/cur doesn't overflow */ /* no need to handle overflow as long as the existing org/cur doesn't overflow */
#define CALC_OVERTIME_AVG(org, cur, out) do { \ #define CALC_OVERTIME_AVG(prev, cur) (((prev) * AVG_WEIGHT_FACTOR_OLD + (cur) * AVG_WEIGHT_FACTOR_NEW) / (AVG_WEIGHT_FACTOR_OLD + AVG_WEIGHT_FACTOR_NEW))
(out)->tv_sec = (org)->tv_sec * AVG_WEIGHT_FACTOR_OLD + (cur)->tv_nsec * AVG_WEIGHT_FACTOR_NEW; \
(out)->tv_nsec = (org)->tv_nsec * AVG_WEIGHT_FACTOR_OLD + (cur)->tv_nsec * AVG_WEIGHT_FACTOR_NEW; \
TIMESPEC_DIV((out), (AVG_WEIGHT_FACTOR_OLD + AVG_WEIGHT_FACTOR_NEW), (out)); \
} while(0)
#define KEVQ_NEXT_AVAIL_LOCKED(out, head, element, member) do { \ #define KEVQ_NEXT_AVAIL_LOCKED(out, head, element, member) do { \
(out) = (element); \ (out) = (element); \
while(1) { \ while(1) { \
(out) = (out) == NULL ? LIST_FIRST((head)) : LIST_NEXT((out), member); \ (out) = (out) == NULL ? LIST_FIRST((head)) : LIST_NEXT((out), member); \
if ((out) != NULL) { \ if ((out) != NULL && KEVQ_AVAIL(out)) { \
KEVQ_LOCK((out)); \ KEVQ_LOCK((out)); \
if (KEVQ_AVAIL((out))) { \ if (KEVQ_AVAIL((out))) { \
break; \ break; \
@ -147,7 +143,8 @@ static void kevq_init(struct kevq *kevq);
static void kevq_release(struct kevq* kevq, int locked); static void kevq_release(struct kevq* kevq, int locked);
static int kevq_acquire_kq(struct kqueue *kq, struct thread *td, struct kevq **kevqp); static int kevq_acquire_kq(struct kqueue *kq, struct thread *td, struct kevq **kevqp);
static void kevq_destroy(struct kevq *kevq); static void kevq_destroy(struct kevq *kevq);
static int kevq_acquire(struct kevq *kevq); static int kevq_acquire(struct kevq *kevq, int locked);
static void kevq_worksteal(struct kevq *kevq);
void kevq_drain(struct kevq *kevq); void kevq_drain(struct kevq *kevq);
static void knote_xinit(struct knote *kn); static void knote_xinit(struct knote *kn);
@ -217,11 +214,12 @@ static void knote_sched(struct knote *kn);
static void kqdom_init(struct kqdom *kqd); static void kqdom_init(struct kqdom *kqd);
static void kqdom_update_stats(struct kqdom *leaf, struct timespec *avg); static void kqdom_update_lat(struct kqdom *leaf, unsigned long avg);
static void kqdom_update_active(struct kqdom *leaf, int change);
static void kqdom_insert(struct kqdom *kqd, struct kevq *kevq); static void kqdom_insert(struct kqdom *kqd, struct kevq *kevq);
static void kqdom_remove(struct kqdom *kqd, struct kevq *kevq); static void kqdom_remove(struct kqdom *kqd, struct kevq *kevq);
static void kqdom_destroy(struct kqdom *root); static void kqdom_destroy(struct kqdom *root);
static void kqdom_update_stats(struct kqdom *leaf, struct timespec *avg); static struct kevq * kqdom_random_kevq_locked(struct kqdom *kqd, struct kevq *last_kevq);
static void kqdom_build_internal(struct kqdom *kqd_cur, struct cpu_group *cg_cur, int *kqd_id); static void kqdom_build_internal(struct kqdom *kqd_cur, struct cpu_group *cg_cur, int *kqd_id);
static struct kqdom * kqdom_build(void); static struct kqdom * kqdom_build(void);
static struct kqdom * kqdom_find(struct kqdom *root, int cpuid); static struct kqdom * kqdom_find(struct kqdom *root, int cpuid);
@ -246,6 +244,26 @@ static int filt_user(struct knote *kn, long hint);
static void filt_usertouch(struct knote *kn, struct kevent *kev, static void filt_usertouch(struct knote *kn, struct kevent *kev,
u_long type); u_long type);
static int kq_sched_bon_count = 2;
SYSCTL_INT(_kern, OID_AUTO, kq_sched_bon_count, CTLFLAG_RWTUN, &kq_sched_bon_count, 0, "the number of kevqs to select the best one from");
/* TODO: make this a percentage? */
static int kq_sched_ws_count = 1;
SYSCTL_INT(_kern, OID_AUTO, kq_sched_ws_count, CTLFLAG_RWTUN, &kq_sched_ws_count, 0, "the number of kevqs to steal each time");
// hacky fast random generator
static unsigned int g_seed = 0x1234;
// Used to seed the generator.
static void kqueue_fsrand(int seed) {
g_seed = seed;
}
// Compute a pseudorandom integer.
// Output value in range [0, 32767]
static int kqueue_frand(void) {
g_seed = (214013 * g_seed + 2531011);
return (g_seed>>16) & 0x7FFF;
}
static struct filterops file_filtops = { static struct filterops file_filtops = {
.f_isfd = 1, .f_isfd = 1,
.f_attach = filt_fileattach, .f_attach = filt_fileattach,
@ -547,7 +565,7 @@ filt_kqueue(struct knote *kn, long hint)
CTR1(KTR_KQ, "filt_kqueue called for kn %p", kn); CTR1(KTR_KQ, "filt_kqueue called for kn %p", kn);
if ( (kq->kq_state & KQ_FLAG_MULTI) == KQ_FLAG_MULTI) { if ( (kq->kq_flags & KQ_FLAG_MULTI) == KQ_FLAG_MULTI) {
return 0; return 0;
} }
@ -1407,8 +1425,9 @@ kqueue_kevent(struct kqueue *kq, struct kevq *kevq, struct thread *td, int nchan
struct kevent *kevp, *changes; struct kevent *kevp, *changes;
struct timespec cur_ts; struct timespec cur_ts;
int i, n, nerrors, error; int i, n, nerrors, error;
unsigned long avg;
if ((kq->kq_state & KQ_FLAG_MULTI) == 0 && (kevq->kevq_state & KEVQ_RDY) == 0) { if ((kq->kq_flags & KQ_FLAG_MULTI) == 0 && (kevq->kevq_state & KEVQ_RDY) == 0) {
/* Mark the global kevq as ready for single threaded mode to close the window between /* Mark the global kevq as ready for single threaded mode to close the window between
kqueue_register and kqueue_scan.*/ kqueue_register and kqueue_scan.*/
KEVQ_LOCK(kevq); KEVQ_LOCK(kevq);
@ -1421,22 +1440,24 @@ kqueue_kevent(struct kqueue *kq, struct kevq *kevq, struct thread *td, int nchan
if (kevq->kevq_last_nkev != 0) if (kevq->kevq_last_nkev != 0)
{ {
/* make sure we actually processed events last time */ /* make sure we actually processed events last time */
getnanouptime(&cur_ts); getnanouptime(&cur_ts);
timespecsub(&cur_ts, &kevq->kevq_last_kev, &cur_ts); timespecsub(&cur_ts, &kevq->kevq_last_kev, &cur_ts);
/* divide by the number of events processed */ /* divide by the number of events processed */
TIMESPEC_DIV(&cur_ts, kevq->kevq_last_nkev, &cur_ts); avg = (cur_ts.tv_sec * 1000000 + cur_ts.tv_nsec / 100) / kevq->kevq_last_nkev;
if (timespecisset(&kevq->kevq_avg_lat)) {
CALC_OVERTIME_AVG(&kevq->kevq_avg_lat, &kevq->kevq_avg_lat, &cur_ts); if (kevq->kevq_avg_lat != 0) {
kevq->kevq_avg_lat = CALC_OVERTIME_AVG(kevq->kevq_avg_lat, avg);
} else { } else {
/* first time */ kevq->kevq_avg_lat = avg;
timespecadd(&cur_ts, &kevq->kevq_avg_lat, &kevq->kevq_avg_lat);
} }
CTR4(KTR_KQ, "kevent: td %d spent %ld s %ld ns per event on %d events", td->td_tid, cur_ts.tv_sec, cur_ts.tv_nsec, kevq->kevq_last_nkev); CTR3(KTR_KQ, "kevent: td %d spent %ld us per event on %d events", td->td_tid, avg, kevq->kevq_last_nkev);
/* clear parameters */ /* clear parameters */
timespecclear(&kevq->kevq_last_kev); timespecclear(&kevq->kevq_last_kev);
kevq->kevq_last_nkev = 0; kevq->kevq_last_nkev = 0;
kqdom_update_lat(kevq->kevq_kqd, avg);
} }
KEVQ_UNLOCK(kevq); KEVQ_UNLOCK(kevq);
@ -1908,7 +1929,7 @@ kevq_thred_drain(struct kevq_thred *kevq_th, struct thread* td) {
KEVQ_TH_LOCK(kevq_th); KEVQ_TH_LOCK(kevq_th);
while((kevq = LIST_FIRST(&kevq_th->kevq_list)) != NULL) { while((kevq = LIST_FIRST(&kevq_th->kevq_list)) != NULL) {
if (kevq_acquire(kevq) == 0) { if (kevq_acquire(kevq, 0) == 0) {
CTR2(KTR_KQ, "kevq_thred_drain: draining kevq %p on kevq_th %p", kevq, kevq_th); CTR2(KTR_KQ, "kevq_thred_drain: draining kevq %p on kevq_th %p", kevq, kevq_th);
KEVQ_TH_UNLOCK(kevq_th); KEVQ_TH_UNLOCK(kevq_th);
kevq_drain(kevq); kevq_drain(kevq);
@ -1924,7 +1945,6 @@ static void
kevq_init(struct kevq *kevq) { kevq_init(struct kevq *kevq) {
mtx_init(&kevq->lock, "kevq", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&kevq->lock, "kevq", NULL, MTX_DEF | MTX_DUPOK);
TAILQ_INIT(&kevq->kn_head); TAILQ_INIT(&kevq->kn_head);
timespecclear(&kevq->kevq_avg_lat);
timespecclear(&kevq->kevq_last_kev); timespecclear(&kevq->kevq_last_kev);
} }
@ -1944,19 +1964,25 @@ kevq_release(struct kevq* kevq, int locked)
} }
static int static int
kevq_acquire(struct kevq *kevq) kevq_acquire(struct kevq *kevq, int locked)
{ {
KEVQ_NOTOWNED(kevq);
int error; int error;
if (locked) {
KEVQ_OWNED(kevq);
} else {
KEVQ_LOCK(kevq);
}
error = 0; error = 0;
KEVQ_LOCK(kevq);
CTR2(KTR_KQ, "referencing kevq %p (refcnt = %d)", kevq, kevq->kevq_refcnt); CTR2(KTR_KQ, "referencing kevq %p (refcnt = %d)", kevq, kevq->kevq_refcnt);
if ((kevq->kevq_state & KEVQ_CLOSING) == KEVQ_CLOSING) { if ((kevq->kevq_state & KEVQ_CLOSING) == KEVQ_CLOSING) {
error = EINVAL; error = EINVAL;
} else { } else {
kevq->kevq_refcnt++; kevq->kevq_refcnt++;
} }
KEVQ_UNLOCK(kevq);
if (!locked) {
KEVQ_UNLOCK(kevq);
}
return error; return error;
} }
@ -1982,7 +2008,7 @@ kevq_acquire_kq(struct kqueue *kq, struct thread *td, struct kevq **kevqp)
return EINVAL; return EINVAL;
} }
if ((kq->kq_state & KQ_FLAG_MULTI) == KQ_FLAG_MULTI) { if ((kq->kq_flags & KQ_FLAG_MULTI) == KQ_FLAG_MULTI) {
// allocate KEVQ_TH // allocate KEVQ_TH
if (td->td_kevq_thred == NULL) { if (td->td_kevq_thred == NULL) {
kevq_th = malloc(sizeof(struct kevq_thred), M_KQUEUE, M_WAITOK | M_ZERO); kevq_th = malloc(sizeof(struct kevq_thred), M_KQUEUE, M_WAITOK | M_ZERO);
@ -2043,13 +2069,20 @@ kevq_acquire_kq(struct kqueue *kq, struct thread *td, struct kevq **kevqp)
LIST_INSERT_HEAD(&kq->kq_kevqlist, kevq, kq_e); LIST_INSERT_HEAD(&kq->kq_kevqlist, kevq, kq_e);
KEVQ_TH_UNLOCK(kevq_th);
KQ_UNLOCK(kq);
kqdom_insert(kqd, kevq); kqdom_insert(kqd, kevq);
KQD_UNLOCK(kqd);
} else { } else {
to_free = alloc_kevq; to_free = alloc_kevq;
KEVQ_TH_UNLOCK(kevq_th);
KQD_UNLOCK(kqd);
KQ_UNLOCK(kq);
} }
KEVQ_TH_UNLOCK(kevq_th);
KQD_UNLOCK(kqd);
KQ_UNLOCK(kq);
if (to_free != NULL) { if (to_free != NULL) {
free(to_free, M_KQUEUE); free(to_free, M_KQUEUE);
@ -2081,7 +2114,7 @@ kevq_acquire_kq(struct kqueue *kq, struct thread *td, struct kevq **kevqp)
} }
} }
error = kevq_acquire(kevq); error = kevq_acquire(kevq, 0);
if (!error) { if (!error) {
*kevqp = kevq; *kevqp = kevq;
@ -2105,13 +2138,13 @@ kqueue_acquire(struct file *fp, struct kqueue **kqp)
if (((kq->kq_state) & KQ_CLOSING) != 0) { if (((kq->kq_state) & KQ_CLOSING) != 0) {
return (EBADF); return (EBADF);
} }
if ((kq->kq_state & KQ_FLAG_INIT) == 0) { if ((kq->kq_flags & KQ_FLAG_INIT) == 0) {
kq->kq_state |= KQ_FLAG_INIT; kq->kq_flags |= KQ_FLAG_INIT;
} }
kq->kq_refcnt++; kq->kq_refcnt++;
KQ_UNLOCK(kq); KQ_UNLOCK(kq);
if (((kq->kq_state & KQ_FLAG_MULTI) != 0) && (kq->kq_kqd == NULL)) { if (((kq->kq_flags & KQ_FLAG_MULTI) != 0) && (kq->kq_kqd == NULL)) {
kqd = kqdom_build(); kqd = kqdom_build();
KQ_LOCK(kq); KQ_LOCK(kq);
if (kq->kq_kqd == NULL) { if (kq->kq_kqd == NULL) {
@ -2230,82 +2263,142 @@ static void
kqdom_init(struct kqdom *kqd) kqdom_init(struct kqdom *kqd)
{ {
mtx_init(&kqd->kqd_lock, "kqdom_lock", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&kqd->kqd_lock, "kqdom_lock", NULL, MTX_DEF | MTX_DUPOK);
LIST_INIT(&kqd->kqd_kevqlist);
TAILQ_INIT(&kqd->children);
} }
/* inserts a list*/ /* inserts a list*/
static void static void
kqdom_insert(struct kqdom *kqd, struct kevq *kevq) kqdom_insert(struct kqdom *kqd, struct kevq *kevq)
{ {
int val; int oldcap;
struct kqdom* parent; struct kevq **expand;
KQD_OWNED(kqd); KQD_OWNED(kqd);
KASSERT(kqd->num_children == 0, ("inserting into a non-leaf kqdom")); KASSERT(kqd->num_children == 0, ("inserting into a non-leaf kqdom"));
LIST_INSERT_HEAD(&kqd->kqd_kevqlist, kevq, kqd_e); CTR4(KTR_KQ, "kqdom_insert: kevq: %p kqd %d: cnt: %d cap: %d", kevq, kqd->id, kqd->kqd_kevqcnt, kqd->kqd_kevqcap);
/* TODO: don't hold the lock while doing the update */
parent = kqd; retry:
while(parent != NULL) { if (kqd->kqd_kevqcnt + 1 > kqd->kqd_kevqcap) {
val = atomic_fetchadd_int(&parent->num_kevq, 1); CTR2(KTR_KQ, "kqdom_insert: expanding... kqd %d for kevq %p\n", kqd->id, kevq);
KASSERT(val >= 0, ("invalid num_kevq for kqdom <= 0")); oldcap = kqd->kqd_kevqcap;
parent = parent->parent; KQD_UNLOCK(kqd);
expand = malloc(sizeof(struct kqdom *) * (oldcap + KQDOM_EXTENT_FACTOR), M_KQUEUE, M_WAITOK | M_ZERO);
KQD_LOCK(kqd);
/* recheck if we need expansion, make sure old capacity didn't change */
if (kqd->kqd_kevqcap == oldcap) {
/* copy the content from the old list to this */
for(int i = 0; i < kqd->kqd_kevqcnt; i++) {
expand[i] = kqd->kqd_kevqlist[i];
}
free(kqd->kqd_kevqlist, M_KQUEUE);
kqd->kqd_kevqlist = expand;
kqd->kqd_kevqcap = oldcap + KQDOM_EXTENT_FACTOR;
} else {
/* some threads made changes while we allocated memory, retry */
free(expand, M_KQUEUE);
goto retry;
}
} }
KQD_OWNED(kqd);
KASSERT(kqd->kqd_kevqcnt + 1 <= kqd->kqd_kevqcap, ("kqdom didn't expand properly"));
/* insert to list */
kqd->kqd_kevqlist[kqd->kqd_kevqcnt] = kevq;
kqd->kqd_kevqcnt++;
} }
/* removes a list */ /* removes a list */
static void static void
kqdom_remove(struct kqdom *kqd, struct kevq *kevq) kqdom_remove(struct kqdom *kqd, struct kevq *kevq)
{ {
int val; int found;
struct kqdom* parent;
KQD_OWNED(kqd); KQD_OWNED(kqd);
KASSERT(kqd->num_children == 0, ("removing from a non-leaf kqdom")); KASSERT(kqd->num_children == 0, ("removing from a non-leaf kqdom"));
LIST_REMOVE(kevq, kqd_e); CTR4(KTR_KQ, "kqdom_remove: kevq: %p kqd %d: cnt: %d cap: %d", kevq, kqd->id, kqd->kqd_kevqcnt, kqd->kqd_kevqcap);
/* TODO: don't hold the lock while doing the update */ found = 0;
parent = kqd;
while(parent != NULL) { /* slow, but no need to optimize for delete */
val = atomic_fetchadd_int(&parent->num_kevq, -1); for(int i = 0; i < kqd->kqd_kevqcnt; i++) {
KASSERT(val >= 0, ("invalid num_kevq for kqdom <= 0")); if(kqd->kqd_kevqlist[i] == kevq) {
parent = parent->parent; found = 1;
}
if(found && (i+1 < kqd->kqd_kevqcnt)) {
kqd->kqd_kevqlist[i] = kqd->kqd_kevqlist[i+1];
}
} }
if (kqd->kqd_ckevq == kevq) { KASSERT(found, ("cannot find kevq from kqdom"));
kqd->kqd_ckevq = LIST_NEXT(kevq, kqd_e);
} kqd->kqd_kevqcnt--;
kqd->kqd_kevqlist[kqd->kqd_kevqcnt] = NULL;
if (kqd->kqd_kevqcnt != 0)
kqd->kqd_ckevq = kqd->kqd_ckevq % kqd->kqd_kevqcnt;
else
kqd->kqd_ckevq = 0;
} }
static void static void
kqdom_destroy(struct kqdom *root) kqdom_destroy(struct kqdom *root)
{ {
struct kqdom *kqdom, *tkqd; for(int i = 0; i < root->num_children; i++) {
kqdom_destroy(root->children[i]);
TAILQ_FOREACH_SAFE(kqdom, &root->children, child_e, tkqd) {
kqdom_destroy(kqdom);
} }
CTR2(KTR_KQ, "kqdom_destroy: destroyed kqdom %p with %d child kqdoms", root, root->num_children); CTR2(KTR_KQ, "kqdom_destroy: destroyed kqdom %p with %d child kqdoms", root, root->num_children);
KASSERT(LIST_FIRST(&root->kqd_kevqlist) == NULL, ("freeing a kqdom with kevqs"));
if (root->kqd_kevqlist != NULL) {
KASSERT(root->kqd_kevqcnt == 0, ("freeing a kqdom with kevqs"));
free(root->kqd_kevqlist, M_KQUEUE);
}
if (root->children != NULL) {
free(root->children, M_KQUEUE);
}
KASSERT(root->num_active == 0, ("freeing a kqdom with active kevqs"));
free(root, M_KQUEUE); free(root, M_KQUEUE);
} }
static void static void
kqdom_update_stats(struct kqdom *leaf, struct timespec *avg) kqdom_update_active(struct kqdom *leaf, int change)
{ {
struct timespec last_avg; int oldval, newval;
last_avg.tv_sec = avg->tv_sec;
last_avg.tv_nsec = avg->tv_nsec;
KASSERT(change != 0, ("updating active 0"));
while (leaf != NULL) {
oldval = atomic_fetchadd_int(&leaf->num_active, change);
newval = oldval + change;
KASSERT(oldval >= 0 && newval >= 0, ("invalid oldval or newval after update"));
if (oldval == 0) {
change = 1;
CTR3(KTR_KQ, "kqdom_update_active: change %d: num of active %d for kqdom %d", change, newval, leaf->id);
} else if (newval == 0) {
/* if new val is 0, we */
change = -1;
CTR3(KTR_KQ, "kqdom_update_active: change %d: num of active %d for kqdom %d", change, newval, leaf->id);
} else {
break;
}
leaf = leaf->parent;
}
}
static void
kqdom_update_lat(struct kqdom *leaf, unsigned long avg)
{
while(leaf != NULL) { while(leaf != NULL) {
KQD_LOCK(leaf); if (leaf->avg_lat != 0) {
// bit rot race here?
leaf->avg_lat = CALC_OVERTIME_AVG(leaf->avg_lat, avg);
} else {
leaf->avg_lat = avg;
}
CALC_OVERTIME_AVG(&leaf->kqd_avg_lat, &last_avg, &leaf->kqd_avg_lat); CTR2(KTR_KQ, "kqdom_update_lat: updated avg lat %ld us for kqdom %d", leaf->avg_lat, leaf->id);
CTR3(KTR_KQ, "kqdom_update_stats: updated avg lat %ld sec %ld for kqdom %d",
leaf->kqd_avg_lat.tv_sec, leaf->kqd_avg_lat.tv_nsec, leaf->id);
last_avg.tv_sec = leaf->kqd_avg_lat.tv_sec;
last_avg.tv_nsec = leaf->kqd_avg_lat.tv_nsec;
KQD_UNLOCK(leaf);
leaf = leaf->parent; leaf = leaf->parent;
} }
@ -2325,6 +2418,7 @@ kqdom_build_internal(struct kqdom *kqd_cur, struct cpu_group *cg_cur, int *kqd_i
(*kqd_id)++; (*kqd_id)++;
kqd_cur->num_children = cg_numchild; kqd_cur->num_children = cg_numchild;
CPU_COPY(&cg_cur->cg_mask, &kqd_cur->cpu_mask); CPU_COPY(&cg_cur->cg_mask, &kqd_cur->cpu_mask);
kqd_cur->children = malloc(sizeof(struct kqdom *) * cg_numchild, M_KQUEUE, M_WAITOK | M_ZERO);
for (int i = 0; i < cg_numchild; i++) { for (int i = 0; i < cg_numchild; i++) {
child = malloc(sizeof(struct kqdom), M_KQUEUE, M_WAITOK | M_ZERO); child = malloc(sizeof(struct kqdom), M_KQUEUE, M_WAITOK | M_ZERO);
@ -2332,7 +2426,7 @@ kqdom_build_internal(struct kqdom *kqd_cur, struct cpu_group *cg_cur, int *kqd_i
child->parent = kqd_cur; child->parent = kqd_cur;
TAILQ_INSERT_TAIL(&kqd_cur->children, child, child_e); kqd_cur->children[i] = child;
kqdom_build_internal(child, &cg_cur->cg_child[i], kqd_id); kqdom_build_internal(child, &cg_cur->cg_child[i], kqd_id);
} }
} }
@ -2351,16 +2445,14 @@ kqdom_build()
static struct kqdom * static struct kqdom *
kqdom_find(struct kqdom *root, int cpuid) kqdom_find(struct kqdom *root, int cpuid)
{ {
struct kqdom *child, *tchild;
if (root->num_children == 0) { if (root->num_children == 0) {
KASSERT(CPU_ISSET(cpuid, &root->cpu_mask), ("kqdom_find: cpuid and cpumask mismatch")); KASSERT(CPU_ISSET(cpuid, &root->cpu_mask), ("kqdom_find: cpuid and cpumask mismatch"));
return root; return root;
} }
TAILQ_FOREACH_SAFE(child, &root->children, child_e, tchild) { for(int i = 0; i < root->num_children; i++) {
if (CPU_ISSET(cpuid, &child->cpu_mask)) { if(CPU_ISSET(cpuid, &root->children[i]->cpu_mask)) {
return kqdom_find(child, cpuid); return kqdom_find(root->children[i], cpuid);
} }
} }
@ -2463,6 +2555,89 @@ kqueue_task(void *arg, int pending)
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
} }
static void
kevq_worksteal(struct kevq *kevq)
{
struct kevq *other_kevq;
struct kqueue *kq;
struct knote *ws_kn, *next_kn;
struct knlist *knl;
int ws_count, valid;
TAILQ_HEAD(, knote) kn_wsq;
KEVQ_OWNED(kevq);
TAILQ_INIT(&kn_wsq);
ws_count = 0;
kq = kevq->kq;
KEVQ_UNLOCK(kevq);
/* todo maybe from cur kqdomain instead of from root */
other_kevq = kqdom_random_kevq_locked(kq->kq_kqd, kevq);
if (other_kevq != NULL && other_kevq != kevq && other_kevq->kn_count > 0) {
CTR3(KTR_KQ, "kevq_worksteal: kevq %p selected kevq %p with %d knotes", kevq, other_kevq, other_kevq->kn_count);
ws_kn = TAILQ_FIRST(&other_kevq->kn_head);
while(ws_count < kq_sched_ws_count && ws_kn != NULL) {
KEVQ_OWNED(other_kevq);
next_kn = TAILQ_NEXT(ws_kn, kn_tqe);
/* don't care about markers */
if ((ws_kn->kn_status & KN_MARKER) != 0) {
goto end_loop;
}
KN_FLUX_LOCK(ws_kn);
/* ignore influx, inactive and disabled */
if (kn_in_flux(ws_kn) || (ws_kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_REQUEUE)) != (KN_ACTIVE)) {
KN_FLUX_UNLOCK(ws_kn);
goto end_loop;
}
knote_enter_flux(ws_kn);
KN_FLUX_UNLOCK(ws_kn);
/* Remove from the old kevq first, due to lock order */
knote_dequeue(ws_kn);
KEVQ_UNLOCK(other_kevq);
/* validate event */
knl = kn_list_lock(ws_kn);
valid = ws_kn->kn_fop->f_event(ws_kn, 0);
kn_list_unlock(knl);
if (valid) {
TAILQ_INSERT_TAIL(&kn_wsq, ws_kn, kn_wse);
ws_count++;
}
KEVQ_LOCK(other_kevq);
if (!valid) {
/* if not valid, return it to the previous queue */
knote_enqueue(ws_kn, other_kevq);
KN_LEAVE_FLUX_WAKEUP(ws_kn);
}
end_loop:
ws_kn = next_kn;
}
}
if (other_kevq != NULL) {
KEVQ_UNLOCK(other_kevq);
}
KEVQ_LOCK(kevq);
while(!TAILQ_EMPTY(&kn_wsq)) {
ws_kn = TAILQ_FIRST(&kn_wsq);
TAILQ_REMOVE(&kn_wsq, ws_kn, kn_wse);
knote_enqueue(ws_kn, kevq);
KN_LEAVE_FLUX_WAKEUP(ws_kn);
CTR4(KTR_KQ, "kevq_worksteal: kevq %p stole kn %p, ident: %d from kevq %p", kevq, ws_kn, ws_kn->kn_id, other_kevq);
}
}
/* /*
* Scan, update kn_data (if not ONESHOT), and copyout triggered events. * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
* We treat KN_MARKER knotes as if they are in flux. * We treat KN_MARKER knotes as if they are in flux.
@ -2471,12 +2646,14 @@ static int
kqueue_scan(struct kevq *kevq, int maxevents, struct kevent_copyops *k_ops, kqueue_scan(struct kevq *kevq, int maxevents, struct kevent_copyops *k_ops,
const struct timespec *tsp, struct kevent *keva, struct thread *td) const struct timespec *tsp, struct kevent *keva, struct thread *td)
{ {
struct kqueue *kq;
struct kevent *kevp; struct kevent *kevp;
struct knote *kn, *marker; struct knote *kn, *marker;
struct knlist *knl; struct knlist *knl;
sbintime_t asbt, rsbt; sbintime_t asbt, rsbt;
int count, error, haskqglobal, influx, nkev, touch; int count, error, haskqglobal, influx, nkev, touch;
kq = kevq->kq;
count = maxevents; count = maxevents;
nkev = 0; nkev = 0;
error = 0; error = 0;
@ -2517,11 +2694,20 @@ kqueue_scan(struct kevq *kevq, int maxevents, struct kevent_copyops *k_ops,
if ((kevq->kevq_state & KEVQ_RDY) == 0) { if ((kevq->kevq_state & KEVQ_RDY) == 0) {
/* Mark the kevq as ready to receive events */ /* Mark the kevq as ready to receive events */
kevq->kevq_state |= KEVQ_RDY; kevq->kevq_state |= KEVQ_RDY;
kqdom_update_active(kevq->kevq_kqd, 1);
} }
retry: retry:
kevp = keva; kevp = keva;
CTR3(KTR_KQ, "kqueue_scan: td %d on kevq %p has %d events", td->td_tid, kevq, kevq->kn_count); CTR3(KTR_KQ, "kqueue_scan: td %d on kevq %p has %d events", td->td_tid, kevq, kevq->kn_count);
if ((kq->kq_flags & KQ_FLAG_MULTI) != 0 && (kq->kq_sched_flags & KQ_SCHED_WORK_STEALING) != 0 && kevq->kn_count == 0)
{
/* try work stealing */
kevq_worksteal(kevq);
}
KEVQ_OWNED(kevq);
if (kevq->kn_count == 0) { if (kevq->kn_count == 0) {
if (asbt == -1) { if (asbt == -1) {
error = EWOULDBLOCK; error = EWOULDBLOCK;
@ -2573,7 +2759,7 @@ kqueue_scan(struct kevq *kevq, int maxevents, struct kevent_copyops *k_ops,
TAILQ_REMOVE(&kevq->kn_head, kn, kn_tqe); TAILQ_REMOVE(&kevq->kn_head, kn, kn_tqe);
CTR3(KTR_KQ, "kqueue_scan: td %d on kevq %p dequeued knote %p", td->td_tid, kevq, kn); CTR3(KTR_KQ, "kqueue_scan: td %d on kevq %p dequeued knote %p", td->td_tid, kevq, kn);
if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
kn->kn_status &= ~KN_QUEUED; kn->kn_status &= ~(KN_QUEUED | KN_REQUEUE);
kevq->kn_count--; kevq->kn_count--;
KN_FLUX_UNLOCK(kn); KN_FLUX_UNLOCK(kn);
continue; continue;
@ -2592,7 +2778,7 @@ kqueue_scan(struct kevq *kevq, int maxevents, struct kevent_copyops *k_ops,
("knote %p is unexpectedly in flux", kn)); ("knote %p is unexpectedly in flux", kn));
if ((kn->kn_flags & EV_DROP) == EV_DROP) { if ((kn->kn_flags & EV_DROP) == EV_DROP) {
kn->kn_status &= ~KN_QUEUED; kn->kn_status &= ~(KN_QUEUED | KN_REQUEUE);
knote_enter_flux(kn); knote_enter_flux(kn);
kevq->kn_count--; kevq->kn_count--;
KN_FLUX_UNLOCK(kn); KN_FLUX_UNLOCK(kn);
@ -2605,7 +2791,7 @@ kqueue_scan(struct kevq *kevq, int maxevents, struct kevent_copyops *k_ops,
KEVQ_LOCK(kevq); KEVQ_LOCK(kevq);
continue; continue;
} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
kn->kn_status &= ~KN_QUEUED; kn->kn_status &= ~(KN_QUEUED | KN_REQUEUE);
knote_enter_flux(kn); knote_enter_flux(kn);
kevq->kn_count--; kevq->kn_count--;
KN_FLUX_UNLOCK(kn); KN_FLUX_UNLOCK(kn);
@ -2633,12 +2819,12 @@ kqueue_scan(struct kevq *kevq, int maxevents, struct kevent_copyops *k_ops,
KEVQ_LOCK(kevq); KEVQ_LOCK(kevq);
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
KN_SCAN); KN_SCAN | KN_REQUEUE);
knote_leave_flux_ul(kn); knote_leave_flux_ul(kn);
kevq->kn_count--; kevq->kn_count--;
kn_list_unlock(knl); kn_list_unlock(knl);
influx = 1; influx = 1;
CTR3(KTR_KQ, "kqueue_scan: kn %p not valid anymore for kevq %p, td %d", kn, kevq, td->td_tid); CTR4(KTR_KQ, "kqueue_scan: kn %p, ident: %d not valid anymore for kevq %p, td %d", kn, kn->kn_id, kevq, td->td_tid);
continue; continue;
} }
touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL); touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL);
@ -2659,10 +2845,15 @@ kqueue_scan(struct kevq *kevq, int maxevents, struct kevent_copyops *k_ops,
} }
if (kn->kn_flags & EV_DISPATCH) if (kn->kn_flags & EV_DISPATCH)
kn->kn_status |= KN_DISABLED; kn->kn_status |= KN_DISABLED;
kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | KN_REQUEUE);
kevq->kn_count--; kevq->kn_count--;
} else { } else {
CTR2(KTR_KQ, "kqueue_scan: requeued kn %p to kevq %p", kn, kevq); /* this flag is here to prevent a subtle workstealing race where one thread gets an identifier
and returns, before it can process the event, another thread steals the knote and
processes the same fd, resulting in the first thread having no data available.
Work stealing will avoid stealing knotes with this flag set*/
kn->kn_status |= KN_REQUEUE;
CTR3(KTR_KQ, "kqueue_scan: requeued kn %p, ident: %d to kevq %p", kn, kn->kn_id,kevq);
TAILQ_INSERT_TAIL(&kevq->kn_head, kn, kn_tqe); TAILQ_INSERT_TAIL(&kevq->kn_head, kn, kn_tqe);
} }
@ -2771,11 +2962,11 @@ kqueue_ioctl(struct file *fp, u_long cmd, void *data,
switch (cmd) { switch (cmd) {
case FKQMULTI: case FKQMULTI:
KQ_LOCK(kq); KQ_LOCK(kq);
if ((kq->kq_state & KQ_FLAG_INIT) == KQ_FLAG_INIT) { if ((kq->kq_flags & KQ_FLAG_INIT) == KQ_FLAG_INIT) {
error = (EINVAL); error = (EINVAL);
} else { } else {
CTR2(KTR_KQ, "kqueue_ioctl: multi flag set for kq %p, scheduler flags: %d", kq, *(int*)data); CTR2(KTR_KQ, "kqueue_ioctl: multi flag set for kq %p, scheduler flags: %d", kq, *(int*)data);
kq->kq_state |= (KQ_FLAG_INIT | KQ_FLAG_MULTI); kq->kq_flags |= (KQ_FLAG_INIT | KQ_FLAG_MULTI);
kq->kq_sched_flags = *(int*)data; kq->kq_sched_flags = *(int*)data;
} }
KQ_UNLOCK(kq); KQ_UNLOCK(kq);
@ -2801,7 +2992,7 @@ kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
return POLLERR; return POLLERR;
KQ_LOCK(kq); KQ_LOCK(kq);
if ((kq->kq_state & KQ_FLAG_MULTI) != KQ_FLAG_MULTI ) { if ((kq->kq_flags & KQ_FLAG_MULTI) != KQ_FLAG_MULTI ) {
revents = 0; revents = 0;
} else { } else {
if (events & (POLLIN | POLLRDNORM)) { if (events & (POLLIN | POLLRDNORM)) {
@ -2906,7 +3097,7 @@ kevq_drain(struct kevq *kevq)
knote_dequeue(kn); knote_dequeue(kn);
if ((kq->kq_state & KQ_FLAG_MULTI) == KQ_FLAG_MULTI && (kq->kq_state & KQ_CLOSING) != KQ_CLOSING && (kn->kn_status & KN_MARKER) == 0) { if ((kq->kq_flags & KQ_FLAG_MULTI) == KQ_FLAG_MULTI && (kq->kq_state & KQ_CLOSING) != KQ_CLOSING && (kn->kn_status & KN_MARKER) == 0) {
KEVQ_UNLOCK(kevq); KEVQ_UNLOCK(kevq);
/* TODO: When we knote activate, if the ev has EV_CLEAR set, maybe we shouldn't activate the event /* TODO: When we knote activate, if the ev has EV_CLEAR set, maybe we shouldn't activate the event
* if there hasn't been activities on the fd * if there hasn't been activities on the fd
@ -2937,7 +3128,7 @@ kevq_drain(struct kevq *kevq)
// and will be dequeued later (kn->kn_kevq will be set to another valid kevq) // and will be dequeued later (kn->kn_kevq will be set to another valid kevq)
// //
if ((kq->kq_state & KQ_FLAG_MULTI) == KQ_FLAG_MULTI) { if ((kq->kq_flags & KQ_FLAG_MULTI) == KQ_FLAG_MULTI) {
// drop from KQ Domain // drop from KQ Domain
KQ_LOCK(kq); KQ_LOCK(kq);
KQD_LOCK(kqd); KQD_LOCK(kqd);
@ -2949,6 +3140,9 @@ kevq_drain(struct kevq *kevq)
LIST_REMOVE(kevq, kevq_th_e); LIST_REMOVE(kevq, kevq_th_e);
// detach from kqdom // detach from kqdom
if((kevq->kevq_state & KEVQ_RDY) != 0) {
kqdom_update_active(kqd, -1);
}
kqdom_remove(kqd, kevq); kqdom_remove(kqd, kevq);
// detach from kqueue // detach from kqueue
@ -3038,10 +3232,10 @@ kqueue_drain(struct kqueue *kq, struct kevq *kevq, struct thread *td)
} }
// destroy kqdoms and kevqs // destroy kqdoms and kevqs
if ((kq->kq_state & KQ_FLAG_MULTI) == KQ_FLAG_MULTI) { if ((kq->kq_flags & KQ_FLAG_MULTI) == KQ_FLAG_MULTI) {
while((kevq = LIST_FIRST(&kq->kq_kevqlist)) != NULL) { while((kevq = LIST_FIRST(&kq->kq_kevqlist)) != NULL) {
KQ_UNLOCK(kq); KQ_UNLOCK(kq);
if (kevq_acquire(kevq) == 0) if (kevq_acquire(kevq, 0) == 0)
kevq_drain(kevq); kevq_drain(kevq);
KQ_LOCK(kq); KQ_LOCK(kq);
} }
@ -3099,7 +3293,7 @@ kqueue_close(struct file *fp, struct thread *td)
int error; int error;
int filedesc_unlock; int filedesc_unlock;
if ((kq->kq_state & KQ_FLAG_MULTI) == KQ_FLAG_MULTI) { if ((kq->kq_flags & KQ_FLAG_MULTI) == KQ_FLAG_MULTI) {
// only acquire the kqueue lock here // only acquire the kqueue lock here
if ((error = kqueue_acquire(fp, &kq))) if ((error = kqueue_acquire(fp, &kq)))
return error; return error;
@ -3691,6 +3885,89 @@ knote_drop_detached(struct knote *kn, struct thread *td)
knote_free(kn); knote_free(kn);
} }
static struct kevq *
kqdom_random_kevq_locked(struct kqdom* kqd, struct kevq* last_kevq)
{
struct kqdom *each_child, *last_child;
struct kevq *kevq, *each_kevq;
int num_active, init_idx;
u_long random;
/* fall-back with the last child in case there is a race */
last_child = NULL;
kevq = NULL;
while (kqd->num_children > 0) {
/* read once */
num_active = kqd->num_active;
if (num_active == 0) {
/* if we got to a child and now it doesn't have any active children, then return NULL
this happens either on the first loop or due to a race of kevq deletion */
return NULL;
}
random = kqueue_frand() % num_active;
KASSERT(random < kqd->num_children, ("more active children than total children"));
for(int i = 0; i < kqd->num_children; i++) {
each_child = kqd->children[i];
if (each_child->num_active > 0) {
/* if the child suits our need */
last_child = each_child;
if (random == 0) {
kqd = each_child;
break;
}
random--;
}
if (i == kqd->num_children) {
kqd = last_child;
}
}
}
if (kqd != NULL) {
CTR3(KTR_KQ, "kqdom_random_kevq_locked: selected kqd %d, # children %p, last_kevq %p", kqd->id, kqd->kqd_kevqcnt, last_kevq);
KQD_LOCK(kqd);
if (kqd->kqd_kevqcnt != 0) {
random = kqueue_frand() % kqd->kqd_kevqcnt;
init_idx = random;
each_kevq = kqd->kqd_kevqlist[random];
while(1) {
/* fast fail */
if (KEVQ_AVAIL(each_kevq) && each_kevq != last_kevq) {
KEVQ_LOCK(each_kevq);
if (KEVQ_AVAIL(each_kevq)) {
kevq = each_kevq;
break;
}
KEVQ_UNLOCK(each_kevq);
}
random = (random + 1) % kqd->kqd_kevqcnt;
if (random == init_idx) {
break;
}
each_kevq = kqd->kqd_kevqlist[random];
}
}
KQD_UNLOCK(kqd);
}
if (kevq != NULL) {
KEVQ_OWNED(kevq);
}
CTR2(KTR_KQ, "kqdom_random_kevq_locked: selected kevq %p, last_kevq %p", kevq, last_kevq);
return kevq;
}
/* select the next kevq based on knote and scheduler flags and locks the returned kevq */ /* select the next kevq based on knote and scheduler flags and locks the returned kevq */
static struct kevq * static struct kevq *
@ -3698,14 +3975,15 @@ knote_next_kevq(struct knote *kn)
{ {
struct kqdom *kqd; struct kqdom *kqd;
struct kqueue *kq; struct kqueue *kq;
struct kevq *next_kevq; struct kevq *next_kevq, *sel_kevq;
int cur_kevq;
next_kevq = NULL; next_kevq = NULL;
kq = kn->kn_kq; kq = kn->kn_kq;
CTR1(KTR_KQ, "knote_next_kevq: processing kn %p", kn); CTR1(KTR_KQ, "knote_next_kevq: processing kn %p", kn);
if ((kq->kq_state & KQ_FLAG_MULTI) == 0) { if ((kq->kq_flags & KQ_FLAG_MULTI) == 0) {
// single threaded mode, just return the current kevq // single threaded mode, just return the current kevq
KQ_LOCK(kn->kn_kq); KQ_LOCK(kn->kn_kq);
if ((kq->kq_state & KQ_CLOSING) == 0) if ((kq->kq_state & KQ_CLOSING) == 0)
@ -3731,27 +4009,96 @@ knote_next_kevq(struct knote *kn)
return next_kevq; return next_kevq;
} }
if ((kq->kq_sched_flags & KQ_SCHED_QUEUE) != 0) { if ((kq->kq_sched_flags & KQ_SCHED_BEST_OF_N) != 0) {
if (kn->kn_kqd == NULL) { kqd = kq->kq_kqd;
/* the first time knote is queued, record the kqdom */ for(int i = 0; i < kq_sched_bon_count; i++) {
kn->kn_kqd = kqdom_find(kq->kq_kqd, PCPU_GET(cpuid)); sel_kevq = kqdom_random_kevq_locked(kqd, next_kevq);
if (sel_kevq != NULL) {
KASSERT(kn->kn_kqd != NULL, ("knote scheduled on an unidentified CPU2")); KEVQ_OWNED(sel_kevq);
CTR2(KTR_KQ, "knote_next_kevq: [QUEUE] knote %p attached to kqdom id %d", kn, kn->kn_kqd->id);
CTR2(KTR_KQ, "knote_next_kevq: [BON] selected random kevq %p for kn %p", sel_kevq, kn);
if (next_kevq == NULL && kevq_acquire(sel_kevq, 1) == 0) {
next_kevq = sel_kevq;
KEVQ_UNLOCK(sel_kevq);
} else {
// compare their avg wait time
// TODO: refactor the unlock pattern here
if (sel_kevq->kevq_avg_lat * sel_kevq->kn_count < next_kevq->kevq_avg_lat * next_kevq->kn_count) {
if (kevq_acquire(sel_kevq, 1) == 0) {
KEVQ_UNLOCK(sel_kevq);
kevq_release(next_kevq, 0);
next_kevq = sel_kevq;
} else {
KEVQ_UNLOCK(sel_kevq);
}
} else {
KEVQ_UNLOCK(sel_kevq);
}
}
CTR2(KTR_KQ, "knote_next_kevq: [BON] current best kevq %p, avg wait time: %d", next_kevq, next_kevq->kevq_avg_lat * next_kevq->kn_count);
}
} }
kqd = kn->kn_kqd; if (next_kevq != NULL) {
KEVQ_LOCK(next_kevq);
kevq_release(next_kevq, 1);
// recheck availability
if (!KEVQ_AVAIL(next_kevq)) {
KEVQ_UNLOCK(next_kevq);
next_kevq = NULL;
}
}
CTR2(KTR_KQ, "knote_next_kevq: [BON] next kevq %p for kn %p", next_kevq, kn);
}
if ((next_kevq == NULL) && (kq->kq_sched_flags & KQ_SCHED_QUEUE) != 0) {
if((kq->kq_sched_flags & KQ_SCHED_QUEUE_CPU) != 0) {
kqd = kqdom_find(kq->kq_kqd, PCPU_GET(cpuid));
} else {
if (kn->kn_kqd == NULL) {
/* the first time knote is queued, record the kqdom */
kn->kn_kqd = kqdom_find(kq->kq_kqd, PCPU_GET(cpuid));
KASSERT(kn->kn_kqd != NULL, ("knote scheduled on an unidentified CPU2"));
CTR2(KTR_KQ, "knote_next_kevq: [QUEUE] knote %p attached to kqdom id %d", kn, kn->kn_kqd->id);
}
kqd = kn->kn_kqd;
}
KQD_LOCK(kqd); KQD_LOCK(kqd);
cur_kevq = kqd->kqd_ckevq;
KEVQ_NEXT_AVAIL_LOCKED(next_kevq, &kqd->kqd_kevqlist, kqd->kqd_ckevq, kqd_e); while(1) {
kqd->kqd_ckevq = next_kevq; if (kqd->kqd_kevqcnt == 0) {
break;
}
cur_kevq = (cur_kevq + 1) % kqd->kqd_kevqcnt;
next_kevq = kqd->kqd_kevqlist[cur_kevq];
if (KEVQ_AVAIL(next_kevq)) {
/* fast fail */
KEVQ_LOCK(next_kevq);
if (KEVQ_AVAIL(next_kevq)) {
kqd->kqd_ckevq = cur_kevq;
break;
}
KEVQ_UNLOCK(next_kevq);
}
if (cur_kevq == kqd->kqd_ckevq) {
next_kevq = NULL;
break;
}
}
KQD_UNLOCK(kqd); KQD_UNLOCK(kqd);
CTR2(KTR_KQ, "knote_next_kevq: [QUEUE] next kevq %p for kn %p", next_kevq, kn); CTR2(KTR_KQ, "knote_next_kevq: [QUEUE] next kevq %p for kn %p", next_kevq, kn);
} }
// generic round-robbin // fall-back round-robbin
if (next_kevq == NULL) { if (next_kevq == NULL) {
KQ_LOCK(kq); KQ_LOCK(kq);

View File

@ -283,7 +283,11 @@ struct filterops {
/* /*
* KQ scheduler flags * KQ scheduler flags
*/ */
#define KQ_SCHED_QUEUE 0x1 /* make kq affinitize the knote depending on the cpu it's scheduled */ #define KQ_SCHED_QUEUE 0x01 /* make kq affinitize the knote depending on the first cpu it's scheduled to */
#define KQ_SCHED_QUEUE_CPU 0x02 /* make kq affinitize the knote depending on the runtime cpu it's scheduled to */
#define KQ_SCHED_WORK_STEALING 0x04
#define KQ_SCHED_BEST_OF_N 0x08
#define KQ_SCHED_GREEDY 0x16
/* /*
* An in-flux knote cannot be dropped from its kq while the kq is * An in-flux knote cannot be dropped from its kq while the kq is
@ -299,6 +303,7 @@ struct knote {
SLIST_ENTRY(knote) kn_selnext; /* for struct selinfo */ SLIST_ENTRY(knote) kn_selnext; /* for struct selinfo */
struct knlist *kn_knlist; /* f_attach populated */ struct knlist *kn_knlist; /* f_attach populated */
TAILQ_ENTRY(knote) kn_tqe; TAILQ_ENTRY(knote) kn_tqe;
TAILQ_ENTRY(knote) kn_wse; /* for work stealing queue */
struct kqueue *kn_kq; /* which kqueue we are on */ struct kqueue *kn_kq; /* which kqueue we are on */
struct kevq *kn_kevq; /* the kevq the knote is on */ struct kevq *kn_kevq; /* the kevq the knote is on */
/* used by the scheduler */ /* used by the scheduler */
@ -316,6 +321,7 @@ struct knote {
#define KN_MARKER 0x20 /* ignore this knote */ #define KN_MARKER 0x20 /* ignore this knote */
#define KN_KQUEUE 0x40 /* this knote belongs to a kq */ #define KN_KQUEUE 0x40 /* this knote belongs to a kq */
#define KN_SCAN 0x100 /* flux set in kqueue_scan() */ #define KN_SCAN 0x100 /* flux set in kqueue_scan() */
#define KN_REQUEUE 0x200 /* knote has triggered and is requeued to the current queue */
int kn_fluxwait; int kn_fluxwait;
int kn_influx; int kn_influx;
struct mtx kn_fluxlock; struct mtx kn_fluxlock;

View File

@ -40,9 +40,10 @@
#define KQ_NEVENTS 8 /* minimize copy{in,out} calls */ #define KQ_NEVENTS 8 /* minimize copy{in,out} calls */
#define KQEXTENT 256 /* linear growth by this amount */ #define KQEXTENT 256 /* linear growth by this amount */
#define KQDOM_EXTENT_FACTOR 8 /* linear growth by this amount */
struct kevq { struct kevq {
LIST_ENTRY(kevq) kevq_th_e; /* entry into kevq_thred's hashtable */ LIST_ENTRY(kevq) kevq_th_e; /* entry into kevq_thred's hashtable */
LIST_ENTRY(kevq) kqd_e; /* entry into kqdomain */
LIST_ENTRY(kevq) kq_e; /* entry into kq */ LIST_ENTRY(kevq) kq_e; /* entry into kq */
LIST_ENTRY(kevq) kevq_th_tqe; /* entry into kevq_thred's kevq_list */ LIST_ENTRY(kevq) kevq_th_tqe; /* entry into kevq_thred's kevq_list */
struct kqueue *kq; /* the kq that the kevq belongs to */ struct kqueue *kq; /* the kq that the kevq belongs to */
@ -58,42 +59,50 @@ struct kevq {
int kevq_refcnt; int kevq_refcnt;
/* Used by the scheduler */ /* Used by the scheduler */
struct timespec kevq_avg_lat; unsigned long kevq_avg_lat;
struct timespec kevq_last_kev; struct timespec kevq_last_kev;
int kevq_last_nkev; int kevq_last_nkev;
}; };
/* TODO: assumed that threads don't get rescheduled across cores */ /* TODO: assumed that threads don't get rescheduled across cores */
struct kqdom { struct kqdom {
/* static */
struct mtx kqd_lock; struct mtx kqd_lock;
TAILQ_ENTRY(kqdom) child_e;
struct kqdom *parent; struct kqdom *parent;
int id; int id;
struct timespec kqd_avg_lat;
cpuset_t cpu_mask; cpuset_t cpu_mask;
int num_children; int num_children;
int num_kevq; struct kqdom **children;
TAILQ_HEAD(, kqdom) children;
struct kevqlist kqd_kevqlist; /* list of kevqs on the kdomain, only set for leaf domains*/ /* statistics */
struct kevq *kqd_ckevq; unsigned long avg_lat;
int num_active; /* total number of active children below this node */
/* dynamic members*/
struct kevq **kqd_kevqlist; /* array list of kevqs on the kdomain, only set for leaf domains */
int kqd_kevqcap;
int kqd_kevqcnt;
int kqd_ckevq;
}; };
struct kqueue { struct kqueue {
struct mtx kq_lock; struct mtx kq_lock;
int kq_refcnt; int kq_refcnt;
struct selinfo kq_sel; struct selinfo kq_sel;
int kq_state; int kq_state;
#define KQ_SEL 0x01 #define KQ_SEL 0x01
#define KQ_ASYNC 0x02 #define KQ_ASYNC 0x02
#define KQ_TASKSCHED 0x04 /* task scheduled */ #define KQ_TASKSCHED 0x04 /* task scheduled */
#define KQ_TASKDRAIN 0x08 /* waiting for task to drain */ #define KQ_TASKDRAIN 0x08 /* waiting for task to drain */
#define KQ_CLOSING 0x10 #define KQ_CLOSING 0x10
#define KQ_FLAG_INIT 0x20 /* kqueue has been initialized. this flag is set after the first kevent structure is processed */ int kq_flags;
#define KQ_FLAG_MULTI 0x40 /* Multi-threaded mode */ #define KQ_FLAG_INIT 0x01 /* kqueue has been initialized. this flag is set after the first kevent structure is processed */
#define KQ_FLAG_MULTI 0x02 /* Multi-threaded mode */
TAILQ_ENTRY(kqueue) kq_list; TAILQ_ENTRY(kqueue) kq_list;
struct sigio *kq_sigio; struct sigio *kq_sigio;
struct filedesc *kq_fdp; struct filedesc *kq_fdp;
int kq_knlistsize; /* size of knlist */ int kq_knlistsize; /* size of knlist */
struct klist *kq_knlist; /* list of knotes */ struct klist *kq_knlist; /* list of knotes */
u_long kq_knhashmask; /* size of knhash */ u_long kq_knhashmask; /* size of knhash */
struct klist *kq_knhash; /* hash table for knotes */ struct klist *kq_knhash; /* hash table for knotes */
@ -105,7 +114,7 @@ struct kqueue {
struct kevqlist kq_kevqlist; /* list of kevqs for fall-back round robbin */ struct kevqlist kq_kevqlist; /* list of kevqs for fall-back round robbin */
struct kqdom *kq_kqd; /* root domain */ struct kqdom *kq_kqd; /* root domain */
struct kevq *kq_ckevq; /* current kevq for multithreaded kqueue, used for round robbin */ struct kevq *kq_ckevq; /* current kevq for multithreaded kqueue, used for round robbin */
int kq_sched_flags; /* Scheduler flag for the KQ */ int kq_sched_flags; /* Scheduler flag for the KQ */
}; };
#endif /* !_SYS_EVENTVAR_H_ */ #endif /* !_SYS_EVENTVAR_H_ */

View File

@ -33,13 +33,18 @@
/* /*
* KQ scheduler flags * KQ scheduler flags
*/ */
#define KQ_SCHED_QUEUE 0x1 /* make kq affinitize the knote depending on the cpu it's scheduled */ #define KQ_SCHED_QUEUE 0x01 /* make kq affinitize the knote depending on the first cpu it's scheduled to */
#define KQ_SCHED_QUEUE_CPU 0x02 /* make kq affinitize the knote depending on the runtime cpu it's scheduled to */
#define KQ_SCHED_WORK_STEALING 0x04
#define KQ_SCHED_BEST_OF_N 0x08
#define KQ_SCHED_GREEDY 0x16
//#define TEST_DEBUG //#define TEST_DEBUG
struct thread_info { struct thread_info {
pthread_t thrd; pthread_t thrd;
int can_crash; int can_crash;
int ws_master;
pthread_mutex_t lock; pthread_mutex_t lock;
int group_id; int group_id;
int evcnt; int evcnt;
@ -89,7 +94,7 @@ socket_pop(int sockfd)
/* Drain the read buffer, then make sure there are no more events. */ /* Drain the read buffer, then make sure there are no more events. */
#ifdef TEST_DEBUG #ifdef TEST_DEBUG
printf("READ_M: popping the read buffer\n"); printf("READ_M: popping the read buffer of sock %d\n", sockfd);
#endif #endif
if (read(sockfd, &buf, 1) < 1) if (read(sockfd, &buf, 1) < 1)
err(1, "read(2)"); err(1, "read(2)");
@ -432,6 +437,139 @@ test_socket_queue(void)
success(); success();
} }
/***************************
* WS test
***************************/
#define SOCK_WS_CNT (1000)
volatile int ws_good = 0;
static void*
test_socket_ws_worker(void* args)
{
struct thread_info *info = (struct thread_info *) args;
char dat;
int ws_num = 0;
struct kevent *ret;
while (1) {
#ifdef TEST_DEBUG
printf("READ_M: thread %d waiting for events\n", info->tid);
#endif
ret = kevent_get(g_kqfd);
#ifdef TEST_DEBUG
printf("READ_M: thread %d woke up\n", info->tid);
#endif
dat = socket_pop(ret->ident);
free(ret);
if (info->ws_master == 0) {
/*if we are the master, wait for slave to signal us*/
while(!ws_good) {
usleep(500);
}
break;
} else {
ws_num++;
if (ws_num == SOCK_WS_CNT - 1) {
ws_good = 1;
break;
}
}
}
#ifdef TEST_DEBUG
printf("READ_M: thread %d exiting\n", info->tid);
#endif
pthread_exit(0);
}
int ws_sockfd[SOCK_WS_CNT][2];
static void
test_socket_ws()
{
struct kevent kev;
struct thread_info thrd_info[2];
const char *test_id = "[Multi][WS]kevent(evfilt)";
cpuset_t cpuset;
test_begin(test_id);
for (int i = 0; i < SOCK_WS_CNT; i++) {
/* Create a connected pair of full-duplex sockets for testing socket events */
if (socketpair(AF_UNIX, SOCK_STREAM, 0, &ws_sockfd[i][0]) < 0) {
err(1, "kevent_socket");
}
EV_SET(&kev, ws_sockfd[i][0], EVFILT_READ, EV_ADD, 0, 0, &ws_sockfd[i][0]);
if (kevent(g_kqfd, &kev, 1, NULL, 0, NULL) == -1) {
err(1, "kevent_ws_add");
}
}
srand(time(NULL));
#ifdef TEST_DEBUG
printf("READ_M: creating master thread...\n");
#endif
for (int i = 0; i < 1; i++) {
thrd_info[i].tid = i;
thrd_info[i].ws_master = i;
pthread_create(&thrd_info[i].thrd, NULL, test_socket_ws_worker, &thrd_info[i]);
CPU_ZERO(&cpuset);
CPU_SET(0, &cpuset);
if (pthread_setaffinity_np(thrd_info[i].thrd, sizeof(cpuset_t), &cpuset) < 0) {
err(1, "thread_affinity");
}
}
sleep(3);
for(int i = 0; i < SOCK_WS_CNT; i++) {
#ifdef TEST_DEBUG
printf("READ_M: pusing 1 packet to sock %d\n", i);
#endif
socket_push(ws_sockfd[i][1], '.');
}
sleep(1);
for(int i = 1; i < 2; i++) {
#ifdef TEST_DEBUG
printf("READ_M: creating slave thread...\n");
#endif
thrd_info[i].tid = i;
thrd_info[i].ws_master = i;
pthread_create(&thrd_info[i].thrd, NULL, test_socket_ws_worker, &thrd_info[i]);
CPU_ZERO(&cpuset);
CPU_SET(0, &cpuset);
if (pthread_setaffinity_np(thrd_info[i].thrd, sizeof(cpuset_t), &cpuset) < 0) {
err(1, "thread_affinity");
}
}
/* shutdown the systems */
#ifdef TEST_DEBUG
printf("READ_M: waiting for threads to exit...\n");
#endif
for (int i = 0; i < 2; i++) {
pthread_join(thrd_info[i].thrd, NULL);
}
for (int i = 0; i < SOCK_WS_CNT; i++) {
EV_SET(&kev, ws_sockfd[i][0], EVFILT_READ, EV_DELETE, 0, 0, &ws_sockfd[i][0]);
if (kevent(g_kqfd, &kev, 1, NULL, 0, NULL) == -1) {
err(1, "kevent_ws_delete");
}
}
success();
}
/*************************** /***************************
* Brutal test * Brutal test
@ -465,6 +603,9 @@ test_socket_brutal_worker(void* args)
#endif #endif
if ((rand() % 100) < THREAD_EXIT_PROB) { if ((rand() % 100) < THREAD_EXIT_PROB) {
#ifdef TEST_DEBUG
printf("READ_M: thread %d checking fake crash\n", info->tid);
#endif
pthread_mutex_lock(&info->lock); pthread_mutex_lock(&info->lock);
#ifdef TEST_DEBUG #ifdef TEST_DEBUG
printf("READ_M: thread %d trying to fake crash. Can crash: %d\n", info->tid, info->can_crash); printf("READ_M: thread %d trying to fake crash. Can crash: %d\n", info->tid, info->can_crash);
@ -478,6 +619,10 @@ test_socket_brutal_worker(void* args)
pthread_mutex_unlock(&info->lock); pthread_mutex_unlock(&info->lock);
} }
#ifdef TEST_DEBUG
printf("READ_M: thread %d ident: %ld\n", info->tid, ret->ident);
#endif
dat = socket_pop(ret->ident); dat = socket_pop(ret->ident);
free(ret); free(ret);
@ -529,6 +674,7 @@ test_socket_brutal()
brute_threadinfo[i].tid = i; brute_threadinfo[i].tid = i;
brute_threadinfo[i].evcnt = 0; brute_threadinfo[i].evcnt = 0;
brute_threadinfo[i].can_crash = ((i % 10) != 0); brute_threadinfo[i].can_crash = ((i % 10) != 0);
pthread_mutex_init(&brute_threadinfo[i].lock, NULL);
pthread_create(&brute_threadinfo[i].thrd, NULL, test_socket_brutal_worker, &brute_threadinfo[i]); pthread_create(&brute_threadinfo[i].thrd, NULL, test_socket_brutal_worker, &brute_threadinfo[i]);
} }
@ -574,6 +720,7 @@ test_socket_brutal()
for (int i = 0; i < THREAD_BRUTE_CNT; i++) { for (int i = 0; i < THREAD_BRUTE_CNT; i++) {
pthread_join(brute_threadinfo[i].thrd, NULL); pthread_join(brute_threadinfo[i].thrd, NULL);
pthread_mutex_destroy(&brute_threadinfo[i].lock);
} }
for (int i = 0; i < SOCK_BRUTE_CNT; i++) { for (int i = 0; i < SOCK_BRUTE_CNT; i++) {
@ -612,4 +759,21 @@ test_evfilt_read_m()
test_socket_brutal(); test_socket_brutal();
close(g_kqfd); close(g_kqfd);
flags = KQ_SCHED_BEST_OF_N;
g_kqfd = kqueue();
error = ioctl(g_kqfd, FKQMULTI, &flags);
test_socket_brutal();
close(g_kqfd);
flags = KQ_SCHED_WORK_STEALING;
g_kqfd = kqueue();
error = ioctl(g_kqfd, FKQMULTI, &flags);
test_socket_ws();
test_socket_brutal();
close(g_kqfd);
} }