diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index e1e3d68b0339..4178b118987a 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -170,7 +170,7 @@ static int kevq_acquire(struct kevq *kevq, int locked); static void kevq_worksteal(struct kevq *kevq); static void kevq_drain(struct kevq *kevq, struct thread *td); static void kevq_activate(struct kevq *kevq, struct thread *td); -static struct kevq * kevq_vec_select_kevq(struct veclist *lst, int num_rand); +static struct kevq * kevq_vec_select_kevq(struct veclist *lst, int num_rand, u_long rand); static struct knote * kevq_peek_knote(struct kevq *kevq); static inline void kevq_delete_knote(struct kevq *kevq, struct knote *kn); static void kevq_insert_knote(struct kevq *kevq, struct knote *kn); @@ -250,7 +250,7 @@ static void kqdom_update_parents(struct kqdom *leaf, int direction); static void kqdom_insert(struct kqdom *kqd, struct kevq *kevq); static void kqdom_remove(struct kqdom *kqd, struct kevq *kevq); static void kqdom_destroy(struct kqdom *root); -static struct kevq * kqdom_random_kevq_locked(struct kqdom *kqd); +//static struct kevq * kqdom_random_kevq_locked(struct kqdom *kqd); static void kqdom_build_internal(struct kqdom *kqd_cur, struct cpu_group *cg_cur, int *kqd_id); static struct kqdom * kqdom_build(void); static struct kqdom * kqdom_find(struct kqdom *root, int cpuid); @@ -418,10 +418,52 @@ SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, #define KQTUNE_PARSE_ARGS(sf) (((sf) >> 16) & 0xFFFF) #define KQTUNE_PARSE_OBJ(sf) ((sf) & 0xFFFF) +#define NSHUFF 50 + + +/* + * Pseudo-random number generator for perturbing the profiling clock, + * and whatever else we might use it for. The result is uniform on + * [0, 2^31 - 1]. + */ +static u_long +kqueue_random(u_long* seed) +{ + long x, hi, lo, t; + + /* + * Compute x[n + 1] = (7^5 * x[n]) mod (2^31 - 1). + * From "Random number generators: good ones are hard to find", + * Park and Miller, Communications of the ACM, vol. 31, no. 10, + * October 1988, p. 1195. + */ + /* Can't be initialized with 0, so use another value. */ + if ((x = *seed) == 0) + x = 123459876; + hi = x / 127773; + lo = x % 127773; + t = 16807 * lo - 2836 * hi; + if (t < 0) + t += 0x7fffffff; + *seed = t; + //CTR1(KTR_KQ, "kqueue_random: generated %ld", t); + return (t); +} + +static void +kqueue_srandom(u_long *field, u_long seed) +{ + int i; + + *field = seed; + for (i = 0; i < NSHUFF; i++) + kqueue_random(field); +} + static inline long kevq_exp_lat(struct kevq *kevq) { - return kevq->kevq_avg_lat * (kevq_total_knote(kevq) + 1) + kevq->kevq_last_kev - get_cyclecount(); + return kevq->kevq_avg_lat * (kevq_total_knote(kevq) + 1) + kevq->kevq_last_kev; } static inline long @@ -2040,13 +2082,16 @@ kevq_init(struct kevq *kevq) { TAILQ_INIT(&kevq->kn_head); TAILQ_INIT(&kevq->kn_rt_head); - kevq->kn_marker.kn_status = KN_MARKER; - kevq->kn_marker_rt.kn_status = KN_MARKER; - kevq->kn_marker_rt.kn_flags = EV_REALTIME; - kevq->kn_marker.kn_kevq = kevq; - kevq->kn_marker_rt.kn_kevq = kevq; - mtx_init(&kevq->kn_marker.kn_fluxlock, "kn_fluxlock", NULL, MTX_DEF | MTX_DUPOK); - mtx_init(&kevq->kn_marker_rt.kn_fluxlock, "kn_fluxlock", NULL, MTX_DEF | MTX_DUPOK); + kevq->kn_marker = knote_alloc(M_WAITOK); + kevq->kn_marker_rt = knote_alloc(M_WAITOK); + kevq->kn_marker->kn_status = KN_MARKER; + kevq->kn_marker->kn_status = KN_MARKER; + kevq->kn_marker_rt->kn_status = KN_MARKER; + kevq->kn_marker_rt->kn_flags = EV_REALTIME; + kevq->kn_marker->kn_kevq = kevq; + kevq->kn_marker_rt->kn_kevq = kevq; + + kqueue_srandom(&kevq->kevq_rand_seed, (u_long)kevq); } static void @@ -2414,6 +2459,7 @@ kevq_dump(struct sbuf *buf, struct kevq *kevq, int level) sbuf_printf(buf, "%*c\n", level * DUMP_INDENT, ' ', kevq, kevq->kn_count, kevq->kn_rt_count, + kevq->kevq_avg_rlimit, kevq->kevq_tot_time, kevq->kevq_tot_syscall, kevq->kevq_tot_ev, @@ -2747,7 +2794,7 @@ kevq_worksteal(struct kevq *kevq) KASSERT(tgt_count <= 8, ("too many kevq ws knotes")); KVLST_RLOCK(kq); - other_kevq = kevq_vec_select_kevq(&kq->kevq_vlist, 1); + other_kevq = kevq_vec_select_kevq(&kq->kevq_vlist, 1, kqueue_random(&kevq->kevq_rand_seed)); /* fast fail */ if (other_kevq != kevq && kevq_stealable(other_kevq)) { if (KEVQ_TRYLOCK(other_kevq)) { @@ -2908,7 +2955,7 @@ kqueue_scan(struct kevq *kevq, int maxevents, struct kevent_copyops *k_ops, * = (1 / kq->kq_tfreq) / (kevq->kevq_avg_lat / hz) * = (hz / (kevq->kevq_avg_lat * kq->kq_tfreq)) */ - evlimit = hz / (kevq->kevq_avg_lat * kq->kq_tfreq); + evlimit = (2100 * 1000 * 1000) / (kevq->kevq_avg_lat * kq->kq_tfreq); if (evlimit == 0) { evlimit = 1; @@ -2932,7 +2979,11 @@ kqueue_scan(struct kevq *kevq, int maxevents, struct kevent_copyops *k_ops, */ rtlimit = (maxevents * kq->kq_rtshare + 99) / 100; KASSERT(rtlimit > 0, ("the math above is fundamentally broken")); - + if (kevq->kevq_avg_rlimit == 0) { + kevq->kevq_avg_rlimit = rtlimit; + } else { + kevq->kevq_avg_rlimit = calc_overtime_avg(kevq->kevq_avg_rlimit, rtlimit, 80); + } rsbt = 0; if (tsp != NULL) { if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 || @@ -2966,8 +3017,8 @@ kqueue_scan(struct kevq *kevq, int maxevents, struct kevent_copyops *k_ops, } if (kq->kq_flags & KQ_FLAG_MULTI) { - marker = &kevq->kn_marker; - rtmarker = &kevq->kn_marker_rt; + marker = kevq->kn_marker; + rtmarker = kevq->kn_marker_rt; } else { marker = knote_alloc(M_WAITOK); rtmarker = knote_alloc(M_WAITOK); @@ -3188,7 +3239,7 @@ kqueue_scan(struct kevq *kevq, int maxevents, struct kevent_copyops *k_ops, knl = kn_list_lock(kn); fevent = kn->kn_fop->f_event(kn, 0); - /* return stolen knotes */ + /* return ALL knotes */ if (kn->kn_status & KN_WS) { KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | KN_SCAN | KN_PROCESSING | KN_WS); @@ -3533,6 +3584,8 @@ kevq_destroy(struct kevq *kevq) { CTR1(KTR_KQ, "kevq_destroy for %p", kevq); mtx_destroy(&kevq->lock); + knote_free(kevq->kn_marker); + knote_free(kevq->kn_marker_rt); free(kevq, M_KQUEUE); } @@ -4395,22 +4448,19 @@ knote_drop_detached(struct knote *kn, struct thread *td) } static struct kevq * -kevq_vec_select_kevq(struct veclist *lst, int num_rand) +kevq_vec_select_kevq(struct veclist *lst, int num_rand, u_long rand) { int sz; struct kevq *cur_kevq = NULL, *next_kevq; - u_long rand; /* XXX: hack */ - KASSERT(num_rand < 8, ("too much num_rand")); + KASSERT(num_rand <= 2, ("too large num_rand")); //CTR1(KTR_KQ, "kevq_vec_select_kevq: num - %d", num_rand); sz = veclist_size(lst); if (sz > 0) { - rand = random(); - for (int i = 0; i < num_rand; i++) { next_kevq = veclist_at(lst, rand % sz); @@ -4420,7 +4470,7 @@ kevq_vec_select_kevq(struct veclist *lst, int num_rand) cur_kevq = next_kevq; } - /* XXX: hack */ + /* XXX: hack, 256 queues max */ rand = rand >> 8; } } @@ -4430,51 +4480,51 @@ kevq_vec_select_kevq(struct veclist *lst, int num_rand) return cur_kevq; } -static struct kevq * -kqdom_random_kevq_locked(struct kqdom *kqd) -{ - struct kevq *kevq; - struct kqdom *tkqd; - int num_active; - u_long rand; +// static struct kevq * +// kqdom_random_kevq_locked(struct kqdom *kqd) +// { +// struct kevq *kevq; +// struct kqdom *tkqd; +// int num_active; +// u_long rand; - rand = random(); - kevq = NULL; +// rand = random(); +// kevq = NULL; - while (!kqdom_is_leaf(kqd)) { - KQD_RLOCK(kqd); - /* we only select active stuff inside this, need to be EXTREMELY fast */ - num_active = veclist_size(&kqd->kqd_activelist); - CTR1(KTR_KQ, "kqdom_random_kevq_locked: randomly selected leaf kqdom %d", kqd->id); - if (num_active > 0) { - tkqd = veclist_at(&kqd->kqd_activelist, rand % num_active); - } else { - tkqd = NULL; - } - KQD_RUNLOCK(kqd); - kqd = tkqd; - /* XXX: hack */ - rand = rand >> 8; - } +// while (!kqdom_is_leaf(kqd)) { +// KQD_RLOCK(kqd); +// /* we only select active stuff inside this, need to be EXTREMELY fast */ +// num_active = veclist_size(&kqd->kqd_activelist); +// CTR1(KTR_KQ, "kqdom_random_kevq_locked: randomly selected leaf kqdom %d", kqd->id); +// if (num_active > 0) { +// tkqd = veclist_at(&kqd->kqd_activelist, rand % num_active); +// } else { +// tkqd = NULL; +// } +// KQD_RUNLOCK(kqd); +// kqd = tkqd; +// /* XXX: hack */ +// rand = rand >> 8; +// } - if (kqd != NULL) { - CTR1(KTR_KQ, "kqdom_random_kevq_locked: randomly selected leaf kqdom %d", kqd->id); - KQD_RLOCK(kqd); +// if (kqd != NULL) { +// CTR1(KTR_KQ, "kqdom_random_kevq_locked: randomly selected leaf kqdom %d", kqd->id); +// KQD_RLOCK(kqd); - kevq = kevq_vec_select_kevq(&kqd->kqd_kevqs, 1); - kevq = kevq_lock_check_avail(kevq); +// kevq = kevq_vec_select_kevq(&kqd->kqd_kevqs, 1); +// kevq = kevq_lock_check_avail(kevq); - KQD_RUNLOCK(kqd); - } +// KQD_RUNLOCK(kqd); +// } - if (kevq != NULL) { - KEVQ_OWNED(kevq); - } +// if (kevq != NULL) { +// KEVQ_OWNED(kevq); +// } - CTR1(KTR_KQ, "kqdom_random_kevq_locked: randomly selected kevq %p", kevq); +// CTR1(KTR_KQ, "kqdom_random_kevq_locked: randomly selected kevq %p", kevq); - return kevq; -} +// return kevq; +// } /* select the next kevq based on knote and scheduler flags and locks the returned kevq */ @@ -4540,11 +4590,11 @@ knote_next_kevq(struct knote *kn) KASSERT(kqdom_is_leaf(kqd), ("found kqdom not leaf")); KQD_RLOCK(kqd); - next_kevq = kevq_vec_select_kevq(&kqd->kqd_kevqs, 1); + next_kevq = kevq_vec_select_kevq(&kqd->kqd_kevqs, 1, kqueue_random(&kn->kn_rand_seed)); if (sargs > 0) { KVLST_RLOCK(kq); - other_kevq = kevq_vec_select_kevq(&kq->kevq_vlist, sargs); + other_kevq = kevq_vec_select_kevq(&kq->kevq_vlist, sargs, kqueue_random(&kn->kn_rand_seed)); if (next_kevq == NULL || (other_kevq != NULL && kevq_lat_wcmp(next_kevq, other_kevq, 90) > 0)) { next_kevq = other_kevq; @@ -4569,7 +4619,7 @@ knote_next_kevq(struct knote *kn) case KQ_SCHED_BEST: KVLST_RLOCK(kq); - next_kevq = kevq_vec_select_kevq(&kq->kevq_vlist, sargs); + next_kevq = kevq_vec_select_kevq(&kq->kevq_vlist, sargs, kqueue_random(&kn->kn_rand_seed)); next_kevq = kevq_lock_check_avail(next_kevq); KVLST_RUNLOCK(kq); @@ -4582,7 +4632,7 @@ knote_next_kevq(struct knote *kn) /* fall-back rand robbin*/ if (next_kevq == NULL) { - rand = random(); + rand = kqueue_random(&kn->kn_rand_seed); KVLST_RLOCK(kq); sz = veclist_size(&kq->kevq_vlist); @@ -4782,6 +4832,7 @@ knote_alloc(int mflag) struct knote *ret = uma_zalloc(knote_zone, mflag | M_ZERO); /* CTR1(KTR_KQ, "knote_alloc: allocating knote %p", ret); */ mtx_init(&ret->kn_fluxlock, "kn_fluxlock", NULL, MTX_DEF | MTX_DUPOK); + kqueue_srandom(&ret->kn_rand_seed, (u_long)ret); return ret; } diff --git a/sys/sys/event.h b/sys/sys/event.h index 97af4d1cdb46..5dc4f1459250 100644 --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -313,6 +313,7 @@ struct knote { #define KN_WS 0x100 /* the knote is stolen from another kevq */ int kn_fluxwait; int kn_influx; + u_long kn_rand_seed; struct mtx kn_fluxlock; int kn_sfflags; /* saved filter flags */ int64_t kn_sdata; /* saved data field */ diff --git a/sys/sys/eventvar.h b/sys/sys/eventvar.h index ae506f5d8cde..240e3d71ad7e 100644 --- a/sys/sys/eventvar.h +++ b/sys/sys/eventvar.h @@ -51,11 +51,11 @@ struct kevq { /* 1st cacheline */ /* Sched stats */ + u_long kevq_rand_seed; uint64_t kevq_avg_lat; uint64_t kevq_avg_ev; uint64_t kevq_tot_ev; uint64_t kevq_tot_time; - uint64_t kevq_tot_syscall; uint64_t kevq_last_kev; uint32_t kevq_last_nkev; #define KEVQ_SLEEP 0x01 @@ -65,14 +65,7 @@ struct kevq { int kevq_state; int kn_count; /* number of pending knotes */ int kn_rt_count; /* number of runtime knotes */ - - /* 2nd cacheline */ - uint64_t kevq_tot_ws; - /* TODO: maybe these should be in kqdomain or global */ - uint64_t kevq_tot_fallback; - uint64_t kevq_tot_kqd_mismatch; - uint64_t kevq_tot_sched; - uint64_t kevq_tot_realtime; + /* end 1st cache line */ LIST_ENTRY(kevq) kevq_th_e; /* entry into kevq_thred's hashtable */ LIST_ENTRY(kevq) kq_e; /* entry into kq */ @@ -83,10 +76,19 @@ struct kevq { struct kevq_thred *kevq_th; /* the thread that the kevq belongs to */ struct mtx lock; /* the lock for the kevq */ struct ktailq kn_head; /* list of pending knotes */ - struct knote kn_marker; + struct knote *kn_marker; struct ktailq kn_rt_head; /* list of pending knotes with runtime priority */ - struct knote kn_marker_rt; + struct knote *kn_marker_rt; int kevq_refcnt; + + /* TODO: maybe these should be in kqdomain or global */ + uint64_t kevq_tot_fallback; + uint64_t kevq_tot_kqd_mismatch; + uint64_t kevq_tot_sched; + uint64_t kevq_tot_realtime; + uint64_t kevq_tot_syscall; + uint64_t kevq_tot_ws; + uint64_t kevq_avg_rlimit; }; /* TODO: assumed that threads don't get rescheduled across cores */ diff --git a/tests/sys/kqueue/libkqueue/read_m.c b/tests/sys/kqueue/libkqueue/read_m.c index b4a78d6afe21..403ae4da23f8 100644 --- a/tests/sys/kqueue/libkqueue/read_m.c +++ b/tests/sys/kqueue/libkqueue/read_m.c @@ -928,6 +928,18 @@ test_evfilt_read_m() test_socket_brutal("rand"); close(g_kqfd); + /* BO2 */ + flags = KQSCHED_MAKE(KQ_SCHED_BEST,2,0,0); + g_kqfd = kqueue(); + error = ioctl(g_kqfd, FKQMULTI, &flags); + if (error == -1) { + err(1, "ioctl"); + } + + test_socket_read(1); + test_socket_brutal("best2"); + close(g_kqfd); + /* Queue + bo0 */ flags = KQSCHED_MAKE(KQ_SCHED_QUEUE,0,0,0); g_kqfd = kqueue(); @@ -981,18 +993,6 @@ test_evfilt_read_m() test_socket_brutal("cpu2"); close(g_kqfd); - /* BO2 */ - flags = KQSCHED_MAKE(KQ_SCHED_BEST,2,0,0); - g_kqfd = kqueue(); - error = ioctl(g_kqfd, FKQMULTI, &flags); - if (error == -1) { - err(1, "ioctl"); - } - - test_socket_read(1); - test_socket_brutal("best2"); - close(g_kqfd); - /* WS */ flags = KQSCHED_MAKE(0,0,KQ_SCHED_FEAT_WS,1); g_kqfd = kqueue();