From f91aa773bea2bd0dff2d021c2d78fe78da78d52d Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 20 Jun 2019 01:15:33 +0000 Subject: [PATCH] Add wakeup_any(), cheaper wakeup_one() for taskqueue(9). wakeup_one() and underlying sleepq_signal() spend additional time trying to be fair, waking thread with highest priority, sleeping longest time. But in case of taskqueue there are many absolutely identical threads, and any fairness between them is quite pointless. It makes even worse, since round-robin wakeups not only make previous CPU affinity in scheduler quite useless, but also hide from user chance to see CPU bottlenecks, when sequential workload with one request at a time looks evenly distributed between multiple threads. This change adds new SLEEPQ_UNFAIR flag to sleepq_signal(), making it wakeup thread that went to sleep last, but no longer in context switch (to avoid immediate spinning on the thread lock). On top of that new wakeup_any() function is added, equivalent to wakeup_one(), but setting the flag. On top of that taskqueue(9) is switchied to wakeup_any() to wakeup its threads. As result, on 72-core Xeon v4 machine sequential ZFS write to 12 ZVOLs with 16KB block size spend 34% less time in wakeup_any() and descendants then it was spending in wakeup_one(), and total write throughput increased by ~10% with the same as before CPU usage. Reviewed by: markj, mmacy MFC after: 2 weeks Sponsored by: iXsystems, Inc. Differential Revision: https://reviews.freebsd.org/D20669 --- share/man/man9/Makefile | 3 ++- share/man/man9/sleep.9 | 30 ++++++++++++++++++++------- share/man/man9/sleepqueue.9 | 5 +++-- sys/kern/kern_synch.c | 13 ++++++++++++ sys/kern/subr_sleepqueue.c | 41 +++++++++++++++++++++++++++---------- sys/kern/subr_taskqueue.c | 2 +- sys/sys/queue.h | 4 ++++ sys/sys/sleepqueue.h | 1 + sys/sys/systm.h | 1 + 9 files changed, 78 insertions(+), 22 deletions(-) diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile index 5ae3206f48dd..8931249df623 100644 --- a/share/man/man9/Makefile +++ b/share/man/man9/Makefile @@ -1880,7 +1880,8 @@ MLINKS+=sleep.9 msleep.9 \ sleep.9 tsleep.9 \ sleep.9 tsleep_sbt.9 \ sleep.9 wakeup.9 \ - sleep.9 wakeup_one.9 + sleep.9 wakeup_one.9 \ + sleep.9 wakeup_any.9 MLINKS+=sleepqueue.9 init_sleepqueues.9 \ sleepqueue.9 sleepq_abort.9 \ sleepqueue.9 sleepq_add.9 \ diff --git a/share/man/man9/sleep.9 b/share/man/man9/sleep.9 index 82aa25d14931..05e281664e38 100644 --- a/share/man/man9/sleep.9 +++ b/share/man/man9/sleep.9 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd March 4, 2018 +.Dd June 19, 2019 .Dt SLEEP 9 .Os .Sh NAME @@ -38,7 +38,9 @@ .Nm pause_sbt , .Nm tsleep , .Nm tsleep_sbt , -.Nm wakeup +.Nm wakeup , +.Nm wakeup_one , +.Nm wakeup_any .Nd wait for events .Sh SYNOPSIS .In sys/param.h @@ -70,6 +72,8 @@ .Fn wakeup "void *chan" .Ft void .Fn wakeup_one "void *chan" +.Ft void +.Fn wakeup_any "void *chan" .Sh DESCRIPTION The functions .Fn tsleep , @@ -79,8 +83,9 @@ The functions .Fn pause_sig , .Fn pause_sbt , .Fn wakeup , +.Fn wakeup_one , and -.Fn wakeup_one +.Fn wakeup_any handle event-based thread blocking. If a thread must wait for an external event, it is put to sleep by @@ -252,9 +257,10 @@ function is a wrapper around .Fn tsleep that suspends execution of the current thread for the indicated timeout. The thread can not be awakened early by signals or calls to -.Fn wakeup +.Fn wakeup , +.Fn wakeup_one or -.Fn wakeup_one . +.Fn wakeup_any . The .Fn pause_sig function is a variant of @@ -263,8 +269,8 @@ which can be awakened early by signals. .Pp The .Fn wakeup_one -function makes the first thread in the queue that is sleeping on the -parameter +function makes the first highest priority thread in the queue that is +sleeping on the parameter .Fa chan runnable. This reduces the load when a large number of threads are sleeping on @@ -293,6 +299,16 @@ pay particular attention to ensure that no other threads wait on the same .Fa chan . .Pp +The +.Fn wakeup_any +function is similar to +.Fn wakeup_one , +except that it makes runnable last thread on the queue (sleeping less), +ignoring fairness. +It can be used when threads sleeping on the +.Fa chan +are known to be identical and there is no reason to be fair. +.Pp If the timeout given by .Fa timo or diff --git a/share/man/man9/sleepqueue.9 b/share/man/man9/sleepqueue.9 index 6d3b98009329..68c1e9b66e93 100644 --- a/share/man/man9/sleepqueue.9 +++ b/share/man/man9/sleepqueue.9 @@ -22,7 +22,7 @@ .\" .\" $FreeBSD$ .\" -.Dd September 22, 2014 +.Dd June 19, 2019 .Dt SLEEPQUEUE 9 .Os .Sh NAME @@ -290,7 +290,8 @@ and functions. The .Fn sleepq_signal -function awakens the highest priority thread sleeping on a wait channel while +function awakens the highest priority thread sleeping on a wait channel +(if SLEEPQ_UNFAIR flag is set, thread that went to sleep recently) while .Fn sleepq_broadcast awakens all of the threads sleeping on a wait channel. The diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index 7d6340cba411..9e775037b6cc 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -368,6 +368,19 @@ wakeup_one(void *ident) kick_proc0(); } +void +wakeup_any(void *ident) +{ + int wakeup_swapper; + + sleepq_lock(ident); + wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP | SLEEPQ_UNFAIR, + 0, 0); + sleepq_release(ident); + if (wakeup_swapper) + kick_proc0(); +} + static void kdb_switch(void) { diff --git a/sys/kern/subr_sleepqueue.c b/sys/kern/subr_sleepqueue.c index 7f8a66eb8f5b..16e9bbfa3abc 100644 --- a/sys/kern/subr_sleepqueue.c +++ b/sys/kern/subr_sleepqueue.c @@ -123,7 +123,7 @@ CTASSERT(powerof2(SC_TABLESIZE)); * c - sleep queue chain lock */ struct sleepqueue { - TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */ + struct threadqueue sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */ u_int sq_blockedcnt[NR_SLEEPQS]; /* (c) N. of blocked threads. */ LIST_ENTRY(sleepqueue) sq_hash; /* (c) Chain and free list. */ LIST_HEAD(, sleepqueue) sq_free; /* (c) Free queues. */ @@ -889,12 +889,14 @@ sleepq_init(void *mem, int size, int flags) } /* - * Find the highest priority thread sleeping on a wait channel and resume it. + * Find thread sleeping on a wait channel and resume it. */ int sleepq_signal(void *wchan, int flags, int pri, int queue) { + struct sleepqueue_chain *sc; struct sleepqueue *sq; + struct threadqueue *head; struct thread *td, *besttd; int wakeup_swapper; @@ -907,16 +909,33 @@ sleepq_signal(void *wchan, int flags, int pri, int queue) KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE), ("%s: mismatch between sleep/wakeup and cv_*", __func__)); - /* - * Find the highest priority thread on the queue. If there is a - * tie, use the thread that first appears in the queue as it has - * been sleeping the longest since threads are always added to - * the tail of sleep queues. - */ - besttd = TAILQ_FIRST(&sq->sq_blocked[queue]); - TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) { - if (td->td_priority < besttd->td_priority) + head = &sq->sq_blocked[queue]; + if (flags & SLEEPQ_UNFAIR) { + /* + * Find the most recently sleeping thread, but try to + * skip threads still in process of context switch to + * avoid spinning on the thread lock. + */ + sc = SC_LOOKUP(wchan); + besttd = TAILQ_LAST_FAST(head, thread, td_slpq); + while (besttd->td_lock != &sc->sc_lock) { + td = TAILQ_PREV_FAST(besttd, head, thread, td_slpq); + if (td == NULL) + break; besttd = td; + } + } else { + /* + * Find the highest priority thread on the queue. If there + * is a tie, use the thread that first appears in the queue + * as it has been sleeping the longest since threads are + * always added to the tail of sleep queues. + */ + besttd = td = TAILQ_FIRST(head); + while ((td = TAILQ_NEXT(td, td_slpq)) != NULL) { + if (td->td_priority < besttd->td_priority) + besttd = td; + } } MPASS(besttd != NULL); thread_lock(besttd); diff --git a/sys/kern/subr_taskqueue.c b/sys/kern/subr_taskqueue.c index 51ceab775333..e070272adf9b 100644 --- a/sys/kern/subr_taskqueue.c +++ b/sys/kern/subr_taskqueue.c @@ -804,7 +804,7 @@ taskqueue_thread_enqueue(void *context) tqp = context; tq = *tqp; - wakeup_one(tq); + wakeup_any(tq); } TASKQUEUE_DEFINE(swi, taskqueue_swi_enqueue, NULL, diff --git a/sys/sys/queue.h b/sys/sys/queue.h index 94fd1ce7b66b..3d28a6c92454 100644 --- a/sys/sys/queue.h +++ b/sys/sys/queue.h @@ -829,6 +829,10 @@ struct { \ #define TAILQ_PREV(elm, headname, field) \ (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) +#define TAILQ_PREV_FAST(elm, head, type, field) \ + ((elm)->field.tqe_prev == &(head)->tqh_first ? NULL : \ + __containerof((elm)->field.tqe_prev, QUEUE_TYPEOF(type), field.tqe_next)) + #define TAILQ_REMOVE(head, elm, field) do { \ QMD_SAVELINK(oldnext, (elm)->field.tqe_next); \ QMD_SAVELINK(oldprev, (elm)->field.tqe_prev); \ diff --git a/sys/sys/sleepqueue.h b/sys/sys/sleepqueue.h index 08eec5cd4381..8974869fb616 100644 --- a/sys/sys/sleepqueue.h +++ b/sys/sys/sleepqueue.h @@ -84,6 +84,7 @@ struct thread; #define SLEEPQ_SX 0x03 /* Used by an sx lock. */ #define SLEEPQ_LK 0x04 /* Used by a lockmgr. */ #define SLEEPQ_INTERRUPTIBLE 0x100 /* Sleep is interruptible. */ +#define SLEEPQ_UNFAIR 0x200 /* Unfair wakeup order. */ void init_sleepqueues(void); int sleepq_abort(struct thread *td, int intrval); diff --git a/sys/sys/systm.h b/sys/sys/systm.h index 097b5af9a8eb..733348c4fae9 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -489,6 +489,7 @@ int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, _sleep((chan), NULL, (pri), (wmesg), (bt), (pr), (flags)) void wakeup(void * chan); void wakeup_one(void * chan); +void wakeup_any(void * chan); /* * Common `struct cdev *' stuff are declared here to avoid #include poisoning