MFC r200447,201703,201709-201710:

In current code, threads performing an interruptible sleep will leave the waiters flag on forcing the owner to do a wakeup even when the waiter queue is empty. That operation may lead to a deadlock in the case of doing a fake wakeup on the "preferred" queue while the other queue has real waiters on it, because nobody is going to wakeup the 2nd queue waiters and they will sleep indefinitively. A similar bug, is present, for lockmgr in the case the waiters are sleeping with LK_SLEEPFAIL on. Add a sleepqueue interface which does report the actual number of waiters on a specified queue of a waitchannel and track if at least one sleepfail waiter is present or not. In presence of this or empty "preferred" queue, wakeup both waiters queues. Discussed with: kib Tested by: Pete French <petefrench at ticketswitch dot com>, Justin Head <justin at encarnate dot com>
2010-01-18 14:43:44 +00:00 · 2010-01-18 14:43:44 +00:00 · 5fcd322a25
commit 5fcd322a25
parent 6cec00d040
6 changed files with 194 additions and 21 deletions
--- a/share/man/man9/sleepqueue.9
+++ b/share/man/man9/sleepqueue.9
@ -23,7 +23,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd December 12, 2009
+.Dd January 18, 2010
 .Dt SLEEPQUEUE 9
 .Os
 .Sh NAME
@ -41,6 +41,7 @@
 .Nm sleepq_remove ,
 .Nm sleepq_signal ,
 .Nm sleepq_set_timeout ,
+.Nm sleepq_sleepcnt ,
 .Nm sleepq_timedwait ,
 .Nm sleepq_timedwait_sig ,
 .Nm sleepq_wait ,
@ -77,6 +78,8 @@
 .Fn sleepq_signal "void *wchan" "int flags" "int pri" "int queue"
 .Ft void
 .Fn sleepq_set_timeout "void *wchan" "int timo"
+.Ft u_int
+.Fn sleepq_sleepcnt "void *wchan" "int queue"
 .Ft int
 .Fn sleepq_timedwait "void *wchan"
 .Ft int
@ -355,6 +358,14 @@ One possible use is waking up a specific thread from a widely shared sleep
 channel.
 .Pp
 The
+.Fn sleepq_sleepcnt
+function offer a simple way to retrieve the number of threads sleeping for
+the specified
+.Fa queue ,
+given a
+.Fa wchan .
+.Pp
+The
 .Fn sleepq_abort ,
 .Fn sleepq_broadcast ,
 and
--- a/sys/kern/kern_lock.c
+++ b/sys/kern/kern_lock.c
@ -54,8 +54,8 @@ __FBSDID("$FreeBSD$");
 #include <ddb/ddb.h>
 #endif

-CTASSERT(((LK_ADAPTIVE | LK_NOSHARE) & LO_CLASSFLAGS) ==
-    (LK_ADAPTIVE | LK_NOSHARE));
+CTASSERT(((LK_ADAPTIVE | LK_EXSLPFAIL | LK_NOSHARE) & LO_CLASSFLAGS) ==
+    (LK_ADAPTIVE | LK_EXSLPFAIL | LK_NOSHARE));
 CTASSERT(LK_UNLOCKED == (LK_UNLOCKED &
    ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS)));

@ -194,6 +194,14 @@ sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk,

 	if (flags & LK_INTERLOCK)
 		class->lc_unlock(ilk);
+
+	/*
+	 * LK_EXSLPFAIL is not invariant during the lock pattern but it is
+	 * always protected by the sleepqueue spinlock, thus it is safe to
+	 * handle within the lo_flags.
+	 */
+	if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0)
+		lk->lock_object.lo_flags |= LK_EXSLPFAIL;
 	GIANT_SAVE();
 	sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ?
 	    SLEEPQ_INTERRUPTIBLE : 0), queue);
@ -222,6 +230,7 @@ static __inline int
 wakeupshlk(struct lock *lk, const char *file, int line)
 {
 	uintptr_t v, x;
+	u_int realexslp;
 	int queue, wakeup_swapper;

 	TD_LOCKS_DEC(curthread);
@ -267,13 +276,45 @@ wakeupshlk(struct lock *lk, const char *file, int line)
 		/*
 		 * If the lock has exclusive waiters, give them preference in
 		 * order to avoid deadlock with shared runners up.
+		 * If interruptible sleeps left the exclusive queue empty
+		 * avoid a starvation for the threads sleeping on the shared
+		 * queue by giving them precedence and cleaning up the
+		 * exclusive waiters bit anyway.
+		 * Please note that the LK_EXSLPFAIL flag may be lying about
+		 * the real presence of waiters with the LK_SLEEPFAIL flag on
+		 * because they may be used in conjuction with interruptible
+		 * sleeps.
 		 */
-		if (x & LK_EXCLUSIVE_WAITERS) {
-			queue = SQ_EXCLUSIVE_QUEUE;
-			v |= (x & LK_SHARED_WAITERS);
+		realexslp = sleepq_sleepcnt(&lk->lock_object,
+		    SQ_EXCLUSIVE_QUEUE);
+		if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
+			if ((lk->lock_object.lo_flags & LK_EXSLPFAIL) == 0) {
+				lk->lock_object.lo_flags &= ~LK_EXSLPFAIL;
+				queue = SQ_EXCLUSIVE_QUEUE;
+				v |= (x & LK_SHARED_WAITERS);
+			} else {
+				lk->lock_object.lo_flags &= ~LK_EXSLPFAIL;
+				LOCK_LOG2(lk,
+				    "%s: %p has only LK_SLEEPFAIL sleepers",
+				    __func__, lk);
+				LOCK_LOG2(lk,
+			    "%s: %p waking up threads on the exclusive queue",
+				    __func__, lk);
+				wakeup_swapper =
+				    sleepq_broadcast(&lk->lock_object,
+				    SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
+				queue = SQ_SHARED_QUEUE;
+			}
+				
 		} else {
-			MPASS((x & ~LK_EXCLUSIVE_SPINNERS) ==
-			    LK_SHARED_WAITERS);
+
+			/*
+			 * Exclusive waiters sleeping with LK_SLEEPFAIL on
+			 * and using interruptible sleeps/timeout may have
+			 * left spourious LK_EXSLPFAIL flag on, so clean
+			 * it up anyway.
+			 */
+			lk->lock_object.lo_flags &= ~LK_EXSLPFAIL;
 			queue = SQ_SHARED_QUEUE;
 		}

@ -285,7 +326,7 @@ wakeupshlk(struct lock *lk, const char *file, int line)
 		LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue",
 		    __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
 		    "exclusive");
-		wakeup_swapper = sleepq_broadcast(&lk->lock_object, SLEEPQ_LK,
+		wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK,
 		    0, queue);
 		sleepq_release(&lk->lock_object);
 		break;
@ -362,6 +403,8 @@ lockdestroy(struct lock *lk)

 	KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held"));
 	KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed"));
+	KASSERT((lk->lock_object.lo_flags & LK_EXSLPFAIL) == 0,
+	    ("lockmgr still exclusive waiters"));
 	lock_destroy(&lk->lock_object);
 }

@ -373,7 +416,7 @@ __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk,
 	struct lock_class *class;
 	const char *iwmesg;
 	uintptr_t tid, v, x;
-	u_int op;
+	u_int op, realexslp;
 	int error, ipri, itimo, queue, wakeup_swapper;
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
@ -903,14 +946,48 @@ __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk,
 		 	 * If the lock has exclusive waiters, give them
 			 * preference in order to avoid deadlock with
 			 * shared runners up.
+			 * If interruptible sleeps left the exclusive queue
+			 * empty avoid a starvation for the threads sleeping
+			 * on the shared queue by giving them precedence
+			 * and cleaning up the exclusive waiters bit anyway.
+			 * Please note that the LK_EXSLPFAIL flag may be lying
+			 * about the real presence of waiters with the
+			 * LK_SLEEPFAIL flag on because they may be used in
+			 * conjuction with interruptible sleeps.
 			 */
 			MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
-			if (x & LK_EXCLUSIVE_WAITERS) {
-				queue = SQ_EXCLUSIVE_QUEUE;
-				v |= (x & LK_SHARED_WAITERS);
+			realexslp = sleepq_sleepcnt(&lk->lock_object,
+			    SQ_EXCLUSIVE_QUEUE);
+			if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
+				if ((lk->lock_object.lo_flags &
+				    LK_EXSLPFAIL) == 0) {
+					lk->lock_object.lo_flags &=
+					    ~LK_EXSLPFAIL;
+					queue = SQ_EXCLUSIVE_QUEUE;
+					v |= (x & LK_SHARED_WAITERS);
+				} else {
+					lk->lock_object.lo_flags &=
+					    ~LK_EXSLPFAIL;
+					LOCK_LOG2(lk,
+					"%s: %p has only LK_SLEEPFAIL sleepers",
+					    __func__, lk);
+					LOCK_LOG2(lk,
+			"%s: %p waking up threads on the exclusive queue",
+					    __func__, lk);
+					wakeup_swapper =
+					    sleepq_broadcast(&lk->lock_object,
+					    SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
+					queue = SQ_SHARED_QUEUE;
+				}
 			} else {
-				MPASS((x & LK_ALL_WAITERS) ==
-				    LK_SHARED_WAITERS);
+
+				/*
+				 * Exclusive waiters sleeping with LK_SLEEPFAIL
+				 * on and using interruptible sleeps/timeout
+				 * may have left spourious LK_EXSLPFAIL flag
+				 * on, so clean it up anyway.
+				 */
+				lk->lock_object.lo_flags &= ~LK_EXSLPFAIL;
 				queue = SQ_SHARED_QUEUE;
 			}

@ -919,7 +996,7 @@ __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk,
 			    __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
 			    "exclusive");
 			atomic_store_rel_ptr(&lk->lk_lock, v);
-			wakeup_swapper = sleepq_broadcast(&lk->lock_object,
+			wakeup_swapper |= sleepq_broadcast(&lk->lock_object,
 			    SLEEPQ_LK, 0, queue);
 			sleepq_release(&lk->lock_object);
 			break;
@ -976,14 +1053,64 @@ __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk,
 			v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
 			if ((x & ~v) == LK_UNLOCKED) {
 				v = (x & ~LK_EXCLUSIVE_SPINNERS);
+
+				/*
+				 * If interruptible sleeps left the exclusive
+				 * queue empty avoid a starvation for the
+				 * threads sleeping on the shared queue by
+				 * giving them precedence and cleaning up the
+				 * exclusive waiters bit anyway.
+				 * Please note that the LK_EXSLPFAIL flag may
+				 * be lying about the real presence of waiters
+				 * with the LK_SLEEPFAIL flag on because they
+				 * may be used in conjuction with interruptible
+				 * sleeps.
+				 */
 				if (v & LK_EXCLUSIVE_WAITERS) {
 					queue = SQ_EXCLUSIVE_QUEUE;
 					v &= ~LK_EXCLUSIVE_WAITERS;
 				} else {
+
+					/*
+					 * Exclusive waiters sleeping with
+					 * LK_SLEEPFAIL on and using
+					 * interruptible sleeps/timeout may
+					 * have left spourious LK_EXSLPFAIL
+					 * flag on, so clean it up anyway.
+					 */
 					MPASS(v & LK_SHARED_WAITERS);
+					lk->lock_object.lo_flags &=
+					    ~LK_EXSLPFAIL;
 					queue = SQ_SHARED_QUEUE;
 					v &= ~LK_SHARED_WAITERS;
 				}
+				if (queue == SQ_EXCLUSIVE_QUEUE) {
+					realexslp =
+					    sleepq_sleepcnt(&lk->lock_object,
+					    SQ_EXCLUSIVE_QUEUE);
+					if ((lk->lock_object.lo_flags &
+					    LK_EXSLPFAIL) == 0) {
+						lk->lock_object.lo_flags &=
+						    ~LK_EXSLPFAIL;
+						queue = SQ_SHARED_QUEUE;
+						v &= ~LK_SHARED_WAITERS;
+						if (realexslp != 0) {
+							LOCK_LOG2(lk,
+					"%s: %p has only LK_SLEEPFAIL sleepers",
+							    __func__, lk);
+							LOCK_LOG2(lk,
+			"%s: %p waking up threads on the exclusive queue",
+							    __func__, lk);
+							wakeup_swapper =
+							    sleepq_broadcast(
+							    &lk->lock_object,
+							    SLEEPQ_LK, 0,
+							    SQ_EXCLUSIVE_QUEUE);
+						}
+					} else
+						lk->lock_object.lo_flags &=
+						    ~LK_EXSLPFAIL;
+				}
 				if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) {
 					sleepq_release(&lk->lock_object);
 					continue;
--- a/sys/kern/kern_sx.c
+++ b/sys/kern/kern_sx.c
@ -702,8 +702,12 @@ _sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int line)
 	 * ideal.  It gives precedence to shared waiters if they are
 	 * present.  For this condition, we have to preserve the
 	 * state of the exclusive waiters flag.
+	 * If interruptible sleeps left the shared queue empty avoid a
+	 * starvation for the threads sleeping on the exclusive queue by giving
+	 * them precedence and cleaning up the shared waiters bit anyway.
 	 */
-	if (sx->sx_lock & SX_LOCK_SHARED_WAITERS) {
+	if ((sx->sx_lock & SX_LOCK_SHARED_WAITERS) != 0 &&
+	    sleepq_sleepcnt(&sx->lock_object, SQ_SHARED_QUEUE) != 0) {
 		queue = SQ_SHARED_QUEUE;
 		x |= (sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS);
 	} else
--- a/sys/kern/subr_sleepqueue.c
+++ b/sys/kern/subr_sleepqueue.c
@ -118,6 +118,7 @@ __FBSDID("$FreeBSD$");
 */
 struct sleepqueue {
 	TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS];	/* (c) Blocked threads. */
+	u_int sq_blockedcnt[NR_SLEEPQS];	/* (c) N. of blocked threads. */
 	LIST_ENTRY(sleepqueue) sq_hash;		/* (c) Chain and free list. */
 	LIST_HEAD(, sleepqueue) sq_free;	/* (c) Free queues. */
 	void	*sq_wchan;			/* (c) Wait channel. */
@ -306,9 +307,12 @@ sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags,
 		int i;

 		sq = td->td_sleepqueue;
-		for (i = 0; i < NR_SLEEPQS; i++)
+		for (i = 0; i < NR_SLEEPQS; i++) {
 			KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]),
-				("thread's sleep queue %d is not empty", i));
+			    ("thread's sleep queue %d is not empty", i));
+			KASSERT(sq->sq_blockedcnt[i] == 0,
+			    ("thread's sleep queue %d count mismatches", i));
+		}
 		KASSERT(LIST_EMPTY(&sq->sq_free),
 		    ("thread's sleep queue has a non-empty free list"));
 		KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
@ -334,6 +338,7 @@ sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags,
 	}
 	thread_lock(td);
 	TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq);
+	sq->sq_blockedcnt[queue]++;
 	td->td_sleepqueue = NULL;
 	td->td_sqqueue = queue;
 	td->td_wchan = wchan;
@ -366,6 +371,22 @@ sleepq_set_timeout(void *wchan, int timo)
 	callout_reset_curcpu(&td->td_slpcallout, timo, sleepq_timeout, td);
 }

+/*
+ * Return the number of actual sleepers for the specified queue.
+ */
+u_int
+sleepq_sleepcnt(void *wchan, int queue)
+{
+	struct sleepqueue *sq;
+
+	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
+	sq = sleepq_lookup(wchan);
+	if (sq == NULL)
+		return (0);
+	return (sq->sq_blockedcnt[queue]);
+}
+
 /*
 * Marks the pending sleep of the current thread as interruptible and
 * makes an initial check for pending signals before putting a thread
@ -665,6 +686,7 @@ sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri)
 	mtx_assert(&sc->sc_lock, MA_OWNED);

 	/* Remove the thread from the queue. */
+	sq->sq_blockedcnt[td->td_sqqueue]--;
 	TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);

 	/*
@ -720,8 +742,10 @@ sleepq_dtor(void *mem, int size, void *arg)
 	int i;

 	sq = mem;
-	for (i = 0; i < NR_SLEEPQS; i++)
+	for (i = 0; i < NR_SLEEPQS; i++) {
 		MPASS(TAILQ_EMPTY(&sq->sq_blocked[i]));
+		MPASS(sq->sq_blockedcnt[i] == 0);
+	}
 }
 #endif

@ -736,8 +760,10 @@ sleepq_init(void *mem, int size, int flags)

 	bzero(mem, size);
 	sq = mem;
-	for (i = 0; i < NR_SLEEPQS; i++)
+	for (i = 0; i < NR_SLEEPQS; i++) {
 		TAILQ_INIT(&sq->sq_blocked[i]);
+		sq->sq_blockedcnt[i] = 0;
+	}
 	LIST_INIT(&sq->sq_free);
 	return (0);
 }
@ -1170,6 +1196,7 @@ found:
 					  td->td_tid, td->td_proc->p_pid,
 					  td->td_name);
 			}
+		db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]);
 	}
 }

--- a/sys/sys/lockmgr.h
+++ b/sys/sys/lockmgr.h
@ -144,6 +144,9 @@ _lockmgr_args_rw(struct lock *lk, u_int flags, struct rwlock *ilk,
 #define	LK_QUIET	0x000020
 #define	LK_ADAPTIVE	0x000040

+/* LK_EXSLPFAIL to follow, even if not used in lockinit() */
+#define	LK_EXSLPFAIL	0x000080
+
 /*
 * Additional attributes to be used in lockmgr().
 */
--- a/sys/sys/sleepqueue.h
+++ b/sys/sys/sleepqueue.h
@ -109,6 +109,7 @@ void	sleepq_release(void *wchan);
 void	sleepq_remove(struct thread *td, void *wchan);
 int	sleepq_signal(void *wchan, int flags, int pri, int queue);
 void	sleepq_set_timeout(void *wchan, int timo);
+u_int	sleepq_sleepcnt(void *wchan, int queue);
 int	sleepq_timedwait(void *wchan, int pri);
 int	sleepq_timedwait_sig(void *wchan, int pri);
 void	sleepq_wait(void *wchan, int pri);