Implement preemption of kernel threads natively in the scheduler rather

than as one-off hacks in various other parts of the kernel: - Add a function maybe_preempt() that is called from sched_add() to determine if a thread about to be added to a run queue should be preempted to directly. If it is not safe to preempt or if the new thread does not have a high enough priority, then the function returns false and sched_add() adds the thread to the run queue. If the thread should be preempted to but the current thread is in a nested critical section, then the flag TDF_OWEPREEMPT is set and the thread is added to the run queue. Otherwise, mi_switch() is called immediately and the thread is never added to the run queue since it is switch to directly. When exiting an outermost critical section, if TDF_OWEPREEMPT is set, then clear it and call mi_switch() to perform the deferred preemption. - Remove explicit preemption from ithread_schedule() as calling setrunqueue() now does all the correct work. This also removes the do_switch argument from ithread_schedule(). - Do not use the manual preemption code in mtx_unlock if the architecture supports native preemption. - Don't call mi_switch() in a loop during shutdown to give ithreads a chance to run if the architecture supports native preemption since the ithreads will just preempt DELAY(). - Don't call mi_switch() from the page zeroing idle thread for architectures that support native preemption as it is unnecessary. - Native preemption is enabled on the same archs that supported ithread preemption, namely alpha, i386, and amd64. This change should largely be a NOP for the default case as committed except that we will do fewer context switches in a few cases and will avoid the run queues completely when preempting. Approved by: scottl (with his re@ hat)
2004-07-02 20:21:44 +00:00 · 2004-07-02 20:21:44 +00:00 · 0c0b25ae91
commit 0c0b25ae91
parent 5a66986def
21 changed files with 174 additions and 46 deletions
--- a/sys/alpha/alpha/interrupt.c
+++ b/sys/alpha/alpha/interrupt.c
@ -455,7 +455,7 @@ alpha_dispatch_intr(void *frame, unsigned long vector)
 	 * thread to the current CPU until we return from the interrupt.
 	 */
 	sched_pin();
-	error = ithread_schedule(ithd, !cold);
+	error = ithread_schedule(ithd);
 	KASSERT(error == 0, ("got an impossible stray interrupt"));
 	sched_unpin();
 }
--- a/sys/alpha/include/param.h
+++ b/sys/alpha/include/param.h
@ -113,6 +113,8 @@
 #define	SSIZE		1		/* initial stack size/NBPG */
 #define	SINCR		1		/* increment of stack/NBPG */

+#define	PREEMPTION
+
 #ifndef	KSTACK_PAGES
 #define	KSTACK_PAGES	2		/* pages of kstack (with pcb) */
 #endif
--- a/sys/amd64/amd64/intr_machdep.c
+++ b/sys/amd64/amd64/intr_machdep.c
@ -215,7 +215,7 @@ intr_execute_handlers(struct intsrc *isrc, struct intrframe *iframe)
 		if (ih == NULL)
 			error = EINVAL;
 		else
-			error = ithread_schedule(it, !cold);
+			error = ithread_schedule(it);
 	}
 	if (error == EINVAL) {
 		atomic_add_long(isrc->is_straycount, 1);
--- a/sys/amd64/include/param.h
+++ b/sys/amd64/include/param.h
@ -119,6 +119,8 @@
 #define	NBPML4		(1ul<<PML4SHIFT)/* bytes/page map lev4 table */
 #define	PML4MASK	(NBPML4-1)

+#define	PREEMPTION
+
 #define IOPAGES	2		/* pages of i/o permission bitmap */

 #ifndef	KSTACK_PAGES
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@ -186,6 +186,11 @@ options 	MUTEX_WAKE_ALL

 # SMP Debugging Options:
 #
+# FULL_PREEMPTION instructs the kernel to preempt non-realtime kernel
+#	  threads.  It sole use is to expose race conditions and other
+#	  bugs during development.  Enabling this option will reduce
+#	  performance and increase the frequency of kernel panics by
+#	  design.  If you aren't sure that you need it then you don't.
 # MUTEX_DEBUG enables various extra assertions in the mutex code.
 # SLEEPQUEUE_PROFILING enables rudimentary profiling of the hash table
 #	  used to hold active sleep queues.
@ -197,6 +202,7 @@ options 	MUTEX_WAKE_ALL
 #	  a lock hierarchy violation occurs or if locks are held when going to
 #	  sleep.
 # WITNESS_SKIPSPIN disables the witness checks on spin mutexes.
+options 	FULL_PREEMPTION
 options 	MUTEX_DEBUG
 options 	WITNESS
 options 	WITNESS_DDB
--- a/sys/conf/options
+++ b/sys/conf/options
@ -61,6 +61,7 @@ DDB_NUMSYM	opt_ddb.h
 DDB_TRACE
 DDB_UNATTENDED
 DIRECTIO	opt_directio.h
+FULL_PREEMPTION
 GDB_REMOTE_CHAT	opt_ddb.h
 GDBSPEED	opt_ddb.h
 GEOM_AES	opt_geom.h
--- a/sys/i386/i386/intr_machdep.c
+++ b/sys/i386/i386/intr_machdep.c
@ -215,7 +215,7 @@ intr_execute_handlers(struct intsrc *isrc, struct intrframe *iframe)
 		if (ih == NULL)
 			error = EINVAL;
 		else
-			error = ithread_schedule(it, !cold);
+			error = ithread_schedule(it);
 	}
 	if (error == EINVAL) {
 		atomic_add_long(isrc->is_straycount, 1);
--- a/sys/i386/include/param.h
+++ b/sys/i386/include/param.h
@ -97,6 +97,8 @@
 #define NBPDR		(1<<PDRSHIFT)	/* bytes/page dir */
 #define PDRMASK		(NBPDR-1)

+#define	PREEMPTION
+
 #define IOPAGES	2		/* pages of i/o permission bitmap */

 #ifndef KSTACK_PAGES
--- a/sys/ia64/ia64/interrupt.c
+++ b/sys/ia64/ia64/interrupt.c
@ -384,7 +384,7 @@ ia64_dispatch_intr(void *frame, unsigned long vector)
 		return;
 	}

-	error = ithread_schedule(ithd, 0);	/* XXX:no preemption for now */
+	error = ithread_schedule(ithd);
 	KASSERT(error == 0, ("got an impossible stray interrupt"));
 }

--- a/sys/kern/kern_intr.c
+++ b/sys/kern/kern_intr.c
@ -365,7 +365,7 @@ ithread_remove_handler(void *cookie)
 }

 int
-ithread_schedule(struct ithd *ithread, int do_switch)
+ithread_schedule(struct ithd *ithread)
 {
 	struct int_entropy entropy;
 	struct thread *td;
@ -399,10 +399,7 @@ ithread_schedule(struct ithd *ithread, int do_switch)
 	/*
 	 * Set it_need to tell the thread to keep running if it is already
 	 * running.  Then, grab sched_lock and see if we actually need to
-	 * put this thread on the runqueue.  If so and the do_switch flag is
-	 * true and it is safe to switch, then switch to the ithread
-	 * immediately.  Otherwise, set the needresched flag to guarantee
-	 * that this ithread will run before any userland processes.
+	 * put this thread on the runqueue.
 	 */
 	ithread->it_need = 1;
 	mtx_lock_spin(&sched_lock);
@ -410,16 +407,6 @@ ithread_schedule(struct ithd *ithread, int do_switch)
 		CTR2(KTR_INTR, "%s: setrunqueue %d", __func__, p->p_pid);
 		TD_CLR_IWAIT(td);
 		setrunqueue(td);
-		if (do_switch &&
-		    (ctd->td_critnest == 1) ) {
-			KASSERT((TD_IS_RUNNING(ctd)),
-			    ("ithread_schedule: Bad state for curthread."));
-			if (ctd->td_flags & TDF_IDLETD)
-				ctd->td_state = TDS_CAN_RUN; /* XXXKSE */
-			mi_switch(SW_INVOL, NULL);
-		} else {
-			curthread->td_flags |= TDF_NEEDRESCHED;
-		}
 	} else {
 		CTR4(KTR_INTR, "%s: pid %d: it_need %d, state %d",
 		    __func__, p->p_pid, ithread->it_need, td->td_state);
@ -480,7 +467,7 @@ swi_sched(void *cookie, int flags)
 	 */
 	atomic_store_rel_int(&ih->ih_need, 1);
 	if (!(flags & SWI_DELAY)) {
-		error = ithread_schedule(it, !cold && !dumping);
+		error = ithread_schedule(it);
 		KASSERT(error == 0, ("stray software interrupt"));
 	}
 }
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@ -621,7 +621,9 @@ void
 _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 {
 	struct turnstile *ts;
+#ifndef PREEMPTION
 	struct thread *td, *td1;
+#endif

 	if (mtx_recursed(m)) {
 		if (--(m->mtx_recurse) == 0)
@ -646,8 +648,10 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 #else
 	MPASS(ts != NULL);
 #endif
+#ifndef PREEMPTION
 	/* XXX */
 	td1 = turnstile_head(ts);
+#endif
 #ifdef MUTEX_WAKE_ALL
 	turnstile_broadcast(ts);
 	_release_lock_quick(m);
@ -665,6 +669,7 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 #endif
 	turnstile_unpend(ts);

+#ifndef PREEMPTION
 	/*
 	 * XXX: This is just a hack until preemption is done.  However,
 	 * once preemption is done we need to either wrap the
@ -701,6 +706,7 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 			    m, (void *)m->mtx_lock);
 	}
 	mtx_unlock_spin(&sched_lock);
+#endif

 	return;
 }
--- a/sys/kern/kern_shutdown.c
+++ b/sys/kern/kern_shutdown.c
@ -269,7 +269,9 @@ boot(int howto)
 	if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
 		register struct buf *bp;
 		int iter, nbusy, pbusy;
+#ifndef PREEMPTION
 		int subiter;
+#endif

 		waittime = 0;
 		printf("\nsyncing disks, buffers remaining... ");
@ -300,20 +302,29 @@ boot(int howto)
 				iter = 0;
 			pbusy = nbusy;
 			sync(&thread0, NULL);
- 			if (curthread != NULL) {
-				DROP_GIANT();
-   				for (subiter = 0; subiter < 50 * iter; subiter++) {
-     					mtx_lock_spin(&sched_lock);
-					/*
-					 * Allow interrupt threads to run
-					 */
-     					mi_switch(SW_VOL, NULL);
-     					mtx_unlock_spin(&sched_lock);
-     					DELAY(1000);
-   				}
-				PICKUP_GIANT();
- 			} else
+
+#ifdef PREEMPTION
+			/*
+			 * Drop Giant and spin for a while to allow
+			 * interrupt threads to run.
+			 */
+			DROP_GIANT();
 			DELAY(50000 * iter);
+			PICKUP_GIANT();
+#else
+			/*
+			 * Drop Giant and context switch several times to
+			 * allow interrupt threads to run.
+			 */
+			DROP_GIANT();
+			for (subiter = 0; subiter < 50 * iter; subiter++) {
+				mtx_lock_spin(&sched_lock);
+				mi_switch(SW_VOL, NULL);
+				mtx_unlock_spin(&sched_lock);
+				DELAY(1000);
+			}
+			PICKUP_GIANT();
+#endif
 		}
 		printf("\n");
 		/*
--- a/sys/kern/kern_switch.c
+++ b/sys/kern/kern_switch.c
@ -88,6 +88,8 @@ reassigned to keep this true.
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");

+#include "opt_full_preemption.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@ -423,10 +425,10 @@ setrunqueue(struct thread *td)
 	}
 }

-/************************************************************************
- * Critical section marker functions					*
- ************************************************************************/
-/* Critical sections that prevent preemption. */
+/*
+ * Kernel thread preemption implementation.  Critical sections mark
+ * regions of code in which preemptions are not allowed.
+ */
 void
 critical_enter(void)
 {
@ -447,6 +449,13 @@ critical_exit(void)
 	KASSERT(td->td_critnest != 0,
 	    ("critical_exit: td_critnest == 0"));
 	if (td->td_critnest == 1) {
+#ifdef PREEMPTION
+		if (td->td_flags & TDF_OWEPREEMPT) {
+			mtx_lock_spin(&sched_lock);
+			mi_switch(SW_INVOL, NULL);
+			mtx_unlock_spin(&sched_lock);
+		}
+#endif
 		td->td_critnest = 0;
 		cpu_critical_exit();
 	} else {
@ -454,6 +463,86 @@ critical_exit(void)
 	}
 }

+/*
+ * This function is called when a thread is about to be put on run queue
+ * because it has been made runnable or its priority has been adjusted.  It
+ * determines if the new thread should be immediately preempted to.  If so,
+ * it switches to it and eventually returns true.  If not, it returns false
+ * so that the caller may place the thread on an appropriate run queue.
+ */
+int
+maybe_preempt(struct thread *td)
+{
+	struct thread *ctd;
+	int cpri, pri;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+#ifdef PREEMPTION
+	/*
+	 * The new thread should not preempt the current thread if any of the
+	 * following conditions are true:
+	 *
+	 *  - The current thread has a higher (numerically lower) priority.
+	 *  - It is too early in the boot for context switches (cold is set).
+	 *  - The current thread has an inhibitor set or is in the process of
+	 *    exiting.  In this case, the current thread is about to switch
+	 *    out anyways, so there's no point in preempting.  If we did,
+	 *    the current thread would not be properly resumed as well, so
+	 *    just avoid that whole landmine.
+	 *  - If the new thread's priority is not a realtime priority and
+	 *    the current thread's priority is not an idle priority and
+	 *    FULL_PREEMPTION is disabled.
+	 *
+	 * If all of these conditions are false, but the current thread is in
+	 * a nested critical section, then we have to defer the preemption
+	 * until we exit the critical section.  Otherwise, switch immediately
+	 * to the new thread.
+	 */
+	ctd = curthread;
+	pri = td->td_priority;
+	cpri = ctd->td_priority;
+	if (pri >= cpri || cold /* || dumping */ || TD_IS_INHIBITED(ctd) ||
+	    td->td_kse->ke_state != KES_THREAD)
+		return (0);
+#ifndef FULL_PREEMPTION
+	if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD) &&
+	    !(cpri >= PRI_MIN_IDLE))
+		return (0);
+#endif
+	if (ctd->td_critnest > 1) {
+		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
+		    ctd->td_critnest);
+		ctd->td_flags |= TDF_OWEPREEMPT;
+		return (0);
+	}
+
+	/*
+	 * Our thread state says that we are already on a run queue, so
+	 * update our state as if we had been dequeued by choosethread().
+	 */
+	MPASS(TD_ON_RUNQ(td));
+	TD_SET_RUNNING(td);
+	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
+	    td->td_proc->p_pid, td->td_proc->p_comm);
+	mi_switch(SW_INVOL, td);
+	return (1);
+#else
+	return (0);
+#endif
+}
+
+#ifndef PREEMPTION
+/* XXX: There should be a non-static version of this. */
+static void
+printf_caddr_t(void *data)
+{
+	printf("%s", (char *)data);
+}
+static char preempt_warning[] =
+    "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
+SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
+    preempt_warning)
+#endif

 /************************************************************************
 * SYSTEM RUN QUEUE manipulations and tests				*
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@ -299,7 +299,9 @@ mi_switch(int flags, struct thread *newtd)
 	if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
 		mtx_assert(&Giant, MA_NOTOWNED);
 #endif
-	KASSERT(td->td_critnest == 1,
+	KASSERT(td->td_critnest == 1 || (td->td_critnest == 2 &&
+	    (td->td_flags & TDF_OWEPREEMPT) != 0 && (flags & SW_INVOL) != 0 &&
+	    newtd == NULL),
 	    ("mi_switch: switch in a critical section"));
 	KASSERT((flags & (SW_INVOL | SW_VOL)) != 0,
 	    ("mi_switch: switch must be voluntary or involuntary"));
@ -308,6 +310,7 @@ mi_switch(int flags, struct thread *newtd)
 		p->p_stats->p_ru.ru_nvcsw++;
 	else
 		p->p_stats->p_ru.ru_nivcsw++;
+
 	/*
 	 * Compute the amount of time during which the current
 	 * process was running, and add that to its total so far.
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@ -654,7 +654,7 @@ sched_switch(struct thread *td, struct thread *newtd)
 		sched_tdcnt++;
 	td->td_lastcpu = td->td_oncpu;
 	td->td_last_kse = ke;
-	td->td_flags &= ~TDF_NEEDRESCHED;
+	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_OWEPREEMPT);
 	td->td_oncpu = NOCPU;
 	/*
 	 * At the last moment, if this thread is still marked RUNNING,
@ -712,6 +712,16 @@ sched_add(struct thread *td)
 	    ke->ke_proc->p_comm));
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("sched_add: process swapped out"));
+
+#ifdef SMP
+	/*
+	 * Only try to preempt if the thread is unpinned or pinned to the
+	 * current CPU.
+	 */
+	if (KSE_CAN_MIGRATE(ke) || ke->ke_runq == &runq_pcpu[PCPU_GET(cpuid)])
+#endif
+	if (maybe_preempt(td))
+		return;
 	ke->ke_ksegrp->kg_runq_kses++;
 	ke->ke_state = KES_ONRUNQ;

--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@ -1139,7 +1139,7 @@ sched_switch(struct thread *td, struct thread *newtd)
 	td->td_last_kse = ke;
        td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
-        td->td_flags &= ~TDF_NEEDRESCHED;
+	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_OWEPREEMPT);

 	/*
 	 * If the KSE has been assigned it may be in the process of switching
@ -1623,6 +1623,15 @@ sched_add(struct thread *td)
        if (td->td_priority < curthread->td_priority)
                curthread->td_flags |= TDF_NEEDRESCHED;

+#ifdef SMP
+	/*
+	 * Only try to preempt if the thread is unpinned or pinned to the
+	 * current CPU.
+	 */
+	if (KSE_CAN_MIGRATE(ke) || ke->ke_cpu == PCPU_GET(cpuid))
+#endif
+	if (maybe_preempt(td))
+		return;
 	ke->ke_ksegrp->kg_runq_kses++;
 	ke->ke_state = KES_ONRUNQ;

--- a/sys/powerpc/powerpc/intr_machdep.c
+++ b/sys/powerpc/powerpc/intr_machdep.c
@ -308,7 +308,7 @@ sched_ithd(void *cookie)

 	ih = (struct intr_handler *)cookie;

-	error = ithread_schedule(ih->ih_ithd, 0);
+	error = ithread_schedule(ih->ih_ithd);

 	if (error == EINVAL)
 		intr_stray_handler(ih);
--- a/sys/sparc64/sparc64/intr_machdep.c
+++ b/sys/sparc64/sparc64/intr_machdep.c
@ -230,11 +230,7 @@ sched_ithd(void *cookie)
 	int error;

 	iv = cookie;
-#ifdef notyet
 	error = ithread_schedule(iv->iv_ithd);
-#else
-	error = ithread_schedule(iv->iv_ithd, 0);
-#endif
 	if (error == EINVAL)
 		intr_stray_vector(iv);
 }
--- a/sys/sys/interrupt.h
+++ b/sys/sys/interrupt.h
@ -122,7 +122,7 @@ int	ithread_add_handler(struct ithd *ithread, const char *name,
 	    driver_intr_t handler, void *arg, u_char pri, enum intr_type flags,
 	    void **cookiep);
 int	ithread_remove_handler(void *cookie);
-int	ithread_schedule(struct ithd *ithread, int do_switch);
+int	ithread_schedule(struct ithd *ithread);
 int     swi_add(struct ithd **ithdp, const char *name,
 	    driver_intr_t handler, void *arg, int pri, enum intr_type flags,
 	    void **cookiep);
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@ -346,6 +346,7 @@ struct thread {
 #define	TDF_IDLETD	0x000020 /* This is one of the per-CPU idle threads. */
 #define	TDF_SELECT	0x000040 /* Selecting; wakeup/waiting danger. */
 #define	TDF_TSNOBLOCK	0x000100 /* Don't block on a turnstile due to race. */
+#define	TDF_OWEPREEMPT	0x000200 /* Thread has a pending preemption. */
 #define	TDF_ASTPENDING	0x000800 /* Thread has some asynchronous events. */
 #define	TDF_TIMOFAIL	0x001000 /* Timeout from sleep after we were awake. */
 #define	TDF_INTERRUPT	0x002000 /* Thread is marked as interrupted. */
@ -850,6 +851,7 @@ void	fork_exit(void (*)(void *, struct trapframe *), void *,
 void	fork_return(struct thread *, struct trapframe *);
 int	inferior(struct proc *p);
 int	leavepgrp(struct proc *p);
+int	maybe_preempt(struct thread *td);
 void	mi_switch(int flags, struct thread *newtd);
 int	p_candebug(struct thread *td, struct proc *p);
 int	p_cansee(struct thread *td, struct proc *p);
--- a/sys/vm/vm_zeroidle.c
+++ b/sys/vm/vm_zeroidle.c
@ -151,12 +151,14 @@ vm_pagezero(void __unused *arg)
 	for (;;) {
 		if (vm_page_zero_check()) {
 			pages += vm_page_zero_idle();
+#ifndef PREEMPTION
 			if (pages > idlezero_maxrun || sched_runnable()) {
 				mtx_lock_spin(&sched_lock);
 				mi_switch(SW_VOL, NULL);
 				mtx_unlock_spin(&sched_lock);
 				pages = 0;
 			}
+#endif
 		} else {
 			tsleep(&zero_state, pri, "pgzero", hz * 300);
 			pages = 0;