Implement preemption of kernel threads natively in the scheduler rather

than as one-off hacks in various other parts of the kernel:
- Add a function maybe_preempt() that is called from sched_add() to
  determine if a thread about to be added to a run queue should be
  preempted to directly.  If it is not safe to preempt or if the new
  thread does not have a high enough priority, then the function returns
  false and sched_add() adds the thread to the run queue.  If the thread
  should be preempted to but the current thread is in a nested critical
  section, then the flag TDF_OWEPREEMPT is set and the thread is added
  to the run queue.  Otherwise, mi_switch() is called immediately and the
  thread is never added to the run queue since it is switch to directly.
  When exiting an outermost critical section, if TDF_OWEPREEMPT is set,
  then clear it and call mi_switch() to perform the deferred preemption.
- Remove explicit preemption from ithread_schedule() as calling
  setrunqueue() now does all the correct work.  This also removes the
  do_switch argument from ithread_schedule().
- Do not use the manual preemption code in mtx_unlock if the architecture
  supports native preemption.
- Don't call mi_switch() in a loop during shutdown to give ithreads a
  chance to run if the architecture supports native preemption since
  the ithreads will just preempt DELAY().
- Don't call mi_switch() from the page zeroing idle thread for
  architectures that support native preemption as it is unnecessary.
- Native preemption is enabled on the same archs that supported ithread
  preemption, namely alpha, i386, and amd64.

This change should largely be a NOP for the default case as committed
except that we will do fewer context switches in a few cases and will
avoid the run queues completely when preempting.

Approved by:	scottl (with his re@ hat)
This commit is contained in:
John Baldwin 2004-07-02 20:21:44 +00:00
parent 5a66986def
commit 0c0b25ae91
21 changed files with 174 additions and 46 deletions

View File

@ -455,7 +455,7 @@ alpha_dispatch_intr(void *frame, unsigned long vector)
* thread to the current CPU until we return from the interrupt.
*/
sched_pin();
error = ithread_schedule(ithd, !cold);
error = ithread_schedule(ithd);
KASSERT(error == 0, ("got an impossible stray interrupt"));
sched_unpin();
}

View File

@ -113,6 +113,8 @@
#define SSIZE 1 /* initial stack size/NBPG */
#define SINCR 1 /* increment of stack/NBPG */
#define PREEMPTION
#ifndef KSTACK_PAGES
#define KSTACK_PAGES 2 /* pages of kstack (with pcb) */
#endif

View File

@ -215,7 +215,7 @@ intr_execute_handlers(struct intsrc *isrc, struct intrframe *iframe)
if (ih == NULL)
error = EINVAL;
else
error = ithread_schedule(it, !cold);
error = ithread_schedule(it);
}
if (error == EINVAL) {
atomic_add_long(isrc->is_straycount, 1);

View File

@ -119,6 +119,8 @@
#define NBPML4 (1ul<<PML4SHIFT)/* bytes/page map lev4 table */
#define PML4MASK (NBPML4-1)
#define PREEMPTION
#define IOPAGES 2 /* pages of i/o permission bitmap */
#ifndef KSTACK_PAGES

View File

@ -186,6 +186,11 @@ options MUTEX_WAKE_ALL
# SMP Debugging Options:
#
# FULL_PREEMPTION instructs the kernel to preempt non-realtime kernel
# threads. It sole use is to expose race conditions and other
# bugs during development. Enabling this option will reduce
# performance and increase the frequency of kernel panics by
# design. If you aren't sure that you need it then you don't.
# MUTEX_DEBUG enables various extra assertions in the mutex code.
# SLEEPQUEUE_PROFILING enables rudimentary profiling of the hash table
# used to hold active sleep queues.
@ -197,6 +202,7 @@ options MUTEX_WAKE_ALL
# a lock hierarchy violation occurs or if locks are held when going to
# sleep.
# WITNESS_SKIPSPIN disables the witness checks on spin mutexes.
options FULL_PREEMPTION
options MUTEX_DEBUG
options WITNESS
options WITNESS_DDB

View File

@ -61,6 +61,7 @@ DDB_NUMSYM opt_ddb.h
DDB_TRACE
DDB_UNATTENDED
DIRECTIO opt_directio.h
FULL_PREEMPTION
GDB_REMOTE_CHAT opt_ddb.h
GDBSPEED opt_ddb.h
GEOM_AES opt_geom.h

View File

@ -215,7 +215,7 @@ intr_execute_handlers(struct intsrc *isrc, struct intrframe *iframe)
if (ih == NULL)
error = EINVAL;
else
error = ithread_schedule(it, !cold);
error = ithread_schedule(it);
}
if (error == EINVAL) {
atomic_add_long(isrc->is_straycount, 1);

View File

@ -97,6 +97,8 @@
#define NBPDR (1<<PDRSHIFT) /* bytes/page dir */
#define PDRMASK (NBPDR-1)
#define PREEMPTION
#define IOPAGES 2 /* pages of i/o permission bitmap */
#ifndef KSTACK_PAGES

View File

@ -384,7 +384,7 @@ ia64_dispatch_intr(void *frame, unsigned long vector)
return;
}
error = ithread_schedule(ithd, 0); /* XXX:no preemption for now */
error = ithread_schedule(ithd);
KASSERT(error == 0, ("got an impossible stray interrupt"));
}

View File

@ -365,7 +365,7 @@ ithread_remove_handler(void *cookie)
}
int
ithread_schedule(struct ithd *ithread, int do_switch)
ithread_schedule(struct ithd *ithread)
{
struct int_entropy entropy;
struct thread *td;
@ -399,10 +399,7 @@ ithread_schedule(struct ithd *ithread, int do_switch)
/*
* Set it_need to tell the thread to keep running if it is already
* running. Then, grab sched_lock and see if we actually need to
* put this thread on the runqueue. If so and the do_switch flag is
* true and it is safe to switch, then switch to the ithread
* immediately. Otherwise, set the needresched flag to guarantee
* that this ithread will run before any userland processes.
* put this thread on the runqueue.
*/
ithread->it_need = 1;
mtx_lock_spin(&sched_lock);
@ -410,16 +407,6 @@ ithread_schedule(struct ithd *ithread, int do_switch)
CTR2(KTR_INTR, "%s: setrunqueue %d", __func__, p->p_pid);
TD_CLR_IWAIT(td);
setrunqueue(td);
if (do_switch &&
(ctd->td_critnest == 1) ) {
KASSERT((TD_IS_RUNNING(ctd)),
("ithread_schedule: Bad state for curthread."));
if (ctd->td_flags & TDF_IDLETD)
ctd->td_state = TDS_CAN_RUN; /* XXXKSE */
mi_switch(SW_INVOL, NULL);
} else {
curthread->td_flags |= TDF_NEEDRESCHED;
}
} else {
CTR4(KTR_INTR, "%s: pid %d: it_need %d, state %d",
__func__, p->p_pid, ithread->it_need, td->td_state);
@ -480,7 +467,7 @@ swi_sched(void *cookie, int flags)
*/
atomic_store_rel_int(&ih->ih_need, 1);
if (!(flags & SWI_DELAY)) {
error = ithread_schedule(it, !cold && !dumping);
error = ithread_schedule(it);
KASSERT(error == 0, ("stray software interrupt"));
}
}

View File

@ -621,7 +621,9 @@ void
_mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
{
struct turnstile *ts;
#ifndef PREEMPTION
struct thread *td, *td1;
#endif
if (mtx_recursed(m)) {
if (--(m->mtx_recurse) == 0)
@ -646,8 +648,10 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
#else
MPASS(ts != NULL);
#endif
#ifndef PREEMPTION
/* XXX */
td1 = turnstile_head(ts);
#endif
#ifdef MUTEX_WAKE_ALL
turnstile_broadcast(ts);
_release_lock_quick(m);
@ -665,6 +669,7 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
#endif
turnstile_unpend(ts);
#ifndef PREEMPTION
/*
* XXX: This is just a hack until preemption is done. However,
* once preemption is done we need to either wrap the
@ -701,6 +706,7 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
m, (void *)m->mtx_lock);
}
mtx_unlock_spin(&sched_lock);
#endif
return;
}

View File

@ -269,7 +269,9 @@ boot(int howto)
if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
register struct buf *bp;
int iter, nbusy, pbusy;
#ifndef PREEMPTION
int subiter;
#endif
waittime = 0;
printf("\nsyncing disks, buffers remaining... ");
@ -300,20 +302,29 @@ boot(int howto)
iter = 0;
pbusy = nbusy;
sync(&thread0, NULL);
if (curthread != NULL) {
DROP_GIANT();
for (subiter = 0; subiter < 50 * iter; subiter++) {
mtx_lock_spin(&sched_lock);
/*
* Allow interrupt threads to run
*/
mi_switch(SW_VOL, NULL);
mtx_unlock_spin(&sched_lock);
DELAY(1000);
}
PICKUP_GIANT();
} else
#ifdef PREEMPTION
/*
* Drop Giant and spin for a while to allow
* interrupt threads to run.
*/
DROP_GIANT();
DELAY(50000 * iter);
PICKUP_GIANT();
#else
/*
* Drop Giant and context switch several times to
* allow interrupt threads to run.
*/
DROP_GIANT();
for (subiter = 0; subiter < 50 * iter; subiter++) {
mtx_lock_spin(&sched_lock);
mi_switch(SW_VOL, NULL);
mtx_unlock_spin(&sched_lock);
DELAY(1000);
}
PICKUP_GIANT();
#endif
}
printf("\n");
/*

View File

@ -88,6 +88,8 @@ reassigned to keep this true.
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_full_preemption.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@ -423,10 +425,10 @@ setrunqueue(struct thread *td)
}
}
/************************************************************************
* Critical section marker functions *
************************************************************************/
/* Critical sections that prevent preemption. */
/*
* Kernel thread preemption implementation. Critical sections mark
* regions of code in which preemptions are not allowed.
*/
void
critical_enter(void)
{
@ -447,6 +449,13 @@ critical_exit(void)
KASSERT(td->td_critnest != 0,
("critical_exit: td_critnest == 0"));
if (td->td_critnest == 1) {
#ifdef PREEMPTION
if (td->td_flags & TDF_OWEPREEMPT) {
mtx_lock_spin(&sched_lock);
mi_switch(SW_INVOL, NULL);
mtx_unlock_spin(&sched_lock);
}
#endif
td->td_critnest = 0;
cpu_critical_exit();
} else {
@ -454,6 +463,86 @@ critical_exit(void)
}
}
/*
* This function is called when a thread is about to be put on run queue
* because it has been made runnable or its priority has been adjusted. It
* determines if the new thread should be immediately preempted to. If so,
* it switches to it and eventually returns true. If not, it returns false
* so that the caller may place the thread on an appropriate run queue.
*/
int
maybe_preempt(struct thread *td)
{
struct thread *ctd;
int cpri, pri;
mtx_assert(&sched_lock, MA_OWNED);
#ifdef PREEMPTION
/*
* The new thread should not preempt the current thread if any of the
* following conditions are true:
*
* - The current thread has a higher (numerically lower) priority.
* - It is too early in the boot for context switches (cold is set).
* - The current thread has an inhibitor set or is in the process of
* exiting. In this case, the current thread is about to switch
* out anyways, so there's no point in preempting. If we did,
* the current thread would not be properly resumed as well, so
* just avoid that whole landmine.
* - If the new thread's priority is not a realtime priority and
* the current thread's priority is not an idle priority and
* FULL_PREEMPTION is disabled.
*
* If all of these conditions are false, but the current thread is in
* a nested critical section, then we have to defer the preemption
* until we exit the critical section. Otherwise, switch immediately
* to the new thread.
*/
ctd = curthread;
pri = td->td_priority;
cpri = ctd->td_priority;
if (pri >= cpri || cold /* || dumping */ || TD_IS_INHIBITED(ctd) ||
td->td_kse->ke_state != KES_THREAD)
return (0);
#ifndef FULL_PREEMPTION
if (!(pri >= PRI_MIN_ITHD && pri <= PRI_MAX_ITHD) &&
!(cpri >= PRI_MIN_IDLE))
return (0);
#endif
if (ctd->td_critnest > 1) {
CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
ctd->td_critnest);
ctd->td_flags |= TDF_OWEPREEMPT;
return (0);
}
/*
* Our thread state says that we are already on a run queue, so
* update our state as if we had been dequeued by choosethread().
*/
MPASS(TD_ON_RUNQ(td));
TD_SET_RUNNING(td);
CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
td->td_proc->p_pid, td->td_proc->p_comm);
mi_switch(SW_INVOL, td);
return (1);
#else
return (0);
#endif
}
#ifndef PREEMPTION
/* XXX: There should be a non-static version of this. */
static void
printf_caddr_t(void *data)
{
printf("%s", (char *)data);
}
static char preempt_warning[] =
"WARNING: Kernel preemption is disabled, expect reduced performance.\n";
SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
preempt_warning)
#endif
/************************************************************************
* SYSTEM RUN QUEUE manipulations and tests *

View File

@ -299,7 +299,9 @@ mi_switch(int flags, struct thread *newtd)
if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
mtx_assert(&Giant, MA_NOTOWNED);
#endif
KASSERT(td->td_critnest == 1,
KASSERT(td->td_critnest == 1 || (td->td_critnest == 2 &&
(td->td_flags & TDF_OWEPREEMPT) != 0 && (flags & SW_INVOL) != 0 &&
newtd == NULL),
("mi_switch: switch in a critical section"));
KASSERT((flags & (SW_INVOL | SW_VOL)) != 0,
("mi_switch: switch must be voluntary or involuntary"));
@ -308,6 +310,7 @@ mi_switch(int flags, struct thread *newtd)
p->p_stats->p_ru.ru_nvcsw++;
else
p->p_stats->p_ru.ru_nivcsw++;
/*
* Compute the amount of time during which the current
* process was running, and add that to its total so far.

View File

@ -654,7 +654,7 @@ sched_switch(struct thread *td, struct thread *newtd)
sched_tdcnt++;
td->td_lastcpu = td->td_oncpu;
td->td_last_kse = ke;
td->td_flags &= ~TDF_NEEDRESCHED;
td->td_flags &= ~(TDF_NEEDRESCHED | TDF_OWEPREEMPT);
td->td_oncpu = NOCPU;
/*
* At the last moment, if this thread is still marked RUNNING,
@ -712,6 +712,16 @@ sched_add(struct thread *td)
ke->ke_proc->p_comm));
KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
("sched_add: process swapped out"));
#ifdef SMP
/*
* Only try to preempt if the thread is unpinned or pinned to the
* current CPU.
*/
if (KSE_CAN_MIGRATE(ke) || ke->ke_runq == &runq_pcpu[PCPU_GET(cpuid)])
#endif
if (maybe_preempt(td))
return;
ke->ke_ksegrp->kg_runq_kses++;
ke->ke_state = KES_ONRUNQ;

View File

@ -1139,7 +1139,7 @@ sched_switch(struct thread *td, struct thread *newtd)
td->td_last_kse = ke;
td->td_lastcpu = td->td_oncpu;
td->td_oncpu = NOCPU;
td->td_flags &= ~TDF_NEEDRESCHED;
td->td_flags &= ~(TDF_NEEDRESCHED | TDF_OWEPREEMPT);
/*
* If the KSE has been assigned it may be in the process of switching
@ -1623,6 +1623,15 @@ sched_add(struct thread *td)
if (td->td_priority < curthread->td_priority)
curthread->td_flags |= TDF_NEEDRESCHED;
#ifdef SMP
/*
* Only try to preempt if the thread is unpinned or pinned to the
* current CPU.
*/
if (KSE_CAN_MIGRATE(ke) || ke->ke_cpu == PCPU_GET(cpuid))
#endif
if (maybe_preempt(td))
return;
ke->ke_ksegrp->kg_runq_kses++;
ke->ke_state = KES_ONRUNQ;

View File

@ -308,7 +308,7 @@ sched_ithd(void *cookie)
ih = (struct intr_handler *)cookie;
error = ithread_schedule(ih->ih_ithd, 0);
error = ithread_schedule(ih->ih_ithd);
if (error == EINVAL)
intr_stray_handler(ih);

View File

@ -230,11 +230,7 @@ sched_ithd(void *cookie)
int error;
iv = cookie;
#ifdef notyet
error = ithread_schedule(iv->iv_ithd);
#else
error = ithread_schedule(iv->iv_ithd, 0);
#endif
if (error == EINVAL)
intr_stray_vector(iv);
}

View File

@ -122,7 +122,7 @@ int ithread_add_handler(struct ithd *ithread, const char *name,
driver_intr_t handler, void *arg, u_char pri, enum intr_type flags,
void **cookiep);
int ithread_remove_handler(void *cookie);
int ithread_schedule(struct ithd *ithread, int do_switch);
int ithread_schedule(struct ithd *ithread);
int swi_add(struct ithd **ithdp, const char *name,
driver_intr_t handler, void *arg, int pri, enum intr_type flags,
void **cookiep);

View File

@ -346,6 +346,7 @@ struct thread {
#define TDF_IDLETD 0x000020 /* This is one of the per-CPU idle threads. */
#define TDF_SELECT 0x000040 /* Selecting; wakeup/waiting danger. */
#define TDF_TSNOBLOCK 0x000100 /* Don't block on a turnstile due to race. */
#define TDF_OWEPREEMPT 0x000200 /* Thread has a pending preemption. */
#define TDF_ASTPENDING 0x000800 /* Thread has some asynchronous events. */
#define TDF_TIMOFAIL 0x001000 /* Timeout from sleep after we were awake. */
#define TDF_INTERRUPT 0x002000 /* Thread is marked as interrupted. */
@ -850,6 +851,7 @@ void fork_exit(void (*)(void *, struct trapframe *), void *,
void fork_return(struct thread *, struct trapframe *);
int inferior(struct proc *p);
int leavepgrp(struct proc *p);
int maybe_preempt(struct thread *td);
void mi_switch(int flags, struct thread *newtd);
int p_candebug(struct thread *td, struct proc *p);
int p_cansee(struct thread *td, struct proc *p);

View File

@ -151,12 +151,14 @@ vm_pagezero(void __unused *arg)
for (;;) {
if (vm_page_zero_check()) {
pages += vm_page_zero_idle();
#ifndef PREEMPTION
if (pages > idlezero_maxrun || sched_runnable()) {
mtx_lock_spin(&sched_lock);
mi_switch(SW_VOL, NULL);
mtx_unlock_spin(&sched_lock);
pages = 0;
}
#endif
} else {
tsleep(&zero_state, pri, "pgzero", hz * 300);
pages = 0;