diff --git a/bin/ps/print.c b/bin/ps/print.c index a49c8fa656b2..7f7898f7669f 100644 --- a/bin/ps/print.c +++ b/bin/ps/print.c @@ -185,7 +185,7 @@ state(k, ve) break; case SSLEEP: - if (flag & P_SINTR) /* interuptable (long) */ + if (flag & P_SINTR) /* interruptable (long) */ *cp = p->p_slptime >= MAXSLP ? 'I' : 'S'; else *cp = 'D'; @@ -196,6 +196,14 @@ state(k, ve) *cp = 'R'; break; + case SWAIT: + *cp = 'W'; + break; + + case SMTX: + *cp = 'M'; + break; + case SZOMB: *cp = 'Z'; break; diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile index d378f7c41379..9a1ceefdc872 100644 --- a/share/man/man9/Makefile +++ b/share/man/man9/Makefile @@ -14,8 +14,8 @@ MAN9= CONDSPLASSERT.9 KASSERT.9 MD5.9 SPLASSERT.9 \ at_exit.9 at_fork.9 bios.9 boot.9 buf.9 cd.9 copy.9 \ devfs_add_devswf.9 devfs_link.9 devfs_remove_dev.9 devstat.9 \ devtoname.9 fetch.9 ifnet.9 inittodr.9 intro.9 kernacc.9 malloc.9 \ - make_dev.9 microseq.9 mi_switch.9 namei.9 panic.9 physio.9 posix4.9 \ - psignal.9 resettodr.9 rtalloc.9 rtentry.9 sleep.9 spl.9 \ + make_dev.9 microseq.9 mi_switch.9 mutex.9 namei.9 panic.9 physio.9 \ + posix4.9 psignal.9 resettodr.9 rtalloc.9 rtentry.9 sleep.9 spl.9 \ store.9 style.9 suser.9 time.9 timeout.9 uio.9 \ vget.9 vnode.9 vput.9 vref.9 vrele.9 vslock.9 \ microtime.9 microuptime.9 tvtohz.9 diff --git a/share/man/man9/mutex.9 b/share/man/man9/mutex.9 new file mode 100644 index 000000000000..ac1b78f2ff28 --- /dev/null +++ b/share/man/man9/mutex.9 @@ -0,0 +1,222 @@ +.\" +.\" Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. Berkeley Software Design Inc's name may not be used to endorse or +.\" promote products derived from this software without specific prior +.\" written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" from BSDI $Id: mutex.4,v 1.1.2.3 1998/04/27 22:53:13 ewv Exp $ +.\" $FreeBSD$ +.\" +.Dd April 20, 1998 +.Dt MUTEX 9 +.Sh NAME +.Nm mutex, +.Nm mtx_enter, +.Nm mtx_exit +.Nd kernel synchronization primitives +.Sh SYNOPSIS +.Ft void +.Fn mtx_enter "mtx_t *mutex" "int flags" +.Ft void +.Fn mtx_exit "mtx_t *mutex" "int flags" +.Ft int +.Fn mtx_owned "mtx_t *mutex" +.Sh DESCRIPTION +The +.Fn mtx_enter +function acquires a mutual exclusion lock +on behalf of the currently running kernel thread. +If another kernel thread is holding the mutex, +the caller will be disconnected from the CPU +until the mutex is available +(i.e. it will sleep), +spin wait for the mutex, +or possibly a combination of both. +.Pp +It is possible for the same thread to recursively acquire a mutex +with no ill effects; +if recursion on a given mutex can be avoided, +faster and smaller code will usually be generated. +.Pp +The +.Fn mtx_exit +function releases a mutual exclusion lock; +if a higher priority thread is waiting for the mutex, +the releasing thread may be disconnected +to allow the higher priority thread to acquire the mutex and run. +.Pp +The type of a mutex is not an attribute of the mutex, +but instead a function of the +.Fa flags +argument passed to +.Fn mtx_enter +and +.Fn mtx_exit ; +this allows code to be generated for the specific mutex type +at compile time +and avoids wasting run time on the determination of lock features. +This does place on the programmer, +the burden of using matching forms of the +.Fn mtx_enter +and +.Fn mtx_exit +functions for a given mutex. +It is an error to acquire a mutex in one mode (e.g. spin) +and release it in another (e.g. default). +It is also an error to get the lock in one mode +and allow another thread to attempt to get the lock in another mode. +A good general rule is to always use a given mutex in one mode only. +.Pp +The +.Fn mtx_owned +function returns a non-zero value +if the mutex pointed to is already held by the current thread. +.Ss The default Mutex Type +Most kernel code should use the default lock type; +the default lock type will allow the thread +to be disconnected from the CPU +if it cannot get the lock. +The machine dependent implementation +may treat the lock as a short term spin lock +under some circumstances. +However, it is always safe to use these forms of locks +in an interrupt thread +without fear of deadlock +against an interrupted thread on the same CPU. +.Ss The spin Mutex Type +A spin mutex will not relinquish the CPU +when it cannot immediately get the requested lock, +but will loop, waiting for the mutex to be released by another CPU. +This could result in deadlock +if a thread interrupted the thread which held a mutex +and then tried to acquire the mutex; +for this reason spin locks will disable all interrupts +(on the local CPU only) +by default. +.Pp +Spin locks are fairly specialized locks +that are intended to be held for very short periods of time; +their primary purpose is to protect portions of the code +that implement default (i.e. sleep) locks. +.Ss Flags +The flags passed to the +.Fn mtx_enter +and +.Fn mtx_exit +functions determine what type of mutex is being used +and also provide various options +used to generate more efficient code under certain circumstances. +.Pp +Both lock types (default and spin) +can be acquired recursively by the same thread. +This behavior can be changed with flags. +.Pp +The type of the mutex must always be specified: +.Bl -tag -width MTX_NORECURSE +.It Dv MTX_DEF +Default lock type; +will always allow the current thread to be suspended +to avoid deadlock conditions against interrupt threads. +The machine dependent implementation of this lock type +may spin for a while before suspending the current thread. +Most locks should be of this type. +.It Dv MTX_SPIN +Spin lock; +will never relinquish the CPU. +By default all interrupts are disabled on the local CPU +while any spin lock is held. +.El +.Pp +Options that modify mutex behavior: +.Bl -tag -width MTX_NORECURSE +.It Dv MTX_NORECURSE +If it is known, absolutely, +that the mutex will not be recursively acquired at this invocation +then this flag should be specified. +.Pp +If the lock is already held by the current thread, +then a kernel with +.Dv SMP_DEBUG +defined will panic; +without debugging enabled, +the thread may deadlock against itself +or leave the mutex in a corrupted state. +.Pp +This flag prevents generation of additional inline code +to deal with recursive lock acquisitions +and should be specified whenever possible +in the interests of efficiency. +Not specifying this flag will only cause the generated code +to be a little larger than necessary; +it will still operate correctly. +.It Dv MTX_RLIKELY +This provides a hint that it is likely that this mutex +will be held recursively at this invocation. +The actual optimization used is machine dependent; +generally, this will inline code to handle recursion +where a function call would otherwise be needed. +.Pp +This is a hint only; +leaving it out or specifying it inappropriately +will not cause any great harm other than +possibly generating less efficient code. +.It Dv MTX_TOPHALF +This option applies to spin locks only. +It indicates that the mutex is never acquired +from an interrupt thread, +so it is safe to leave interrupts enabled while holding the lock. +Since an interrupt may occur while holding the lock, +this may be detrimental to other processors +spin waiting for the lock. +Do not forget to include this option when the lock is released. +.Pp +This option should not be used in new code; +it is documented here for completeness only. +.It Dv MTX_FIRST +This option applies to spin locks only. +It indicates this is the first spin lock acquired by the thread. +No other spin locks may be held, +and the requested lock also may not be currently held. +Do not forget to include this option when the lock is released. +.It Dv MTX_NOSWITCH +When releasing a mutex, +this flag prevents a thread switch that might occur +if another higher priority thread was waiting for the mutex. +This may cause priority inversion and should be used carefully. +.Pp +This flag is used internally by the lock code. +It should not be used in general kernel code +and is documented here for completeness only. +.It Dv MTX_NOSPIN +For default locks, +this hint will prevent spinning before relinquishing the CPU. +This should be specified when it is known +that the lock will usually remain unavailable for some time +when it is not immediately available +(i.e.: coarse grained locks protecting large subsystems). +.El +.Sh HISTORY +These +functions appeared in BSD/OS 4.1 and +.Fx 5.0 . diff --git a/sys/alpha/alpha/clock.c b/sys/alpha/alpha/clock.c index 88adaa41a007..500d1694510f 100644 --- a/sys/alpha/alpha/clock.c +++ b/sys/alpha/alpha/clock.c @@ -43,6 +43,8 @@ * @(#)clock.c 8.1 (Berkeley) 6/10/93 */ +#include "opt_clock.h" + #include /* RCS ID & Copyright macro defns */ #include @@ -80,8 +82,23 @@ int disable_rtc_set; /* disable resettodr() if != 0 */ int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */ static int beeping = 0; -extern int cycles_per_sec; +#define TIMER_DIV(x) ((timer_freq + (x) / 2) / (x)) +#ifndef TIMER_FREQ +#define TIMER_FREQ 1193182 +#endif +u_int32_t timer_freq = TIMER_FREQ; +int timer0_max_count; + +static u_int32_t i8254_lastcount; +static u_int32_t i8254_offset; +static int i8254_ticked; +static int clkintr_pending = 0; + +extern int cycles_per_sec; +extern int ncpus; + +static timecounter_get_t i8254_get_timecount; static timecounter_get_t alpha_get_timecount; static struct timecounter alpha_timecounter = { @@ -95,6 +112,17 @@ static struct timecounter alpha_timecounter = { SYSCTL_OPAQUE(_debug, OID_AUTO, alpha_timecounter, CTLFLAG_RD, &alpha_timecounter, sizeof(alpha_timecounter), "S,timecounter", ""); +static struct timecounter i8254_timecounter = { + i8254_get_timecount, /* get_timecount */ + 0, /* no poll_pps */ + ~0u, /* counter_mask */ + 0, /* frequency */ + "i8254" /* name */ +}; + +SYSCTL_OPAQUE(_debug, OID_AUTO, i8254_timecounter, CTLFLAG_RD, + &i8254_timecounter, sizeof(i8254_timecounter), "S,timecounter", ""); + /* Values for timerX_state: */ #define RELEASED 0 #define RELEASE_PENDING 1 @@ -120,11 +148,14 @@ static u_int32_t max_cycles_per_tick; static u_int32_t last_time; static void handleclock(void* arg); -static u_int32_t calibrate_clocks(u_int32_t firmware_freq); +static void calibrate_clocks(u_int32_t firmware_freq, + u_int32_t *pcc, u_int32_t *timer); +static void set_timer_freq(u_int freq, int intr_freq); void clockattach(device_t dev) { + u_int32_t pcc, freq, delta; /* * Just bookkeeping. @@ -132,7 +163,33 @@ clockattach(device_t dev) if (clockdev) panic("clockattach: multiple clocks"); clockdev = dev; - cycles_per_sec = calibrate_clocks(cycles_per_sec); + + calibrate_clocks(cycles_per_sec, &pcc, &freq); + cycles_per_sec = pcc; + + /* + * Use the calibrated i8254 frequency if it seems reasonable. + * Otherwise use the default, and don't use the calibrated i586 + * frequency. + */ + delta = freq > timer_freq ? freq - timer_freq : timer_freq - freq; + if (delta < timer_freq / 100) { +#ifndef CLK_USE_I8254_CALIBRATION + if (bootverbose) + printf( +"CLK_USE_I8254_CALIBRATION not specified - using default frequency\n"); + freq = timer_freq; +#endif + timer_freq = freq; + } else { + if (bootverbose) + printf( + "%d Hz differs from default of %d Hz by more than 1%%\n", + freq, timer_freq); + } + set_timer_freq(timer_freq, hz); + i8254_timecounter.tc_frequency = timer_freq; + #ifdef EVCNT_COUNTERS evcnt_attach(dev, "intr", &clock_intr_evcnt); #endif @@ -190,8 +247,12 @@ cpu_initclocks() scaled_ticks_per_cycle = ((u_int64_t)hz << FIX_SHIFT) / freq; max_cycles_per_tick = 2*freq / hz; - alpha_timecounter.tc_frequency = freq; - tc_init(&alpha_timecounter); + tc_init(&i8254_timecounter); + + if (ncpus == 1) { + alpha_timecounter.tc_frequency = freq; + tc_init(&alpha_timecounter); + } stathz = 128; platform.clockintr = (void (*) __P((void *))) handleclock; @@ -202,15 +263,36 @@ cpu_initclocks() CLOCK_INIT(clockdev); } -static u_int32_t -calibrate_clocks(u_int32_t firmware_freq) +static int +getit(void) +{ + int high, low; + int s; + + s = splhigh(); + + /* Select timer0 and latch counter value. */ + outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); + + low = inb(TIMER_CNTR0); + high = inb(TIMER_CNTR0); + + splx(s); + return ((high << 8) | low); +} + +static void +calibrate_clocks(u_int32_t firmware_freq, u_int32_t *pcc, u_int32_t *timer) { u_int32_t start_pcc, stop_pcc; + u_int count, prev_count, tot_count; int sec, start_sec; if (bootverbose) printf("Calibrating clock(s) ... "); + set_timer_freq(timer_freq, hz); + /* Read the mc146818A seconds counter. */ if (CLOCK_GETSECS(clockdev, &sec)) goto fail; @@ -224,16 +306,36 @@ calibrate_clocks(u_int32_t firmware_freq) break; } - /* Start keeping track of the PCC. */ + /* Start keeping track of the PCC and i8254. */ + prev_count = getit(); + if (prev_count == 0) + goto fail; + tot_count = 0; + start_pcc = alpha_rpcc(); /* - * Wait for the mc146818A seconds counter to change. + * Wait for the mc146818A seconds counter to change. Read the i8254 + * counter for each iteration since this is convenient and only + * costs a few usec of inaccuracy. The timing of the final reads + * of the counters almost matches the timing of the initial reads, + * so the main cause of inaccuracy is the varying latency from + * inside getit() or rtcin(RTC_STATUSA) to the beginning of the + * rtcin(RTC_SEC) that returns a changed seconds count. The + * maximum inaccuracy from this cause is < 10 usec on 486's. */ start_sec = sec; for (;;) { if (CLOCK_GETSECS(clockdev, &sec)) goto fail; + count = getit(); + if (count == 0) + goto fail; + if (count > prev_count) + tot_count += prev_count - (count - timer0_max_count); + else + tot_count += prev_count - count; + prev_count = count; if (sec != start_sec) break; } @@ -246,29 +348,55 @@ calibrate_clocks(u_int32_t firmware_freq) if (bootverbose) { printf("PCC clock: %u Hz (firmware %u Hz)\n", stop_pcc - start_pcc, firmware_freq); + printf("i8254 clock: %u Hz\n", tot_count); } - return (stop_pcc - start_pcc); + *pcc = stop_pcc - start_pcc; + *timer = tot_count; + return; fail: if (bootverbose) printf("failed, using firmware default of %u Hz\n", firmware_freq); - return (firmware_freq); + + *pcc = firmware_freq; + *timer = 0; + return; +} + +static void +set_timer_freq(u_int freq, int intr_freq) +{ + int new_timer0_max_count; + int s; + + s = splhigh(); + timer_freq = freq; + new_timer0_max_count = TIMER_DIV(intr_freq); + if (new_timer0_max_count != timer0_max_count) { + timer0_max_count = new_timer0_max_count; + outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); + outb(TIMER_CNTR0, timer0_max_count & 0xff); + outb(TIMER_CNTR0, timer0_max_count >> 8); + } + splx(s); } static void handleclock(void* arg) { - u_int32_t now = alpha_rpcc(); - u_int32_t delta = now - last_time; - last_time = now; - - if (delta > max_cycles_per_tick) { - int i, missed_ticks; - missed_ticks = (delta * scaled_ticks_per_cycle) >> FIX_SHIFT; - for (i = 0; i < missed_ticks; i++) - hardclock(arg); + if (timecounter->tc_get_timecount == i8254_get_timecount) { + int s = splhigh(); + if (i8254_ticked) + i8254_ticked = 0; + else { + i8254_offset += timer0_max_count; + i8254_lastcount = 0; + } + clkintr_pending = 0; + splx(s); } + hardclock(arg); setdelayed(); } @@ -432,6 +560,35 @@ resettodr() CLOCK_SET(clockdev, &ct); } +static unsigned +i8254_get_timecount(struct timecounter *tc) +{ + u_int count; + u_int high, low; + int s; + + s = splhigh(); + + /* Select timer0 and latch counter value. */ + outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); + + low = inb(TIMER_CNTR0); + high = inb(TIMER_CNTR0); + count = timer0_max_count - ((high << 8) | low); + if (count < i8254_lastcount || + (!i8254_ticked && (clkintr_pending || + ((count < 20) && (inb(IO_ICU1) & 1))) + )) { + i8254_ticked = 1; + i8254_offset += timer0_max_count; + } + i8254_lastcount = count; + count += i8254_offset; + + splx(s); + return (count); +} + static unsigned alpha_get_timecount(struct timecounter* tc) { @@ -477,15 +634,6 @@ sysbeepstop(void *chan) beeping = 0; } -/* - * Frequency of all three count-down timers; (TIMER_FREQ/freq) is the - * appropriate count to generate a frequency of freq hz. - */ -#ifndef TIMER_FREQ -#define TIMER_FREQ 1193182 -#endif -#define TIMER_DIV(x) ((TIMER_FREQ+(x)/2)/(x)) - int sysbeep(int pitch, int period) { diff --git a/sys/alpha/alpha/genassym.c b/sys/alpha/alpha/genassym.c index a67f2d11275c..066d87b4fee3 100644 --- a/sys/alpha/alpha/genassym.c +++ b/sys/alpha/alpha/genassym.c @@ -51,8 +51,11 @@ #include #include #include +#include #include #include +#include +#include #include #include #include @@ -66,6 +69,21 @@ #include #include +#include "opt_smp.h" + +ASSYM(GD_CURPROC, offsetof(struct globaldata, gd_curproc)); +ASSYM(GD_FPCURPROC, offsetof(struct globaldata, gd_fpcurproc)); +ASSYM(GD_CURPCB, offsetof(struct globaldata, gd_curpcb)); +ASSYM(GD_SWITCHTIME, offsetof(struct globaldata, gd_switchtime)); +ASSYM(GD_CPUNO, offsetof(struct globaldata, gd_cpuno)); +ASSYM(GD_IDLEPCBPHYS, offsetof(struct globaldata, gd_idlepcbphys)); +ASSYM(GD_ASTPENDING, offsetof(struct globaldata, gd_astpending)); + +ASSYM(MTX_LOCK, offsetof(struct mtx, mtx_lock)); +ASSYM(MTX_RECURSE, offsetof(struct mtx, mtx_recurse)); +ASSYM(MTX_SAVEIPL, offsetof(struct mtx, mtx_saveipl)); +ASSYM(MTX_UNOWNED, MTX_UNOWNED); + ASSYM(P_ADDR, offsetof(struct proc, p_addr)); ASSYM(P_MD_FLAGS, offsetof(struct proc, p_md.md_flags)); ASSYM(P_MD_PCBPADDR, offsetof(struct proc, p_md.md_pcbpaddr)); @@ -81,6 +99,7 @@ ASSYM(PTESIZE, PTESIZE); ASSYM(U_PCB_ONFAULT, offsetof(struct user, u_pcb.pcb_onfault)); ASSYM(U_PCB_HWPCB_KSP, offsetof(struct user, u_pcb.pcb_hw.apcb_ksp)); ASSYM(U_PCB_CONTEXT, offsetof(struct user, u_pcb.pcb_context)); +ASSYM(U_PCB_SCHEDNEST, offsetof(struct user, u_pcb.pcb_schednest)); ASSYM(PCB_HW, offsetof(struct pcb, pcb_hw)); diff --git a/sys/alpha/alpha/interrupt.c b/sys/alpha/alpha/interrupt.c index deedefeb9710..20f621e0bd29 100644 --- a/sys/alpha/alpha/interrupt.c +++ b/sys/alpha/alpha/interrupt.c @@ -33,6 +33,8 @@ * notice. */ +#include "opt_ddb.h" + #include /* RCS ID & Copyright macro defns */ /* __KERNEL_RCSID(0, "$NetBSD: interrupt.c,v 1.23 1998/02/24 07:38:01 thorpej Exp $");*/ @@ -43,12 +45,15 @@ #include #include #include +#include #include #include #include #include #include +#include +#include #ifdef EVCNT_COUNTERS struct evcnt clock_intr_evcnt; /* event counter for clock intrs. */ @@ -56,8 +61,11 @@ struct evcnt clock_intr_evcnt; /* event counter for clock intrs. */ #include #endif +#ifdef DDB +#include +#endif + volatile int mc_expected, mc_received; -u_int32_t intr_nesting_level; static void dummy_perf(unsigned long vector, struct trapframe *framep) @@ -75,13 +83,19 @@ interrupt(a0, a1, a2, framep) unsigned long a0, a1, a2; struct trapframe *framep; { + /* + * Find our per-cpu globals. + */ + globalp = (struct globaldata *) alpha_pal_rdval(); - atomic_add_int(&intr_nesting_level, 1); + atomic_add_int(&PCPU_GET(intr_nesting_level), 1); { struct proc* p = curproc; if (!p) p = &proc0; - if ((caddr_t) framep < (caddr_t) p->p_addr + 1024) + if ((caddr_t) framep < (caddr_t) p->p_addr + 1024) { + mtx_enter(&Giant, MTX_DEF); panic("possible stack overflow\n"); + } } framep->tf_regs[FRAME_TRAPARG_A0] = a0; @@ -89,10 +103,18 @@ interrupt(a0, a1, a2, framep) framep->tf_regs[FRAME_TRAPARG_A2] = a2; switch (a0) { case ALPHA_INTR_XPROC: /* interprocessor interrupt */ - printf("interprocessor interrupt!\n"); + CTR0(KTR_INTR|KTR_SMP, "interprocessor interrupt"); + smp_handle_ipi(framep); /* note: lock not taken */ break; case ALPHA_INTR_CLOCK: /* clock interrupt */ + CTR0(KTR_INTR, "clock interrupt"); + if (PCPU_GET(cpuno) != hwrpb->rpb_primary_cpu_id) { + CTR0(KTR_INTR, "ignoring clock on secondary"); + return; + } + + mtx_enter(&Giant, MTX_DEF); cnt.v_intr++; #ifdef EVCNT_COUNTERS clock_intr_evcnt.ev_count++; @@ -105,24 +127,31 @@ interrupt(a0, a1, a2, framep) if((++schedclk2 & 0x7) == 0) statclock((struct clockframe *)framep); } + mtx_exit(&Giant, MTX_DEF); break; case ALPHA_INTR_ERROR: /* Machine Check or Correctable Error */ + mtx_enter(&Giant, MTX_DEF); a0 = alpha_pal_rdmces(); if (platform.mcheck_handler) (*platform.mcheck_handler)(a0, framep, a1, a2); else machine_check(a0, framep, a1, a2); + mtx_exit(&Giant, MTX_DEF); break; case ALPHA_INTR_DEVICE: /* I/O device interrupt */ + mtx_enter(&Giant, MTX_DEF); cnt.v_intr++; if (platform.iointr) (*platform.iointr)(framep, a1); + mtx_exit(&Giant, MTX_DEF); break; case ALPHA_INTR_PERF: /* interprocessor interrupt */ + mtx_enter(&Giant, MTX_DEF); perf_irq(a1, framep); + mtx_exit(&Giant, MTX_DEF); break; case ALPHA_INTR_PASSIVE: @@ -132,11 +161,12 @@ interrupt(a0, a1, a2, framep) break; default: + mtx_enter(&Giant, MTX_DEF); panic("unexpected interrupt: type 0x%lx vec 0x%lx a2 0x%lx\n", a0, a1, a2); /* NOTREACHED */ } - atomic_subtract_int(&intr_nesting_level, 1); + atomic_subtract_int(&PCPU_GET(intr_nesting_level), 1); } void @@ -204,6 +234,7 @@ machine_check(mces, framep, vector, param) printf(" pid = %d, comm = %s\n", curproc->p_pid, curproc->p_comm); printf("\n"); + kdb_trap(mces, vector, param, ALPHA_KENTRY_MM, framep); panic("machine check"); } diff --git a/sys/alpha/alpha/ipl_funcs.c b/sys/alpha/alpha/ipl_funcs.c index 8c2cb67c27fa..6642bce13dc3 100644 --- a/sys/alpha/alpha/ipl_funcs.c +++ b/sys/alpha/alpha/ipl_funcs.c @@ -30,9 +30,13 @@ #include #include #include +#include #include #include #include +#include +#include +#include #include #include "sio.h" @@ -129,7 +133,9 @@ do_sir() u_int32_t pend; int i; - atomic_add_int(&intr_nesting_level, 1); + mtx_enter(&Giant, MTX_DEF); + + atomic_add_int(&PCPU_GET(intr_nesting_level), 1); splsoft(); while ((pend = atomic_readandclear(&ipending)) != 0) { for (i = 0; pend && i < 32; i++) { @@ -142,7 +148,9 @@ do_sir() } } } - atomic_subtract_int(&intr_nesting_level, 1); + atomic_subtract_int(&PCPU_GET(intr_nesting_level), 1); + + mtx_exit(&Giant, MTX_DEF); } #define GENSET(name, ptr, bit) \ diff --git a/sys/alpha/alpha/locore.s b/sys/alpha/alpha/locore.s index 221c6731dd5d..2da1315bb856 100644 --- a/sys/alpha/alpha/locore.s +++ b/sys/alpha/alpha/locore.s @@ -77,7 +77,7 @@ */ #define SWITCH_CONTEXT \ /* Make a note of the context we're running on. */ \ - stq a0, curpcb ; \ + stq a0, GD_CURPCB(globalp); \ \ /* Swap in the new context. */ \ call_pal PAL_OSF1_swpctx @@ -106,6 +106,12 @@ ldiq a0, VPTBASE call_pal PAL_OSF1_wrvptptr /* clobbers a0, t0, t8-t11 */ + /* + * Initialise globalp. + */ + call_pal PAL_OSF1_rdval /* clobbers t0, t8-t11 */ + mov v0, globalp + /* * Switch to proc0's PCB, which is at U_PCB off of proc0paddr. */ @@ -126,18 +132,50 @@ * Note that setregs() is responsible for setting its contents * to 'reasonable' values. */ - lda sp,-(FRAME_SIZE * 8)(sp) /* space for struct trapframe */ + lda sp,-288(sp) /* space for struct trapframe */ mov sp, a0 /* arg is frame ptr */ CALL(mi_startup) /* go to mi_startup()! */ - /* - * Call exception_return, to simulate return from (fake) - * exception to user-land, running process 1, init! - */ - jmp zero, exception_return /* "And that's all she wrote." */ + /* NOTREACHED */ + END(locorestart) + /* + * Secondary processors start executing here. They will have their + * unique value set to point at the per-cpu structure and will + * be executing on their private idle stack. + */ + NESTED(smp_init_secondary_glue, 1, 0, ra, 0, 0) + mov pv, globalp + ldiq a0, ALPHA_PSL_IPL_HIGH /* disable all interrupts */ + call_pal PAL_OSF1_swpipl + + br pv, 1f +1: LDGP(pv) + + mov gp, a0 + call_pal PAL_OSF1_wrkgp /* clobbers a0, t0, t8-t11 */ + + ldiq a0, -2 /* TBIA */ + call_pal PAL_OSF1_tbi + call_pal PAL_imb + + ldq a0, GD_IDLEPCBPHYS(globalp) /* switch to idle ctx */ + call_pal PAL_OSF1_swpctx + + CALL(smp_init_secondary) /* initialise the rest */ + + /* + * After initialising, we start idling for real. + * We have the kernel lock at this point. + */ + CALL(cpu_switch) /* never returns */ + + call_pal PAL_halt + + END(smp_init_secondary_glue) + /**************************************************************************/ /* diff --git a/sys/alpha/alpha/machdep.c b/sys/alpha/alpha/machdep.c index 598362d31acb..9af44415134b 100644 --- a/sys/alpha/alpha/machdep.c +++ b/sys/alpha/alpha/machdep.c @@ -97,6 +97,8 @@ #include #include #include +#include +#include #include #include #include @@ -127,6 +129,8 @@ #include #include #include +#include +#include #include #include #include @@ -140,18 +144,17 @@ #include #include -struct proc* curproc; -struct proc* fpcurproc; -struct pcb* curpcb; u_int64_t cycles_per_usec; u_int32_t cycles_per_sec; -int whichqs, whichrtqs, whichidqs; int cold = 1; struct platform platform; alpha_chipset_t chipset; struct bootinfo_kernel bootinfo; -struct timeval switchtime; -int switchticks; + +struct cpuhead cpuhead; + +mtx_t sched_lock; +mtx_t Giant; struct user *proc0paddr; @@ -419,6 +422,14 @@ cpu_startup(dummy) vm_pager_bufferinit(); EVENTHANDLER_REGISTER(shutdown_final, alpha_srm_shutdown, 0, SHUTDOWN_PRI_LAST); + +#ifdef SMP + /* + * OK, enough kmem_alloc/malloc state should be up, lets get on with it! + */ + mp_start(); /* fire up the secondaries */ + mp_announce(); +#endif /* SMP */ } int @@ -977,12 +988,26 @@ alpha_init(pfn, ptb, bim, bip, biv) proc0.p_addr = proc0paddr = (struct user *)pmap_steal_memory(UPAGES * PAGE_SIZE); + /* + * Setup the global data for the bootstrap cpu. + */ + { + size_t sz = round_page(UPAGES * PAGE_SIZE); + globalp = (struct globaldata *) pmap_steal_memory(sz); + globaldata_init(globalp, alpha_pal_whami(), sz); + alpha_pal_wrval((u_int64_t) globalp); + PCPU_GET(next_asn) = 1; /* 0 used for proc0 pmap */ + } + /* * Initialize the virtual memory system, and set the * page table base register in proc 0's PCB. */ pmap_bootstrap(ALPHA_PHYS_TO_K0SEG(alpha_ptob(ptb)), hwrpb->rpb_max_asn); + hwrpb->rpb_vptb = VPTBASE; + hwrpb->rpb_checksum = hwrpb_checksum(); + /* * Initialize the rest of proc 0's PCB, and cache its physical @@ -999,6 +1024,29 @@ alpha_init(pfn, ptb, bim, bip, biv) (u_int64_t)proc0paddr + USPACE - sizeof(struct trapframe); proc0.p_md.md_tf = (struct trapframe *)proc0paddr->u_pcb.pcb_hw.apcb_ksp; + PCPU_SET(curproc, &proc0); + + /* + * Get the right value for the boot cpu's idle ptbr. + */ + globalp->gd_idlepcb.apcb_ptbr = proc0.p_addr->u_pcb.pcb_hw.apcb_ptbr; + + /* + * Record all cpus in a list. + */ + SLIST_INIT(&cpuhead); + SLIST_INSERT_HEAD(&cpuhead, GLOBALP, gd_allcpu); + + /* + * Initialise the kernel lock. + */ + mtx_init(&Giant, "Giant", MTX_DEF); + mtx_init(&sched_lock, "sched lock", MTX_SPIN); + + /* + * Enable interrupts on first release (in switch_trampoline). + */ + sched_lock.mtx_saveipl = ALPHA_PSL_IPL_0; /* * Look at arguments passed to us and compute boothowto. @@ -1118,6 +1166,8 @@ alpha_init(pfn, ptb, bim, bip, biv) #endif } + hwrpb_restart_setup(); + alpha_pal_wrfen(0); } @@ -2034,9 +2084,14 @@ SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, void alpha_fpstate_check(struct proc *p) { + /* + * For SMP, we should check the fpcurproc of each cpu. + */ +#ifndef SMP if (p->p_addr->u_pcb.pcb_hw.apcb_flags & ALPHA_PCB_FLAGS_FEN) if (p != fpcurproc) panic("alpha_check_fpcurproc: bogus"); +#endif } #define SET_FEN(p) \ diff --git a/sys/alpha/alpha/mem.c b/sys/alpha/alpha/mem.c index 940d8271d62a..196ed143aacd 100644 --- a/sys/alpha/alpha/mem.c +++ b/sys/alpha/alpha/mem.c @@ -261,9 +261,12 @@ mem_modevent(module_t mod, int type, void *data) case MOD_LOAD: if (bootverbose) printf("mem: \n"); +/* XXX - ??? */ +#if 0 /* Initialise memory range handling */ if (mem_range_softc.mr_op != NULL) mem_range_softc.mr_op->init(&mem_range_softc); +#endif memdev = make_dev(&mem_cdevsw, 0, UID_ROOT, GID_KMEM, 0640, "mem"); diff --git a/sys/alpha/alpha/mp_machdep.c b/sys/alpha/alpha/mp_machdep.c new file mode 100644 index 000000000000..367b57e813bc --- /dev/null +++ b/sys/alpha/alpha/mp_machdep.c @@ -0,0 +1,1115 @@ +/*- + * Copyright (c) 2000 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define CHECKSTATE_USER 0 +#define CHECKSTATE_SYS 1 +#define CHECKSTATE_INTR 2 + +volatile u_int stopped_cpus; +volatile u_int started_cpus; +volatile u_int checkstate_probed_cpus; +volatile u_int checkstate_need_ast; +volatile u_int checkstate_pending_ast; +struct proc* checkstate_curproc[NCPUS]; +int checkstate_cpustate[NCPUS]; +u_long checkstate_pc[NCPUS]; +volatile u_int resched_cpus; +void (*cpustop_restartfunc) __P((void)); +int mp_ncpus; + +int smp_started; +int boot_cpu_id; +u_int32_t all_cpus; + +static struct globaldata *cpuno_to_globaldata[NCPUS]; + +int smp_active = 0; /* are the APs allowed to run? */ +SYSCTL_INT(_machdep, OID_AUTO, smp_active, CTLFLAG_RW, &smp_active, 0, ""); + +/* Is forwarding of a interrupt to the CPU holding the ISR lock enabled ? */ +int forward_irq_enabled = 1; +SYSCTL_INT(_machdep, OID_AUTO, forward_irq_enabled, CTLFLAG_RW, + &forward_irq_enabled, 0, ""); + +/* Enable forwarding of a signal to a process running on a different CPU */ +static int forward_signal_enabled = 1; +SYSCTL_INT(_machdep, OID_AUTO, forward_signal_enabled, CTLFLAG_RW, + &forward_signal_enabled, 0, ""); + +/* Enable forwarding of roundrobin to all other cpus */ +static int forward_roundrobin_enabled = 1; +SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW, + &forward_roundrobin_enabled, 0, ""); + +/* + * Communicate with a console running on a secondary processor. + * Return 1 on failure. + */ +static int +smp_send_secondary_command(const char *command, int cpuno) +{ + u_int64_t mask = 1L << cpuno; + struct pcs *cpu = LOCATE_PCS(hwrpb, cpuno); + int i, len; + + /* + * Sanity check. + */ + len = strlen(command); + if (len > sizeof(cpu->pcs_buffer.rxbuf)) { + printf("smp_send_secondary_command: command '%s' too long\n", + command); + return 0; + } + + /* + * Wait for the rx bit to clear. + */ + for (i = 0; i < 100000; i++) { + if (!(hwrpb->rpb_rxrdy & mask)) + break; + DELAY(10); + } + if (hwrpb->rpb_rxrdy & mask) + return 0; + + /* + * Write the command into the processor's buffer. + */ + bcopy(command, cpu->pcs_buffer.rxbuf, len); + cpu->pcs_buffer.rxlen = len; + + /* + * Set the bit in the rxrdy mask and let the secondary try to + * handle the command. + */ + atomic_set_64(&hwrpb->rpb_rxrdy, mask); + + /* + * Wait for the rx bit to clear. + */ + for (i = 0; i < 100000; i++) { + if (!(hwrpb->rpb_rxrdy & mask)) + break; + DELAY(10); + } + if (hwrpb->rpb_rxrdy & mask) + return 0; + + return 1; +} + +void +smp_init_secondary(void) +{ + /* + * Record the globaldata pointer in the per-cpu system value. + */ + alpha_pal_wrval((u_int64_t) globalp); + + /* + * Point interrupt/exception vectors to our own. + */ + alpha_pal_wrent(XentInt, ALPHA_KENTRY_INT); + alpha_pal_wrent(XentArith, ALPHA_KENTRY_ARITH); + alpha_pal_wrent(XentMM, ALPHA_KENTRY_MM); + alpha_pal_wrent(XentIF, ALPHA_KENTRY_IF); + alpha_pal_wrent(XentUna, ALPHA_KENTRY_UNA); + alpha_pal_wrent(XentSys, ALPHA_KENTRY_SYS); + + mtx_enter(&Giant, MTX_DEF); + + printf("smp_init_secondary: called\n"); + CTR0(KTR_SMP, "smp_init_secondary"); + + /* + * Add to mask. + */ + smp_started = 1; + if (PCPU_GET(cpuno) + 1 > mp_ncpus) + mp_ncpus = PCPU_GET(cpuno) + 1; + spl0(); + smp_ipi_all(0); + + mtx_exit(&Giant, MTX_DEF); +} + +extern void smp_init_secondary_glue(void); + +static int +smp_start_secondary(int cpuno) +{ + struct pcs *cpu = LOCATE_PCS(hwrpb, cpuno); + struct pcs *bootcpu = LOCATE_PCS(hwrpb, hwrpb->rpb_primary_cpu_id); + struct alpha_pcb *pcb = (struct alpha_pcb *) cpu->pcs_hwpcb; + struct globaldata *globaldata; + int i; + size_t sz; + + if ((cpu->pcs_flags & PCS_PV) == 0) { + printf("smp_start_secondary: cpu %d PALcode invalid\n", cpuno); + return 0; + } + + printf("smp_start_secondary: starting cpu %d\n", cpuno); + + sz = round_page(UPAGES * PAGE_SIZE); + globaldata = malloc(sz, M_TEMP, M_NOWAIT); + if (!globaldata) { + printf("smp_start_secondary: can't allocate memory\n"); + return 0; + } + + globaldata_init(globaldata, cpuno, sz); + + /* + * Copy the idle pcb and setup the address to start executing. + * Use the pcb unique value to point the secondary at its globaldata + * structure. + */ + *pcb = globaldata->gd_idlepcb; + hwrpb->rpb_restart = (u_int64_t) smp_init_secondary_glue; + hwrpb->rpb_restart_val = (u_int64_t) globaldata; + hwrpb->rpb_checksum = hwrpb_checksum(); + + /* + * Tell the cpu to start with the same PALcode as us. + */ + bcopy(&bootcpu->pcs_pal_rev, &cpu->pcs_pal_rev, + sizeof cpu->pcs_pal_rev); + + /* + * Set flags in cpu structure and push out write buffers to + * make sure the secondary sees it. + */ + cpu->pcs_flags |= PCS_CV|PCS_RC; + cpu->pcs_flags &= ~PCS_BIP; + alpha_mb(); + + /* + * Fire it up and hope for the best. + */ + if (!smp_send_secondary_command("START\r\n", cpuno)) { + printf("smp_init_secondary: can't send START command\n"); + free(globaldata, M_TEMP); + return 0; + } + + /* + * Wait for the secondary to set the BIP flag in its structure. + */ + for (i = 0; i < 100000; i++) { + if (cpu->pcs_flags & PCS_BIP) + break; + DELAY(10); + } + if (!(cpu->pcs_flags & PCS_BIP)) { + printf("smp_init_secondary: secondary did not respond\n"); + free(globaldata, M_TEMP); + } + + /* + * It worked (I think). + */ + /* if (bootverbose) */ + printf("smp_init_secondary: cpu %d started\n", cpuno); + + return 1; +} + +/* + * Initialise a struct globaldata. + */ +void +globaldata_init(struct globaldata *globaldata, int cpuno, size_t sz) +{ + bzero(globaldata, sz); + globaldata->gd_idlepcbphys = vtophys((vm_offset_t) &globaldata->gd_idlepcb); + globaldata->gd_idlepcb.apcb_ksp = (u_int64_t) + ((caddr_t) globaldata + sz - sizeof(struct trapframe)); + globaldata->gd_idlepcb.apcb_ptbr = proc0.p_addr->u_pcb.pcb_hw.apcb_ptbr; + globaldata->gd_cpuno = cpuno; + globaldata->gd_other_cpus = all_cpus & ~(1 << cpuno); + globaldata->gd_next_asn = 0; + globaldata->gd_current_asngen = 1; + cpuno_to_globaldata[cpuno] = globaldata; +} + +struct globaldata * +globaldata_find(int cpuno) +{ + return cpuno_to_globaldata[cpuno]; +} + +/* Implementation of simplelocks */ + +/* + * Atomically swap the value of *p with val. Return the old value of *p. + */ +static __inline int +atomic_xchg(volatile u_int *p, u_int val) +{ + u_int32_t oldval, temp; + __asm__ __volatile__ ( + "1:\tldl_l %0,%3\n\t" /* load current value */ + "mov %4,%1\n\t" /* value to store */ + "stl_c %1,%2\n\t" /* attempt to store */ + "beq %1,2f\n\t" /* if the store failed, spin */ + "br 3f\n" /* it worked, exit */ + "2:\tbr 1b\n" /* *p not updated, loop */ + "3:\n" /* it worked */ + : "=&r"(oldval), "=r"(temp), "=m" (*p) + : "m"(*p), "r"(val) + : "memory"); + return oldval; +} + +void +s_lock_init(struct simplelock *lkp) +{ + lkp->lock_data = 0; +} + +void +s_lock(struct simplelock *lkp) +{ + for (;;) { + if (s_lock_try(lkp)) + return; + + /* + * Spin until clear. + */ + while (lkp->lock_data) + ; + } +} + +int +s_lock_try(struct simplelock *lkp) +{ + u_int32_t oldval, temp; + + __asm__ __volatile__ ( + "1:\tldl_l %0,%3\n\t" /* load current value */ + "blbs %0,2f\n" /* if set, give up now */ + "mov 1,%1\n\t" /* value to store */ + "stl_c %1,%2\n\t" /* attempt to store */ + "beq %1,3f\n\t" /* if the store failed, spin */ + "2:" /* exit */ + ".section .text2,\"ax\"\n" /* improve branch prediction */ + "3:\tbr 1b\n" /* *p not updated, loop */ + ".previous\n" + : "=&r"(oldval), "=r"(temp), "=m" (lkp->lock_data) + : "m"(lkp->lock_data) + : "memory"); + + if (!oldval) { + /* + * It was clear, return success. + */ + alpha_mb(); + return 1; + } + return 0; +} + +/* Other stuff */ + +/* lock around the MP rendezvous */ +static struct simplelock smp_rv_lock; + +static void +init_locks(void) +{ + s_lock_init(&smp_rv_lock); +} + +void +mp_start() +{ + int i; + int cpuno = PCPU_GET(cpuno); + + init_locks(); + + if (cpuno + 1 > mp_ncpus) + mp_ncpus = cpuno + 1; + + all_cpus = 1<rpb_pcs_cnt; i++) { + struct pcs *pcsp; + + if (i == cpuno) + continue; + pcsp = (struct pcs *)((char *)hwrpb + hwrpb->rpb_pcs_off + + (i * hwrpb->rpb_pcs_size)); + if ((pcsp->pcs_flags & PCS_PP) != 0) { + all_cpus |= 1<rpb_pcs_cnt; i++) { + struct pcs *pcsp; + + if (i == cpuno) + continue; + pcsp = (struct pcs *)((char *)hwrpb + hwrpb->rpb_pcs_off + + (i * hwrpb->rpb_pcs_size)); + if ((pcsp->pcs_flags & PCS_PP) != 0) { + smp_active = 1; + smp_start_secondary(i); + break; /* only one for now */ + } + } +} + +void +mp_announce() +{ +} + +void +smp_invltlb() +{ +} + +#define GD_TO_INDEX(pc, prof) \ + ((int)(((u_quad_t)((pc) - (prof)->pr_off) * \ + (u_quad_t)((prof)->pr_scale)) >> 16) & ~1) + +extern long cp_time[CPUSTATES]; + +static void +addugd_intr_forwarded(struct proc *p, int id, int *astmap) +{ + int i; + struct uprof *prof; + u_long pc; + + pc = checkstate_pc[id]; + prof = &p->p_stats->p_prof; + if (pc >= prof->pr_off && + (i = GD_TO_INDEX(pc, prof)) < prof->pr_size) { + if ((p->p_flag & P_OWEUPC) == 0) { + prof->pr_addr = pc; + prof->pr_ticks = 1; + p->p_flag |= P_OWEUPC; + } + *astmap |= (1 << id); + } +} + +static void +forwarded_statclock(int id, int pscnt, int *astmap) +{ + struct pstats *pstats; + long rss; + struct rusage *ru; + struct vmspace *vm; + int cpustate; + struct proc *p; +#ifdef GPROF + register struct gmonparam *g; + int i; +#endif + + p = checkstate_curproc[id]; + cpustate = checkstate_cpustate[id]; + + switch (cpustate) { + case CHECKSTATE_USER: + if (p->p_flag & P_PROFIL) + addugd_intr_forwarded(p, id, astmap); + if (pscnt > 1) + return; + p->p_uticks++; + if (p->p_nice > NZERO) + cp_time[CP_NICE]++; + else + cp_time[CP_USER]++; + break; + case CHECKSTATE_SYS: +#ifdef GPROF + /* + * Kernel statistics are just like addugd_intr, only easier. + */ + g = &_gmonparam; + if (g->state == GMON_PROF_ON) { + i = checkstate_pc[id] - g->lowpc; + if (i < g->textsize) { + i /= HISTFRACTION * sizeof(*g->kcount); + g->kcount[i]++; + } + } +#endif + if (pscnt > 1) + return; + + if (!p) + cp_time[CP_IDLE]++; + else { + p->p_sticks++; + cp_time[CP_SYS]++; + } + break; + case CHECKSTATE_INTR: + default: +#ifdef GPROF + /* + * Kernel statistics are just like addugd_intr, only easier. + */ + g = &_gmonparam; + if (g->state == GMON_PROF_ON) { + i = checkstate_pc[id] - g->lowpc; + if (i < g->textsize) { + i /= HISTFRACTION * sizeof(*g->kcount); + g->kcount[i]++; + } + } +#endif + if (pscnt > 1) + return; + if (p) + p->p_iticks++; + cp_time[CP_INTR]++; + } + if (p != NULL) { + schedclock(p); + + /* Update resource usage integrals and maximums. */ + if ((pstats = p->p_stats) != NULL && + (ru = &pstats->p_ru) != NULL && + (vm = p->p_vmspace) != NULL) { + ru->ru_ixrss += pgtok(vm->vm_tsize); + ru->ru_idrss += pgtok(vm->vm_dsize); + ru->ru_isrss += pgtok(vm->vm_ssize); + rss = pgtok(vmspace_resident_count(vm)); + if (ru->ru_maxrss < rss) + ru->ru_maxrss = rss; + } + } +} + +#define BETTER_CLOCK_DIAGNOSTIC + +void +forward_statclock(int pscnt) +{ + int map; + int id; + int i; + + /* Kludge. We don't yet have separate locks for the interrupts + * and the kernel. This means that we cannot let the other processors + * handle complex interrupts while inhibiting them from entering + * the kernel in a non-interrupt context. + * + * What we can do, without changing the locking mechanisms yet, + * is letting the other processors handle a very simple interrupt + * (wich determines the processor states), and do the main + * work ourself. + */ + + CTR1(KTR_SMP, "forward_statclock(%d)", pscnt); + + if (!smp_started || cold || panicstr) + return; + + /* Step 1: Probe state (user, cpu, interrupt, spinlock, idle ) */ + + map = PCPU_GET(other_cpus) & ~stopped_cpus ; + checkstate_probed_cpus = 0; + if (map != 0) + smp_ipi_selected(map, IPI_CHECKSTATE); + + i = 0; + while (checkstate_probed_cpus != map) { + /* spin */ + i++; + if (i == 100000) { +#ifdef BETTER_CLOCK_DIAGNOSTIC + printf("forward_statclock: checkstate %x\n", + checkstate_probed_cpus); +#endif + break; + } + } + + /* + * Step 2: walk through other processors processes, update ticks and + * profiling info. + */ + + map = 0; + for (id = 0; id < mp_ncpus; id++) { + if (id == cpuid) + continue; + if (((1 << id) & checkstate_probed_cpus) == 0) + continue; + forwarded_statclock(id, pscnt, &map); + } + if (map != 0) { + checkstate_need_ast |= map; + smp_ipi_selected(map, IPI_AST); + i = 0; + while ((checkstate_need_ast & map) != 0) { + /* spin */ + i++; + if (i > 100000) { +#ifdef BETTER_CLOCK_DIAGNOSTIC + printf("forward_statclock: dropped ast 0x%x\n", + checkstate_need_ast & map); +#endif + break; + } + } + } +} + +void +forward_hardclock(int pscnt) +{ + int map; + int id; + struct proc *p; + struct pstats *pstats; + int i; + + /* Kludge. We don't yet have separate locks for the interrupts + * and the kernel. This means that we cannot let the other processors + * handle complex interrupts while inhibiting them from entering + * the kernel in a non-interrupt context. + * + * What we can do, without changing the locking mechanisms yet, + * is letting the other processors handle a very simple interrupt + * (wich determines the processor states), and do the main + * work ourself. + */ + + CTR1(KTR_SMP, "forward_hardclock(%d)", pscnt); + + if (!smp_started || cold || panicstr) + return; + + /* Step 1: Probe state (user, cpu, interrupt, spinlock, idle) */ + + map = PCPU_GET(other_cpus) & ~stopped_cpus ; + checkstate_probed_cpus = 0; + if (map != 0) + smp_ipi_selected(map, IPI_CHECKSTATE); + + i = 0; + while (checkstate_probed_cpus != map) { + /* spin */ + i++; + if (i == 100000) { +#ifdef BETTER_CLOCK_DIAGNOSTIC + printf("forward_hardclock: checkstate %x\n", + checkstate_probed_cpus); +#endif + breakpoint(); + break; + } + } + + /* + * Step 2: walk through other processors processes, update virtual + * timer and profiling timer. If stathz == 0, also update ticks and + * profiling info. + */ + + map = 0; + for (id = 0; id < mp_ncpus; id++) { + if (id == cpuid) + continue; + if (((1 << id) & checkstate_probed_cpus) == 0) + continue; + p = checkstate_curproc[id]; + if (p) { + pstats = p->p_stats; + if (checkstate_cpustate[id] == CHECKSTATE_USER && + timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) { + psignal(p, SIGVTALRM); + map |= (1 << id); + } + if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) { + psignal(p, SIGPROF); + map |= (1 << id); + } + } + if (stathz == 0) { + forwarded_statclock( id, pscnt, &map); + } + } + if (map != 0) { + checkstate_need_ast |= map; + smp_ipi_selected(map, IPI_AST); + i = 0; + while ((checkstate_need_ast & map) != 0) { + /* spin */ + i++; + if (i > 100000) { +#ifdef BETTER_CLOCK_DIAGNOSTIC + printf("forward_hardclock: dropped ast 0x%x\n", + checkstate_need_ast & map); +#endif + break; + } + } + } +} + +void +forward_signal(struct proc *p) +{ + int map; + int id; + int i; + + /* Kludge. We don't yet have separate locks for the interrupts + * and the kernel. This means that we cannot let the other processors + * handle complex interrupts while inhibiting them from entering + * the kernel in a non-interrupt context. + * + * What we can do, without changing the locking mechanisms yet, + * is letting the other processors handle a very simple interrupt + * (wich determines the processor states), and do the main + * work ourself. + */ + + CTR1(KTR_SMP, "forward_signal(%p)", p); + + if (!smp_started || cold || panicstr) + return; + if (!forward_signal_enabled) + return; + while (1) { + if (p->p_stat != SRUN) + return; + id = p->p_oncpu; + if (id == 0xff) + return; + map = (1< 100000) { +#if 0 + printf("forward_signal: dropped ast 0x%x\n", + checkstate_need_ast & map); +#endif + break; + } + } + if (id == p->p_oncpu) + return; + } +} + +void +forward_roundrobin(void) +{ + u_int map; + int i; + + CTR0(KTR_SMP, "forward_roundrobin()"); + + if (!smp_started || cold || panicstr) + return; + if (!forward_roundrobin_enabled) + return; + resched_cpus |= PCPU_GET(other_cpus); + map = PCPU_GET(other_cpus) & ~stopped_cpus ; + smp_ipi_selected(map, IPI_AST); + i = 0; + while ((checkstate_need_ast & map) != 0) { + /* spin */ + i++; + if (i > 100000) { +#if 0 + printf("forward_roundrobin: dropped ast 0x%x\n", + checkstate_need_ast & map); +#endif + break; + } + } +} + +/* + * When called the executing CPU will send an IPI to all other CPUs + * requesting that they halt execution. + * + * Usually (but not necessarily) called with 'other_cpus' as its arg. + * + * - Signals all CPUs in map to stop. + * - Waits for each to stop. + * + * Returns: + * -1: error + * 0: NA + * 1: ok + * + * XXX FIXME: this is not MP-safe, needs a lock to prevent multiple CPUs + * from executing at same time. + */ +int +stop_cpus(u_int map) +{ + int i; + + if (!smp_started) + return 0; + + CTR1(KTR_SMP, "stop_cpus(%x)", map); + + /* send the stop IPI to all CPUs in map */ + smp_ipi_selected(map, IPI_STOP); + + i = 0; + while ((stopped_cpus & map) != map) { + /* spin */ + i++; + if (i == 100000) { + printf("timeout stopping cpus\n"); + break; + } + alpha_mb(); + } + + printf("stopped_cpus=%x\n", stopped_cpus); + + return 1; +} + + +/* + * Called by a CPU to restart stopped CPUs. + * + * Usually (but not necessarily) called with 'stopped_cpus' as its arg. + * + * - Signals all CPUs in map to restart. + * - Waits for each to restart. + * + * Returns: + * -1: error + * 0: NA + * 1: ok + */ +int +restart_cpus(u_int map) +{ + if (!smp_started) + return 0; + + CTR1(KTR_SMP, "restart_cpus(%x)", map); + + started_cpus = map; /* signal other cpus to restart */ + alpha_mb(); + + while ((stopped_cpus & map) != 0) /* wait for each to clear its bit */ + alpha_mb(); + + return 1; +} + +/* + * All-CPU rendezvous. CPUs are signalled, all execute the setup function + * (if specified), rendezvous, execute the action function (if specified), + * rendezvous again, execute the teardown function (if specified), and then + * resume. + * + * Note that the supplied external functions _must_ be reentrant and aware + * that they are running in parallel and in an unknown lock context. + */ +static void (*smp_rv_setup_func)(void *arg); +static void (*smp_rv_action_func)(void *arg); +static void (*smp_rv_teardown_func)(void *arg); +static void *smp_rv_func_arg; +static volatile int smp_rv_waiters[2]; + +void +smp_rendezvous_action(void) +{ + /* setup function */ + if (smp_rv_setup_func != NULL) + smp_rv_setup_func(smp_rv_func_arg); + /* spin on entry rendezvous */ + atomic_add_int(&smp_rv_waiters[0], 1); + while (smp_rv_waiters[0] < mp_ncpus) + ; + /* action function */ + if (smp_rv_action_func != NULL) + smp_rv_action_func(smp_rv_func_arg); + /* spin on exit rendezvous */ + atomic_add_int(&smp_rv_waiters[1], 1); + while (smp_rv_waiters[1] < mp_ncpus) + ; + /* teardown function */ + if (smp_rv_teardown_func != NULL) + smp_rv_teardown_func(smp_rv_func_arg); +} + +void +smp_rendezvous(void (* setup_func)(void *), + void (* action_func)(void *), + void (* teardown_func)(void *), + void *arg) +{ + int s; + + /* disable interrupts on this CPU, save interrupt status */ + s = splhigh(); + + /* obtain rendezvous lock */ + s_lock(&smp_rv_lock); /* XXX sleep here? NOWAIT flag? */ + + /* set static function pointers */ + smp_rv_setup_func = setup_func; + smp_rv_action_func = action_func; + smp_rv_teardown_func = teardown_func; + smp_rv_func_arg = arg; + smp_rv_waiters[0] = 0; + smp_rv_waiters[1] = 0; + + /* signal other processors, which will enter the IPI with interrupts off */ + smp_ipi_all_but_self(IPI_RENDEZVOUS); + + /* call executor function */ + smp_rendezvous_action(); + + /* release lock */ + s_unlock(&smp_rv_lock); + + /* restore interrupt flag */ + splx(s); +} + +/* + * send an IPI to a set of cpus. + */ +void +smp_ipi_selected(u_int32_t cpus, u_int64_t ipi) +{ + struct globaldata *globaldata; + + CTR2(KTR_SMP, "smp_ipi_selected", cpus, ipi); + alpha_mb(); + while (cpus) { + int cpuno = ffs(cpus) - 1; + cpus &= ~(1 << cpuno); + + globaldata = cpuno_to_globaldata[cpuno]; + if (globaldata) { + atomic_set_64(&globaldata->gd_pending_ipis, ipi); + alpha_mb(); + CTR1(KTR_SMP, "calling alpha_pal_wripir(%d)", cpuno); + alpha_pal_wripir(cpuno); + } + } +} + +/* + * send an IPI INTerrupt containing 'vector' to all CPUs, including myself + */ +void +smp_ipi_all(u_int64_t ipi) +{ + smp_ipi_selected(all_cpus, ipi); +} + +/* + * send an IPI to all CPUs EXCEPT myself + */ +void +smp_ipi_all_but_self(u_int64_t ipi) +{ + smp_ipi_selected(PCPU_GET(other_cpus), ipi); +} + +/* + * send an IPI to myself + */ +void +smp_ipi_self(u_int64_t ipi) +{ + smp_ipi_selected(1 << PCPU_GET(cpuno), ipi); +} + +static u_int64_t +atomic_readandclear(u_int64_t* p) +{ + u_int64_t v, temp; + __asm__ __volatile__ ( + "wmb\n" /* ensure pending writes have drained */ + "1:\tldq_l %0,%3\n\t" /* load current value, asserting lock */ + "ldiq %1,0\n\t" /* value to store */ + "stq_c %1,%2\n\t" /* attempt to store */ + "beq %1,2f\n\t" /* if the store failed, spin */ + "br 3f\n" /* it worked, exit */ + "2:\tbr 1b\n" /* *p not updated, loop */ + "3:\tmb\n" /* it worked */ + : "=&r"(v), "=&r"(temp), "=m" (*p) + : "m"(*p) + : "memory"); + return v; +} + +/* + * Handle an IPI sent to this processor. + */ +void +smp_handle_ipi(struct trapframe *frame) +{ + u_int64_t ipis = atomic_readandclear(&PCPU_GET(pending_ipis)); + u_int64_t ipi; + int cpuno = PCPU_GET(cpuno); + + CTR1(KTR_SMP, "smp_handle_ipi(), ipis=%x", ipis); + while (ipis) { + /* + * Find the lowest set bit. + */ + ipi = ipis & ~(ipis - 1); + switch (ipi) { + case IPI_INVLTLB: + break; + + case IPI_RENDEZVOUS: + CTR0(KTR_SMP, "IPI_RENDEZVOUS"); + smp_rendezvous_action(); + break; + + case IPI_AST: + CTR0(KTR_SMP, "IPI_AST"); + atomic_clear_int(&checkstate_need_ast, 1<tf_regs[FRAME_PS] & ALPHA_PSL_USERMODE) + ast(frame); /* XXX */ + break; + + case IPI_CHECKSTATE: + CTR0(KTR_SMP, "IPI_CHECKSTATE"); + if (frame->tf_regs[FRAME_PS] & ALPHA_PSL_USERMODE) + checkstate_cpustate[cpuno] = CHECKSTATE_USER; + else if (PCPU_GET(intr_nesting_level) == 1) + checkstate_cpustate[cpuno] = CHECKSTATE_SYS; + else + checkstate_cpustate[cpuno] = CHECKSTATE_INTR; + checkstate_curproc[cpuno] = PCPU_GET(curproc); + atomic_set_int(&checkstate_probed_cpus, 1<rpb_primary_cpu_id + && hwrpb->rpb_txrdy != 0) { + hwrpb->rpb_txrdy = 0; + alpha_mb(); + } +} + +#if 0 + +/* + * Atomically compare the value stored at *p with cmpval and if the + * two values are equal, update the value of *p with newval. Returns + * zero if the compare failed, nonzero otherwise. + */ +u_int64_t +atomic_cmpset_64(volatile u_int64_t* p, u_int64_t cmpval, u_int64_t newval) +{ + u_int64_t ret, temp; + + + printf("atomic_cmpset_64: *p=%lx, cmpval=%lx, newval=%lx\n", + *p, cmpval, newval); + __asm __volatile ( + "1:\tldq_l %1, %5\n\t" /* load old value */ + "cmpeq %1, %3, %0\n\t" /* compare */ + "beq %0, 2f\n\t" /* exit if not equal */ + "mov %4, %1\n\t" /* value to store */ + "stq_c %1, %2\n\t" /* attempt to store */ + "beq %1, 3f\n\t" /* if it failed, spin */ + "2:\n" /* done */ + ".section .text3,\"ax\"\n" /* improve branch prediction */ + "3:\tbr 1b\n" /* try again */ + ".previous\n" + : "=&r" (ret), "=r" (temp), "=m" (*p) + : "r" (cmpval), "r" (newval), "m" (*p) + : "memory"); + printf("atomic_cmpset_64: *p=%lx\n", *p); + + return ret; +} + +#endif diff --git a/sys/alpha/alpha/pmap.c b/sys/alpha/alpha/pmap.c index 7cdf67e02ea2..2a4852f15725 100644 --- a/sys/alpha/alpha/pmap.c +++ b/sys/alpha/alpha/pmap.c @@ -171,6 +171,7 @@ #include #include +#include #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 @@ -325,9 +326,7 @@ vm_offset_t kernel_vm_end; * Data for the ASN allocator */ static int pmap_maxasn; -static int pmap_nextasn = 0; -static u_int pmap_current_asngen = 1; -static pmap_t pmap_active = 0; +static pmap_t pmap_active[NCPUS]; /* * Data for the pv entry allocation mechanism @@ -456,16 +455,13 @@ void pmap_bootstrap(vm_offset_t ptaddr, u_int maxasn) { pt_entry_t newpte; - pt_entry_t* pte; - vm_offset_t va; int i; /* - * Setup ASNs + * Setup ASNs. PCPU_GET(next_asn) and PCPU_GET(current_asngen) are set + * up already. */ - pmap_nextasn = 0; pmap_maxasn = maxasn; - pmap_current_asngen = 1; /* * Allocate a level 1 map for the kernel. @@ -550,26 +546,13 @@ pmap_bootstrap(vm_offset_t ptaddr, u_int maxasn) kernel_pmap = &kernel_pmap_store; kernel_pmap->pm_lev1 = Lev1map; kernel_pmap->pm_count = 1; - kernel_pmap->pm_active = 1; - kernel_pmap->pm_asn = 0; - kernel_pmap->pm_asngen = pmap_current_asngen; - pmap_nextasn = 1; + kernel_pmap->pm_active = ~0; + kernel_pmap->pm_asn[alpha_pal_whami()].asn = 0; + kernel_pmap->pm_asn[alpha_pal_whami()].gen = 1; TAILQ_INIT(&kernel_pmap->pm_pvlist); nklev3 = NKPT; nklev2 = 1; - /* - * Reserve some special page table entries/VA space for temporary - * mapping of pages. - */ -#define SYSMAP(c, p, v, n) \ - v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); - - va = virtual_avail; - pte = pmap_lev3pte(kernel_pmap, va); - - virtual_avail = va; - /* * Set up proc0's PCB such that the ptbr points to the right place * and has the kernel pmap's. @@ -663,14 +646,44 @@ pmap_init2() static void pmap_invalidate_asn(pmap_t pmap) { - pmap->pm_asngen = 0; + pmap->pm_asn[PCPU_GET(cpuno)].gen = 0; +} + +struct pmap_invalidate_page_arg { + pmap_t pmap; + vm_offset_t va; +}; + +static void +pmap_invalidate_page_action(void *arg) +{ + pmap_t pmap = ((struct pmap_invalidate_page_arg *) arg)->pmap; + vm_offset_t va = ((struct pmap_invalidate_page_arg *) arg)->va; + + if (pmap->pm_active & (1 << PCPU_GET(cpuno))) { + ALPHA_TBIS(va); + alpha_pal_imb(); /* XXX overkill? */ + } else { + pmap_invalidate_asn(pmap); + } } static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { - if (pmap_isactive(pmap)) { - ALPHA_TBIS(va); + struct pmap_invalidate_page_arg arg; + arg.pmap = pmap; + arg.va = va; + smp_rendezvous(0, pmap_invalidate_page_action, 0, (void *) &arg); +} + +static void +pmap_invalidate_all_action(void *arg) +{ + pmap_t pmap = (pmap_t) arg; + + if (pmap->pm_active & (1 << PCPU_GET(cpuno))) { + ALPHA_TBIA(); alpha_pal_imb(); /* XXX overkill? */ } else pmap_invalidate_asn(pmap); @@ -679,32 +692,29 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) static void pmap_invalidate_all(pmap_t pmap) { - if (pmap_isactive(pmap)) { - ALPHA_TBIA(); - alpha_pal_imb(); /* XXX overkill? */ - } else - pmap_invalidate_asn(pmap); + smp_rendezvous(0, pmap_invalidate_all_action, 0, (void *) pmap); } static void pmap_get_asn(pmap_t pmap) { - if (pmap->pm_asngen != pmap_current_asngen) { - if (pmap_nextasn > pmap_maxasn) { + if (pmap->pm_asn[PCPU_GET(cpuno)].gen != PCPU_GET(current_asngen)) { + if (PCPU_GET(next_asn) > pmap_maxasn) { /* * Start a new ASN generation. * * Invalidate all per-process mappings and I-cache */ - pmap_nextasn = 0; - pmap_current_asngen++; + PCPU_GET(next_asn) = 0; + PCPU_GET(current_asngen)++; + PCPU_GET(current_asngen) &= (1 << 24) - 1; - if (pmap_current_asngen == 0) { + if (PCPU_GET(current_asngen) == 0) { /* - * Clear the pm_asngen of all pmaps. + * Clear the pm_asn[].gen of all pmaps. * This is safe since it is only called from * pmap_activate after it has deactivated - * the old pmap. + * the old pmap and it only affects this cpu. */ struct proc *p; pmap_t tpmap; @@ -712,11 +722,11 @@ pmap_get_asn(pmap_t pmap) #ifdef PMAP_DIAGNOSTIC printf("pmap_get_asn: generation rollover\n"); #endif - pmap_current_asngen = 1; + PCPU_GET(current_asngen) = 1; LIST_FOREACH(p, &allproc, p_list) { if (p->p_vmspace) { tpmap = vmspace_pmap(p->p_vmspace); - tpmap->pm_asngen = 0; + tpmap->pm_asn[PCPU_GET(cpuno)].gen = 0; } } } @@ -729,8 +739,8 @@ pmap_get_asn(pmap_t pmap) ALPHA_TBIAP(); alpha_pal_imb(); /* XXX overkill? */ } - pmap->pm_asn = pmap_nextasn++; - pmap->pm_asngen = pmap_current_asngen; + pmap->pm_asn[PCPU_GET(cpuno)].asn = PCPU_GET(next_asn)++; + pmap->pm_asn[PCPU_GET(cpuno)].gen = PCPU_GET(current_asngen); } } @@ -1163,13 +1173,17 @@ void pmap_pinit0(pmap) struct pmap *pmap; { + int i; + pmap->pm_lev1 = Lev1map; pmap->pm_flags = 0; pmap->pm_count = 1; pmap->pm_ptphint = NULL; pmap->pm_active = 0; - pmap->pm_asn = 0; - pmap->pm_asngen = 0; + for (i = 0; i < NCPUS; i++) { + pmap->pm_asn[i].asn = 0; + pmap->pm_asn[i].gen = 0; + } TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } @@ -1183,6 +1197,7 @@ pmap_pinit(pmap) register struct pmap *pmap; { vm_page_t lev1pg; + int i; /* * allocate object for the ptes @@ -1215,8 +1230,10 @@ pmap_pinit(pmap) pmap->pm_count = 1; pmap->pm_ptphint = NULL; pmap->pm_active = 0; - pmap->pm_asn = 0; - pmap->pm_asngen = 0; + for (i = 0; i < NCPUS; i++) { + pmap->pm_asn[i].asn = 0; + pmap->pm_asn[i].gen = 0; + } TAILQ_INIT(&pmap->pm_pvlist); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } @@ -2994,21 +3011,22 @@ pmap_activate(struct proc *p) pmap = vmspace_pmap(p->p_vmspace); - if (pmap_active && pmap != pmap_active) { - pmap_active->pm_active = 0; - pmap_active = 0; + if (pmap_active[PCPU_GET(cpuno)] && pmap != pmap_active[PCPU_GET(cpuno)]) { + atomic_clear_32(&pmap_active[PCPU_GET(cpuno)]->pm_active, + 1 << PCPU_GET(cpuno)); + pmap_active[PCPU_GET(cpuno)] = 0; } p->p_addr->u_pcb.pcb_hw.apcb_ptbr = ALPHA_K0SEG_TO_PHYS((vm_offset_t) pmap->pm_lev1) >> PAGE_SHIFT; - if (pmap->pm_asngen != pmap_current_asngen) + if (pmap->pm_asn[PCPU_GET(cpuno)].gen != PCPU_GET(current_asngen)) pmap_get_asn(pmap); - pmap_active = pmap; - pmap->pm_active = 1; /* XXX use bitmap for SMP */ + pmap_active[PCPU_GET(cpuno)] = pmap; + atomic_set_32(&pmap->pm_active, 1 << PCPU_GET(cpuno)); - p->p_addr->u_pcb.pcb_hw.apcb_asn = pmap->pm_asn; + p->p_addr->u_pcb.pcb_hw.apcb_asn = pmap->pm_asn[PCPU_GET(cpuno)].asn; if (p == curproc) { alpha_pal_swpctx((u_long)p->p_md.md_pcbpaddr); @@ -3020,8 +3038,8 @@ pmap_deactivate(struct proc *p) { pmap_t pmap; pmap = vmspace_pmap(p->p_vmspace); - pmap->pm_active = 0; - pmap_active = 0; + atomic_clear_32(&pmap->pm_active, 1 << PCPU_GET(cpuno)); + pmap_active[PCPU_GET(cpuno)] = 0; } vm_offset_t diff --git a/sys/alpha/alpha/prom.c b/sys/alpha/alpha/prom.c index 5880b2c2de6a..805539a571d0 100644 --- a/sys/alpha/alpha/prom.c +++ b/sys/alpha/alpha/prom.c @@ -57,7 +57,6 @@ int prom_mapped = 1; /* Is PROM still mapped? */ pt_entry_t rom_pte, saved_pte[1]; /* XXX */ static pt_entry_t *rom_lev1map __P((void)); -extern struct pcb* curpcb; extern pt_entry_t* Lev1map; static void prom_cache_sync __P((void)); diff --git a/sys/alpha/alpha/support.s b/sys/alpha/alpha/support.s index 2e5ff39c4426..2a87327ea470 100644 --- a/sys/alpha/alpha/support.s +++ b/sys/alpha/alpha/support.s @@ -71,7 +71,7 @@ beq t1, fusufault lda t0, fusufault /* trap faults */ - ldq t2, curproc + ldq t2, GD_CURPROC(globalp) ldq t2, P_ADDR(t2) stq t0, U_PCB_ONFAULT(t2) @@ -91,7 +91,7 @@ beq t1, fusufault lda t0, fusufault /* trap faults */ - ldq t2, curproc + ldq t2, GD_CURPROC(globalp) ldq t2, P_ADDR(t2) stq t0, U_PCB_ONFAULT(t2) @@ -116,7 +116,7 @@ beq t1, fusufault lda t0, fusufault /* trap faults */ - ldq t2, curproc + ldq t2, GD_CURPROC(globalp) ldq t2, P_ADDR(t2) stq t0, U_PCB_ONFAULT(t2) @@ -135,7 +135,7 @@ beq t1, fusufault lda t0, fusufault /* trap faults */ - ldq t2, curproc + ldq t2, GD_CURPROC(globalp) ldq t2, P_ADDR(t2) stq t0, U_PCB_ONFAULT(t2) @@ -153,7 +153,7 @@ END(suibyte) LEAF(fusufault, 0) - ldq t0, curproc + ldq t0, GD_CURPROC(globalp) ldq t0, P_ADDR(t0) stq zero, U_PCB_ONFAULT(t0) ldiq v0, -1 @@ -221,13 +221,13 @@ NESTED(copyinstr, 4, 16, ra, 0, 0) beq t1, copyerr /* if it's not, error out. */ lda v0, copyerr /* set up fault handler. */ .set noat - ldq at_reg, curproc + ldq at_reg, GD_CURPROC(globalp) ldq at_reg, P_ADDR(at_reg) stq v0, U_PCB_ONFAULT(at_reg) .set at CALL(copystr) /* do the copy. */ .set noat - ldq at_reg, curproc /* kill the fault handler. */ + ldq at_reg, GD_CURPROC(globalp) /* kill the fault handler. */ ldq at_reg, P_ADDR(at_reg) stq zero, U_PCB_ONFAULT(at_reg) .set at @@ -245,13 +245,13 @@ NESTED(copyoutstr, 4, 16, ra, 0, 0) beq t1, copyerr /* if it's not, error out. */ lda v0, copyerr /* set up fault handler. */ .set noat - ldq at_reg, curproc + ldq at_reg, GD_CURPROC(globalp) ldq at_reg, P_ADDR(at_reg) stq v0, U_PCB_ONFAULT(at_reg) .set at CALL(copystr) /* do the copy. */ .set noat - ldq at_reg, curproc /* kill the fault handler. */ + ldq at_reg, GD_CURPROC(globalp) /* kill the fault handler. */ ldq at_reg, P_ADDR(at_reg) stq zero, U_PCB_ONFAULT(at_reg) .set at @@ -423,13 +423,13 @@ bcopy_da_finish: insql t4,a1,t4 addq a1,a2,a4 ldq_u t6,0(a1) - ldq_u t7,-1(a4) + ldq_u t8,-1(a4) bic t6,t4,t6 - bic t7,t5,t7 + bic t8,t5,t8 and t2,t4,t2 and t3,t5,t3 or t2,t6,t2 - or t3,t7,t3 + or t3,t8,t3 stq_u t3,-1(a4) stq_u t2,0(a1) RET @@ -513,13 +513,13 @@ NESTED(copyin, 3, 16, ra, 0, 0) beq t1, copyerr /* if it's not, error out. */ lda v0, copyerr /* set up fault handler. */ .set noat - ldq at_reg, curproc + ldq at_reg, GD_CURPROC(globalp) ldq at_reg, P_ADDR(at_reg) stq v0, U_PCB_ONFAULT(at_reg) .set at CALL(bcopy) /* do the copy. */ .set noat - ldq at_reg, curproc /* kill the fault handler. */ + ldq at_reg, GD_CURPROC(globalp) /* kill the fault handler. */ ldq at_reg, P_ADDR(at_reg) stq zero, U_PCB_ONFAULT(at_reg) .set at @@ -538,13 +538,13 @@ NESTED(copyout, 3, 16, ra, 0, 0) beq t1, copyerr /* if it's not, error out. */ lda v0, copyerr /* set up fault handler. */ .set noat - ldq at_reg, curproc + ldq at_reg, GD_CURPROC(globalp) ldq at_reg, P_ADDR(at_reg) stq v0, U_PCB_ONFAULT(at_reg) .set at CALL(bcopy) /* do the copy. */ .set noat - ldq at_reg, curproc /* kill the fault handler. */ + ldq at_reg, GD_CURPROC(globalp) /* kill the fault handler. */ ldq at_reg, P_ADDR(at_reg) stq zero, U_PCB_ONFAULT(at_reg) .set at @@ -555,7 +555,7 @@ NESTED(copyout, 3, 16, ra, 0, 0) END(copyout) LEAF(copyerr, 0) - ldq t0, curproc + ldq t0, GD_CURPROC(globalp) ldq t0, P_ADDR(t0) stq zero, U_PCB_ONFAULT(t0) /* reset fault handler. */ ldq ra, (16-8)(sp) /* restore ra. */ diff --git a/sys/alpha/alpha/swtch.s b/sys/alpha/alpha/swtch.s index ee191ebd6264..f457a34754e0 100644 --- a/sys/alpha/alpha/swtch.s +++ b/sys/alpha/alpha/swtch.s @@ -28,7 +28,9 @@ * rights to redistribute these changes. */ +#define _LOCORE #include +#include #include "assym.s" /**************************************************************************/ @@ -39,7 +41,7 @@ */ #define SWITCH_CONTEXT \ /* Make a note of the context we're running on. */ \ - stq a0, curpcb; \ + stq a0, GD_CURPCB(globalp); \ \ /* Swap in the new context. */ \ call_pal PAL_OSF1_swpctx @@ -85,27 +87,6 @@ Lsavectx1: LDGP(pv) IMPORT(want_resched, 4) IMPORT(Lev1map, 8) -/* - * When no processes are on the runq, cpu_switch branches to idle - * to wait for something to come ready. - * Note: this is really a part of cpu_switch() but defined here for kernel - * profiling. - */ -LEAF(idle, 0) - br pv, Lidle1 -Lidle1: LDGP(pv) - stq zero, switchtime /* zero switchtime.tv_sec */ - stq zero, curproc /* curproc <- NULL for stats */ - mov zero, a0 /* enable all interrupts */ - call_pal PAL_OSF1_swpipl -Lidle2: - CALL(procrunnable) - beq v0, Lidle2 - ldiq a0, ALPHA_PSL_IPL_HIGH /* disable all interrupts */ - call_pal PAL_OSF1_swpipl - jmp zero, sw1 /* jump back into the fray */ - END(idle) - /* * cpu_switch() * Find the highest priority process and resume it. @@ -113,7 +94,10 @@ Lidle2: LEAF(cpu_switch, 1) LDGP(pv) /* do an inline savectx(), to save old context */ + ldq a0, GD_CURPROC(globalp) ldq a1, P_ADDR(a0) + ldl t0, sched_lock+MTX_RECURSE /* save sched_lock state */ + stl t0, U_PCB_SCHEDNEST(a1) /* NOTE: ksp is stored by the swpctx */ stq s0, U_PCB_CONTEXT+(0 * 8)(a1) /* store s0 - s6 */ stq s1, U_PCB_CONTEXT+(1 * 8)(a1) @@ -129,16 +113,12 @@ LEAF(cpu_switch, 1) mov a0, s0 /* save old curproc */ mov a1, s1 /* save old U-area */ - CALL(procrunnable) /* anything to run? */ - beq v0, idle /* and if none, go idle */ - ldiq a0, ALPHA_PSL_IPL_HIGH /* disable all interrupts */ call_pal PAL_OSF1_swpipl sw1: br pv, Lcs1 Lcs1: LDGP(pv) - CALL(chooseproc) - beq v0, idle + CALL(chooseproc) /* can't return NULL */ mov v0, s2 ldq s3, P_MD_PCBPADDR(s2) /* save new pcbpaddr */ @@ -194,7 +174,7 @@ Lcs7: * because we might have re-entered cpu_switch() from idle(), * in which case curproc would be NULL. */ - stq s2, curproc /* curproc = p */ + stq s2, GD_CURPROC(globalp) /* curproc = p */ stl zero, want_resched /* we've rescheduled */ /* @@ -212,6 +192,10 @@ Lcs7: ldq s5, U_PCB_CONTEXT+(5 * 8)(t0) ldq s6, U_PCB_CONTEXT+(6 * 8)(t0) ldq ra, U_PCB_CONTEXT+(7 * 8)(t0) /* restore ra */ + ldl t1, U_PCB_SCHEDNEST(t0) + stl t1, sched_lock+MTX_RECURSE /* restore lock */ + ldq t1, GD_CURPROC(globalp) + stq t1, sched_lock+MTX_LOCK ldq a0, U_PCB_CONTEXT+(8 * 8)(t0) /* restore ipl */ and a0, ALPHA_PSL_IPL_MASK, a0 call_pal PAL_OSF1_swpipl @@ -231,6 +215,7 @@ Lcs7: * pointer to the executing process's proc structure. */ LEAF(switch_trampoline, 0) + MTX_EXIT(sched_lock) mov s0, pv mov s1, ra mov s2, a0 @@ -266,7 +251,7 @@ Lchkast: and s1, ALPHA_PSL_USERMODE, t0 /* are we returning to user? */ beq t0, Lrestoreregs /* no: just return */ - ldl t2, astpending /* AST pending? */ + ldl t2, GD_ASTPENDING(globalp) /* AST pending? */ beq t2, Lrestoreregs /* no: return */ /* We've got an AST. Handle it. */ @@ -277,7 +262,7 @@ Lchkast: Lrestoreregs: /* set the hae register if this process has specified a value */ - ldq t0, curproc + ldq t0, GD_CURPROC(globalp) beq t0, Lnohae ldq t1, P_MD_FLAGS(t0) and t1, MDP_HAEUSED diff --git a/sys/alpha/alpha/synch_machdep.c b/sys/alpha/alpha/synch_machdep.c new file mode 100644 index 000000000000..a3077e9db57d --- /dev/null +++ b/sys/alpha/alpha/synch_machdep.c @@ -0,0 +1,529 @@ +/*- + * Copyright (c) 1997, 1998 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ + * $FreeBSD$ + */ + +#define MTX_STRS /* define common strings */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* All mutii in system (used for debug/panic) */ +mtx_t all_mtx = { MTX_UNOWNED, 0, 0, "All muti queue head", + TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked), + { NULL, NULL }, &all_mtx, &all_mtx +#ifdef SMP_DEBUG + , NULL, { NULL, NULL }, NULL, 0 +#endif +}; + +int mtx_cur_cnt; +int mtx_max_cnt; + +extern void _mtx_enter_giant_def(void); +extern void _mtx_exit_giant_def(void); + +static void propagate_priority(struct proc *) __unused; + +#define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED) +#define mtx_owner(m) (mtx_unowned(m) ? NULL \ + : (struct proc *)((m)->mtx_lock & MTX_FLAGMASK)) + +#define RETIP(x) *(((u_int64_t *)(&x)) - 1) +#define SET_PRIO(p, pri) (p)->p_priority = (pri) + +/* + * XXX Temporary, for use from assembly language + */ + +void +_mtx_enter_giant_def(void) +{ + + mtx_enter(&Giant, MTX_DEF); +} + +void +_mtx_exit_giant_def(void) +{ + + mtx_exit(&Giant, MTX_DEF); +} + +static void +propagate_priority(struct proc *p) +{ + int pri = p->p_priority; + mtx_t *m = p->p_blocked; + + for (;;) { + struct proc *p1; + + p = mtx_owner(m); + + if (p == NULL) { + /* + * This really isn't quite right. Really + * ought to bump priority of process that + * next axcquires the mutex. + */ + MPASS(m->mtx_lock == MTX_CONTESTED); + return; + } + MPASS(p->p_magic == P_MAGIC); + if (p->p_priority <= pri) + return; + /* + * If lock holder is actually running just bump priority. + */ + if (TAILQ_NEXT(p, p_procq) == NULL) { + SET_PRIO(p, pri); + return; + } + /* + * If on run queue move to new run queue, and + * quit. Otherwise pick up mutex p is blocked on + */ + if ((m = p->p_blocked) == NULL) { + remrunqueue(p); + SET_PRIO(p, pri); + setrunqueue(p); + return; + } + /* + * Check if the proc needs to be moved up on + * the blocked chain + */ + if ((p1 = TAILQ_PREV(p, rq, p_procq)) == NULL || + p1->p_priority <= pri) + continue; + + /* + * Remove proc from blocked chain + */ + TAILQ_REMOVE(&m->mtx_blocked, p, p_procq); + TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) { + MPASS(p1->p_magic == P_MAGIC); + if (p1->p_priority > pri) + break; + } + if (p1) + TAILQ_INSERT_BEFORE(p1, p, p_procq); + else + TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq); + CTR4(KTR_LOCK, + "propagate priority: p 0x%x moved before 0x%x on [0x%x] %s", + p, p1, m, m->mtx_description); + } +} + +void +mtx_enter_hard(mtx_t *m, int type, int ipl) +{ + struct proc *p = CURPROC; + + switch (type) { + case MTX_DEF: + if ((m->mtx_lock & MTX_FLAGMASK) == (u_int64_t)p) { + m->mtx_recurse++; + atomic_set_64(&m->mtx_lock, MTX_RECURSE); + CTR1(KTR_LOCK, "mtx_enter: 0x%x recurse", m); + return; + } + CTR3(KTR_LOCK, "mtx_enter: 0x%x contested (lock=%x) [0x%x]", + m, m->mtx_lock, RETIP(m)); + while (!atomic_cmpset_64(&m->mtx_lock, MTX_UNOWNED, + (u_int64_t)p)) { + int v; + struct timeval tv; + struct proc *p1; + + mtx_enter(&sched_lock, MTX_SPIN | MTX_RLIKELY); + /* + * check if the lock has been released while + * waiting for the schedlock. + */ + if ((v = m->mtx_lock) == MTX_UNOWNED) { + mtx_exit(&sched_lock, MTX_SPIN); + continue; + } + /* + * The mutex was marked contested on release. This + * means that there are processes blocked on it. + */ + if (v == MTX_CONTESTED) { + p1 = TAILQ_FIRST(&m->mtx_blocked); + m->mtx_lock = (u_int64_t)p | MTX_CONTESTED; + if (p1->p_priority < p->p_priority) { + SET_PRIO(p, p1->p_priority); + } + mtx_exit(&sched_lock, MTX_SPIN); + return; + } + /* + * If the mutex isn't already contested and + * a failure occurs setting the contested bit the + * mutex was either release or the + * state of the RECURSION bit changed. + */ + if ((v & MTX_CONTESTED) == 0 && + !atomic_cmpset_64(&m->mtx_lock, v, + v | MTX_CONTESTED)) { + mtx_exit(&sched_lock, MTX_SPIN); + continue; + } + + /* We definitely have to sleep for this lock */ + mtx_assert(m, MA_NOTOWNED); + + printf("m->mtx_lock=%lx\n", m->mtx_lock); + +#ifdef notyet + /* + * If we're borrowing an interrupted thread's VM + * context must clean up before going to sleep. + */ + if (p->p_flag & (P_ITHD | P_SITHD)) { + ithd_t *it = (ithd_t *)p; + + if (it->it_interrupted) { + CTR2(KTR_LOCK, + "mtx_enter: 0x%x interrupted 0x%x", + it, it->it_interrupted); + intr_thd_fixup(it); + } + } +#endif + + /* Put us on the list of procs blocked on this mutex */ + if (TAILQ_EMPTY(&m->mtx_blocked)) { + p1 = (struct proc *)(m->mtx_lock & + MTX_FLAGMASK); + LIST_INSERT_HEAD(&p1->p_contested, m, + mtx_contested); + TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq); + } else { + TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) + if (p1->p_priority > p->p_priority) + break; + if (p1) + TAILQ_INSERT_BEFORE(p1, p, p_procq); + else + TAILQ_INSERT_TAIL(&m->mtx_blocked, p, + p_procq); + } + + p->p_blocked = m; /* Who we're blocked on */ +#ifdef notyet + propagate_priority(p); +#endif + CTR3(KTR_LOCK, "mtx_enter: p 0x%x blocked on [0x%x] %s", + p, m, m->mtx_description); + /* + * cloaned from mi_switch + */ + microtime(&tv); + p->p_runtime += (tv.tv_usec - + PCPU_GET(switchtime.tv_usec)) + + (tv.tv_sec - + PCPU_GET(switchtime.tv_sec)) * + (int64_t)1000000; + PCPU_SET(switchtime.tv_usec, tv.tv_usec); + PCPU_SET(switchtime.tv_sec, tv.tv_sec); + cpu_switch(); + if (PCPU_GET(switchtime.tv_sec) == 0) + microtime(&GLOBALP->gd_switchtime); + PCPU_SET(switchticks, ticks); + CTR3(KTR_LOCK, + "mtx_enter: p 0x%x free from blocked on [0x%x] %s", + p, m, m->mtx_description); + mtx_exit(&sched_lock, MTX_SPIN); + } + alpha_mb(); + return; + case MTX_SPIN: + case MTX_SPIN | MTX_FIRST: + case MTX_SPIN | MTX_TOPHALF: + { + int i = 0; + + if (m->mtx_lock == (u_int64_t)p) { + m->mtx_recurse++; + return; + } + CTR1(KTR_LOCK, "mtx_enter: 0x%x spinning", m); + for (;;) { + if (atomic_cmpset_64(&m->mtx_lock, MTX_UNOWNED, + (u_int64_t)p)) { + alpha_mb(); + break; + } + while (m->mtx_lock != MTX_UNOWNED) { + if (i++ < 1000000) + continue; + if (i++ < 6000000) + DELAY (1); + else + panic("spin lock > 5 seconds"); + } + } + +#ifdef SMP_DEBUG + if (type != MTX_SPIN) + m->mtx_saveipl = 0xbeefface; + else +#endif + m->mtx_saveipl = ipl; + CTR1(KTR_LOCK, "mtx_enter: 0x%x spin done", m); + return; + } + } +} + +void +mtx_exit_hard(mtx_t *m, int type) +{ + struct proc *p, *p1; + mtx_t *m1; + int pri; + + switch (type) { + case MTX_DEF: + case MTX_DEF | MTX_NOSWITCH: + if (m->mtx_recurse != 0) { + if (--(m->mtx_recurse) == 0) + atomic_clear_64(&m->mtx_lock, MTX_RECURSE); + CTR1(KTR_LOCK, "mtx_exit: 0x%x unrecurse", m); + return; + } + mtx_enter(&sched_lock, MTX_SPIN); + CTR1(KTR_LOCK, "mtx_exit: 0x%x contested", m); + p = CURPROC; + p1 = TAILQ_FIRST(&m->mtx_blocked); + MPASS(p->p_magic == P_MAGIC); + MPASS(p1->p_magic == P_MAGIC); + TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq); + if (TAILQ_EMPTY(&m->mtx_blocked)) { + LIST_REMOVE(m, mtx_contested); + atomic_cmpset_64(&m->mtx_lock, m->mtx_lock, + MTX_UNOWNED); + CTR1(KTR_LOCK, "mtx_exit: 0x%x not held", m); + } else + m->mtx_lock = MTX_CONTESTED; + pri = MAXPRI; + LIST_FOREACH(m1, &p->p_contested, mtx_contested) { + int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority; + if (cp < pri) + pri = cp; + } + if (pri > p->p_nativepri) + pri = p->p_nativepri; + SET_PRIO(p, pri); + CTR2(KTR_LOCK, "mtx_exit: 0x%x contested setrunqueue 0x%x", + m, p1); + p1->p_blocked = NULL; + setrunqueue(p1); + if ((type & MTX_NOSWITCH) == 0 && p1->p_priority < pri) { +#ifdef notyet + if (p->p_flag & (P_ITHD | P_SITHD)) { + ithd_t *it = (ithd_t *)p; + + if (it->it_interrupted) { + CTR2(KTR_LOCK, + "mtx_exit: 0x%x interruped 0x%x", + it, it->it_interrupted); + intr_thd_fixup(it); + } + } +#endif + setrunqueue(p); + CTR2(KTR_LOCK, "mtx_exit: 0x%x switching out lock=0x%x", + m, m->mtx_lock); + cpu_switch(); + CTR2(KTR_LOCK, "mtx_exit: 0x%x resuming lock=0x%x", + m, m->mtx_lock); + } + mtx_exit(&sched_lock, MTX_SPIN); + return; + case MTX_SPIN: + case MTX_SPIN | MTX_FIRST: + if (m->mtx_recurse != 0) { + m->mtx_recurse--; + return; + } + alpha_mb(); + if (atomic_cmpset_64(&m->mtx_lock, CURTHD, MTX_UNOWNED)) { + MPASS(m->mtx_saveipl != 0xbeefface); + alpha_pal_swpipl(m->mtx_saveipl); + return; + } + panic("unsucuessful release of spin lock"); + case MTX_SPIN | MTX_TOPHALF: + if (m->mtx_recurse != 0) { + m->mtx_recurse--; + return; + } + alpha_mb(); + if (atomic_cmpset_64(&m->mtx_lock, CURTHD, MTX_UNOWNED)) + return; + panic("unsucuessful release of spin lock"); + default: + panic("mtx_exit_hard: unsupported type 0x%x\n", type); + } +} + +#define MV_DESTROY 0 /* validate before destory */ +#define MV_INIT 1 /* validate before init */ + +#ifdef SMP_DEBUG + +int mtx_validate __P((mtx_t *, int)); + +int +mtx_validate(mtx_t *m, int when) +{ + mtx_t *mp; + int i; + int retval = 0; + + if (m == &all_mtx || cold) + return 0; + + mtx_enter(&all_mtx, MTX_DEF); + ASS(kernacc((caddr_t)all_mtx.mtx_next, 4, 1) == 1); + ASS(all_mtx.mtx_next->mtx_prev == &all_mtx); + for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) { + if (kernacc((caddr_t)mp->mtx_next, 4, 1) != 1) { + panic("mtx_validate: mp=%p mp->mtx_next=%p", + mp, mp->mtx_next); + } + i++; + if (i > mtx_cur_cnt) { + panic("mtx_validate: too many in chain, known=%d\n", + mtx_cur_cnt); + } + } + ASS(i == mtx_cur_cnt); + switch (when) { + case MV_DESTROY: + for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) + if (mp == m) + break; + ASS(mp == m); + break; + case MV_INIT: + for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) + if (mp == m) { + /* + * Not good. This mutex already exits + */ + retval = 1; +#if 1 + printf("re-initing existing mutex %s\n", + m->mtx_description); + ASS(m->mtx_lock == MTX_UNOWNED); + retval = 1; +#else + panic("re-initing existing mutex %s", + m->mtx_description); +#endif + } + } + mtx_exit(&all_mtx, MTX_DEF); + return (retval); +} +#endif + +void +mtx_init(mtx_t *m, char *t, int flag) +{ + + CTR2(KTR_LOCK, "mtx_init 0x%x (%s)", m, t); +#ifdef SMP_DEBUG + if (mtx_validate(m, MV_INIT)) /* diagnostic and error correction */ + return; +#endif + bzero((void *)m, sizeof *m); + TAILQ_INIT(&m->mtx_blocked); + m->mtx_description = t; + m->mtx_lock = MTX_UNOWNED; + /* Put on all mutex queue */ + mtx_enter(&all_mtx, MTX_DEF); + m->mtx_next = &all_mtx; + m->mtx_prev = all_mtx.mtx_prev; + m->mtx_prev->mtx_next = m; + all_mtx.mtx_prev = m; + if (++mtx_cur_cnt > mtx_max_cnt) + mtx_max_cnt = mtx_cur_cnt; + mtx_exit(&all_mtx, MTX_DEF); + witness_init(m, flag); +} + +void +mtx_destroy(mtx_t *m) +{ + + CTR2(KTR_LOCK, "mtx_destroy 0x%x (%s)", m, m->mtx_description); +#ifdef SMP_DEBUG + if (m->mtx_next == NULL) + panic("mtx_destroy: %p (%s) already destroyed", + m, m->mtx_description); + + if (!mtx_owned(m)) { + ASS(m->mtx_lock == MTX_UNOWNED); + } else { + ASS((m->mtx_lock & (MTX_RECURSE|MTX_CONTESTED)) == 0); + } + mtx_validate(m, MV_DESTROY); /* diagnostic */ +#endif + +#ifdef WITNESS + if (m->mtx_witness) + witness_destroy(m); +#endif /* WITNESS */ + + /* Remove from the all mutex queue */ + mtx_enter(&all_mtx, MTX_DEF); + m->mtx_next->mtx_prev = m->mtx_prev; + m->mtx_prev->mtx_next = m->mtx_next; +#ifdef SMP_DEBUG + m->mtx_next = m->mtx_prev = NULL; +#endif + mtx_cur_cnt--; + mtx_exit(&all_mtx, MTX_DEF); +} diff --git a/sys/alpha/alpha/trap.c b/sys/alpha/alpha/trap.c index a21d532dcf22..072c5f3fdbc5 100644 --- a/sys/alpha/alpha/trap.c +++ b/sys/alpha/alpha/trap.c @@ -35,8 +35,9 @@ #include #include +#include +#include #include -#include #include #include #include @@ -58,6 +59,8 @@ #include #include #include +#include +#include #ifdef KTRACE #include @@ -69,8 +72,6 @@ #endif u_int32_t want_resched; -u_int32_t astpending; -struct proc *fpcurproc; /* current user of the FPU */ void userret __P((struct proc *, u_int64_t, u_quad_t)); @@ -201,6 +202,11 @@ trap(a0, a1, a2, entry, framep) u_quad_t sticks; int user; + /* + * Find our per-cpu globals. + */ + globalp = (struct globaldata *) alpha_pal_rdval(); + cnt.v_trap++; p = curproc; ucode = 0; @@ -233,9 +239,12 @@ trap(a0, a1, a2, entry, framep) * and per-process unaligned-access-handling flags). */ if (user) { - if ((i = unaligned_fixup(a0, a1, a2, p)) == 0) + mtx_enter(&Giant, MTX_DEF); + if ((i = unaligned_fixup(a0, a1, a2, p)) == 0) { + mtx_exit(&Giant, MTX_DEF); goto out; - + } + mtx_exit(&Giant, MTX_DEF); ucode = a0; /* VA */ break; } @@ -259,9 +268,13 @@ trap(a0, a1, a2, entry, framep) * is not requested or if the completion fails. */ if (user) { + mtx_enter(&Giant, MTX_DEF); if (a0 & EXCSUM_SWC) - if (fp_software_completion(a1, p)) + if (fp_software_completion(a1, p)) { + mtx_exit(&Giant, MTX_DEF); goto out; + } + mtx_exit(&Giant, MTX_DEF); i = SIGFPE; ucode = a0; /* exception summary */ break; @@ -364,6 +377,7 @@ trap(a0, a1, a2, entry, framep) vm_prot_t ftype = 0; int rv; + mtx_enter(&Giant, MTX_DEF); /* * If it was caused by fuswintr or suswintr, * just punt. Note that we check the faulting @@ -379,6 +393,7 @@ trap(a0, a1, a2, entry, framep) framep->tf_regs[FRAME_PC] = p->p_addr->u_pcb.pcb_onfault; p->p_addr->u_pcb.pcb_onfault = 0; + mtx_exit(&Giant, MTX_DEF); goto out; } @@ -489,9 +504,11 @@ trap(a0, a1, a2, entry, framep) rv = KERN_INVALID_ADDRESS; } if (rv == KERN_SUCCESS) { + mtx_exit(&Giant, MTX_DEF); goto out; } + mtx_exit(&Giant, MTX_DEF); if (!user) { /* Check for copyin/copyout fault */ if (p != NULL && @@ -573,6 +590,12 @@ syscall(code, framep) u_int64_t args[10]; /* XXX */ u_int hidden = 0, nargs; + /* + * Find our per-cpu globals. + */ + globalp = (struct globaldata *) alpha_pal_rdval(); + mtx_enter(&Giant, MTX_DEF); + framep->tf_regs[FRAME_TRAPARG_A0] = 0; framep->tf_regs[FRAME_TRAPARG_A1] = 0; framep->tf_regs[FRAME_TRAPARG_A2] = 0; @@ -693,6 +716,7 @@ syscall(code, framep) * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); + mtx_exit(&Giant, MTX_DEF); } /* @@ -712,6 +736,7 @@ child_return(p) if (KTRPOINT(p, KTR_SYSRET)) ktrsysret(p->p_tracep, SYS_fork, 0, 0); #endif + mtx_exit(&Giant, MTX_DEF); } /* @@ -725,6 +750,8 @@ ast(framep) register struct proc *p; u_quad_t sticks; + mtx_enter(&Giant, MTX_DEF); + p = curproc; sticks = p->p_sticks; p->p_md.md_tf = framep; @@ -734,7 +761,7 @@ ast(framep) cnt.v_soft++; - astpending = 0; + PCPU_SET(astpending, 0); if (p->p_flag & P_OWEUPC) { p->p_flag &= ~P_OWEUPC; addupc_task(p, p->p_stats->p_prof.pr_addr, @@ -742,6 +769,8 @@ ast(framep) } userret(p, framep->tf_regs[FRAME_PC], sticks); + + mtx_exit(&Giant, MTX_DEF); } /* diff --git a/sys/alpha/alpha/vm_machdep.c b/sys/alpha/alpha/vm_machdep.c index 8baea02b8494..3831d67658c6 100644 --- a/sys/alpha/alpha/vm_machdep.c +++ b/sys/alpha/alpha/vm_machdep.c @@ -84,6 +84,7 @@ #include #include #include +#include #include #include @@ -246,8 +247,10 @@ cpu_exit(p) alpha_fpstate_drop(p); (void) splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); + mtx_exit(&Giant, MTX_DEF); cnt.v_swtch++; - cpu_switch(p); + cpu_switch(); panic("cpu_exit"); } @@ -358,7 +361,7 @@ vunmapbuf(bp) } /* - * Force reset the processor by invalidating the entire address space! + * Reset back to firmware. */ void cpu_reset() @@ -416,7 +419,7 @@ vm_page_zero_idle() return(0); #ifdef SMP - if (try_mplock()) { + if (KLOCK_ENTER(M_TRY)) { #endif s = splvm(); m = vm_page_list_find(PQ_FREE, free_rover, FALSE); @@ -447,7 +450,7 @@ vm_page_zero_idle() free_rover = (free_rover + PQ_PRIME2) & PQ_L2_MASK; splx(s); #ifdef SMP - rel_mplock(); + KLOCK_EXIT; #endif return (1); #ifdef SMP diff --git a/sys/alpha/include/asm.h b/sys/alpha/include/asm.h index b185295c60c7..d46eb972868a 100644 --- a/sys/alpha/include/asm.h +++ b/sys/alpha/include/asm.h @@ -90,6 +90,11 @@ #define sp $30 /* (S) stack pointer */ #define zero $31 /* wired zero */ +/* In the kernel, we use t7 to point at the per-cpu globals. */ +#ifdef _KERNEL +#define globalp $8 +#endif + /* Floating point registers (XXXX VERIFY THIS) */ #define fv0 $f0 /* (T) return value (real) */ #define fv1 $f1 /* (T) return value (imaginary)*/ @@ -266,7 +271,6 @@ _name_:; \ .loc 1 __LINE__; \ bsr ra,exception_save_regs /* jmp/CALL trashes pv/t12 */ - /* * LEAF * Declare a global leaf function. diff --git a/sys/alpha/include/cpu.h b/sys/alpha/include/cpu.h index c9d783b8b0ed..99eb79eef668 100644 --- a/sys/alpha/include/cpu.h +++ b/sys/alpha/include/cpu.h @@ -65,7 +65,7 @@ struct clockframe { #define CLKF_BASEPRI(framep) \ (((framep)->cf_tf.tf_regs[FRAME_PS] & ALPHA_PSL_IPL_MASK) == 0) #define CLKF_PC(framep) ((framep)->cf_tf.tf_regs[FRAME_PC]) -#define CLKF_INTR(framep) (intr_nesting_level >= 2) +#define CLKF_INTR(framep) (PCPU_GET(intr_nesting_level) >= 2) /* * Preempt the current process if in interrupt from user mode, @@ -89,9 +89,10 @@ struct clockframe { */ #define signotify(p) aston() -#define aston() (astpending = 1) +#define aston() PCPU_SET(astpending, 1) #ifdef _KERNEL +extern u_int astpending; extern u_int32_t intr_nesting_level; /* bookeeping only; counts sw intrs */ extern u_int32_t want_resched; /* resched() was called */ #endif @@ -132,7 +133,6 @@ struct reg; struct rpb; struct trapframe; -extern struct proc *fpcurproc; extern struct rpb *hwrpb; extern volatile int mc_expected, mc_received; diff --git a/sys/alpha/include/cpufunc.h b/sys/alpha/include/cpufunc.h index e7d37f0fb146..cabfe0fa88a8 100644 --- a/sys/alpha/include/cpufunc.h +++ b/sys/alpha/include/cpufunc.h @@ -33,6 +33,7 @@ #include #include +#include #ifdef __GNUC__ @@ -44,6 +45,33 @@ breakpoint(void) #endif +/* + * Bogus interrupt manipulation + */ +static __inline void +disable_intr(void) +{ + alpha_pal_swpipl(ALPHA_PSL_IPL_HIGH); +} + +static __inline void +enable_intr(void) +{ + alpha_pal_swpipl(ALPHA_PSL_IPL_0); +} + +static __inline u_int +save_intr(void) +{ + return alpha_pal_rdps() & ALPHA_PSL_IPL_MASK; +} + +static __inline void +restore_intr(u_int ipl) +{ + alpha_pal_swpipl(ipl); +} + #endif /* _KERNEL */ #endif /* !_MACHINE_CPUFUNC_H_ */ diff --git a/sys/alpha/include/globaldata.h b/sys/alpha/include/globaldata.h new file mode 100644 index 000000000000..b246bb1fb707 --- /dev/null +++ b/sys/alpha/include/globaldata.h @@ -0,0 +1,79 @@ +/*- + * Copyright (c) 1999 Luoqi Chen + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MACHINE_GLOBALDATA_H_ +#define _MACHINE_GLOBALDATA_H_ + +#ifdef _KERNEL + +#include + +/* + * This structure maps out the global data that needs to be kept on a + * per-cpu basis. genassym uses this to generate offsets for the assembler + * code, which also provides external symbols so that C can get at them as + * though they were really globals. This structure is pointed to by + * the per-cpu system value (see alpha_pal_rdval() and alpha_pal_wrval()). + * Inside the kernel, the globally reserved register t7 is used to + * point at the globaldata structure. + */ +struct globaldata { + struct alpha_pcb gd_idlepcb; /* pcb for idling */ + struct proc *gd_curproc; /* current process */ + struct proc *gd_idleproc; /* idle process */ + struct proc *gd_fpcurproc; /* fp state owner */ + struct pcb *gd_curpcb; /* current pcb */ + struct timeval gd_switchtime; + int gd_switchticks; + u_int gd_cpuno; /* this cpu number */ + u_int gd_other_cpus; /* all other cpus */ + int gd_inside_intr; + u_int64_t gd_idlepcbphys; /* pa of gd_idlepcb */ + u_int64_t gd_pending_ipis; /* pending IPI events */ + u_int32_t gd_next_asn; /* next ASN to allocate */ + u_int32_t gd_current_asngen; /* ASN rollover check */ + u_int32_t gd_intr_nesting_level; /* interrupt recursion */ + + u_int gd_astpending; + SLIST_ENTRY(globaldata) gd_allcpu; +#ifdef KTR_PERCPU + volatile int gd_ktr_idx; /* Index into trace table */ + char *gd_ktr_buf; + char gd_ktr_buf_data[0]; +#endif +}; + +SLIST_HEAD(cpuhead, globaldata); +extern struct cpuhead cpuhead; + +void globaldata_init(struct globaldata *pcpu, int cpuno, size_t sz); +struct globaldata *globaldata_find(int cpuno); + +#endif /* _KERNEL */ + +#endif /* !_MACHINE_GLOBALDATA_H_ */ diff --git a/sys/alpha/include/globals.h b/sys/alpha/include/globals.h new file mode 100644 index 000000000000..303efdfe9f6a --- /dev/null +++ b/sys/alpha/include/globals.h @@ -0,0 +1,63 @@ +/*- + * Copyright (c) 1999 Luoqi Chen + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MACHINE_GLOBALS_H_ +#define _MACHINE_GLOBALS_H_ + +#ifdef _KERNEL + +register struct globaldata *globalp __asm__("$8"); + +#if 1 +#define GLOBALP globalp +#else +#define GLOBALP ((struct globaldata *) alpha_pal_rdval()) +#endif + +#define PCPU_GET(name) (GLOBALP->gd_##name) +#define PCPU_SET(name,value) (GLOBALP->gd_##name = (value)) + +/* + * The following set of macros works for UP kernel as well, but for maximum + * performance we allow the global variables to be accessed directly. On the + * other hand, kernel modules should always use these macros to maintain + * portability between UP and SMP kernels. + */ +#define CURPROC PCPU_GET(curproc) +#define curproc PCPU_GET(curproc) +#define idleproc PCPU_GET(idleproc) +#define curpcb PCPU_GET(curpcb) +#define fpcurproc PCPU_GET(fpcurproc) +#define switchtime PCPU_GET(switchtime) +#define switchticks PCPU_GET(switchticks) +#define cpuid PCPU_GET(cpuno) +#define prevproc PCPU_GET(curproc) /* XXX - until ithreads */ + +#endif /* _KERNEL */ + +#endif /* !_MACHINE_GLOBALS_H_ */ diff --git a/sys/alpha/include/ipl.h b/sys/alpha/include/ipl.h index ea93fbb39d12..2e9b3cc7b7d9 100644 --- a/sys/alpha/include/ipl.h +++ b/sys/alpha/include/ipl.h @@ -127,4 +127,19 @@ extern void schedsoftclock(void); extern unsigned cpl; /* current priority level mask */ #endif +/* + * Interprocessor interrupts for SMP. + */ +#define IPI_INVLTLB 0x0001 +#define IPI_RENDEZVOUS 0x0002 +#define IPI_AST 0x0004 +#define IPI_CHECKSTATE 0x0008 +#define IPI_STOP 0x0010 + +void smp_ipi_selected(u_int32_t cpus, u_int64_t ipi); +void smp_ipi_all(u_int64_t ipi); +void smp_ipi_all_but_self(u_int64_t ipi); +void smp_ipi_self(u_int64_t ipi); +void smp_handle_ipi(struct trapframe *frame); + #endif /* !_MACHINE_MD_VAR_H_ */ diff --git a/sys/alpha/include/lock.h b/sys/alpha/include/lock.h index c2ae0fab148f..1066d461ff66 100644 --- a/sys/alpha/include/lock.h +++ b/sys/alpha/include/lock.h @@ -35,10 +35,40 @@ * It is an error to hold one of these locks while a process is sleeping. */ struct simplelock { - volatile int lock_data; + volatile u_int lock_data; }; +/* functions in mp_machdep.c */ +void s_lock_init __P((struct simplelock *)); +void s_lock __P((struct simplelock *)); +int s_lock_try __P((struct simplelock *)); +void ss_lock __P((struct simplelock *)); +void ss_unlock __P((struct simplelock *)); +void s_lock_np __P((struct simplelock *)); +void s_unlock_np __P((struct simplelock *)); + +/* inline simplelock functions */ +static __inline void +s_unlock(struct simplelock *lkp) +{ + alpha_mb(); + lkp->lock_data = 0; +} + +#if !defined(SIMPLELOCK_DEBUG) && NCPUS > 1 +/* + * This set of defines turns on the real functions in i386/isa/apic_ipl.s. + */ +#define simple_lock_init(alp) s_lock_init(alp) +#define simple_lock(alp) s_lock(alp) +#define simple_lock_try(alp) s_lock_try(alp) +#define simple_unlock(alp) s_unlock(alp) + +#endif /* !SIMPLELOCK_DEBUG && NCPUS > 1 */ + #define COM_LOCK() #define COM_UNLOCK() +#define COM_DISABLE_INTR() COM_LOCK() +#define COM_ENABLE_INTR() COM_UNLOCK() #endif /* !_MACHINE_LOCK_H_ */ diff --git a/sys/alpha/include/mutex.h b/sys/alpha/include/mutex.h new file mode 100644 index 000000000000..ac13b8cbde0e --- /dev/null +++ b/sys/alpha/include/mutex.h @@ -0,0 +1,563 @@ +/*- + * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: mutex.h,v 2.7.2.35 2000/04/27 03:10:26 cp Exp $ + * $FreeBSD$ + */ + + +#ifndef _MACHINE_MUTEX_H_ +#define _MACHINE_MUTEX_H_ + +#ifndef LOCORE + +#include +#include +#include +#include + +/* + * Mutex flags + * + * Types + */ +#define MTX_DEF 0x1 /* Default (spin/sleep) */ +#define MTX_SPIN 0x2 /* Spin only lock */ + +/* Options */ +#define MTX_RLIKELY 0x4 /* (opt) Recursion likely */ +#define MTX_NORECURSE 0x8 /* No recursion possible */ +#define MTX_NOSPIN 0x10 /* Don't spin before sleeping */ +#define MTX_NOSWITCH 0x20 /* Do not switch on release */ +#define MTX_FIRST 0x40 /* First spin lock holder */ +#define MTX_TOPHALF 0x80 /* Interrupts not disabled on spin */ + +/* options that should be passed on to mtx_enter_hard, mtx_exit_hard */ +#define MTX_HARDOPTS (MTX_DEF | MTX_SPIN | MTX_FIRST | MTX_TOPHALF | MTX_NOSWITCH) + +/* Flags/value used in mtx_lock */ +#define MTX_RECURSE 0x01 /* (non-spin) lock held recursively */ +#define MTX_CONTESTED 0x02 /* (non-spin) lock contested */ +#define MTX_FLAGMASK ~(MTX_RECURSE | MTX_CONTESTED) +#define MTX_UNOWNED 0x8 /* Cookie for free mutex */ + +struct proc; /* XXX */ + +/* + * Sleep/spin mutex + */ +struct mtx { + volatile u_int64_t mtx_lock; /* lock owner/gate/flags */ + volatile u_int32_t mtx_recurse; /* number of recursive holds */ + u_int32_t mtx_saveipl; /* saved ipl (for spin locks) */ + char *mtx_description; + TAILQ_HEAD(, proc) mtx_blocked; + LIST_ENTRY(mtx) mtx_contested; + struct mtx *mtx_next; /* all locks in system */ + struct mtx *mtx_prev; +#ifdef SMP_DEBUG + /* If you add anything here, adjust the mtxf_t definition below */ + struct witness *mtx_witness; + LIST_ENTRY(mtx) mtx_held; + char *mtx_file; + int mtx_line; +#endif /* SMP_DEBUG */ +}; + +typedef struct mtx mtx_t; + +/* + * Filler for structs which need to remain the same size + * whether or not SMP_DEBUG is turned on. + */ +typedef struct mtxf { +#ifdef SMP_DEBUG + char mtxf_data[0]; +#else + char mtxf_data[4*sizeof(void *) + sizeof(int)]; +#endif +} mtxf_t; + +#define mp_fixme(string) + +#ifdef _KERNEL +/* Misc */ +#define CURTHD ((u_int64_t)CURPROC) /* Current thread ID */ + +/* Prototypes */ +void mtx_init(mtx_t *m, char *description, int flag); +void mtx_enter_hard(mtx_t *, int type, int ipl); +void mtx_exit_hard(mtx_t *, int type); +void mtx_destroy(mtx_t *m); + +/* Global locks */ +extern mtx_t sched_lock; +extern mtx_t Giant; + +/* + * Used to replace return with an exit Giant and return. + */ + +#define EGAR(a) \ +do { \ + mtx_exit(&Giant, MTX_DEF); \ + return (a); \ +} while (0) + +#define VEGAR \ +do { \ + mtx_exit(&Giant, MTX_DEF); \ + return; \ +} while (0) + +#define DROP_GIANT() \ +do { \ + int _giantcnt; \ + WITNESS_SAVE_DECL(Giant); \ + \ + WITNESS_SAVE(&Giant, Giant); \ + for (_giantcnt = 0; mtx_owned(&Giant); _giantcnt++) \ + mtx_exit(&Giant, MTX_DEF) + +#define PICKUP_GIANT() \ + mtx_assert(&Giant, MA_NOTOWNED); \ + while (_giantcnt--) \ + mtx_enter(&Giant, MTX_DEF); \ + WITNESS_RESTORE(&Giant, Giant); \ +} while (0) + +#define PARTIAL_PICKUP_GIANT() \ + mtx_assert(&Giant, MA_NOTOWNED); \ + while (_giantcnt--) \ + mtx_enter(&Giant, MTX_DEF); \ + WITNESS_RESTORE(&Giant, Giant) + + +/* + * Debugging + */ +#ifndef SMP_DEBUG +#define mtx_assert(m, what) +#else /* SMP_DEBUG */ + +#define MA_OWNED 1 +#define MA_NOTOWNED 2 +#define mtx_assert(m, what) { \ + switch ((what)) { \ + case MA_OWNED: \ + ASS(mtx_owned((m))); \ + break; \ + case MA_NOTOWNED: \ + ASS(!mtx_owned((m))); \ + break; \ + default: \ + panic("unknown mtx_assert at %s:%d", __FILE__, __LINE__); \ + } \ +} + +#ifdef INVARIANTS +#define ASS(ex) MPASS(ex) +#define MPASS(ex) if (!(ex)) panic("Assertion %s failed at %s:%d", \ + #ex, __FILE__, __LINE__) +#define MPASS2(ex, what) if (!(ex)) panic("Assertion %s failed at %s:%d", \ + what, __FILE__, __LINE__) + +#ifdef MTX_STRS +char STR_IEN[] = "fl & 0x200"; +char STR_IDIS[] = "!(fl & 0x200)"; +#else /* MTX_STRS */ +extern char STR_IEN[]; +extern char STR_IDIS[]; +#endif /* MTX_STRS */ +#define ASS_IEN MPASS2((alpha_pal_rdps & ALPHA_PSL_IPL_MASK) + == ALPHA_PSL_IPL_HIGH, STR_IEN) +#define ASS_IDIS MPASS2((alpha_pal_rdps & ALPHA_PSL_IPL_MASK) + != ALPHA_PSL_IPL_HIGH, STR_IDIS) +#endif /* INVARIANTS */ + +#endif /* SMP_DEBUG */ + +#if !defined(SMP_DEBUG) || !defined(INVARIANTS) +#define ASS(ex) +#define MPASS(ex) +#define MPASS2(ex, where) +#define ASS_IEN +#define ASS_IDIS +#endif /* !defined(SMP_DEBUG) || !defined(INVARIANTS) */ + +#ifdef WITNESS +#ifndef SMP_DEBUG +#error WITNESS requires SMP_DEBUG +#endif /* SMP_DEBUG */ +#define WITNESS_ENTER(m, f) \ + if ((m)->mtx_witness != NULL) \ + witness_enter((m), (f), __FILE__, __LINE__) +#define WITNESS_EXIT(m, f) \ + if ((m)->mtx_witness != NULL) \ + witness_exit((m), (f), __FILE__, __LINE__) + +#define WITNESS_SLEEP(check, m) witness_sleep(check, (m), __FILE__, __LINE__) +#define WITNESS_SAVE_DECL(n) \ + char * __CONCAT(n, __wf); \ + int __CONCAT(n, __wl) + +#define WITNESS_SAVE(m, n) \ +do { \ + if ((m)->mtx_witness != NULL) \ + witness_save(m, &__CONCAT(n, __wf), &__CONCAT(n, __wl)); \ +} while (0) + +#define WITNESS_RESTORE(m, n) \ +do { \ + if ((m)->mtx_witness != NULL) \ + witness_restore(m, __CONCAT(n, __wf), __CONCAT(n, __wl)); \ +} while (0) + +void witness_init(mtx_t *, int flag); +void witness_destroy(mtx_t *); +void witness_enter(mtx_t *, int, char *, int); +void witness_try_enter(mtx_t *, int, char *, int); +void witness_exit(mtx_t *, int, char *, int); +void witness_display(void(*)(const char *fmt, ...)); +void witness_list(struct proc *); +int witness_sleep(int, mtx_t *, char *, int); +void witness_save(mtx_t *, char **, int *); +void witness_restore(mtx_t *, char *, int); +#else /* WITNESS */ +#define WITNESS_ENTER(m, flag) +#define WITNESS_EXIT(m, flag) +#define WITNESS_SLEEP(check, m) +#define WITNESS_SAVE_DECL(n) +#define WITNESS_SAVE(m, n) +#define WITNESS_RESTORE(m, n) + +/* + * flag++ is slezoid way of shutting up unused parameter warning + * in mtx_init() + */ +#define witness_init(m, flag) flag++ +#define witness_destroy(m) +#define witness_enter(m, flag, f, l) +#define witness_try_enter(m, flag, f, l ) +#define witness_exit(m, flag, f, l) +#endif /* WITNESS */ + +/* + * Assembly macros (for internal use only) + *-------------------------------------------------------------------------- + */ + +/* + * Get a sleep lock, deal with recursion inline + */ + +#define _V(x) __STRING(x) + +#define _getlock_sleep(mp, tid, type) do { \ + if (atomic_cmpset_64(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) == 0) { \ + if (((mp)->mtx_lock & MTX_FLAGMASK) != (tid)) \ + mtx_enter_hard(mp, (type) & MTX_HARDOPTS, 0); \ + else { \ + if (((mp)->mtx_lock & MTX_RECURSE) == 0) \ + atomic_set_64(&(mp)->mtx_lock, MTX_RECURSE); \ + (mp)->mtx_recurse++; \ + } \ + } else { \ + alpha_mb(); \ + } \ +} while (0) + +/* + * Get a spin lock, handle recusion inline (as the less common case) + */ + +#define _getlock_spin_block(mp, tid, type) do { \ + u_int _ipl = alpha_pal_rdps() & ALPHA_PSL_IPL_MASK; \ + if (atomic_cmpset_64(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) == 0) \ + mtx_enter_hard(mp, (type) & MTX_HARDOPTS, _ipl); \ + else { \ + alpha_mb(); \ + (mp)->mtx_saveipl = _ipl; \ + } \ +} while (0) + +/* + * Get a lock without any recursion handling. Calls the hard enter + * function if we can't get it inline. + */ + +#define _getlock_norecurse(mp, tid, type) do { \ + if (atomic_cmpset_64(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) == 0) \ + mtx_enter_hard((mp), (type) & MTX_HARDOPTS, 0); \ + else \ + alpha_mb(); \ +} while (0) + +/* + * Release a sleep lock assuming we haven't recursed on it, recursion is + * handled in the hard function. + */ + +#define _exitlock_norecurse(mp, tid, type) do { \ + alpha_mb(); \ + if (atomic_cmpset_64(&(mp)->mtx_lock, (tid), MTX_UNOWNED) == 0) \ + mtx_exit_hard((mp), (type) & MTX_HARDOPTS); \ +} while (0) + +/* + * Release a sleep lock when its likely we recursed (the code to + * deal with simple recursion is inline). + */ + +#define _exitlock(mp, tid, type) do { \ + alpha_mb(); \ + if (atomic_cmpset_64(&(mp)->mtx_lock, (tid), MTX_UNOWNED) == 0) {\ + if (((mp)->mtx_lock & MTX_RECURSE) && \ + (--(mp)->mtx_recurse == 0)) \ + atomic_clear_64(&(mp)->mtx_lock, MTX_RECURSE); \ + else \ + mtx_exit_hard((mp), (type) & MTX_HARDOPTS); \ + } \ +} while (0) + +/* + * Release a spin lock (with possible recursion) + */ + +#define _exitlock_spin(mp) do { \ + int _ipl = (mp)->mtx_saveipl; \ + alpha_mb(); \ + if ((mp)->mtx_recurse == 0 || (--(mp)->mtx_recurse) == 0) \ + atomic_cmpset_64(&(mp)->mtx_lock, (mp)->mtx_lock, \ + MTX_UNOWNED); \ + alpha_pal_swpipl(_ipl); \ +} while (0) + +/* + * Externally visible mutex functions + *------------------------------------------------------------------------ + */ + +/* + * Return non-zero if a mutex is already owned by the current thread + */ +#define mtx_owned(m) (((m)->mtx_lock & MTX_FLAGMASK) == CURTHD) + +/* Common strings */ +#ifdef MTX_STRS +char STR_mtx_enter_fmt[] = "GOT %s [%p] at %s:%d r=%d"; +char STR_mtx_bad_type[] = "((type) & (MTX_NORECURSE | MTX_NOSWITCH)) == 0"; +char STR_mtx_exit_fmt[] = "REL %s [%p] at %s:%d r=%d"; +char STR_mtx_owned[] = "mtx_owned(_mpp)"; +char STR_mtx_recurse[] = "_mpp->mtx_recurse == 0"; +char STR_mtx_try_enter_fmt[] = "TRY_ENTER %s [%p] at %s:%d result=%d"; +#else /* MTX_STRS */ +extern char STR_mtx_enter_fmt[]; +extern char STR_mtx_bad_type[]; +extern char STR_mtx_exit_fmt[]; +extern char STR_mtx_owned[]; +extern char STR_mtx_recurse[]; +extern char STR_mtx_try_enter_fmt[]; +#endif /* MTX_STRS */ + +/* + * Get lock 'm', the macro handles the easy (and most common cases) and + * leaves the slow stuff to the mtx_enter_hard() function. + * + * Note: since type is usually a constant much of this code is optimized out + */ +#define mtx_enter(mtxp, type) do { \ + mtx_t * _mpp = mtxp; \ + \ + /* bits only valid on mtx_exit() */ \ + MPASS2(((type) & (MTX_NORECURSE | MTX_NOSWITCH)) == 0, STR_mtx_bad_type); \ + \ + do { \ + if ((type) & MTX_SPIN) { \ + /* \ + * Easy cases of spin locks: \ + * \ + * 1) We already own the lock and will simply \ + * recurse on it (if RLIKELY) \ + * \ + * 2) The lock is free, we just get it \ + */ \ + if ((type) & MTX_RLIKELY) { \ + /* \ + * Check for recursion, if we already \ + * have this lock we just bump the \ + * recursion count. \ + */ \ + if (_mpp->mtx_lock == CURTHD) { \ + _mpp->mtx_recurse++; \ + break; /* Done */ \ + } \ + } \ + \ + if (((type) & MTX_TOPHALF) == 0) \ + /* \ + * If an interrupt thread uses this \ + * we must block interrupts here. \ + */ \ + _getlock_spin_block(_mpp, CURTHD, \ + (type) & MTX_HARDOPTS); \ + else \ + _getlock_norecurse(_mpp, CURTHD, \ + (type) & MTX_HARDOPTS); \ + } else { \ + /* Sleep locks */ \ + if ((type) & MTX_RLIKELY) \ + _getlock_sleep(_mpp, CURTHD, \ + (type) & MTX_HARDOPTS); \ + else \ + _getlock_norecurse(_mpp, CURTHD, \ + (type) & MTX_HARDOPTS); \ + } \ + } while (0); \ + WITNESS_ENTER(_mpp, type); \ + CTR5(KTR_LOCK, STR_mtx_enter_fmt, \ + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, \ + (_mpp)->mtx_recurse); \ +} while (0) + +/* + * Attempt to get MTX_DEF lock, return non-zero if lock acquired + * + * XXX DOES NOT HANDLE RECURSION + */ +#ifdef SMP_DEBUG +#define mtx_try_enter(mtxp, type) ({ \ + mtx_t *const _mpp = mtxp; \ + int _rval; \ + \ + _rval = atomic_cmpset_int(&_mpp->mtx_lock, MTX_UNOWNED, CURTHD);\ + if (_rval && (_mpp)->mtx_witness != NULL) { \ + ASS((_mpp)->mtx_recurse == 0); \ + witness_try_enter(_mpp, type, __FILE__, __LINE__); \ + } \ + CTR5(KTR_LOCK, STR_mtx_try_enter_fmt, \ + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, \ + _rval); \ + _rval; \ +}) + +#else /* SMP_DEBUG */ + +#define mtx_try_enter(mtxp, type) ({ \ + mtx_t *const _mpp = mtxp; \ + int _rval; \ + \ + _rval = atomic_cmpset_int(&_mpp->mtx_lock, MTX_UNOWNED, CURTHD);\ + CTR5(KTR_LOCK, STR_mtx_try_enter_fmt, \ + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, \ + _rval); \ + _rval; \ +}) + +#endif /* SMP_DEBUG */ + +#if 0 +#define mtx_legal2block() ({ \ + register int _l2b; \ + __asm __volatile ( \ +" pushfl;" \ +" popl %%eax;" \ +" andl $0x200, %%eax;" \ + : "=a" (_l2b) \ + : \ + : "cc"); \ + _l2b; \ +}) +#endif + +#define mtx_legal2block() (read_eflags() & 0x200) + +/* + * Release lock m + */ +#define mtx_exit(mtxp, type) do { \ + mtx_t *const _mpp = mtxp; \ + \ + MPASS2(mtx_owned(_mpp), STR_mtx_owned); \ + WITNESS_EXIT(_mpp, type); \ + CTR5(KTR_LOCK, STR_mtx_exit_fmt, \ + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, \ + (_mpp)->mtx_recurse); \ + if ((type) & MTX_SPIN) { \ + if ((type) & MTX_NORECURSE) { \ + MPASS2(_mpp->mtx_recurse == 0, STR_mtx_recurse); \ + atomic_cmpset_64(&_mpp->mtx_lock, _mpp->mtx_lock, \ + MTX_UNOWNED); \ + if (((type) & MTX_TOPHALF) == 0) { \ + splx(_mpp->mtx_saveipl); \ + } \ + } else \ + if ((type) & MTX_TOPHALF) \ + _exitlock_norecurse(_mpp, CURTHD, \ + (type) & MTX_HARDOPTS); \ + else \ + _exitlock_spin(_mpp); \ + } else { \ + /* Handle sleep locks */ \ + if ((type) & MTX_RLIKELY) \ + _exitlock(_mpp, CURTHD, (type) & MTX_HARDOPTS); \ + else \ + _exitlock_norecurse(_mpp, CURTHD, \ + (type) & MTX_HARDOPTS); \ + } \ +} while (0) +#endif /* _KERNEL */ + +#else /* !LOCORE */ + +/* + * Simple assembly macros to get and release non-recursive spin locks + */ +#define MTX_ENTER(lck) \ + call_pal PAL_OSF1_rdps; \ + and v0, ALPHA_PSL_IPL_MASK, v0; \ +1: ldq_l a0, lck+MTX_LOCK; \ + cmpeq a0, MTX_UNOWNED, a1; \ + beq a1, 1b; \ + ldq a0, PC_CURPROC(globalp); \ + stq_c a0, lck+MTX_LOCK; \ + beq a0, 1b; \ + mb; \ + stl v0, lck+MTX_SAVEIPL; \ + ldq a0, ALPHA_PSL_IPL_HIGH; \ + call_pal PSL_OSF1_swpipl + +#define MTX_EXIT(lck) \ + mb; \ + ldiq a0, MTX_UNOWNED; \ + stq a0, lck+MTX_LOCK; \ + ldl a0, lck+MTX_SAVEIPL; \ + call_pal PAL_OSF1_swpipl + +#endif /* !LOCORE */ + +#endif /* __MACHINE_MUTEX_H */ diff --git a/sys/alpha/include/param.h b/sys/alpha/include/param.h index 80dce22b47b8..742a3f793b8b 100644 --- a/sys/alpha/include/param.h +++ b/sys/alpha/include/param.h @@ -70,7 +70,11 @@ #define OBJFORMAT_NAMES "elf" #define OBJFORMAT_DEFAULT "elf" +#ifdef SMP +#define NCPUS 32 +#else #define NCPUS 1 +#endif /* * Round p (pointer or byte index) up to a correctly-aligned value for all diff --git a/sys/alpha/include/pcb.h b/sys/alpha/include/pcb.h index 3caa144f8e4f..3bf25860e085 100644 --- a/sys/alpha/include/pcb.h +++ b/sys/alpha/include/pcb.h @@ -30,7 +30,7 @@ #include #include - +#include #include /* @@ -53,6 +53,7 @@ struct pcb { u_int64_t pcb_fp_control; /* IEEE control word [SW] */ unsigned long pcb_onfault; /* for copy faults [SW] */ unsigned long pcb_accessaddr; /* for [fs]uswintr [SW] */ + u_int32_t pcb_schednest; /* state of sched_lock [SW] */ }; /* @@ -64,3 +65,9 @@ struct md_coredump { struct trapframe md_tf; struct fpreg md_fpstate; }; + +#ifdef _KERNEL +#ifndef curpcb +extern struct pcb *curpcb; /* our current running pcb */ +#endif +#endif diff --git a/sys/alpha/include/pcpu.h b/sys/alpha/include/pcpu.h new file mode 100644 index 000000000000..b246bb1fb707 --- /dev/null +++ b/sys/alpha/include/pcpu.h @@ -0,0 +1,79 @@ +/*- + * Copyright (c) 1999 Luoqi Chen + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MACHINE_GLOBALDATA_H_ +#define _MACHINE_GLOBALDATA_H_ + +#ifdef _KERNEL + +#include + +/* + * This structure maps out the global data that needs to be kept on a + * per-cpu basis. genassym uses this to generate offsets for the assembler + * code, which also provides external symbols so that C can get at them as + * though they were really globals. This structure is pointed to by + * the per-cpu system value (see alpha_pal_rdval() and alpha_pal_wrval()). + * Inside the kernel, the globally reserved register t7 is used to + * point at the globaldata structure. + */ +struct globaldata { + struct alpha_pcb gd_idlepcb; /* pcb for idling */ + struct proc *gd_curproc; /* current process */ + struct proc *gd_idleproc; /* idle process */ + struct proc *gd_fpcurproc; /* fp state owner */ + struct pcb *gd_curpcb; /* current pcb */ + struct timeval gd_switchtime; + int gd_switchticks; + u_int gd_cpuno; /* this cpu number */ + u_int gd_other_cpus; /* all other cpus */ + int gd_inside_intr; + u_int64_t gd_idlepcbphys; /* pa of gd_idlepcb */ + u_int64_t gd_pending_ipis; /* pending IPI events */ + u_int32_t gd_next_asn; /* next ASN to allocate */ + u_int32_t gd_current_asngen; /* ASN rollover check */ + u_int32_t gd_intr_nesting_level; /* interrupt recursion */ + + u_int gd_astpending; + SLIST_ENTRY(globaldata) gd_allcpu; +#ifdef KTR_PERCPU + volatile int gd_ktr_idx; /* Index into trace table */ + char *gd_ktr_buf; + char gd_ktr_buf_data[0]; +#endif +}; + +SLIST_HEAD(cpuhead, globaldata); +extern struct cpuhead cpuhead; + +void globaldata_init(struct globaldata *pcpu, int cpuno, size_t sz); +struct globaldata *globaldata_find(int cpuno); + +#endif /* _KERNEL */ + +#endif /* !_MACHINE_GLOBALDATA_H_ */ diff --git a/sys/alpha/include/pmap.h b/sys/alpha/include/pmap.h index 134c9a2d09fd..de59b66feae5 100644 --- a/sys/alpha/include/pmap.h +++ b/sys/alpha/include/pmap.h @@ -174,9 +174,11 @@ struct pmap { TAILQ_HEAD(,pv_entry) pm_pvlist; /* list of mappings in pmap */ int pm_count; /* reference count */ int pm_flags; /* pmap flags */ - int pm_active; /* active flag */ - int pm_asn; /* address space number */ - u_int pm_asngen; /* generation number of pm_asn */ + u_int32_t pm_active; /* active cpus */ + struct { + u_int32_t asn:8; /* address space number */ + u_int32_t gen:24; /* generation number */ + } pm_asn[NCPUS]; struct pmap_statistics pm_stats; /* pmap statistics */ struct vm_page *pm_ptphint; /* pmap ptp hint */ }; diff --git a/sys/alpha/include/proc.h b/sys/alpha/include/proc.h index 502b607955e4..d003816ab0b1 100644 --- a/sys/alpha/include/proc.h +++ b/sys/alpha/include/proc.h @@ -28,6 +28,12 @@ * rights to redistribute these changes. */ +#ifndef _MACHINE_PROC_H_ +#define _MACHINE_PROC_H_ + +#include +#include + /* * Machine-dependent part of the proc struct for the Alpha. */ @@ -55,3 +61,5 @@ struct mdproc { #define MDP_UAC_SIGBUS 0x0040 /* Deliver SIGBUS upon unaligned access */ #define MDP_UAC_MASK (MDP_UAC_NOPRINT | MDP_UAC_NOFIX | MDP_UAC_SIGBUS) + +#endif /* !_MACHINE_PROC_H_ */ diff --git a/sys/alpha/include/rpb.h b/sys/alpha/include/rpb.h index 1f2f884ec6b4..0be0775563da 100644 --- a/sys/alpha/include/rpb.h +++ b/sys/alpha/include/rpb.h @@ -219,7 +219,8 @@ struct rpb { * PCS: Per-CPU information. */ struct pcs { - u_int8_t pcs_hwpcb[128]; /* 0: PAL dependent */ + + u_int64_t pcs_hwpcb[16]; /* 0: PAL dependent */ #define PCS_BIP 0x000001 /* boot in progress */ #define PCS_RC 0x000002 /* restart possible */ @@ -238,12 +239,12 @@ struct pcs { #define PCS_HALT_WARM_BOOT 0x030000 #define PCS_HALT_STAY_HALTED 0x040000 #define PCS_mbz 0xffffffffff000000 /* 24:63 -- must be zero */ - u_int64_t pcs_flags; /* 80: */ + u_int64_t pcs_flags; /* 128: */ - u_int64_t pcs_pal_memsize; /* 88: PAL memory size */ - u_int64_t pcs_pal_scrsize; /* 90: PAL scratch size */ - vm_offset_t pcs_pal_memaddr; /* 98: PAL memory addr */ - vm_offset_t pcs_pal_scraddr; /* A0: PAL scratch addr */ + u_int64_t pcs_pal_memsize; /* 136: PAL memory size */ + u_int64_t pcs_pal_scrsize; /* 144: PAL scratch size */ + vm_offset_t pcs_pal_memaddr; /* 152: PAL memory addr */ + vm_offset_t pcs_pal_scraddr; /* 160: PAL scratch addr */ struct { u_int64_t minorrev : 8, /* alphabetic char 'a' - 'z' */ @@ -261,14 +262,14 @@ struct pcs { sbz1 : 8, compatibility : 16, /* Compatibility revision */ proc_cnt : 16; /* Processor count */ - } pcs_pal_rev; /* A8: */ + } pcs_pal_rev; /* 168: */ #define pcs_minorrev pcs_pal_rev.minorrev #define pcs_majorrev pcs_pal_rev.majorrev #define pcs_pal_type pcs_pal_rev.pal_type #define pcs_compatibility pcs_pal_rev.compatibility #define pcs_proc_cnt pcs_pal_rev.proc_cnt - u_int64_t pcs_proc_type; /* B0: processor type */ + u_int64_t pcs_proc_type; /* 176: processor type */ #define PCS_PROC_MAJOR 0x00000000ffffffff #define PCS_PROC_MAJORSHIFT 0 @@ -288,23 +289,23 @@ struct pcs { /* Minor number interpretation is processor specific. See cpu.c. */ - u_int64_t pcs_proc_var; /* B8: processor variation. */ + u_int64_t pcs_proc_var; /* 184: processor variation. */ #define PCS_VAR_VAXFP 0x0000000000000001 /* VAX FP support */ #define PCS_VAR_IEEEFP 0x0000000000000002 /* IEEE FP support */ #define PCS_VAR_PE 0x0000000000000004 /* Primary Eligible */ #define PCS_VAR_RESERVED 0xfffffffffffffff8 /* Reserved */ - char pcs_proc_revision[8]; /* C0: only first 4 valid */ - char pcs_proc_sn[16]; /* C8: only first 10 valid */ - vm_offset_t pcs_machcheck; /* D8: mach chk phys addr. */ - u_int64_t pcs_machcheck_len; /* E0: length in bytes */ - vm_offset_t pcs_halt_pcbb; /* E8: phys addr of halt PCB */ - vm_offset_t pcs_halt_pc; /* F0: halt PC */ - u_int64_t pcs_halt_ps; /* F8: halt PS */ - u_int64_t pcs_halt_r25; /* 100: halt argument list */ - u_int64_t pcs_halt_r26; /* 108: halt return addr list */ - u_int64_t pcs_halt_r27; /* 110: halt procedure value */ + char pcs_proc_revision[8]; /* 192: only first 4 valid */ + char pcs_proc_sn[16]; /* 200: only first 10 valid */ + vm_offset_t pcs_machcheck; /* 216: mach chk phys addr. */ + u_int64_t pcs_machcheck_len; /* 224: length in bytes */ + vm_offset_t pcs_halt_pcbb; /* 232: pa of halt PCB */ + vm_offset_t pcs_halt_pc; /* 240: halt PC */ + u_int64_t pcs_halt_ps; /* 248: halt PS */ + u_int64_t pcs_halt_r25; /* 256: halt argument list */ + u_int64_t pcs_halt_r26; /* 264: halt ra list */ + u_int64_t pcs_halt_r27; /* 272: halt procedure value */ #define PCS_HALT_RESERVED 0 #define PCS_HALT_POWERUP 1 @@ -315,17 +316,22 @@ struct pcs { #define PCS_HALT_DOUBLE_ERROR_ABORT 6 #define PCS_HALT_SCBB 7 #define PCS_HALT_PTBR 8 /* 9-FF: reserved */ - u_int64_t pcs_halt_reason; /* 118: */ + u_int64_t pcs_halt_reason; /* 280: */ - u_int64_t pcs_reserved_soft; /* 120: preserved software */ - u_int64_t pcs_buffer[21]; /* 128: console buffers */ + u_int64_t pcs_reserved_soft; /* 288: preserved software */ + struct { + u_int32_t rxlen; + u_int32_t txlen; + char rxbuf[80]; + char txbuf[80]; + } pcs_buffer; /* 296: console buffers */ #define PALvar_reserved 0 #define PALvar_OpenVMS 1 #define PALvar_OSF1 2 - u_int64_t pcs_palrevisions[16]; /* 1D0: PALcode revisions */ + u_int64_t pcs_palrevisions[16]; /* 464: PALcode revisions */ - u_int64_t pcs_reserved_arch[6]; /* 250: reserved arch */ + u_int64_t pcs_reserved_arch[6]; /* 592: reserved arch */ }; /* diff --git a/sys/alpha/include/smp.h b/sys/alpha/include/smp.h index 48d6737759a5..00aec6a955d4 100644 --- a/sys/alpha/include/smp.h +++ b/sys/alpha/include/smp.h @@ -1,10 +1,57 @@ /* + * ---------------------------------------------------------------------------- + * "THE BEER-WARE LICENSE" (Revision 42): + * wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp + * ---------------------------------------------------------------------------- + * * $FreeBSD$ + * */ + #ifndef _MACHINE_SMP_H_ #define _MACHINE_SMP_H_ -#define get_mplock() { } -#define rel_mplock() { } +#ifdef _KERNEL -#endif +#include +#include +#include + +#ifndef LOCORE + +#define BETTER_CLOCK /* unconditional on alpha */ + +/* global data in mp_machdep.c */ +extern volatile u_int checkstate_probed_cpus; +extern volatile u_int checkstate_need_ast; +extern volatile u_int resched_cpus; +extern void (*cpustop_restartfunc) __P((void)); + +extern int smp_active; +extern int mp_ncpus; +extern u_int all_cpus; +extern u_int started_cpus; +extern u_int stopped_cpus; + +/* functions in mp_machdep.c */ +void mp_start(void); +void mp_announce(void); +void smp_invltlb(void); +void forward_statclock(int pscnt); +void forward_hardclock(int pscnt); +void forward_signal(struct proc *); +void forward_roundrobin(void); +int stop_cpus(u_int); +int restart_cpus(u_int); +void smp_rendezvous_action(void); +void smp_rendezvous(void (*)(void *), + void (*)(void *), + void (*)(void *), + void *arg); +void smp_init_secondary(void); + +#endif /* !LOCORE */ +#endif /* _KERNEL */ +#endif /* _MACHINE_SMP_H_ */ diff --git a/sys/amd64/amd64/amd64-gdbstub.c b/sys/amd64/amd64/amd64-gdbstub.c index 986b8d4daa1f..b442a377c44f 100644 --- a/sys/amd64/amd64/amd64-gdbstub.c +++ b/sys/amd64/amd64/amd64-gdbstub.c @@ -188,7 +188,8 @@ getpacket (char *buffer) unsigned char ch; int s; - s = spltty (); + s = read_eflags(); + disable_intr(); do { /* wait around for the start character, ignore all other characters */ @@ -239,7 +240,7 @@ getpacket (char *buffer) } } while (checksum != xmitcsum); - splx (s); + write_eflags(s); } /* send the packet in buffer. */ @@ -253,7 +254,8 @@ putpacket (char *buffer) int s; /* $#. */ - s = spltty (); + s = read_eflags(); + disable_intr(); do { /* @@ -285,7 +287,7 @@ putpacket (char *buffer) putDebugChar (hexchars[checksum & 0xf]); } while ((getDebugChar () & 0x7f) != '+'); - splx (s); + write_eflags(s); } static char remcomInBuffer[BUFMAX]; diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 2a7559df7f97..54bf00366c81 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -17,7 +17,7 @@ /* - * Macros for interrupt interrupt entry, call to handler, and exit. + * Macros for interrupt entry, call to handler, and exit. */ #define FAST_INTR(irq_num, vec_name) \ @@ -121,7 +121,7 @@ IDTVEC(vec_name) ; \ /* - * Test to see if the source is currntly masked, clear if so. + * Test to see if the source is currently masked, clear if so. */ #define UNMASK_IRQ(irq_num) \ IMASK_LOCK ; /* into critical reg */ \ @@ -200,7 +200,16 @@ log_intr_event: #else #define APIC_ITRACE(name, irq_num, id) #endif - + +/* + * Slow, threaded interrupts. + * + * XXX Most of the parameters here are obsolete. Fix this when we're + * done. + * XXX we really shouldn't return via doreti if we just schedule the + * interrupt handler and don't run anything. We could just do an + * iret. FIXME. + */ #define INTR(irq_num, vec_name, maybe_extra_ipending) \ .text ; \ SUPERALIGN_TEXT ; \ @@ -216,87 +225,24 @@ IDTVEC(vec_name) ; \ maybe_extra_ipending ; \ ; \ APIC_ITRACE(apic_itrace_enter, irq_num, APIC_ITRACE_ENTER) ; \ - lock ; /* MP-safe */ \ - btsl $(irq_num), iactive ; /* lazy masking */ \ - jc 1f ; /* already active */ \ ; \ MASK_LEVEL_IRQ(irq_num) ; \ EOI_IRQ(irq_num) ; \ 0: ; \ - APIC_ITRACE(apic_itrace_tryisrlock, irq_num, APIC_ITRACE_TRYISRLOCK) ;\ - MP_TRYLOCK ; /* XXX this is going away... */ \ - testl %eax, %eax ; /* did we get it? */ \ - jz 3f ; /* no */ \ -; \ - APIC_ITRACE(apic_itrace_gotisrlock, irq_num, APIC_ITRACE_GOTISRLOCK) ;\ - testl $IRQ_BIT(irq_num), _cpl ; \ - jne 2f ; /* this INT masked */ \ -; \ incb _intr_nesting_level ; \ ; \ /* entry point used by doreti_unpend for HWIs. */ \ __CONCAT(Xresume,irq_num): ; \ FAKE_MCOUNT(13*4(%esp)) ; /* XXX avoid dbl cnt */ \ - lock ; incl _cnt+V_INTR ; /* tally interrupts */ \ - movl _intr_countp + (irq_num) * 4, %eax ; \ - lock ; incl (%eax) ; \ -; \ - movl _cpl, %eax ; \ - pushl %eax ; \ - orl _intr_mask + (irq_num) * 4, %eax ; \ - movl %eax, _cpl ; \ - lock ; \ - andl $~IRQ_BIT(irq_num), _ipending ; \ -; \ - pushl _intr_unit + (irq_num) * 4 ; \ + pushl $irq_num; /* pass the IRQ */ \ APIC_ITRACE(apic_itrace_enter2, irq_num, APIC_ITRACE_ENTER2) ; \ sti ; \ - call *_intr_handler + (irq_num) * 4 ; \ - cli ; \ + call _sched_ithd ; \ + addl $4, %esp ; /* discard the parameter */ \ APIC_ITRACE(apic_itrace_leave, irq_num, APIC_ITRACE_LEAVE) ; \ ; \ - lock ; andl $~IRQ_BIT(irq_num), iactive ; \ - UNMASK_IRQ(irq_num) ; \ - APIC_ITRACE(apic_itrace_unmask, irq_num, APIC_ITRACE_UNMASK) ; \ - sti ; /* doreti repeats cli/sti */ \ MEXITCOUNT ; \ - jmp _doreti ; \ -; \ - ALIGN_TEXT ; \ -1: ; /* active */ \ - APIC_ITRACE(apic_itrace_active, irq_num, APIC_ITRACE_ACTIVE) ; \ - MASK_IRQ(irq_num) ; \ - EOI_IRQ(irq_num) ; \ - lock ; \ - orl $IRQ_BIT(irq_num), _ipending ; \ - lock ; \ - btsl $(irq_num), iactive ; /* still active */ \ - jnc 0b ; /* retry */ \ - POP_FRAME ; \ - iret ; /* XXX: iactive bit might be 0 now */ \ - ALIGN_TEXT ; \ -2: ; /* masked by cpl, leave iactive set */ \ - APIC_ITRACE(apic_itrace_masked, irq_num, APIC_ITRACE_MASKED) ; \ - lock ; \ - orl $IRQ_BIT(irq_num), _ipending ; \ - MP_RELLOCK ; \ - POP_FRAME ; \ - iret ; \ - ALIGN_TEXT ; \ -3: ; /* other cpu has isr lock */ \ - APIC_ITRACE(apic_itrace_noisrlock, irq_num, APIC_ITRACE_NOISRLOCK) ;\ - lock ; \ - orl $IRQ_BIT(irq_num), _ipending ; \ - testl $IRQ_BIT(irq_num), _cpl ; \ - jne 4f ; /* this INT masked */ \ - call forward_irq ; /* forward irq to lock holder */ \ - POP_FRAME ; /* and return */ \ - iret ; \ - ALIGN_TEXT ; \ -4: ; /* blocked */ \ - APIC_ITRACE(apic_itrace_masked2, irq_num, APIC_ITRACE_MASKED2) ;\ - POP_FRAME ; /* and return */ \ - iret + jmp doreti_next /* * Handle "spurious INTerrupts". @@ -434,20 +380,10 @@ _Xcpuast: FAKE_MCOUNT(13*4(%esp)) - /* - * Giant locks do not come cheap. - * A lot of cycles are going to be wasted here. - */ - call _get_mplock - - movl _cpl, %eax - pushl %eax orl $AST_PENDING, _astpending /* XXX */ incb _intr_nesting_level sti - pushl $0 - movl _cpuid, %eax lock btrl %eax, _checkstate_pending_ast @@ -461,7 +397,7 @@ _Xcpuast: lock incl CNAME(cpuast_cnt) MEXITCOUNT - jmp _doreti + jmp doreti_next 1: /* We are already in the process of delivering an ast for this CPU */ POP_FRAME @@ -487,40 +423,24 @@ _Xforward_irq: FAKE_MCOUNT(13*4(%esp)) - MP_TRYLOCK - testl %eax,%eax /* Did we get the lock ? */ - jz 1f /* No */ - lock incl CNAME(forward_irq_hitcnt) cmpb $4, _intr_nesting_level - jae 2f + jae 1f - movl _cpl, %eax - pushl %eax incb _intr_nesting_level sti - pushl $0 - MEXITCOUNT - jmp _doreti /* Handle forwarded interrupt */ + jmp doreti_next /* Handle forwarded interrupt */ 1: - lock - incl CNAME(forward_irq_misscnt) - call forward_irq /* Oops, we've lost the isr lock */ - MEXITCOUNT - POP_FRAME - iret -2: lock incl CNAME(forward_irq_toodeepcnt) -3: - MP_RELLOCK MEXITCOUNT POP_FRAME iret +#if 0 /* * */ @@ -532,9 +452,11 @@ forward_irq: cmpl $0, CNAME(forward_irq_enabled) jz 4f +/* XXX - this is broken now, because mp_lock doesn't exist movl _mp_lock,%eax cmpl $FREE_LOCK,%eax jne 1f + */ movl $0, %eax /* Pick CPU #0 if noone has lock */ 1: shrl $24,%eax @@ -559,6 +481,7 @@ forward_irq: jnz 3b 4: ret +#endif /* * Executed by a CPU when it receives an Xcpustop IPI from another CPU, @@ -654,6 +577,7 @@ MCOUNT_LABEL(bintr) FAST_INTR(22,fastintr22) FAST_INTR(23,fastintr23) #define CLKINTR_PENDING movl $1,CNAME(clkintr_pending) +/* Threaded interrupts */ INTR(0,intr0, CLKINTR_PENDING) INTR(1,intr1,) INTR(2,intr2,) @@ -728,15 +652,11 @@ _ihandlers: .long _swi_null, swi_net, _swi_null, _swi_null .long _swi_vm, _swi_null, _softclock -imasks: /* masks for interrupt handlers */ - .space NHWI*4 /* padding; HWI masks are elsewhere */ - - .long SWI_TTY_MASK, SWI_NET_MASK, SWI_CAMNET_MASK, SWI_CAMBIO_MASK - .long SWI_VM_MASK, SWI_TQ_MASK, SWI_CLOCK_MASK - +#if 0 /* active flag for lazy masking */ iactive: .long 0 +#endif #ifdef COUNT_XINVLTLB_HITS .globl _xhits diff --git a/sys/amd64/amd64/autoconf.c b/sys/amd64/amd64/autoconf.c index b209065027d6..4edda4bdcab5 100644 --- a/sys/amd64/amd64/autoconf.c +++ b/sys/amd64/amd64/autoconf.c @@ -163,14 +163,6 @@ configure(dummy) * XXX this is slightly misplaced. */ spl0(); - - /* - * Allow lowering of the ipl to the lowest kernel level if we - * panic (or call tsleep() before clearing `cold'). No level is - * completely safe (since a panic may occur in a critical region - * at splhigh()), but we want at least bio interrupts to work. - */ - safepri = cpl; } static void diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S index c895fefa8c15..db56a1b40af6 100644 --- a/sys/amd64/amd64/cpu_switch.S +++ b/sys/amd64/amd64/cpu_switch.S @@ -73,189 +73,6 @@ _tlb_flush_count: .long 0 .text -/* - * When no processes are on the runq, cpu_switch() branches to _idle - * to wait for something to come ready. - */ - ALIGN_TEXT - .type _idle,@function -_idle: - xorl %ebp,%ebp - movl %ebp,_switchtime - -#ifdef SMP - - /* when called, we have the mplock, intr disabled */ - /* use our idleproc's "context" */ - movl _IdlePTD, %ecx - movl %cr3, %eax - cmpl %ecx, %eax - je 2f -#if defined(SWTCH_OPTIM_STATS) - decl _swtch_optim_stats - incl _tlb_flush_count -#endif - movl %ecx, %cr3 -2: - /* Keep space for nonexisting return addr, or profiling bombs */ - movl $gd_idlestack_top-4, %ecx - addl %fs:0, %ecx - movl %ecx, %esp - - /* update common_tss.tss_esp0 pointer */ - movl %ecx, _common_tss + TSS_ESP0 - - movl _cpuid, %esi - btrl %esi, _private_tss - jae 1f - - movl $gd_common_tssd, %edi - addl %fs:0, %edi - - /* move correct tss descriptor into GDT slot, then reload tr */ - movl _tss_gdt, %ebx /* entry in GDT */ - movl 0(%edi), %eax - movl %eax, 0(%ebx) - movl 4(%edi), %eax - movl %eax, 4(%ebx) - movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ - ltr %si -1: - - sti - - /* - * XXX callers of cpu_switch() do a bogus splclock(). Locking should - * be left to cpu_switch(). - * - * NOTE: spl*() may only be called while we hold the MP lock (which - * we do). - */ - call _spl0 - - cli - - /* - * _REALLY_ free the lock, no matter how deep the prior nesting. - * We will recover the nesting on the way out when we have a new - * proc to load. - * - * XXX: we had damn well better be sure we had it before doing this! - */ - movl $FREE_LOCK, %eax - movl %eax, _mp_lock - - /* do NOT have lock, intrs disabled */ - .globl idle_loop -idle_loop: - - cmpl $0,_smp_active - jne 1f - cmpl $0,_cpuid - je 1f - jmp 2f - -1: - call _procrunnable - testl %eax,%eax - jnz 3f - - /* - * Handle page-zeroing in the idle loop. Called with interrupts - * disabled and the MP lock released. Inside vm_page_zero_idle - * we enable interrupts and grab the mplock as required. - */ - cmpl $0,_do_page_zero_idle - je 2f - - call _vm_page_zero_idle /* internal locking */ - testl %eax, %eax - jnz idle_loop -2: - - /* enable intrs for a halt */ - movl $0, lapic_tpr /* 1st candidate for an INT */ - call *_hlt_vector /* wait for interrupt */ - cli - jmp idle_loop - - /* - * Note that interrupts must be enabled while obtaining the MP lock - * in order to be able to take IPI's while blocked. - */ -3: - movl $LOPRIO_LEVEL, lapic_tpr /* arbitrate for INTs */ - sti - call _get_mplock - cli - call _procrunnable - testl %eax,%eax - CROSSJUMP(jnz, sw1a, jz) - call _rel_mplock - jmp idle_loop - -#else /* !SMP */ - - movl $HIDENAME(tmpstk),%esp -#if defined(OVERLY_CONSERVATIVE_PTD_MGMT) -#if defined(SWTCH_OPTIM_STATS) - incl _swtch_optim_stats -#endif - movl _IdlePTD, %ecx - movl %cr3, %eax - cmpl %ecx, %eax - je 2f -#if defined(SWTCH_OPTIM_STATS) - decl _swtch_optim_stats - incl _tlb_flush_count -#endif - movl %ecx, %cr3 -2: -#endif - - /* update common_tss.tss_esp0 pointer */ - movl %esp, _common_tss + TSS_ESP0 - - movl $0, %esi - btrl %esi, _private_tss - jae 1f - - movl $_common_tssd, %edi - - /* move correct tss descriptor into GDT slot, then reload tr */ - movl _tss_gdt, %ebx /* entry in GDT */ - movl 0(%edi), %eax - movl %eax, 0(%ebx) - movl 4(%edi), %eax - movl %eax, 4(%ebx) - movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ - ltr %si -1: - - sti - - /* - * XXX callers of cpu_switch() do a bogus splclock(). Locking should - * be left to cpu_switch(). - */ - call _spl0 - - ALIGN_TEXT -idle_loop: - cli - call _procrunnable - testl %eax,%eax - CROSSJUMP(jnz, sw1a, jz) - call _vm_page_zero_idle - testl %eax, %eax - jnz idle_loop - call *_hlt_vector /* wait for interrupt */ - jmp idle_loop - -#endif /* SMP */ - -CROSSJUMPTARGET(_idle) - ENTRY(default_halt) sti #ifndef SMP @@ -263,6 +80,12 @@ ENTRY(default_halt) #endif ret +/* + * cpu_throw() + */ +ENTRY(cpu_throw) + jmp sw1 + /* * cpu_switch() */ @@ -270,10 +93,11 @@ ENTRY(cpu_switch) /* switch to new process. first, save context as needed */ movl _curproc,%ecx + movl %ecx,_prevproc /* if no process to save, don't bother */ testl %ecx,%ecx - je sw1 + jz sw1 #ifdef SMP movb P_ONCPU(%ecx), %al /* save "last" cpu */ @@ -299,7 +123,7 @@ ENTRY(cpu_switch) movl %edi,PCB_EDI(%edx) movl %gs,PCB_GS(%edx) - /* test if debug regisers should be saved */ + /* test if debug registers should be saved */ movb PCB_FLAGS(%edx),%al andb $PCB_DBREGS,%al jz 1f /* no, skip over */ @@ -319,15 +143,12 @@ ENTRY(cpu_switch) movl %eax,PCB_DR0(%edx) 1: + /* save sched_lock recursion count */ + movl _sched_lock+MTX_RECURSE,%eax + movl %eax,PCB_SCHEDNEST(%edx) + #ifdef SMP - movl _mp_lock, %eax /* XXX FIXME: we should be saving the local APIC TPR */ -#ifdef DIAGNOSTIC - cmpl $FREE_LOCK, %eax /* is it free? */ - je badsw4 /* yes, bad medicine! */ -#endif /* DIAGNOSTIC */ - andl $COUNT_FIELD, %eax /* clear CPU portion */ - movl %eax, PCB_MPNEST(%edx) /* store it */ #endif /* SMP */ #if NNPX > 0 @@ -341,25 +162,33 @@ ENTRY(cpu_switch) 1: #endif /* NNPX > 0 */ - movl $0,_curproc /* out of process */ - - /* save is done, now choose a new process or idle */ + /* save is done, now choose a new process */ sw1: - cli #ifdef SMP /* Stop scheduling if smp_active goes zero and we are not BSP */ cmpl $0,_smp_active jne 1f cmpl $0,_cpuid - CROSSJUMP(je, _idle, jne) /* wind down */ + je 1f + + movl _idleproc, %eax + jmp sw1b 1: #endif + /* + * Choose a new process to schedule. chooseproc() returns idleproc + * if it cannot find another process to run. + */ sw1a: call _chooseproc /* trash ecx, edx, ret eax*/ - testl %eax,%eax - CROSSJUMP(je, _idle, jne) /* if no proc, idle */ + +#ifdef DIAGNOSTIC + testl %eax,%eax /* no process? */ + jz badsw3 /* no, panic */ +#endif +sw1b: movl %eax,%ecx xorl %eax,%eax @@ -456,9 +285,6 @@ sw1a: movl %ecx, _curproc /* into next process */ #ifdef SMP - movl _cpu_lockid, %eax - orl PCB_MPNEST(%edx), %eax /* add next count from PROC */ - movl %eax, _mp_lock /* load the mp_lock */ /* XXX FIXME: we should be restoring the local APIC TPR */ #endif /* SMP */ @@ -500,7 +326,22 @@ cpu_switch_load_gs: movl %eax,%dr7 1: - sti + /* + * restore sched_lock recursion count and transfer ownership to + * new process + */ + movl PCB_SCHEDNEST(%edx),%eax + movl %eax,_sched_lock+MTX_RECURSE + + movl _curproc,%eax + movl %eax,_sched_lock+MTX_LOCK + +#ifdef DIAGNOSTIC + pushfl + popl %ecx + testl $0x200, %ecx /* interrupts enabled? */ + jnz badsw6 /* that way madness lies */ +#endif ret CROSSJUMPTARGET(sw1a) @@ -517,15 +358,27 @@ badsw2: call _panic sw0_2: .asciz "cpu_switch: not SRUN" -#endif -#if defined(SMP) && defined(DIAGNOSTIC) -badsw4: - pushl $sw0_4 +badsw3: + pushl $sw0_3 call _panic -sw0_4: .asciz "cpu_switch: do not have lock" -#endif /* SMP && DIAGNOSTIC */ +sw0_3: .asciz "cpu_switch: chooseproc returned NULL" + +#endif + +#ifdef DIAGNOSTIC +badsw5: + pushl $sw0_5 + call _panic + +sw0_5: .asciz "cpu_switch: interrupts enabled (again)" +badsw6: + pushl $sw0_6 + call _panic + +sw0_6: .asciz "cpu_switch: interrupts enabled" +#endif /* * savectx(pcb) diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S index acb8b40f2810..9e77114a1385 100644 --- a/sys/amd64/amd64/exception.S +++ b/sys/amd64/amd64/exception.S @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #ifdef SMP @@ -175,20 +176,12 @@ IDTVEC(fpu) mov %ax,%fs FAKE_MCOUNT(13*4(%esp)) -#ifdef SMP MPLOCKED incl _cnt+V_TRAP - MP_LOCK - movl _cpl,%eax - pushl %eax /* save original cpl */ pushl $0 /* dummy unit to finish intr frame */ -#else /* SMP */ - movl _cpl,%eax - pushl %eax - pushl $0 /* dummy unit to finish intr frame */ - incl _cnt+V_TRAP -#endif /* SMP */ + call __mtx_enter_giant_def call _npx_intr + call __mtx_exit_giant_def incb _intr_nesting_level MEXITCOUNT @@ -205,9 +198,6 @@ IDTVEC(align) * gate (TGT), else disabled if this was an interrupt gate (IGT). * Note that int0x80_syscall is a trap gate. Only page faults * use an interrupt gate. - * - * Note that all calls to MP_LOCK must occur with interrupts enabled - * in order to be able to take IPI's while waiting for the lock. */ SUPERALIGN_TEXT @@ -227,16 +217,12 @@ alltraps_with_regs_pushed: FAKE_MCOUNT(13*4(%esp)) calltrap: FAKE_MCOUNT(_btrap) /* init "from" _btrap -> calltrap */ - MPLOCKED incl _cnt+V_TRAP - MP_LOCK - movl _cpl,%ebx /* keep orig. cpl here during trap() */ call _trap /* * Return via _doreti to handle ASTs. Have to change trap frame * to interrupt frame. */ - pushl %ebx /* cpl to restore */ subl $4,%esp /* dummy unit to finish intr frame */ incb _intr_nesting_level MEXITCOUNT @@ -274,16 +260,11 @@ IDTVEC(syscall) movl %eax,TF_EFLAGS(%esp) movl $7,TF_ERR(%esp) /* sizeof "lcall 7,0" */ FAKE_MCOUNT(13*4(%esp)) - MPLOCKED incl _cnt+V_SYSCALL call _syscall2 MEXITCOUNT cli /* atomic astpending access */ - cmpl $0,_astpending - je doreti_syscall_ret -#ifdef SMP - MP_LOCK -#endif - pushl $0 /* cpl to restore */ + cmpl $0,_astpending /* AST pending? */ + je doreti_syscall_ret /* no, get out of here */ subl $4,%esp /* dummy unit for interrupt frame */ movb $1,_intr_nesting_level jmp _doreti @@ -312,21 +293,18 @@ IDTVEC(int0x80_syscall) mov %ax,%fs movl $2,TF_ERR(%esp) /* sizeof "int 0x80" */ FAKE_MCOUNT(13*4(%esp)) - MPLOCKED incl _cnt+V_SYSCALL call _syscall2 MEXITCOUNT cli /* atomic astpending access */ - cmpl $0,_astpending - je doreti_syscall_ret -#ifdef SMP - MP_LOCK -#endif - pushl $0 /* cpl to restore */ + cmpl $0,_astpending /* AST pending? */ + je doreti_syscall_ret /* no, get out of here */ subl $4,%esp /* dummy unit for interrupt frame */ movb $1,_intr_nesting_level jmp _doreti ENTRY(fork_trampoline) + MTX_EXIT(_sched_lock, %ecx) + sti call _spl0 #ifdef SMP @@ -355,7 +333,6 @@ ENTRY(fork_trampoline) /* * Return via _doreti to handle ASTs. */ - pushl $0 /* cpl to restore */ subl $4,%esp /* dummy unit to finish intr frame */ movb $1,_intr_nesting_level MEXITCOUNT diff --git a/sys/amd64/amd64/exception.s b/sys/amd64/amd64/exception.s index acb8b40f2810..9e77114a1385 100644 --- a/sys/amd64/amd64/exception.s +++ b/sys/amd64/amd64/exception.s @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #ifdef SMP @@ -175,20 +176,12 @@ IDTVEC(fpu) mov %ax,%fs FAKE_MCOUNT(13*4(%esp)) -#ifdef SMP MPLOCKED incl _cnt+V_TRAP - MP_LOCK - movl _cpl,%eax - pushl %eax /* save original cpl */ pushl $0 /* dummy unit to finish intr frame */ -#else /* SMP */ - movl _cpl,%eax - pushl %eax - pushl $0 /* dummy unit to finish intr frame */ - incl _cnt+V_TRAP -#endif /* SMP */ + call __mtx_enter_giant_def call _npx_intr + call __mtx_exit_giant_def incb _intr_nesting_level MEXITCOUNT @@ -205,9 +198,6 @@ IDTVEC(align) * gate (TGT), else disabled if this was an interrupt gate (IGT). * Note that int0x80_syscall is a trap gate. Only page faults * use an interrupt gate. - * - * Note that all calls to MP_LOCK must occur with interrupts enabled - * in order to be able to take IPI's while waiting for the lock. */ SUPERALIGN_TEXT @@ -227,16 +217,12 @@ alltraps_with_regs_pushed: FAKE_MCOUNT(13*4(%esp)) calltrap: FAKE_MCOUNT(_btrap) /* init "from" _btrap -> calltrap */ - MPLOCKED incl _cnt+V_TRAP - MP_LOCK - movl _cpl,%ebx /* keep orig. cpl here during trap() */ call _trap /* * Return via _doreti to handle ASTs. Have to change trap frame * to interrupt frame. */ - pushl %ebx /* cpl to restore */ subl $4,%esp /* dummy unit to finish intr frame */ incb _intr_nesting_level MEXITCOUNT @@ -274,16 +260,11 @@ IDTVEC(syscall) movl %eax,TF_EFLAGS(%esp) movl $7,TF_ERR(%esp) /* sizeof "lcall 7,0" */ FAKE_MCOUNT(13*4(%esp)) - MPLOCKED incl _cnt+V_SYSCALL call _syscall2 MEXITCOUNT cli /* atomic astpending access */ - cmpl $0,_astpending - je doreti_syscall_ret -#ifdef SMP - MP_LOCK -#endif - pushl $0 /* cpl to restore */ + cmpl $0,_astpending /* AST pending? */ + je doreti_syscall_ret /* no, get out of here */ subl $4,%esp /* dummy unit for interrupt frame */ movb $1,_intr_nesting_level jmp _doreti @@ -312,21 +293,18 @@ IDTVEC(int0x80_syscall) mov %ax,%fs movl $2,TF_ERR(%esp) /* sizeof "int 0x80" */ FAKE_MCOUNT(13*4(%esp)) - MPLOCKED incl _cnt+V_SYSCALL call _syscall2 MEXITCOUNT cli /* atomic astpending access */ - cmpl $0,_astpending - je doreti_syscall_ret -#ifdef SMP - MP_LOCK -#endif - pushl $0 /* cpl to restore */ + cmpl $0,_astpending /* AST pending? */ + je doreti_syscall_ret /* no, get out of here */ subl $4,%esp /* dummy unit for interrupt frame */ movb $1,_intr_nesting_level jmp _doreti ENTRY(fork_trampoline) + MTX_EXIT(_sched_lock, %ecx) + sti call _spl0 #ifdef SMP @@ -355,7 +333,6 @@ ENTRY(fork_trampoline) /* * Return via _doreti to handle ASTs. */ - pushl $0 /* cpl to restore */ subl $4,%esp /* dummy unit to finish intr frame */ movb $1,_intr_nesting_level MEXITCOUNT diff --git a/sys/amd64/amd64/fpu.c b/sys/amd64/amd64/fpu.c index 637853e25264..8610e35f1f11 100644 --- a/sys/amd64/amd64/fpu.c +++ b/sys/amd64/amd64/fpu.c @@ -245,6 +245,12 @@ npx_probe(dev) setidt(16, probetrap, SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(npx_intrno, probeintr, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); npx_idt_probeintr = idt[npx_intrno]; + + /* + * XXX This looks highly bogus, but it appears that npc_probe1 + * needs interrupts enabled. Does this make any difference + * here? + */ enable_intr(); result = npx_probe1(dev); disable_intr(); @@ -797,7 +803,7 @@ npxdna() /* * Record new context early in case frstor causes an IRQ13. */ - npxproc = curproc; + PCPU_SET(npxproc, CURPROC); curpcb->pcb_savefpu.sv_ex_sw = 0; /* * The following frstor may cause an IRQ13 when the state being @@ -834,16 +840,18 @@ npxsave(addr) fnsave(addr); /* fnop(); */ start_emulating(); - npxproc = NULL; + PCPU_SET(npxproc, NULL); #else /* SMP */ + int intrstate; u_char icu1_mask; u_char icu2_mask; u_char old_icu1_mask; u_char old_icu2_mask; struct gate_descriptor save_idt_npxintr; + intrstate = save_intr(); disable_intr(); old_icu1_mask = inb(IO_ICU1 + 1); old_icu2_mask = inb(IO_ICU2 + 1); @@ -851,12 +859,12 @@ npxsave(addr) outb(IO_ICU1 + 1, old_icu1_mask & ~(IRQ_SLAVE | npx0_imask)); outb(IO_ICU2 + 1, old_icu2_mask & ~(npx0_imask >> 8)); idt[npx_intrno] = npx_idt_probeintr; - enable_intr(); + write_eflags(intrstate); stop_emulating(); fnsave(addr); fnop(); start_emulating(); - npxproc = NULL; + PCPU_SET(npxproc, NULL); disable_intr(); icu1_mask = inb(IO_ICU1 + 1); /* masks may have changed */ icu2_mask = inb(IO_ICU2 + 1); @@ -866,7 +874,7 @@ npxsave(addr) (icu2_mask & ~(npx0_imask >> 8)) | (old_icu2_mask & (npx0_imask >> 8))); idt[npx_intrno] = save_idt_npxintr; - enable_intr(); /* back to usual state */ + restore_intr(intrstate); /* back to previous state */ #endif /* SMP */ } diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index 60accd19ba8e..78c607591875 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -51,6 +51,10 @@ #include #include #include +/* XXX */ +#ifdef KTR_PERCPU +#include +#endif #include #include #include @@ -73,6 +77,7 @@ #include #include #include +#include ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); @@ -127,9 +132,7 @@ ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); -#ifdef SMP -ASSYM(PCB_MPNEST, offsetof(struct pcb, pcb_mpnest)); -#endif +ASSYM(PCB_SCHEDNEST, offsetof(struct pcb, pcb_schednest)); ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); @@ -170,7 +173,9 @@ ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab)); ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend)); ASSYM(GD_SIZEOF, sizeof(struct globaldata)); ASSYM(GD_CURPROC, offsetof(struct globaldata, gd_curproc)); +ASSYM(GD_PREVPROC, offsetof(struct globaldata, gd_prevproc)); ASSYM(GD_NPXPROC, offsetof(struct globaldata, gd_npxproc)); +ASSYM(GD_IDLEPROC, offsetof(struct globaldata, gd_idleproc)); ASSYM(GD_CURPCB, offsetof(struct globaldata, gd_curpcb)); ASSYM(GD_COMMON_TSS, offsetof(struct globaldata, gd_common_tss)); ASSYM(GD_SWITCHTIME, offsetof(struct globaldata, gd_switchtime)); @@ -178,11 +183,21 @@ ASSYM(GD_SWITCHTICKS, offsetof(struct globaldata, gd_switchticks)); ASSYM(GD_COMMON_TSSD, offsetof(struct globaldata, gd_common_tssd)); ASSYM(GD_TSS_GDT, offsetof(struct globaldata, gd_tss_gdt)); ASSYM(GD_ASTPENDING, offsetof(struct globaldata, gd_astpending)); +ASSYM(GD_INTR_NESTING_LEVEL, offsetof(struct globaldata, gd_intr_nesting_level)); #ifdef USER_LDT ASSYM(GD_CURRENTLDT, offsetof(struct globaldata, gd_currentldt)); #endif +ASSYM(GD_WITNESS_SPIN_CHECK, offsetof(struct globaldata, gd_witness_spin_check)); + +/* XXX */ +#ifdef KTR_PERCPU +ASSYM(GD_KTR_IDX, offsetof(struct globaldata, gd_ktr_idx)); +ASSYM(GD_KTR_BUF, offsetof(struct globaldata, gd_ktr_buf)); +ASSYM(GD_KTR_BUF_DATA, offsetof(struct globaldata, gd_ktr_buf_data)); +#endif + #ifdef SMP ASSYM(GD_CPUID, offsetof(struct globaldata, gd_cpuid)); ASSYM(GD_CPU_LOCKID, offsetof(struct globaldata, gd_cpu_lockid)); @@ -211,3 +226,9 @@ ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL)); ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL)); ASSYM(GPROC0_SEL, GPROC0_SEL); ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame)); + +ASSYM(MTX_LOCK, offsetof(struct mtx, mtx_lock)); +ASSYM(MTX_RECURSE, offsetof(struct mtx, mtx_recurse)); +ASSYM(MTX_SAVEFL, offsetof(struct mtx, mtx_savefl)); + +ASSYM(MTX_UNOWNED, MTX_UNOWNED); diff --git a/sys/amd64/amd64/identcpu.c b/sys/amd64/amd64/identcpu.c index 0e11e2b8eadf..71ecd63de85a 100644 --- a/sys/amd64/amd64/identcpu.c +++ b/sys/amd64/amd64/identcpu.c @@ -42,6 +42,7 @@ #include "opt_cpu.h" #include +#include #include #include #include @@ -53,6 +54,8 @@ #include #include +#include +#include #include #define IDENTBLUE_CYRIX486 0 diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c index be86c65cb279..b9395bfc7f85 100644 --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -607,12 +607,14 @@ void enable_K5_wt_alloc(void) { u_int64_t msr; + int intrstate; /* * Write allocate is supported only on models 1, 2, and 3, with * a stepping of 4 or greater. */ if (((cpu_id & 0xf0) > 0) && ((cpu_id & 0x0f) > 3)) { + intrstate = save_intr(); disable_intr(); msr = rdmsr(0x83); /* HWCR */ wrmsr(0x83, msr & !(0x10)); @@ -645,7 +647,7 @@ enable_K5_wt_alloc(void) msr=rdmsr(0x83); wrmsr(0x83, msr|0x10); /* enable write allocate */ - enable_intr(); + restore_intr(intrstate); } } @@ -708,7 +710,6 @@ enable_K6_wt_alloc(void) wrmsr(0x0c0000082, whcr); write_eflags(eflags); - enable_intr(); } void @@ -770,7 +771,6 @@ enable_K6_2_wt_alloc(void) wrmsr(0x0c0000082, whcr); write_eflags(eflags); - enable_intr(); } #endif /* I585_CPU && CPU_WT_ALLOC */ diff --git a/sys/amd64/amd64/legacy.c b/sys/amd64/amd64/legacy.c index 8a3077058718..5b6cdbc85618 100644 --- a/sys/amd64/amd64/legacy.c +++ b/sys/amd64/amd64/legacy.c @@ -68,7 +68,10 @@ #else #include #endif +#include +#include #include +#include static struct rman irq_rman, drq_rman, port_rman, mem_rman; @@ -397,9 +400,9 @@ static int nexus_setup_intr(device_t bus, device_t child, struct resource *irq, int flags, void (*ihand)(void *), void *arg, void **cookiep) { - intrmask_t *mask; driver_t *driver; - int error, icflags; + int error, icflags; + int pri; /* interrupt thread priority */ /* somebody tried to setup an irq that failed to allocate! */ if (irq == NULL) @@ -413,27 +416,32 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, driver = device_get_driver(child); switch (flags) { - case INTR_TYPE_TTY: - mask = &tty_imask; + case INTR_TYPE_TTY: /* keyboard or parallel port */ + pri = PI_TTYLOW; break; - case (INTR_TYPE_TTY | INTR_TYPE_FAST): - mask = &tty_imask; + case (INTR_TYPE_TTY | INTR_FAST): /* sio */ + pri = PI_TTYHIGH; icflags |= INTR_FAST; break; case INTR_TYPE_BIO: - mask = &bio_imask; + /* + * XXX We need to refine this. BSD/OS distinguishes + * between tape and disk priorities. + */ + pri = PI_DISK; break; case INTR_TYPE_NET: - mask = &net_imask; + pri = PI_NET; break; case INTR_TYPE_CAM: - mask = &cam_imask; + pri = PI_DISK; /* XXX or PI_CAM? */ break; case INTR_TYPE_MISC: - mask = 0; + pri = PI_DULL; /* don't care */ break; + /* We didn't specify an interrupt level. */ default: - panic("still using grody create_intr interface"); + panic("nexus_setup_intr: no interrupt type in flags"); } /* @@ -444,7 +452,7 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, return (error); *cookiep = inthand_add(device_get_nameunit(child), irq->r_start, - ihand, arg, mask, icflags); + ihand, arg, pri, icflags); if (*cookiep == NULL) error = EINVAL; /* XXX ??? */ diff --git a/sys/amd64/amd64/locore.S b/sys/amd64/amd64/locore.S index bddd7d5be868..fa95fb0d6b53 100644 --- a/sys/amd64/amd64/locore.S +++ b/sys/amd64/amd64/locore.S @@ -862,9 +862,6 @@ map_read_write: movl $(NPTEPG-1), %ebx /* pte offset = NTEPG-1 */ movl $1, %ecx /* one private pt coming right up */ fillkpt(R(SMPptpa), $PG_RW) - -/* Initialize mp lock to allow early traps */ - movl $1, R(_mp_lock) #endif /* SMP */ /* install a pde for temporary double map of bottom of VA */ diff --git a/sys/amd64/amd64/locore.s b/sys/amd64/amd64/locore.s index bddd7d5be868..fa95fb0d6b53 100644 --- a/sys/amd64/amd64/locore.s +++ b/sys/amd64/amd64/locore.s @@ -862,9 +862,6 @@ map_read_write: movl $(NPTEPG-1), %ebx /* pte offset = NTEPG-1 */ movl $1, %ecx /* one private pt coming right up */ fillkpt(R(SMPptpa), $PG_RW) - -/* Initialize mp lock to allow early traps */ - movl $1, R(_mp_lock) #endif /* SMP */ /* install a pde for temporary double map of bottom of VA */ diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 6edecf04db54..875c9d5a7a8a 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -98,10 +99,12 @@ #include #include #include +#include #include /* pcb.h included via sys/user.h */ +#include +#include #ifdef SMP #include -#include #endif #ifdef PERFMON #include @@ -110,6 +113,7 @@ #ifdef OLD_BUS_ARCH #include #endif +#include #include #include #include @@ -247,6 +251,11 @@ vm_offset_t clean_sva, clean_eva; static vm_offset_t pager_sva, pager_eva; static struct trapframe proc0_tf; +struct cpuhead cpuhead; + +mtx_t sched_lock; +mtx_t Giant; + #define offsetof(type, member) ((size_t)(&((type *)0)->member)) static void @@ -431,6 +440,11 @@ cpu_startup(dummy) bufinit(); vm_pager_bufferinit(); + SLIST_INIT(&cpuhead); + SLIST_INSERT_HEAD(&cpuhead, GLOBALDATA, gd_allcpu); + + mtx_init(&sched_lock, "sched lock", MTX_SPIN); + #ifdef SMP /* * OK, enough kmem_alloc/malloc state should be up, lets get on with it! @@ -1817,11 +1831,6 @@ init386(first) #endif int off; - /* - * Prevent lowering of the ipl if we call tsleep() early. - */ - safepri = cpl; - proc0.p_addr = proc0paddr; atdevbase = ISA_HOLE_START + KERNBASE; @@ -1871,6 +1880,10 @@ init386(first) r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); + /* setup curproc so that mutexes work */ + PCPU_SET(curproc, &proc0); + PCPU_SET(prevproc, &proc0); + /* make ldt memory segments */ /* * The data segment limit must not cover the user area because we @@ -1953,7 +1966,7 @@ init386(first) /* make an initial tss so cpu can get interrupt stack on syscall! */ common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16; - common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ; + common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); private_tss = 0; tss_gdt = &gdt[GPROC0_SEL].sd; @@ -1974,6 +1987,12 @@ init386(first) dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); + /* + * We grab Giant during the vm86bios routines, so we need to ensure + * that it is up and running before we use vm86. + */ + mtx_init(&Giant, "Giant", MTX_DEF); + vm86_initialize(); getmemsize(first); @@ -2009,9 +2028,7 @@ init386(first) /* setup proc 0's pcb */ proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD; -#ifdef SMP - proc0.p_addr->u_pcb.pcb_mpnest = 1; -#endif + proc0.p_addr->u_pcb.pcb_schednest = 0; proc0.p_addr->u_pcb.pcb_ext = 0; proc0.p_md.md_regs = &proc0_tf; } diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 61c5ecf73205..95b5759f9e66 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -36,6 +36,7 @@ #endif #include +#include #include #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -236,6 +238,8 @@ typedef struct BASETABLE_ENTRY { #define MP_ANNOUNCE_POST 0x19 +/* used to hold the AP's until we are ready to release them */ +struct simplelock ap_boot_lock; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; @@ -336,6 +340,7 @@ static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); static int apic_int_is_bus_type(int intr, int bus_type); +static void release_aps(void *dummy); /* * Calculate usable address in base memory for AP trampoline code. @@ -403,7 +408,7 @@ mp_probe(void) /* - * Startup the SMP processors. + * Initialize the SMP hardware and the APIC and start up the AP's. */ void mp_start(void) @@ -619,6 +624,9 @@ mp_enable(u_int boot_addr) /* initialize all SMP locks */ init_locks(); + /* obtain the ap_boot_lock */ + s_lock(&ap_boot_lock); + /* start each Application Processor */ start_all_aps(boot_addr); } @@ -1866,9 +1874,6 @@ struct simplelock fast_intr_lock; /* critical region around INTR() routines */ struct simplelock intr_lock; -/* lock regions protected in UP kernel via cli/sti */ -struct simplelock mpintr_lock; - /* lock region used by kernel profiling */ struct simplelock mcount_lock; @@ -1885,26 +1890,16 @@ struct simplelock clock_lock; /* lock around the MP rendezvous */ static struct simplelock smp_rv_lock; +/* only 1 CPU can panic at a time :) */ +struct simplelock panic_lock; + static void init_locks(void) { - /* - * Get the initial mp_lock with a count of 1 for the BSP. - * This uses a LOGICAL cpu ID, ie BSP == 0. - */ - mp_lock = 0x00000001; - -#if 0 - /* ISR uses its own "giant lock" */ - isr_lock = FREE_LOCK; -#endif - #if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ) s_lock_init((struct simplelock*)&apic_itrace_debuglock); #endif - s_lock_init((struct simplelock*)&mpintr_lock); - s_lock_init((struct simplelock*)&mcount_lock); s_lock_init((struct simplelock*)&fast_intr_lock); @@ -1912,6 +1907,7 @@ init_locks(void) s_lock_init((struct simplelock*)&imen_lock); s_lock_init((struct simplelock*)&cpl_lock); s_lock_init(&smp_rv_lock); + s_lock_init(&panic_lock); #ifdef USE_COMLOCK s_lock_init((struct simplelock*)&com_lock); @@ -1919,12 +1915,10 @@ init_locks(void) #ifdef USE_CLOCKLOCK s_lock_init((struct simplelock*)&clock_lock); #endif /* USE_CLOCKLOCK */ + + s_lock_init(&ap_boot_lock); } - -/* Wait for all APs to be fully initialized */ -extern int wait_ap(unsigned int); - /* * start each AP in our list */ @@ -1987,6 +1981,7 @@ start_all_aps(u_int boot_addr) SMPpt[pg + 4] = 0; /* *prv_PMAP1 */ /* prime data page for it to use */ + SLIST_INSERT_HEAD(&cpuhead, gd, gd_allcpu); gd->gd_cpuid = x; gd->gd_cpu_lockid = x << 24; gd->gd_prv_CMAP1 = &SMPpt[pg + 1]; @@ -2211,7 +2206,6 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } - /* * Flush the TLB on all other CPU's * @@ -2348,10 +2342,13 @@ SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW, void ap_init(void); void -ap_init() +ap_init(void) { u_int apic_id; + /* lock against other AP's that are waking up */ + s_lock(&ap_boot_lock); + /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); @@ -2397,6 +2394,30 @@ ap_init() smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } + + /* let other AP's wake up now */ + s_unlock(&ap_boot_lock); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ; /* nothing */ + + /* + * Set curproc to our per-cpu idleproc so that mutexes have + * something unique to lock with. + */ + PCPU_SET(curproc,idleproc); + PCPU_SET(prevproc,idleproc); + + microuptime(&switchtime); + switchticks = ticks; + + /* ok, now grab sched_lock and enter the scheduler */ + enable_intr(); + mtx_enter(&sched_lock, MTX_SPIN); + cpu_throw(); /* doesn't return */ + + panic("scheduler returned us to ap_init"); } #ifdef BETTER_CLOCK @@ -2453,6 +2474,12 @@ forwarded_statclock(int id, int pscnt, int *astmap) p = checkstate_curproc[id]; cpustate = checkstate_cpustate[id]; + /* XXX */ + if (p->p_ithd) + cpustate = CHECKSTATE_INTR; + else if (p == idleproc) + cpustate = CHECKSTATE_SYS; + switch (cpustate) { case CHECKSTATE_USER: if (p->p_flag & P_PROFIL) @@ -2482,9 +2509,10 @@ forwarded_statclock(int id, int pscnt, int *astmap) if (pscnt > 1) return; - if (!p) + if (p == idleproc) { + p->p_sticks++; cp_time[CP_IDLE]++; - else { + } else { p->p_sticks++; cp_time[CP_SYS]++; } @@ -2510,7 +2538,7 @@ forwarded_statclock(int id, int pscnt, int *astmap) p->p_iticks++; cp_time[CP_INTR]++; } - if (p != NULL) { + if (p != idleproc) { schedclock(p); /* Update resource usage integrals and maximums. */ @@ -2863,3 +2891,11 @@ smp_rendezvous(void (* setup_func)(void *), /* release lock */ s_unlock(&smp_rv_lock); } + +void +release_aps(void *dummy __unused) +{ + s_unlock(&ap_boot_lock); +} + +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S index d3602d29a2f4..9ede02c24342 100644 --- a/sys/amd64/amd64/mpboot.S +++ b/sys/amd64/amd64/mpboot.S @@ -114,43 +114,9 @@ mp_begin: /* now running relocated at KERNBASE */ CHECKPOINT(0x39, 6) - /* wait till we can get into the kernel */ - call _boot_get_mplock - - /* Now, let's prepare for some REAL WORK :-) */ + /* Now, let's prepare for some REAL WORK :-) This doesn't return. */ call _ap_init - call _rel_mplock - lock /* Avoid livelock (PIII Errata 39) */ - addl $0,-4(%esp) -2: - cmpl $0, CNAME(smp_started) /* Wait for last AP to be ready */ - jz 2b - call _get_mplock - - /* let her rip! (loads new stack) */ - jmp _cpu_switch - -NON_GPROF_ENTRY(wait_ap) - pushl %ebp - movl %esp, %ebp - call _rel_mplock - lock /* Avoid livelock (PIII Errata 39) */ - addl $0,0(%esp) - movl %eax, 8(%ebp) -1: - cmpl $0, CNAME(smp_started) - jnz 2f - decl %eax - cmpl $0, %eax - jge 1b -2: - call _get_mplock - movl %ebp, %esp - popl %ebp - ret - - /* * This is the embedded trampoline or bootstrap that is * copied into 'real-mode' low memory, it is where the diff --git a/sys/amd64/amd64/mptable.c b/sys/amd64/amd64/mptable.c index 61c5ecf73205..95b5759f9e66 100644 --- a/sys/amd64/amd64/mptable.c +++ b/sys/amd64/amd64/mptable.c @@ -36,6 +36,7 @@ #endif #include +#include #include #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -236,6 +238,8 @@ typedef struct BASETABLE_ENTRY { #define MP_ANNOUNCE_POST 0x19 +/* used to hold the AP's until we are ready to release them */ +struct simplelock ap_boot_lock; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; @@ -336,6 +340,7 @@ static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); static int apic_int_is_bus_type(int intr, int bus_type); +static void release_aps(void *dummy); /* * Calculate usable address in base memory for AP trampoline code. @@ -403,7 +408,7 @@ mp_probe(void) /* - * Startup the SMP processors. + * Initialize the SMP hardware and the APIC and start up the AP's. */ void mp_start(void) @@ -619,6 +624,9 @@ mp_enable(u_int boot_addr) /* initialize all SMP locks */ init_locks(); + /* obtain the ap_boot_lock */ + s_lock(&ap_boot_lock); + /* start each Application Processor */ start_all_aps(boot_addr); } @@ -1866,9 +1874,6 @@ struct simplelock fast_intr_lock; /* critical region around INTR() routines */ struct simplelock intr_lock; -/* lock regions protected in UP kernel via cli/sti */ -struct simplelock mpintr_lock; - /* lock region used by kernel profiling */ struct simplelock mcount_lock; @@ -1885,26 +1890,16 @@ struct simplelock clock_lock; /* lock around the MP rendezvous */ static struct simplelock smp_rv_lock; +/* only 1 CPU can panic at a time :) */ +struct simplelock panic_lock; + static void init_locks(void) { - /* - * Get the initial mp_lock with a count of 1 for the BSP. - * This uses a LOGICAL cpu ID, ie BSP == 0. - */ - mp_lock = 0x00000001; - -#if 0 - /* ISR uses its own "giant lock" */ - isr_lock = FREE_LOCK; -#endif - #if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ) s_lock_init((struct simplelock*)&apic_itrace_debuglock); #endif - s_lock_init((struct simplelock*)&mpintr_lock); - s_lock_init((struct simplelock*)&mcount_lock); s_lock_init((struct simplelock*)&fast_intr_lock); @@ -1912,6 +1907,7 @@ init_locks(void) s_lock_init((struct simplelock*)&imen_lock); s_lock_init((struct simplelock*)&cpl_lock); s_lock_init(&smp_rv_lock); + s_lock_init(&panic_lock); #ifdef USE_COMLOCK s_lock_init((struct simplelock*)&com_lock); @@ -1919,12 +1915,10 @@ init_locks(void) #ifdef USE_CLOCKLOCK s_lock_init((struct simplelock*)&clock_lock); #endif /* USE_CLOCKLOCK */ + + s_lock_init(&ap_boot_lock); } - -/* Wait for all APs to be fully initialized */ -extern int wait_ap(unsigned int); - /* * start each AP in our list */ @@ -1987,6 +1981,7 @@ start_all_aps(u_int boot_addr) SMPpt[pg + 4] = 0; /* *prv_PMAP1 */ /* prime data page for it to use */ + SLIST_INSERT_HEAD(&cpuhead, gd, gd_allcpu); gd->gd_cpuid = x; gd->gd_cpu_lockid = x << 24; gd->gd_prv_CMAP1 = &SMPpt[pg + 1]; @@ -2211,7 +2206,6 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } - /* * Flush the TLB on all other CPU's * @@ -2348,10 +2342,13 @@ SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW, void ap_init(void); void -ap_init() +ap_init(void) { u_int apic_id; + /* lock against other AP's that are waking up */ + s_lock(&ap_boot_lock); + /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); @@ -2397,6 +2394,30 @@ ap_init() smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } + + /* let other AP's wake up now */ + s_unlock(&ap_boot_lock); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ; /* nothing */ + + /* + * Set curproc to our per-cpu idleproc so that mutexes have + * something unique to lock with. + */ + PCPU_SET(curproc,idleproc); + PCPU_SET(prevproc,idleproc); + + microuptime(&switchtime); + switchticks = ticks; + + /* ok, now grab sched_lock and enter the scheduler */ + enable_intr(); + mtx_enter(&sched_lock, MTX_SPIN); + cpu_throw(); /* doesn't return */ + + panic("scheduler returned us to ap_init"); } #ifdef BETTER_CLOCK @@ -2453,6 +2474,12 @@ forwarded_statclock(int id, int pscnt, int *astmap) p = checkstate_curproc[id]; cpustate = checkstate_cpustate[id]; + /* XXX */ + if (p->p_ithd) + cpustate = CHECKSTATE_INTR; + else if (p == idleproc) + cpustate = CHECKSTATE_SYS; + switch (cpustate) { case CHECKSTATE_USER: if (p->p_flag & P_PROFIL) @@ -2482,9 +2509,10 @@ forwarded_statclock(int id, int pscnt, int *astmap) if (pscnt > 1) return; - if (!p) + if (p == idleproc) { + p->p_sticks++; cp_time[CP_IDLE]++; - else { + } else { p->p_sticks++; cp_time[CP_SYS]++; } @@ -2510,7 +2538,7 @@ forwarded_statclock(int id, int pscnt, int *astmap) p->p_iticks++; cp_time[CP_INTR]++; } - if (p != NULL) { + if (p != idleproc) { schedclock(p); /* Update resource usage integrals and maximums. */ @@ -2863,3 +2891,11 @@ smp_rendezvous(void (* setup_func)(void *), /* release lock */ s_unlock(&smp_rv_lock); } + +void +release_aps(void *dummy __unused) +{ + s_unlock(&ap_boot_lock); +} + +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); diff --git a/sys/amd64/amd64/nexus.c b/sys/amd64/amd64/nexus.c index 8a3077058718..5b6cdbc85618 100644 --- a/sys/amd64/amd64/nexus.c +++ b/sys/amd64/amd64/nexus.c @@ -68,7 +68,10 @@ #else #include #endif +#include +#include #include +#include static struct rman irq_rman, drq_rman, port_rman, mem_rman; @@ -397,9 +400,9 @@ static int nexus_setup_intr(device_t bus, device_t child, struct resource *irq, int flags, void (*ihand)(void *), void *arg, void **cookiep) { - intrmask_t *mask; driver_t *driver; - int error, icflags; + int error, icflags; + int pri; /* interrupt thread priority */ /* somebody tried to setup an irq that failed to allocate! */ if (irq == NULL) @@ -413,27 +416,32 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, driver = device_get_driver(child); switch (flags) { - case INTR_TYPE_TTY: - mask = &tty_imask; + case INTR_TYPE_TTY: /* keyboard or parallel port */ + pri = PI_TTYLOW; break; - case (INTR_TYPE_TTY | INTR_TYPE_FAST): - mask = &tty_imask; + case (INTR_TYPE_TTY | INTR_FAST): /* sio */ + pri = PI_TTYHIGH; icflags |= INTR_FAST; break; case INTR_TYPE_BIO: - mask = &bio_imask; + /* + * XXX We need to refine this. BSD/OS distinguishes + * between tape and disk priorities. + */ + pri = PI_DISK; break; case INTR_TYPE_NET: - mask = &net_imask; + pri = PI_NET; break; case INTR_TYPE_CAM: - mask = &cam_imask; + pri = PI_DISK; /* XXX or PI_CAM? */ break; case INTR_TYPE_MISC: - mask = 0; + pri = PI_DULL; /* don't care */ break; + /* We didn't specify an interrupt level. */ default: - panic("still using grody create_intr interface"); + panic("nexus_setup_intr: no interrupt type in flags"); } /* @@ -444,7 +452,7 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, return (error); *cookiep = inthand_add(device_get_nameunit(child), irq->r_start, - ihand, arg, mask, icflags); + ihand, arg, pri, icflags); if (*cookiep == NULL) error = EINVAL; /* XXX ??? */ diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index edae2929fb87..7ce9120d243f 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -668,7 +668,7 @@ pmap_pte_quick(pmap, va) * (unsigned *) prv_PMAP1 = newpf | PG_RW | PG_V; cpu_invlpg(prv_PADDR1); } - return prv_PADDR1 + ((unsigned) index & (NPTEPG - 1)); + return (unsigned *)(prv_PADDR1 + (index & (NPTEPG - 1))); #else if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) { * (unsigned *) PMAP1 = newpf | PG_RW | PG_V; diff --git a/sys/amd64/amd64/swtch.s b/sys/amd64/amd64/swtch.s index c895fefa8c15..db56a1b40af6 100644 --- a/sys/amd64/amd64/swtch.s +++ b/sys/amd64/amd64/swtch.s @@ -73,189 +73,6 @@ _tlb_flush_count: .long 0 .text -/* - * When no processes are on the runq, cpu_switch() branches to _idle - * to wait for something to come ready. - */ - ALIGN_TEXT - .type _idle,@function -_idle: - xorl %ebp,%ebp - movl %ebp,_switchtime - -#ifdef SMP - - /* when called, we have the mplock, intr disabled */ - /* use our idleproc's "context" */ - movl _IdlePTD, %ecx - movl %cr3, %eax - cmpl %ecx, %eax - je 2f -#if defined(SWTCH_OPTIM_STATS) - decl _swtch_optim_stats - incl _tlb_flush_count -#endif - movl %ecx, %cr3 -2: - /* Keep space for nonexisting return addr, or profiling bombs */ - movl $gd_idlestack_top-4, %ecx - addl %fs:0, %ecx - movl %ecx, %esp - - /* update common_tss.tss_esp0 pointer */ - movl %ecx, _common_tss + TSS_ESP0 - - movl _cpuid, %esi - btrl %esi, _private_tss - jae 1f - - movl $gd_common_tssd, %edi - addl %fs:0, %edi - - /* move correct tss descriptor into GDT slot, then reload tr */ - movl _tss_gdt, %ebx /* entry in GDT */ - movl 0(%edi), %eax - movl %eax, 0(%ebx) - movl 4(%edi), %eax - movl %eax, 4(%ebx) - movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ - ltr %si -1: - - sti - - /* - * XXX callers of cpu_switch() do a bogus splclock(). Locking should - * be left to cpu_switch(). - * - * NOTE: spl*() may only be called while we hold the MP lock (which - * we do). - */ - call _spl0 - - cli - - /* - * _REALLY_ free the lock, no matter how deep the prior nesting. - * We will recover the nesting on the way out when we have a new - * proc to load. - * - * XXX: we had damn well better be sure we had it before doing this! - */ - movl $FREE_LOCK, %eax - movl %eax, _mp_lock - - /* do NOT have lock, intrs disabled */ - .globl idle_loop -idle_loop: - - cmpl $0,_smp_active - jne 1f - cmpl $0,_cpuid - je 1f - jmp 2f - -1: - call _procrunnable - testl %eax,%eax - jnz 3f - - /* - * Handle page-zeroing in the idle loop. Called with interrupts - * disabled and the MP lock released. Inside vm_page_zero_idle - * we enable interrupts and grab the mplock as required. - */ - cmpl $0,_do_page_zero_idle - je 2f - - call _vm_page_zero_idle /* internal locking */ - testl %eax, %eax - jnz idle_loop -2: - - /* enable intrs for a halt */ - movl $0, lapic_tpr /* 1st candidate for an INT */ - call *_hlt_vector /* wait for interrupt */ - cli - jmp idle_loop - - /* - * Note that interrupts must be enabled while obtaining the MP lock - * in order to be able to take IPI's while blocked. - */ -3: - movl $LOPRIO_LEVEL, lapic_tpr /* arbitrate for INTs */ - sti - call _get_mplock - cli - call _procrunnable - testl %eax,%eax - CROSSJUMP(jnz, sw1a, jz) - call _rel_mplock - jmp idle_loop - -#else /* !SMP */ - - movl $HIDENAME(tmpstk),%esp -#if defined(OVERLY_CONSERVATIVE_PTD_MGMT) -#if defined(SWTCH_OPTIM_STATS) - incl _swtch_optim_stats -#endif - movl _IdlePTD, %ecx - movl %cr3, %eax - cmpl %ecx, %eax - je 2f -#if defined(SWTCH_OPTIM_STATS) - decl _swtch_optim_stats - incl _tlb_flush_count -#endif - movl %ecx, %cr3 -2: -#endif - - /* update common_tss.tss_esp0 pointer */ - movl %esp, _common_tss + TSS_ESP0 - - movl $0, %esi - btrl %esi, _private_tss - jae 1f - - movl $_common_tssd, %edi - - /* move correct tss descriptor into GDT slot, then reload tr */ - movl _tss_gdt, %ebx /* entry in GDT */ - movl 0(%edi), %eax - movl %eax, 0(%ebx) - movl 4(%edi), %eax - movl %eax, 4(%ebx) - movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ - ltr %si -1: - - sti - - /* - * XXX callers of cpu_switch() do a bogus splclock(). Locking should - * be left to cpu_switch(). - */ - call _spl0 - - ALIGN_TEXT -idle_loop: - cli - call _procrunnable - testl %eax,%eax - CROSSJUMP(jnz, sw1a, jz) - call _vm_page_zero_idle - testl %eax, %eax - jnz idle_loop - call *_hlt_vector /* wait for interrupt */ - jmp idle_loop - -#endif /* SMP */ - -CROSSJUMPTARGET(_idle) - ENTRY(default_halt) sti #ifndef SMP @@ -263,6 +80,12 @@ ENTRY(default_halt) #endif ret +/* + * cpu_throw() + */ +ENTRY(cpu_throw) + jmp sw1 + /* * cpu_switch() */ @@ -270,10 +93,11 @@ ENTRY(cpu_switch) /* switch to new process. first, save context as needed */ movl _curproc,%ecx + movl %ecx,_prevproc /* if no process to save, don't bother */ testl %ecx,%ecx - je sw1 + jz sw1 #ifdef SMP movb P_ONCPU(%ecx), %al /* save "last" cpu */ @@ -299,7 +123,7 @@ ENTRY(cpu_switch) movl %edi,PCB_EDI(%edx) movl %gs,PCB_GS(%edx) - /* test if debug regisers should be saved */ + /* test if debug registers should be saved */ movb PCB_FLAGS(%edx),%al andb $PCB_DBREGS,%al jz 1f /* no, skip over */ @@ -319,15 +143,12 @@ ENTRY(cpu_switch) movl %eax,PCB_DR0(%edx) 1: + /* save sched_lock recursion count */ + movl _sched_lock+MTX_RECURSE,%eax + movl %eax,PCB_SCHEDNEST(%edx) + #ifdef SMP - movl _mp_lock, %eax /* XXX FIXME: we should be saving the local APIC TPR */ -#ifdef DIAGNOSTIC - cmpl $FREE_LOCK, %eax /* is it free? */ - je badsw4 /* yes, bad medicine! */ -#endif /* DIAGNOSTIC */ - andl $COUNT_FIELD, %eax /* clear CPU portion */ - movl %eax, PCB_MPNEST(%edx) /* store it */ #endif /* SMP */ #if NNPX > 0 @@ -341,25 +162,33 @@ ENTRY(cpu_switch) 1: #endif /* NNPX > 0 */ - movl $0,_curproc /* out of process */ - - /* save is done, now choose a new process or idle */ + /* save is done, now choose a new process */ sw1: - cli #ifdef SMP /* Stop scheduling if smp_active goes zero and we are not BSP */ cmpl $0,_smp_active jne 1f cmpl $0,_cpuid - CROSSJUMP(je, _idle, jne) /* wind down */ + je 1f + + movl _idleproc, %eax + jmp sw1b 1: #endif + /* + * Choose a new process to schedule. chooseproc() returns idleproc + * if it cannot find another process to run. + */ sw1a: call _chooseproc /* trash ecx, edx, ret eax*/ - testl %eax,%eax - CROSSJUMP(je, _idle, jne) /* if no proc, idle */ + +#ifdef DIAGNOSTIC + testl %eax,%eax /* no process? */ + jz badsw3 /* no, panic */ +#endif +sw1b: movl %eax,%ecx xorl %eax,%eax @@ -456,9 +285,6 @@ sw1a: movl %ecx, _curproc /* into next process */ #ifdef SMP - movl _cpu_lockid, %eax - orl PCB_MPNEST(%edx), %eax /* add next count from PROC */ - movl %eax, _mp_lock /* load the mp_lock */ /* XXX FIXME: we should be restoring the local APIC TPR */ #endif /* SMP */ @@ -500,7 +326,22 @@ cpu_switch_load_gs: movl %eax,%dr7 1: - sti + /* + * restore sched_lock recursion count and transfer ownership to + * new process + */ + movl PCB_SCHEDNEST(%edx),%eax + movl %eax,_sched_lock+MTX_RECURSE + + movl _curproc,%eax + movl %eax,_sched_lock+MTX_LOCK + +#ifdef DIAGNOSTIC + pushfl + popl %ecx + testl $0x200, %ecx /* interrupts enabled? */ + jnz badsw6 /* that way madness lies */ +#endif ret CROSSJUMPTARGET(sw1a) @@ -517,15 +358,27 @@ badsw2: call _panic sw0_2: .asciz "cpu_switch: not SRUN" -#endif -#if defined(SMP) && defined(DIAGNOSTIC) -badsw4: - pushl $sw0_4 +badsw3: + pushl $sw0_3 call _panic -sw0_4: .asciz "cpu_switch: do not have lock" -#endif /* SMP && DIAGNOSTIC */ +sw0_3: .asciz "cpu_switch: chooseproc returned NULL" + +#endif + +#ifdef DIAGNOSTIC +badsw5: + pushl $sw0_5 + call _panic + +sw0_5: .asciz "cpu_switch: interrupts enabled (again)" +badsw6: + pushl $sw0_6 + call _panic + +sw0_6: .asciz "cpu_switch: interrupts enabled" +#endif /* * savectx(pcb) diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 51de1ac9e650..f32dfaeeddc0 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -49,10 +49,12 @@ #include "opt_trap.h" #include +#include #include #include #include #include +#include #include #include #include @@ -76,12 +78,14 @@ #include #include #include +#include #include #ifdef SMP #include #endif #include +#include #include #ifdef POWERFAIL_NMI @@ -96,11 +100,14 @@ #include "isa.h" #include "npx.h" +#include + int (*pmath_emulate) __P((struct trapframe *)); extern void trap __P((struct trapframe frame)); extern int trapwrite __P((unsigned addr)); extern void syscall2 __P((struct trapframe frame)); +extern void ast __P((struct trapframe frame)); static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); static void trap_fatal __P((struct trapframe *, vm_offset_t)); @@ -142,7 +149,7 @@ static char *trap_msg[] = { }; static __inline int userret __P((struct proc *p, struct trapframe *frame, - u_quad_t oticks, int have_mplock)); + u_quad_t oticks, int have_giant)); #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; @@ -158,18 +165,18 @@ SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); static __inline int -userret(p, frame, oticks, have_mplock) +userret(p, frame, oticks, have_giant) struct proc *p; struct trapframe *frame; u_quad_t oticks; - int have_mplock; + int have_giant; { int sig, s; while ((sig = CURSIG(p)) != 0) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } postsig(sig); } @@ -184,31 +191,34 @@ userret(p, frame, oticks, have_mplock) * mi_switch()'ed, we might not be on the queue indicated by * our priority. */ - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; - } s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); + mtx_exit(&sched_lock, MTX_SPIN); splx(s); - while ((sig = CURSIG(p)) != 0) + while ((sig = CURSIG(p)) != 0) { + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; + } postsig(sig); + } } /* * Charge system time if profiling. */ if (p->p_flag & P_PROFIL) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } addupc_task(p, frame->tf_eip, (u_int)(p->p_sticks - oticks) * psratio); } curpriority = p->p_priority; - return(have_mplock); + return(have_giant); } /* @@ -226,13 +236,20 @@ trap(frame) u_quad_t sticks = 0; int i = 0, ucode = 0, type, code; vm_offset_t eva; +#ifdef POWERFAIL_NMI + static int lastalert = 0; +#endif - if (!(frame.tf_eflags & PSL_I)) { + atomic_add_int(&cnt.v_trap, 1); + + if ((frame.tf_eflags & PSL_I) == 0) { /* - * Buggy application or kernel code has disabled interrupts - * and then trapped. Enabling interrupts now is wrong, but - * it is better than running with interrupts disabled until - * they are accidentally enabled later. + * Buggy application or kernel code has disabled + * interrupts and then trapped. Enabling interrupts + * now is wrong, but it is better than running with + * interrupts disabled until they are accidentally + * enabled later. XXX Consider whether is this still + * correct. */ type = frame.tf_trapno; if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) @@ -252,54 +269,27 @@ trap(frame) eva = 0; if (frame.tf_trapno == T_PAGEFLT) { /* - * For some Cyrix CPUs, %cr2 is clobbered by interrupts. - * This problem is worked around by using an interrupt - * gate for the pagefault handler. We are finally ready - * to read %cr2 and then must reenable interrupts. - * - * XXX this should be in the switch statement, but the - * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the - * flow of control too much for this to be obviously - * correct. + * For some Cyrix CPUs, %cr2 is clobbered by + * interrupts. This problem is worked around by using + * an interrupt gate for the pagefault handler. We + * are finally ready to read %cr2 and then must + * reenable interrupts. */ eva = rcr2(); enable_intr(); - } + } + + mtx_enter(&Giant, MTX_DEF); #if defined(I586_CPU) && !defined(NO_F00F_HACK) restart: #endif + type = frame.tf_trapno; code = frame.tf_err; - if (in_vm86call) { - if (frame.tf_eflags & PSL_VM && - (type == T_PROTFLT || type == T_STKFLT)) { - i = vm86_emulate((struct vm86frame *)&frame); - if (i != 0) - /* - * returns to original process - */ - vm86_trap((struct vm86frame *)&frame); - return; - } - switch (type) { - /* - * these traps want either a process context, or - * assume a normal userspace trap. - */ - case T_PROTFLT: - case T_SEGNPFLT: - trap_fatal(&frame, eva); - return; - case T_TRCTRAP: - type = T_BPTFLT; /* kernel breakpoint */ - /* FALL THROUGH */ - } - goto kernel_trap; /* normal kernel trap handling */ - } - - if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) { + if ((ISPL(frame.tf_cs) == SEL_UPL) || + ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { /* user trap */ sticks = p->p_sticks; @@ -322,16 +312,6 @@ trap(frame) i = SIGFPE; break; - case T_ASTFLT: /* Allow process switch */ - astoff(); - cnt.v_soft++; - if (p->p_flag & P_OWEUPC) { - p->p_flag &= ~P_OWEUPC; - addupc_task(p, p->p_stats->p_prof.pr_addr, - p->p_stats->p_prof.pr_ticks); - } - goto out; - /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle @@ -342,7 +322,7 @@ trap(frame) if (frame.tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)&frame); if (i == 0) - goto out; + goto user; break; } /* FALL THROUGH */ @@ -357,14 +337,20 @@ trap(frame) case T_PAGEFLT: /* page fault */ i = trap_pfault(&frame, TRUE, eva); - if (i == -1) - return; #if defined(I586_CPU) && !defined(NO_F00F_HACK) - if (i == -2) + if (i == -2) { + /* + * f00f hack workaround has triggered, treat + * as illegal instruction not page fault. + */ + frame.tf_trapno = T_PRIVINFLT; goto restart; + } #endif - if (i == 0) + if (i == -1) goto out; + if (i == 0) + goto user; ucode = T_PAGEFLT; break; @@ -377,7 +363,15 @@ trap(frame) #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI - goto handle_powerfail; +#ifndef TIMER_FREQ +# define TIMER_FREQ 1193182 +#endif + if (time_second - lastalert > 10) { + log(LOG_WARNING, "NMI: power fail\n"); + sysbeep(TIMER_FREQ/880, hz); + lastalert = time_second; + } + goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { @@ -391,7 +385,7 @@ trap(frame) kdb_trap (type, 0, &frame); } #endif /* DDB */ - return; + goto out; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; @@ -410,9 +404,9 @@ trap(frame) case T_DNA: #if NNPX > 0 - /* if a transparent fault (due to context switch "late") */ + /* transparent fault (due to context switch "late") */ if (npxdna()) - return; + goto out; #endif if (!pmath_emulate) { i = SIGFPE; @@ -422,7 +416,7 @@ trap(frame) i = (*pmath_emulate)(&frame); if (i == 0) { if (!(frame.tf_eflags & PSL_T)) - return; + goto out; frame.tf_eflags &= ~PSL_T; i = SIGTRAP; } @@ -435,13 +429,12 @@ trap(frame) break; } } else { -kernel_trap: /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(&frame, FALSE, eva); - return; + goto out; case T_DNA: #if NNPX > 0 @@ -451,31 +444,35 @@ trap(frame) * registered such use. */ if (npxdna()) - return; + goto out; #endif break; - case T_PROTFLT: /* general protection fault */ - case T_SEGNPFLT: /* segment not present fault */ /* - * Invalid segment selectors and out of bounds - * %eip's and %esp's can be set up in user mode. - * This causes a fault in kernel mode when the - * kernel tries to return to user mode. We want - * to get this fault so that we can fix the - * problem here and not have to check all the - * selectors and pointers when the user changes - * them. + * The following two traps can happen in + * vm86 mode, and, if so, we want to handle + * them specially. */ -#define MAYBE_DORETI_FAULT(where, whereto) \ - do { \ - if (frame.tf_eip == (int)where) { \ - frame.tf_eip = (int)whereto; \ - return; \ - } \ - } while (0) + case T_PROTFLT: /* general protection fault */ + case T_STKFLT: /* stack fault */ + if (frame.tf_eflags & PSL_VM) { + i = vm86_emulate((struct vm86frame *)&frame); + if (i != 0) + /* + * returns to original process + */ + vm86_trap((struct vm86frame *)&frame); + goto out; + } + /* FALL THROUGH */ + + case T_SEGNPFLT: /* segment not present fault */ + if (in_vm86call) + break; + + if (intr_nesting_level != 0) + break; - if (intr_nesting_level == 0) { /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the @@ -488,20 +485,38 @@ trap(frame) if (frame.tf_eip == (int)cpu_switch_load_gs) { curpcb->pcb_gs = 0; psignal(p, SIGBUS); - return; + goto out; + } + + /* + * Invalid segment selectors and out of bounds + * %eip's and %esp's can be set up in user mode. + * This causes a fault in kernel mode when the + * kernel tries to return to user mode. We want + * to get this fault so that we can fix the + * problem here and not have to check all the + * selectors and pointers when the user changes + * them. + */ + if (frame.tf_eip == (int)doreti_iret) { + frame.tf_eip = (int)doreti_iret_fault; + goto out; + } + if (frame.tf_eip == (int)doreti_popl_ds) { + frame.tf_eip = (int)doreti_popl_ds_fault; + goto out; + } + if (frame.tf_eip == (int)doreti_popl_es) { + frame.tf_eip = (int)doreti_popl_es_fault; + goto out; } - MAYBE_DORETI_FAULT(doreti_iret, - doreti_iret_fault); - MAYBE_DORETI_FAULT(doreti_popl_ds, - doreti_popl_ds_fault); - MAYBE_DORETI_FAULT(doreti_popl_es, - doreti_popl_es_fault); - MAYBE_DORETI_FAULT(doreti_popl_fs, - doreti_popl_fs_fault); + if (frame.tf_eip == (int)doreti_popl_fs) { + frame.tf_eip = (int)doreti_popl_fs_fault; + goto out; + } if (curpcb && curpcb->pcb_onfault) { frame.tf_eip = (int)curpcb->pcb_onfault; - return; - } + goto out; } break; @@ -517,7 +532,7 @@ trap(frame) */ if (frame.tf_eflags & PSL_NT) { frame.tf_eflags &= ~PSL_NT; - return; + goto out; } break; @@ -529,7 +544,7 @@ trap(frame) * silently until the syscall handler has * saved the flags. */ - return; + goto out; } if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { /* @@ -537,7 +552,7 @@ trap(frame) * flags. Stop single stepping it. */ frame.tf_eflags &= ~PSL_T; - return; + goto out; } /* * Ignore debug register trace traps due to @@ -549,13 +564,13 @@ trap(frame) * in kernel space because that is useful when * debugging the kernel. */ - if (user_dbreg_trap()) { + if (user_dbreg_trap() && !in_vm86call) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); - return; + goto out; } /* * Fall through (TRCTRAP kernel mode, kernel address) @@ -567,28 +582,19 @@ trap(frame) */ #ifdef DDB if (kdb_trap (type, 0, &frame)) - return; + goto out; #endif break; #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI -#ifndef TIMER_FREQ -# define TIMER_FREQ 1193182 -#endif - handle_powerfail: - { - static unsigned lastalert = 0; - - if(time_second - lastalert > 10) - { + if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; - } - return; } + goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { @@ -602,16 +608,16 @@ trap(frame) kdb_trap (type, 0, &frame); } #endif /* DDB */ - return; + goto out; } else if (panic_on_nmi == 0) - return; + goto out; /* FALL THROUGH */ #endif /* POWERFAIL_NMI */ #endif /* NISA > 0 */ } trap_fatal(&frame, eva); - return; + goto out; } /* Translate fault for emulators (e.g. Linux) */ @@ -630,8 +636,10 @@ trap(frame) } #endif -out: +user: userret(p, &frame, sticks, 1); +out: + mtx_exit(&Giant, MTX_DEF); } #ifdef notyet @@ -769,10 +777,8 @@ trap_pfault(frame, usermode, eva) * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) - if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) { - frame->tf_trapno = T_PRIVINFLT; + if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return -2; - } #endif if (usermode) goto nogo; @@ -869,8 +875,7 @@ trap_fatal(frame, eva) frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP - /* three seperate prints in case of a trap on an unmapped page */ - printf("mp_lock = %08x; ", mp_lock); + /* two seperate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", cpuid); printf("lapic.id = %08x\n", lapic.id); #endif @@ -917,26 +922,6 @@ trap_fatal(frame, eva) } else { printf("Idle\n"); } - printf("interrupt mask = "); - if ((cpl & net_imask) == net_imask) - printf("net "); - if ((cpl & tty_imask) == tty_imask) - printf("tty "); - if ((cpl & bio_imask) == bio_imask) - printf("bio "); - if ((cpl & cam_imask) == cam_imask) - printf("cam "); - if (cpl == 0) - printf("none"); -#ifdef SMP -/** - * XXX FIXME: - * we probably SHOULD have stopped the other CPUs before now! - * another CPU COULD have been touching cpl at this moment... - */ - printf(" <- SMP: XXX"); -#endif - printf("\n"); #ifdef KDB if (kdb_trap(&psl)) @@ -973,8 +958,7 @@ dblfault_handler() printf("esp = 0x%x\n", common_tss.tss_esp); printf("ebp = 0x%x\n", common_tss.tss_ebp); #ifdef SMP - /* three seperate prints in case of a trap on an unmapped page */ - printf("mp_lock = %08x; ", mp_lock); + /* two seperate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", cpuid); printf("lapic.id = %08x\n", lapic.id); #endif @@ -1048,12 +1032,14 @@ syscall2(frame) int error; int narg; int args[8]; - int have_mplock = 0; + int have_giant = 0; u_int code; + atomic_add_int(&cnt.v_syscall, 1); + #ifdef DIAGNOSTIC if (ISPL(frame.tf_cs) != SEL_UPL) { - get_mplock(); + mtx_enter(&Giant, MTX_DEF); panic("syscall"); /* NOT REACHED */ } @@ -1075,9 +1061,9 @@ syscall2(frame) /* * The prep code is not MP aware. */ - get_mplock(); + mtx_enter(&Giant, MTX_DEF); (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); - rel_mplock(); + mtx_exit(&Giant, MTX_DEF); } else { /* * Need to check if this is a 32 bit or 64 bit syscall. @@ -1114,8 +1100,8 @@ syscall2(frame) */ if (params && (i = narg * sizeof(int)) && (error = copyin(params, (caddr_t)args, (u_int)i))) { - get_mplock(); - have_mplock = 1; + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, narg, args); @@ -1129,15 +1115,15 @@ syscall2(frame) * we are ktracing */ if ((callp->sy_narg & SYF_MPSAFE) == 0) { - get_mplock(); - have_mplock = 1; + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } ktrsyscall(p->p_tracep, code, narg, args); } @@ -1192,9 +1178,9 @@ syscall2(frame) * Traced syscall. trapsignal() is not MP aware. */ if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); @@ -1203,13 +1189,13 @@ syscall2(frame) /* * Handle reschedule and other end-of-syscall issues */ - have_mplock = userret(p, &frame, sticks, have_mplock); + have_giant = userret(p, &frame, sticks, have_giant); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } ktrsysret(p->p_tracep, code, error, p->p_retval[0]); } @@ -1225,27 +1211,66 @@ syscall2(frame) /* * Release the MP lock if we had to get it */ - if (have_mplock) - rel_mplock(); + if (have_giant) + mtx_exit(&Giant, MTX_DEF); + + mtx_assert(&sched_lock, MA_NOTOWNED); + mtx_assert(&Giant, MA_NOTOWNED); +} + +void +ast(frame) + struct trapframe frame; +{ + struct proc *p = CURPROC; + u_quad_t sticks; + + /* + * handle atomicy by looping since interrupts are enabled and the + * MP lock is not held. + */ + sticks = ((volatile struct proc *)p)->p_sticks; + while (sticks != ((volatile struct proc *)p)->p_sticks) + sticks = ((volatile struct proc *)p)->p_sticks; + + astoff(); + atomic_add_int(&cnt.v_soft, 1); + if (p->p_flag & P_OWEUPC) { + mtx_enter(&Giant, MTX_DEF); + p->p_flag &= ~P_OWEUPC; + addupc_task(p, p->p_stats->p_prof.pr_addr, + p->p_stats->p_prof.pr_ticks); +} + if (userret(p, &frame, sticks, mtx_owned(&Giant)) != 0) + mtx_exit(&Giant, MTX_DEF); } /* * Simplified back end of syscall(), used when returning from fork() - * directly into user mode. MP lock is held on entry and should be - * held on return. + * directly into user mode. Giant is not held on entry, and must not + * be held on return. */ void fork_return(p, frame) struct proc *p; struct trapframe frame; { + int have_giant; + frame.tf_eax = 0; /* Child returns zero */ frame.tf_eflags &= ~PSL_C; /* success */ frame.tf_edx = 1; - userret(p, &frame, 0, 1); + have_giant = userret(p, &frame, 0, mtx_owned(&Giant)); #ifdef KTRACE - if (KTRPOINT(p, KTR_SYSRET)) + if (KTRPOINT(p, KTR_SYSRET)) { + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; + } ktrsysret(p->p_tracep, SYS_fork, 0, 0); + } #endif + if (have_giant) + mtx_exit(&Giant, MTX_DEF); } diff --git a/sys/amd64/amd64/tsc.c b/sys/amd64/amd64/tsc.c index 15044abbaa3b..724f3c2817ba 100644 --- a/sys/amd64/amd64/tsc.c +++ b/sys/amd64/amd64/tsc.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -93,10 +94,6 @@ #include #endif -#ifdef SMP -#define disable_intr() CLOCK_DISABLE_INTR() -#define enable_intr() CLOCK_ENABLE_INTR() - #ifdef APIC_IO #include /* The interrupt triggered by the 8254 (timer) chip */ @@ -104,7 +101,6 @@ int apic_8254_intr; static u_long read_intr_count __P((int vec)); static void setup_8254_mixed_mode __P((void)); #endif -#endif /* SMP */ /* * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we @@ -147,7 +143,9 @@ int tsc_is_broken; int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */ static int beeping = 0; +#if 0 static u_int clk_imask = HWI_MASK | SWI_MASK; +#endif static const u_char daysinmonth[] = {31,28,31,30,31,30,31,31,30,31,30,31}; static u_int hardclock_max_count; static u_int32_t i8254_lastcount; @@ -205,8 +203,12 @@ SYSCTL_OPAQUE(_debug, OID_AUTO, i8254_timecounter, CTLFLAG_RD, static void clkintr(struct clockframe frame) { + int intrsave; + if (timecounter->tc_get_timecount == i8254_get_timecount) { + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); if (i8254_ticked) i8254_ticked = 0; else { @@ -214,7 +216,8 @@ clkintr(struct clockframe frame) i8254_lastcount = 0; } clkintr_pending = 0; - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); } timer_func(&frame); switch (timer0_state) { @@ -233,14 +236,17 @@ clkintr(struct clockframe frame) break; case ACQUIRE_PENDING: + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); i8254_offset = i8254_get_timecount(NULL); i8254_lastcount = 0; timer0_max_count = TIMER_DIV(new_rate); outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); timer_func = new_function; timer0_state = ACQUIRED; setdelayed(); @@ -249,7 +255,9 @@ clkintr(struct clockframe frame) case RELEASE_PENDING: if ((timer0_prescaler_count += timer0_max_count) >= hardclock_max_count) { + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); i8254_offset = i8254_get_timecount(NULL); i8254_lastcount = 0; timer0_max_count = hardclock_max_count; @@ -257,7 +265,8 @@ clkintr(struct clockframe frame) TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); timer0_prescaler_count = 0; timer_func = hardclock; timer0_state = RELEASED; @@ -404,11 +413,11 @@ DB_SHOW_COMMAND(rtc, rtc) static int getit(void) { - u_long ef; - int high, low; + int high, low, intrsave; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); /* Select timer0 and latch counter value. */ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); @@ -417,7 +426,7 @@ getit(void) high = inb(TIMER_CNTR0); CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); return ((high << 8) | low); } @@ -523,6 +532,7 @@ sysbeepstop(void *chan) int sysbeep(int pitch, int period) { + int intrsave; int x = splclock(); if (acquire_timer2(TIMER_SQWAVE|TIMER_16BIT)) @@ -531,10 +541,13 @@ sysbeep(int pitch, int period) splx(x); return (-1); /* XXX Should be EBUSY, but nobody cares anyway. */ } + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); outb(TIMER_CNTR2, pitch); outb(TIMER_CNTR2, (pitch>>8)); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); if (!beeping) { /* enable counter2 output to speaker */ outb(IO_PPI, inb(IO_PPI) | 3); @@ -683,11 +696,12 @@ calibrate_clocks(void) static void set_timer_freq(u_int freq, int intr_freq) { - u_long ef; + int intrsave; int new_timer0_max_count; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); timer_freq = freq; new_timer0_max_count = hardclock_max_count = TIMER_DIV(intr_freq); if (new_timer0_max_count != timer0_max_count) { @@ -697,7 +711,7 @@ set_timer_freq(u_int freq, int intr_freq) outb(TIMER_CNTR0, timer0_max_count >> 8); } CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); } /* @@ -711,15 +725,16 @@ set_timer_freq(u_int freq, int intr_freq) void i8254_restore(void) { - u_long ef; + int intrsave; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); } /* @@ -979,8 +994,8 @@ cpu_initclocks() { int diag; #ifdef APIC_IO - int apic_8254_trial; - struct intrec *clkdesc; + int apic_8254_trial, num_8254_ticks; + struct intrec *clkdesc, *rtcdesc; #endif /* APIC_IO */ if (statclock_disable) { @@ -1014,14 +1029,15 @@ cpu_initclocks() } else panic("APIC_IO: Cannot route 8254 interrupt to CPU"); } - - clkdesc = inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, - NULL, &clk_imask, INTR_EXCL); - INTREN(1 << apic_8254_intr); - #else /* APIC_IO */ - inthand_add("clk", 0, (inthand2_t *)clkintr, NULL, &clk_imask, + /* + * XXX Check the priority of this interrupt handler. I + * couldn't find anything suitable in the BSD/OS code (grog, + * 19 July 2000). + */ + /* Setup the PIC clk handler. The APIC handler is setup later */ + inthand_add("clk", 0, (inthand2_t *)clkintr, NULL, PI_REALTIME, INTR_EXCL); INTREN(IRQ0); @@ -1032,8 +1048,18 @@ cpu_initclocks() writertc(RTC_STATUSB, RTCSB_24HR); /* Don't bother enabling the statistics clock. */ - if (statclock_disable) + if (statclock_disable) { +#ifdef APIC_IO + /* + * XXX - if statclock is disabled, don't attempt the APIC + * trial. Not sure this is sane for APIC_IO. + */ + inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, NULL, + PI_REALTIME, INTR_EXCL); + INTREN(1 << apic_8254_intr); +#endif /* APIC_IO */ return; + } diag = rtcin(RTC_DIAG); if (diag != 0) printf("RTC BIOS diagnostic error %b\n", diag, RTCDG_BITS); @@ -1041,34 +1067,44 @@ cpu_initclocks() #ifdef APIC_IO if (isa_apic_irq(8) != 8) panic("APIC RTC != 8"); -#endif /* APIC_IO */ - inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, &stat_imask, - INTR_EXCL); - -#ifdef APIC_IO - INTREN(APIC_IRQ8); -#else - INTREN(IRQ8); -#endif /* APIC_IO */ - - writertc(RTC_STATUSB, rtc_statusb); - -#ifdef APIC_IO if (apic_8254_trial) { - + /* + * XXX - We use fast interrupts for clk and rtc long enough to + * perform the APIC probe and then revert to exclusive + * interrupts. + */ + clkdesc = inthand_add("clk", apic_8254_intr, + (inthand2_t *)clkintr, NULL, PI_REALTIME, INTR_FAST); + INTREN(1 << apic_8254_intr); + + rtcdesc = inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, + PI_REALTIME, INTR_FAST); /* XXX */ + INTREN(APIC_IRQ8); + writertc(RTC_STATUSB, rtc_statusb); + printf("APIC_IO: Testing 8254 interrupt delivery\n"); while (read_intr_count(8) < 6) ; /* nothing */ - if (read_intr_count(apic_8254_intr) < 3) { + num_8254_ticks = read_intr_count(apic_8254_intr); + + /* disable and remove our fake handlers */ + INTRDIS(1 << apic_8254_intr); + inthand_remove(clkdesc); + + writertc(RTC_STATUSA, rtc_statusa); + writertc(RTC_STATUSB, RTCSB_24HR); + + INTRDIS(APIC_IRQ8); + inthand_remove(rtcdesc); + + if (num_8254_ticks < 3) { /* * The MP table is broken. * The 8254 was not connected to the specified pin * on the IO APIC. * Workaround: Limited variant of mixed mode. */ - INTRDIS(1 << apic_8254_intr); - inthand_remove(clkdesc); printf("APIC_IO: Broken MP table detected: " "8254 is not connected to " "IOAPIC #%d intpin %d\n", @@ -1087,13 +1123,27 @@ cpu_initclocks() } apic_8254_intr = apic_irq(0, 0); setup_8254_mixed_mode(); - inthand_add("clk", apic_8254_intr, - (inthand2_t *)clkintr, - NULL, &clk_imask, INTR_EXCL); - INTREN(1 << apic_8254_intr); } } + + /* Finally, setup the real clock handlers */ + inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, NULL, + PI_REALTIME, INTR_EXCL); + INTREN(1 << apic_8254_intr); +#endif + + inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, PI_REALTIME, + INTR_EXCL); +#ifdef APIC_IO + INTREN(APIC_IRQ8); +#else + INTREN(IRQ8); +#endif + + writertc(RTC_STATUSB, rtc_statusb); + +#ifdef APIC_IO if (apic_int_type(0, 0) != 3 || int_to_apicintpin[apic_8254_intr].ioapic != 0 || int_to_apicintpin[apic_8254_intr].int_pin != 0) @@ -1198,11 +1248,12 @@ static unsigned i8254_get_timecount(struct timecounter *tc) { u_int count; - u_long ef; + int intrsave; u_int high, low; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); /* Select timer0 and latch counter value. */ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); @@ -1212,7 +1263,7 @@ i8254_get_timecount(struct timecounter *tc) count = timer0_max_count - ((high << 8) | low); if (count < i8254_lastcount || (!i8254_ticked && (clkintr_pending || - ((count < 20 || (!(ef & PSL_I) && count < timer0_max_count / 2u)) && + ((count < 20 || (!(intrsave & PSL_I) && count < timer0_max_count / 2u)) && #ifdef APIC_IO #define lapic_irr1 ((volatile u_int *)&lapic)[0x210 / 4] /* XXX XXX */ /* XXX this assumes that apic_8254_intr is < 24. */ @@ -1227,7 +1278,7 @@ i8254_get_timecount(struct timecounter *tc) i8254_lastcount = count; count += i8254_offset; CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); return (count); } diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index cfb6ceef44d6..831ab3b168a6 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -57,12 +57,14 @@ #include #include #include +#include #include #include #include #include #include +#include #ifdef SMP #include #endif @@ -177,9 +179,8 @@ cpu_fork(p1, p2, flags) * pcb2->pcb_onfault: cloned above (always NULL here?). */ -#ifdef SMP - pcb2->pcb_mpnest = 1; -#endif + pcb2->pcb_schednest = 0; + /* * XXX don't copy the i/o pages. this should probably be fixed. */ @@ -256,8 +257,11 @@ cpu_exit(p) reset_dbregs(); pcb->pcb_flags &= ~PCB_DBREGS; } + mtx_enter(&sched_lock, MTX_SPIN); + mtx_exit(&Giant, MTX_DEF | MTX_NOSWITCH); + mtx_assert(&Giant, MA_NOTOWNED); cnt.v_swtch++; - cpu_switch(p); + cpu_switch(); panic("cpu_exit"); } @@ -406,17 +410,10 @@ vunmapbuf(bp) static void cpu_reset_proxy() { - u_int saved_mp_lock; cpu_reset_proxy_active = 1; while (cpu_reset_proxy_active == 1) - ; /* Wait for other cpu to disable interupts */ - saved_mp_lock = mp_lock; - mp_lock = 1; - printf("cpu_reset_proxy: Grabbed mp lock for BSP\n"); - cpu_reset_proxy_active = 3; - while (cpu_reset_proxy_active == 3) - ; /* Wait for other cpu to enable interrupts */ + ; /* Wait for other cpu to see that we've started */ stop_cpus((1<= ZIDLE_HI(cnt.v_free_count)) return(0); -#ifdef SMP - if (try_mplock()) { -#endif + if (mtx_try_enter(&Giant, MTX_DEF)) { s = splvm(); - __asm __volatile("sti" : : : "memory"); + intrsave = save_intr(); + enable_intr(); zero_state = 0; m = vm_page_list_find(PQ_FREE, free_rover, FALSE); if (m != NULL && (m->flags & PG_ZERO) == 0) { @@ -595,14 +584,10 @@ vm_page_zero_idle() } free_rover = (free_rover + PQ_PRIME2) & PQ_L2_MASK; splx(s); - __asm __volatile("cli" : : : "memory"); -#ifdef SMP - rel_mplock(); -#endif + restore_intr(intrsave); + mtx_exit(&Giant, MTX_DEF); return (1); -#ifdef SMP } -#endif /* * We have to enable interrupts for a moment if the try_mplock fails * in order to potentially take an IPI. XXX this should be in diff --git a/sys/amd64/include/cpu.h b/sys/amd64/include/cpu.h index ffabf7f8ed54..18822b87cc5b 100644 --- a/sys/amd64/include/cpu.h +++ b/sys/amd64/include/cpu.h @@ -46,6 +46,7 @@ #include #include #include +#include /* * definitions of cpu-dependent requirements @@ -86,7 +87,9 @@ * added, we will have an atomicy problem. The type of atomicy we need is * a non-locked orl. */ -#define need_resched() do { astpending = AST_RESCHED|AST_PENDING; } while (0) +#define need_resched() do { \ + PCPU_SET(astpending, AST_RESCHED|AST_PENDING); \ +} while (0) #define resched_wanted() (astpending & AST_RESCHED) /* @@ -109,8 +112,9 @@ * it off (asynchronous need_resched() conflicts are not critical). */ #define signotify(p) aston() - -#define aston() do { astpending |= AST_PENDING; } while (0) +#define aston() do { \ + PCPU_SET(astpending, astpending | AST_PENDING); \ +} while (0) #define astoff() /* @@ -135,7 +139,9 @@ #ifdef _KERNEL extern char btext[]; extern char etext[]; +#ifndef intr_nesting_level extern u_char intr_nesting_level; +#endif void fork_trampoline __P((void)); void fork_return __P((struct proc *, struct trapframe)); diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h index 9a4052fd41d1..39868df422aa 100644 --- a/sys/amd64/include/cpufunc.h +++ b/sys/amd64/include/cpufunc.h @@ -86,20 +86,29 @@ static __inline void disable_intr(void) { __asm __volatile("cli" : : : "memory"); -#ifdef SMP - MPINTR_LOCK(); -#endif } static __inline void enable_intr(void) { -#ifdef SMP - MPINTR_UNLOCK(); -#endif __asm __volatile("sti"); } +static __inline u_int +save_intr(void) +{ + u_int ef; + + __asm __volatile("pushfl; popl %0" : "=r" (ef)); + return (ef); +} + +static __inline void +restore_intr(u_int ef) +{ + __asm __volatile("pushl %0; popfl" : : "r" (ef) : "memory" ); +} + #define HAVE_INLINE_FFS static __inline int diff --git a/sys/amd64/include/mptable.h b/sys/amd64/include/mptable.h index 61c5ecf73205..95b5759f9e66 100644 --- a/sys/amd64/include/mptable.h +++ b/sys/amd64/include/mptable.h @@ -36,6 +36,7 @@ #endif #include +#include #include #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -236,6 +238,8 @@ typedef struct BASETABLE_ENTRY { #define MP_ANNOUNCE_POST 0x19 +/* used to hold the AP's until we are ready to release them */ +struct simplelock ap_boot_lock; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; @@ -336,6 +340,7 @@ static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); static int apic_int_is_bus_type(int intr, int bus_type); +static void release_aps(void *dummy); /* * Calculate usable address in base memory for AP trampoline code. @@ -403,7 +408,7 @@ mp_probe(void) /* - * Startup the SMP processors. + * Initialize the SMP hardware and the APIC and start up the AP's. */ void mp_start(void) @@ -619,6 +624,9 @@ mp_enable(u_int boot_addr) /* initialize all SMP locks */ init_locks(); + /* obtain the ap_boot_lock */ + s_lock(&ap_boot_lock); + /* start each Application Processor */ start_all_aps(boot_addr); } @@ -1866,9 +1874,6 @@ struct simplelock fast_intr_lock; /* critical region around INTR() routines */ struct simplelock intr_lock; -/* lock regions protected in UP kernel via cli/sti */ -struct simplelock mpintr_lock; - /* lock region used by kernel profiling */ struct simplelock mcount_lock; @@ -1885,26 +1890,16 @@ struct simplelock clock_lock; /* lock around the MP rendezvous */ static struct simplelock smp_rv_lock; +/* only 1 CPU can panic at a time :) */ +struct simplelock panic_lock; + static void init_locks(void) { - /* - * Get the initial mp_lock with a count of 1 for the BSP. - * This uses a LOGICAL cpu ID, ie BSP == 0. - */ - mp_lock = 0x00000001; - -#if 0 - /* ISR uses its own "giant lock" */ - isr_lock = FREE_LOCK; -#endif - #if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ) s_lock_init((struct simplelock*)&apic_itrace_debuglock); #endif - s_lock_init((struct simplelock*)&mpintr_lock); - s_lock_init((struct simplelock*)&mcount_lock); s_lock_init((struct simplelock*)&fast_intr_lock); @@ -1912,6 +1907,7 @@ init_locks(void) s_lock_init((struct simplelock*)&imen_lock); s_lock_init((struct simplelock*)&cpl_lock); s_lock_init(&smp_rv_lock); + s_lock_init(&panic_lock); #ifdef USE_COMLOCK s_lock_init((struct simplelock*)&com_lock); @@ -1919,12 +1915,10 @@ init_locks(void) #ifdef USE_CLOCKLOCK s_lock_init((struct simplelock*)&clock_lock); #endif /* USE_CLOCKLOCK */ + + s_lock_init(&ap_boot_lock); } - -/* Wait for all APs to be fully initialized */ -extern int wait_ap(unsigned int); - /* * start each AP in our list */ @@ -1987,6 +1981,7 @@ start_all_aps(u_int boot_addr) SMPpt[pg + 4] = 0; /* *prv_PMAP1 */ /* prime data page for it to use */ + SLIST_INSERT_HEAD(&cpuhead, gd, gd_allcpu); gd->gd_cpuid = x; gd->gd_cpu_lockid = x << 24; gd->gd_prv_CMAP1 = &SMPpt[pg + 1]; @@ -2211,7 +2206,6 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } - /* * Flush the TLB on all other CPU's * @@ -2348,10 +2342,13 @@ SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW, void ap_init(void); void -ap_init() +ap_init(void) { u_int apic_id; + /* lock against other AP's that are waking up */ + s_lock(&ap_boot_lock); + /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); @@ -2397,6 +2394,30 @@ ap_init() smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } + + /* let other AP's wake up now */ + s_unlock(&ap_boot_lock); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ; /* nothing */ + + /* + * Set curproc to our per-cpu idleproc so that mutexes have + * something unique to lock with. + */ + PCPU_SET(curproc,idleproc); + PCPU_SET(prevproc,idleproc); + + microuptime(&switchtime); + switchticks = ticks; + + /* ok, now grab sched_lock and enter the scheduler */ + enable_intr(); + mtx_enter(&sched_lock, MTX_SPIN); + cpu_throw(); /* doesn't return */ + + panic("scheduler returned us to ap_init"); } #ifdef BETTER_CLOCK @@ -2453,6 +2474,12 @@ forwarded_statclock(int id, int pscnt, int *astmap) p = checkstate_curproc[id]; cpustate = checkstate_cpustate[id]; + /* XXX */ + if (p->p_ithd) + cpustate = CHECKSTATE_INTR; + else if (p == idleproc) + cpustate = CHECKSTATE_SYS; + switch (cpustate) { case CHECKSTATE_USER: if (p->p_flag & P_PROFIL) @@ -2482,9 +2509,10 @@ forwarded_statclock(int id, int pscnt, int *astmap) if (pscnt > 1) return; - if (!p) + if (p == idleproc) { + p->p_sticks++; cp_time[CP_IDLE]++; - else { + } else { p->p_sticks++; cp_time[CP_SYS]++; } @@ -2510,7 +2538,7 @@ forwarded_statclock(int id, int pscnt, int *astmap) p->p_iticks++; cp_time[CP_INTR]++; } - if (p != NULL) { + if (p != idleproc) { schedclock(p); /* Update resource usage integrals and maximums. */ @@ -2863,3 +2891,11 @@ smp_rendezvous(void (* setup_func)(void *), /* release lock */ s_unlock(&smp_rv_lock); } + +void +release_aps(void *dummy __unused) +{ + s_unlock(&ap_boot_lock); +} + +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); diff --git a/sys/amd64/include/mutex.h b/sys/amd64/include/mutex.h new file mode 100644 index 000000000000..ef0c9638fc18 --- /dev/null +++ b/sys/amd64/include/mutex.h @@ -0,0 +1,786 @@ +/*- + * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: mutex.h,v 2.7.2.35 2000/04/27 03:10:26 cp Exp $ + * $FreeBSD$ + */ + +#ifndef _MACHINE_MUTEX_H_ +#define _MACHINE_MUTEX_H_ + +#ifndef LOCORE + +#include +#include +#include +#include +#include + +/* + * If kern_mutex.c is being built, compile non-inlined versions of various + * functions so that kernel modules can use them. + */ +#ifndef _KERN_MUTEX_C_ +#define _MTX_INLINE static __inline +#else +#define _MTX_INLINE +#endif + +/* + * Mutex flags + * + * Types + */ +#define MTX_DEF 0x0 /* Default (spin/sleep) */ +#define MTX_SPIN 0x1 /* Spin only lock */ + +/* Options */ +#define MTX_RLIKELY 0x4 /* (opt) Recursion likely */ +#define MTX_NORECURSE 0x8 /* No recursion possible */ +#define MTX_NOSPIN 0x10 /* Don't spin before sleeping */ +#define MTX_NOSWITCH 0x20 /* Do not switch on release */ +#define MTX_FIRST 0x40 /* First spin lock holder */ +#define MTX_TOPHALF 0x80 /* Interrupts not disabled on spin */ + +/* options that should be passed on to mtx_enter_hard, mtx_exit_hard */ +#define MTX_HARDOPTS (MTX_SPIN | MTX_FIRST | MTX_TOPHALF | MTX_NOSWITCH) + +/* Flags/value used in mtx_lock */ +#define MTX_RECURSE 0x01 /* (non-spin) lock held recursively */ +#define MTX_CONTESTED 0x02 /* (non-spin) lock contested */ +#define MTX_FLAGMASK ~(MTX_RECURSE | MTX_CONTESTED) +#define MTX_UNOWNED 0x8 /* Cookie for free mutex */ + +struct proc; /* XXX */ + +/* + * Sleep/spin mutex + */ +struct mtx { + volatile u_int mtx_lock; /* lock owner/gate/flags */ + volatile u_short mtx_recurse; /* number of recursive holds */ + u_short mtx_f1; + u_int mtx_savefl; /* saved flags (for spin locks) */ + char *mtx_description; + TAILQ_HEAD(, proc) mtx_blocked; + LIST_ENTRY(mtx) mtx_contested; + struct mtx *mtx_next; /* all locks in system */ + struct mtx *mtx_prev; +#ifdef SMP_DEBUG + /* If you add anything here, adjust the mtxf_t definition below */ + struct witness *mtx_witness; + LIST_ENTRY(mtx) mtx_held; + char *mtx_file; + int mtx_line; +#endif /* SMP_DEBUG */ +}; + +typedef struct mtx mtx_t; + +/* + * Filler for structs which need to remain the same size + * whether or not SMP_DEBUG is turned on. + */ +typedef struct mtxf { +#ifdef SMP_DEBUG + char mtxf_data[0]; +#else + char mtxf_data[4*sizeof(void *) + sizeof(int)]; +#endif +} mtxf_t; + +#define mp_fixme(string) + +#ifdef _KERNEL +/* Misc */ +#define CURTHD ((u_int)CURPROC) /* Current thread ID */ + +/* Prototypes */ +void mtx_init(mtx_t *m, char *description, int flag); +void mtx_enter_hard(mtx_t *, int type, int flags); +void mtx_exit_hard(mtx_t *, int type); +void mtx_destroy(mtx_t *m); + +#if (defined(KLD_MODULE) || defined(_KERN_MUTEX_C_)) +void mtx_enter(mtx_t *mtxp, int type); +int mtx_try_enter(mtx_t *mtxp, int type); +void mtx_exit(mtx_t *mtxp, int type); +#endif + +/* Global locks */ +extern mtx_t sched_lock; +extern mtx_t Giant; + +/* + * Used to replace return with an exit Giant and return. + */ + +#define EGAR(a) \ +do { \ + mtx_exit(&Giant, MTX_DEF); \ + return (a); \ +} while (0) + +#define VEGAR \ +do { \ + mtx_exit(&Giant, MTX_DEF); \ + return; \ +} while (0) + +#define DROP_GIANT() \ +do { \ + int _giantcnt; \ + WITNESS_SAVE_DECL(Giant); \ + \ + WITNESS_SAVE(&Giant, Giant); \ + for (_giantcnt = 0; mtx_owned(&Giant); _giantcnt++) \ + mtx_exit(&Giant, MTX_DEF) + +#define PICKUP_GIANT() \ + mtx_assert(&Giant, MA_NOTOWNED); \ + while (_giantcnt--) \ + mtx_enter(&Giant, MTX_DEF); \ + WITNESS_RESTORE(&Giant, Giant); \ +} while (0) + +#define PARTIAL_PICKUP_GIANT() \ + mtx_assert(&Giant, MA_NOTOWNED); \ + while (_giantcnt--) \ + mtx_enter(&Giant, MTX_DEF); \ + WITNESS_RESTORE(&Giant, Giant) + + +/* + * Debugging + */ +#ifndef SMP_DEBUG +#define mtx_assert(m, what) +#else /* SMP_DEBUG */ + +#define MA_OWNED 1 +#define MA_NOTOWNED 2 +#define mtx_assert(m, what) { \ + switch ((what)) { \ + case MA_OWNED: \ + ASS(mtx_owned((m))); \ + break; \ + case MA_NOTOWNED: \ + ASS(!mtx_owned((m))); \ + break; \ + default: \ + panic("unknown mtx_assert at %s:%d", __FILE__, __LINE__); \ + } \ +} + +#ifdef INVARIANTS +#define ASS(ex) MPASS(ex) +#define MPASS(ex) if (!(ex)) panic("Assertion %s failed at %s:%d", \ + #ex, __FILE__, __LINE__) +#define MPASS2(ex, what) if (!(ex)) panic("Assertion %s failed at %s:%d", \ + what, __FILE__, __LINE__) + +#ifdef MTX_STRS +char STR_IEN[] = "fl & 0x200"; +char STR_IDIS[] = "!(fl & 0x200)"; +#else /* MTX_STRS */ +extern char STR_IEN[]; +extern char STR_IDIS[]; +#endif /* MTX_STRS */ +#define ASS_IEN MPASS2(read_eflags() & 0x200, STR_IEN) +#define ASS_IDIS MPASS2((read_eflags() & 0x200) == 0, STR_IDIS) +#endif /* INVARIANTS */ + +#endif /* SMP_DEBUG */ + +#if !defined(SMP_DEBUG) || !defined(INVARIANTS) +#define ASS(ex) +#define MPASS(ex) +#define MPASS2(ex, where) +#define ASS_IEN +#define ASS_IDIS +#endif /* !defined(SMP_DEBUG) || !defined(INVARIANTS) */ + +#ifdef WITNESS +#ifndef SMP_DEBUG +#error WITNESS requires SMP_DEBUG +#endif /* SMP_DEBUG */ +#define WITNESS_ENTER(m, f) \ + if ((m)->mtx_witness != NULL) \ + witness_enter((m), (f), __FILE__, __LINE__) +#define WITNESS_EXIT(m, f) \ + if ((m)->mtx_witness != NULL) \ + witness_exit((m), (f), __FILE__, __LINE__) + +#define WITNESS_SLEEP(check, m) witness_sleep(check, (m), __FILE__, __LINE__) +#define WITNESS_SAVE_DECL(n) \ + char * __CONCAT(n, __wf); \ + int __CONCAT(n, __wl) + +#define WITNESS_SAVE(m, n) \ +do { \ + if ((m)->mtx_witness != NULL) \ + witness_save(m, &__CONCAT(n, __wf), &__CONCAT(n, __wl)); \ +} while (0) + +#define WITNESS_RESTORE(m, n) \ +do { \ + if ((m)->mtx_witness != NULL) \ + witness_restore(m, __CONCAT(n, __wf), __CONCAT(n, __wl)); \ +} while (0) + +void witness_init(mtx_t *, int flag); +void witness_destroy(mtx_t *); +void witness_enter(mtx_t *, int, char *, int); +void witness_try_enter(mtx_t *, int, char *, int); +void witness_exit(mtx_t *, int, char *, int); +void witness_display(void(*)(const char *fmt, ...)); +void witness_list(struct proc *); +int witness_sleep(int, mtx_t *, char *, int); +void witness_save(mtx_t *, char **, int *); +void witness_restore(mtx_t *, char *, int); +#else /* WITNESS */ +#define WITNESS_ENTER(m, flag) +#define WITNESS_EXIT(m, flag) +#define WITNESS_SLEEP(check, m) +#define WITNESS_SAVE_DECL(n) +#define WITNESS_SAVE(m, n) +#define WITNESS_RESTORE(m, n) + +/* + * flag++ is slezoid way of shutting up unused parameter warning + * in mtx_init() + */ +#define witness_init(m, flag) flag++ +#define witness_destroy(m) +#define witness_enter(m, flag, f, l) +#define witness_try_enter(m, flag, f, l ) +#define witness_exit(m, flag, f, l) +#endif /* WITNESS */ + +/* + * Assembly macros (for internal use only) + *------------------------------------------------------------------------------ + */ + +#define _V(x) __STRING(x) + +#ifndef I386_CPU + +/* + * For 486 and newer processors. + */ + +/* Get a sleep lock, deal with recursion inline. */ +#define _getlock_sleep(mtxp, tid, type) ({ \ + int _res; \ + \ + __asm __volatile ( \ +" movl $" _V(MTX_UNOWNED) ",%%eax;" /* Unowned cookie */ \ +" " MPLOCKED "" \ +" cmpxchgl %3,%1;" /* Try */ \ +" jz 1f;" /* Got it */ \ +" andl $" _V(MTX_FLAGMASK) ",%%eax;" /* turn off spec bits */ \ +" cmpl %%eax,%3;" /* already have it? */ \ +" je 2f;" /* yes, recurse */ \ +" pushl %4;" \ +" pushl %5;" \ +" call mtx_enter_hard;" \ +" addl $8,%%esp;" \ +" jmp 1f;" \ +"2: lock; orl $" _V(MTX_RECURSE) ",%1;" \ +" incw %2;" \ +"1:" \ +"# getlock_sleep" \ + : "=&a" (_res), /* 0 (dummy output) */ \ + "+m" (mtxp->mtx_lock), /* 1 */ \ + "+m" (mtxp->mtx_recurse) /* 2 */ \ + : "r" (tid), /* 3 (input) */ \ + "gi" (type), /* 4 */ \ + "g" (mtxp) /* 5 */ \ + : "memory", "ecx", "edx" /* used */ ); \ +}) + +/* Get a spin lock, handle recursion inline (as the less common case) */ +#define _getlock_spin_block(mtxp, tid, type) ({ \ + int _res; \ + \ + __asm __volatile ( \ +" pushfl;" \ +" cli;" \ +" movl $" _V(MTX_UNOWNED) ",%%eax;" /* Unowned cookie */ \ +" " MPLOCKED "" \ +" cmpxchgl %3,%1;" /* Try */ \ +" jz 2f;" /* got it */ \ +" pushl %4;" \ +" pushl %5;" \ +" call mtx_enter_hard;" /* mtx_enter_hard(mtxp, type, oflags) */ \ +" addl $0xc,%%esp;" \ +" jmp 1f;" \ +"2: popl %2;" /* save flags */ \ +"1:" \ +"# getlock_spin_block" \ + : "=&a" (_res), /* 0 (dummy output) */ \ + "+m" (mtxp->mtx_lock), /* 1 */ \ + "=m" (mtxp->mtx_savefl) /* 2 */ \ + : "r" (tid), /* 3 (input) */ \ + "gi" (type), /* 4 */ \ + "g" (mtxp) /* 5 */ \ + : "memory", "ecx", "edx" /* used */ ); \ +}) + +/* + * Get a lock without any recursion handling. Calls the hard enter function if + * we can't get it inline. + */ +#define _getlock_norecurse(mtxp, tid, type) ({ \ + int _res; \ + \ + __asm __volatile ( \ +" movl $" _V(MTX_UNOWNED) ",%%eax;" /* Unowned cookie */ \ +" " MPLOCKED "" \ +" cmpxchgl %2,%1;" /* Try */ \ +" jz 1f;" /* got it */ \ +" pushl %3;" \ +" pushl %4;" \ +" call mtx_enter_hard;" /* mtx_enter_hard(mtxp, type) */ \ +" addl $8,%%esp;" \ +"1:" \ +"# getlock_norecurse" \ + : "=&a" (_res), /* 0 (dummy output) */ \ + "+m" (mtxp->mtx_lock) /* 1 */ \ + : "r" (tid), /* 2 (input) */ \ + "gi" (type), /* 3 */ \ + "g" (mtxp) /* 4 */ \ + : "memory", "ecx", "edx" /* used */ ); \ +}) + +/* + * Release a sleep lock assuming we haven't recursed on it, recursion is handled + * in the hard function. + */ +#define _exitlock_norecurse(mtxp, tid, type) ({ \ + int _tid = (int)(tid); \ + \ + __asm __volatile ( \ +" " MPLOCKED "" \ +" cmpxchgl %4,%0;" /* try easy rel */ \ +" jz 1f;" /* released! */ \ +" pushl %2;" \ +" pushl %3;" \ +" call mtx_exit_hard;" \ +" addl $8,%%esp;" \ +"1:" \ +"# exitlock_norecurse" \ + : "+m" (mtxp->mtx_lock), /* 0 */ \ + "+a" (_tid) /* 1 */ \ + : "gi" (type), /* 2 (input) */ \ + "g" (mtxp), /* 3 */ \ + "r" (MTX_UNOWNED) /* 4 */ \ + : "memory", "ecx", "edx" /* used */ ); \ +}) + +/* + * Release a sleep lock when its likely we recursed (the code to + * deal with simple recursion is inline). + */ +#define _exitlock(mtxp, tid, type) ({ \ + int _tid = (int)(tid); \ + \ + __asm __volatile ( \ +" " MPLOCKED "" \ +" cmpxchgl %5,%0;" /* try easy rel */ \ +" jz 1f;" /* released! */ \ +" testl $" _V(MTX_RECURSE) ",%%eax;" /* recursed? */ \ +" jnz 3f;" /* handle recursion */ \ + /* Lock not recursed and contested: do the hard way */ \ +" pushl %3;" \ +" pushl %4;" \ +" call mtx_exit_hard;" /* mtx_exit_hard(mtxp,type) */ \ +" addl $8,%%esp;" \ +" jmp 1f;" \ + /* lock recursed, lower recursion level */ \ +"3: decw %1;" /* one less level */ \ +" jnz 1f;" /* still recursed, done */ \ +" lock; andl $~" _V(MTX_RECURSE) ",%0;" /* turn off recurse flag */ \ +"1:" \ +"# exitlock" \ + : "+m" (mtxp->mtx_lock), /* 0 */ \ + "+m" (mtxp->mtx_recurse), /* 1 */ \ + "+a" (_tid) /* 2 */ \ + : "gi" (type), /* 3 (input) */ \ + "g" (mtxp), /* 4 */ \ + "r" (MTX_UNOWNED) /* 5 */ \ + : "memory", "ecx", "edx" /* used */ ); \ +}) + +/* + * Release a spin lock (with possible recursion). + * + * We use cmpxchgl to clear lock (instead of simple store) to flush posting + * buffers and make the change visible to other CPU's. + */ +#define _exitlock_spin(mtxp, inten1, inten2) ({ \ + int _res; \ + \ + __asm __volatile ( \ +" movw %1,%%ax;" \ +" decw %%ax;" \ +" js 1f;" \ +" movw %%ax,%1;" \ +" jmp 2f;" \ +"1: movl %0,%%eax;" \ +" movl $ " _V(MTX_UNOWNED) ",%%ecx;" \ +" " inten1 ";" \ +" " MPLOCKED "" \ +" cmpxchgl %%ecx,%0;" \ +" " inten2 ";" \ +"2:" \ +"# exitlock_spin" \ + : "+m" (mtxp->mtx_lock), /* 0 */ \ + "+m" (mtxp->mtx_recurse), /* 1 */ \ + "=&a" (_res) /* 2 */ \ + : "g" (mtxp->mtx_savefl) /* 3 (used in 'inten') */ \ + : "memory", "ecx" /* used */ ); \ +}) + +#else /* I386_CPU */ + +/* + * For 386 processors only. + */ + +/* Get a sleep lock, deal with recursion inline. */ +#define _getlock_sleep(mp, tid, type) do { \ + if (atomic_cmpset_int(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) == 0) { \ + if (((mp)->mtx_lock & MTX_FLAGMASK) != (tid)) \ + mtx_enter_hard(mp, (type) & MTX_HARDOPTS, 0); \ + else { \ + atomic_set_int(&(mp)->mtx_lock, MTX_RECURSE); \ + (mp)->mtx_recurse++; \ + } \ + } \ +} while (0) + +/* Get a spin lock, handle recursion inline (as the less common case) */ +#define _getlock_spin_block(mp, tid, type) do { \ + u_int _mtx_fl = read_eflags(); \ + disable_intr(); \ + if (atomic_cmpset_int(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) == 0) \ + mtx_enter_hard(mp, (type) & MTX_HARDOPTS, _mtx_fl); \ + else \ + (mp)->mtx_savefl = _mtx_fl; \ +} while (0) + +/* + * Get a lock without any recursion handling. Calls the hard enter function if + * we can't get it inline. + */ +#define _getlock_norecurse(mp, tid, type) do { \ + if (atomic_cmpset_int(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) == 0) \ + mtx_enter_hard((mp), (type) & MTX_HARDOPTS, 0); \ +} while (0) + +/* + * Release a sleep lock assuming we haven't recursed on it, recursion is handled + * in the hard function. + */ +#define _exitlock_norecurse(mp, tid, type) do { \ + if (atomic_cmpset_int(&(mp)->mtx_lock, (tid), MTX_UNOWNED) == 0) \ + mtx_exit_hard((mp), (type) & MTX_HARDOPTS); \ +} while (0) + +/* + * Release a sleep lock when its likely we recursed (the code to + * deal with simple recursion is inline). + */ +#define _exitlock(mp, tid, type) do { \ + if (atomic_cmpset_int(&(mp)->mtx_lock, (tid), MTX_UNOWNED) == 0) { \ + if ((mp)->mtx_lock & MTX_RECURSE) { \ + if (--((mp)->mtx_recurse) == 0) \ + atomic_clear_int(&(mp)->mtx_lock, \ + MTX_RECURSE); \ + } else { \ + mtx_exit_hard((mp), (type) & MTX_HARDOPTS); \ + } \ + } \ +} while (0) + +/* Release a spin lock (with possible recursion). */ +#define _exitlock_spin(mp, inten1, inten2) do { \ + if ((mp)->mtx_recurse == 0) { \ + atomic_cmpset_int(&(mp)->mtx_lock, (mp)->mtx_lock, \ + MTX_UNOWNED); \ + write_eflags((mp)->mtx_savefl); \ + } else { \ + (mp)->mtx_recurse--; \ + } \ +} while (0) + +#endif /* I386_CPU */ + +/* + * Externally visible mutex functions. + *------------------------------------------------------------------------------ + */ + +/* + * Return non-zero if a mutex is already owned by the current thread. + */ +#define mtx_owned(m) (((m)->mtx_lock & MTX_FLAGMASK) == CURTHD) + +/* Common strings */ +#ifdef MTX_STRS +#ifdef KTR_EXTEND + +/* + * KTR_EXTEND saves file name and line for all entries, so we don't need them + * here. Theoretically we should also change the entries which refer to them + * (from CTR5 to CTR3), but since they're just passed to snprinf as the last + * parameters, it doesn't do any harm to leave them. + */ +char STR_mtx_enter_fmt[] = "GOT %s [%x] r=%d"; +char STR_mtx_exit_fmt[] = "REL %s [%x] r=%d"; +char STR_mtx_try_enter_fmt[] = "TRY_ENTER %s [%x] result=%d"; +#else +char STR_mtx_enter_fmt[] = "GOT %s [%x] at %s:%d r=%d"; +char STR_mtx_exit_fmt[] = "REL %s [%x] at %s:%d r=%d"; +char STR_mtx_try_enter_fmt[] = "TRY_ENTER %s [%x] at %s:%d result=%d"; +#endif +char STR_mtx_bad_type[] = "((type) & (MTX_NORECURSE | MTX_NOSWITCH)) == 0"; +char STR_mtx_owned[] = "mtx_owned(_mpp)"; +char STR_mtx_recurse[] = "_mpp->mtx_recurse == 0"; +#else /* MTX_STRS */ +extern char STR_mtx_enter_fmt[]; +extern char STR_mtx_bad_type[]; +extern char STR_mtx_exit_fmt[]; +extern char STR_mtx_owned[]; +extern char STR_mtx_recurse[]; +extern char STR_mtx_try_enter_fmt[]; +#endif /* MTX_STRS */ + +#ifndef KLD_MODULE +/* + * Get lock 'm', the macro handles the easy (and most common cases) and leaves + * the slow stuff to the mtx_enter_hard() function. + * + * Note: since type is usually a constant much of this code is optimized out. + */ +_MTX_INLINE void +mtx_enter(mtx_t *mtxp, int type) +{ + mtx_t *_mpp = mtxp; + + /* bits only valid on mtx_exit() */ + MPASS2(((type) & (MTX_NORECURSE | MTX_NOSWITCH)) == 0, + STR_mtx_bad_type); + + do { + if ((type) & MTX_SPIN) { + /* + * Easy cases of spin locks: + * + * 1) We already own the lock and will simply + * recurse on it (if RLIKELY) + * + * 2) The lock is free, we just get it + */ + if ((type) & MTX_RLIKELY) { + /* + * Check for recursion, if we already + * have this lock we just bump the + * recursion count. + */ + if (_mpp->mtx_lock == CURTHD) { + _mpp->mtx_recurse++; + break; /* Done */ + } + } + + if (((type) & MTX_TOPHALF) == 0) { + /* + * If an interrupt thread uses this + * we must block interrupts here. + */ + if ((type) & MTX_FIRST) { + ASS_IEN; + disable_intr(); + _getlock_norecurse(_mpp, CURTHD, + (type) & MTX_HARDOPTS); + } else { + _getlock_spin_block(_mpp, CURTHD, + (type) & MTX_HARDOPTS); + } + } else + _getlock_norecurse(_mpp, CURTHD, + (type) & MTX_HARDOPTS); + } else { + /* Sleep locks */ + if ((type) & MTX_RLIKELY) + _getlock_sleep(_mpp, CURTHD, + (type) & MTX_HARDOPTS); + else + _getlock_norecurse(_mpp, CURTHD, + (type) & MTX_HARDOPTS); + } + } while (0); + WITNESS_ENTER(_mpp, type); + CTR5(KTR_LOCK, STR_mtx_enter_fmt, + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, + (_mpp)->mtx_recurse); +} + +/* + * Attempt to get MTX_DEF lock, return non-zero if lock acquired. + * + * XXX DOES NOT HANDLE RECURSION + */ +_MTX_INLINE int +mtx_try_enter(mtx_t *mtxp, int type) +{ + mtx_t *const _mpp = mtxp; + int _rval; + + _rval = atomic_cmpset_int(&_mpp->mtx_lock, MTX_UNOWNED, CURTHD); +#ifdef SMP_DEBUG + if (_rval && (_mpp)->mtx_witness != NULL) { + ASS((_mpp)->mtx_recurse == 0); + witness_try_enter(_mpp, type, __FILE__, __LINE__); + } +#endif + CTR5(KTR_LOCK, STR_mtx_try_enter_fmt, + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, _rval); + + return _rval; +} + +#define mtx_legal2block() (read_eflags() & 0x200) + +/* + * Release lock m. + */ +_MTX_INLINE void +mtx_exit(mtx_t *mtxp, int type) +{ + mtx_t *const _mpp = mtxp; + + MPASS2(mtx_owned(_mpp), STR_mtx_owned); + WITNESS_EXIT(_mpp, type); + CTR5(KTR_LOCK, STR_mtx_exit_fmt, + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, + (_mpp)->mtx_recurse); + if ((type) & MTX_SPIN) { + if ((type) & MTX_NORECURSE) { + MPASS2(_mpp->mtx_recurse == 0, STR_mtx_recurse); + atomic_cmpset_int(&_mpp->mtx_lock, _mpp->mtx_lock, + MTX_UNOWNED); + if (((type) & MTX_TOPHALF) == 0) { + if ((type) & MTX_FIRST) { + ASS_IDIS; + enable_intr(); + } else + write_eflags(_mpp->mtx_savefl); + } + } else { + if ((type) & MTX_TOPHALF) + _exitlock_spin(_mpp,,); + else { + if ((type) & MTX_FIRST) { + ASS_IDIS; + _exitlock_spin(_mpp,, "sti"); + } else { + _exitlock_spin(_mpp, + "pushl %3", "popfl"); + } + } + } + } else { + /* Handle sleep locks */ + if ((type) & MTX_RLIKELY) + _exitlock(_mpp, CURTHD, (type) & MTX_HARDOPTS); + else { + _exitlock_norecurse(_mpp, CURTHD, + (type) & MTX_HARDOPTS); + } + } +} + +#endif /* KLD_MODULE */ +#endif /* _KERNEL */ + +#else /* !LOCORE */ + +/* + * Simple assembly macros to get and release non-recursive spin locks + */ + +#if defined(I386_CPU) + +#define MTX_EXIT(lck, reg) \ + movl $ MTX_UNOWNED,lck+MTX_LOCK; + +#else /* I386_CPU */ + +#define MTX_ENTER(reg, lck) \ +9: movl $ MTX_UNOWNED,%eax; \ + MPLOCKED \ + cmpxchgl reg,lck+MTX_LOCK; \ + jnz 9b + +/* Must use locked bus op (cmpxchg) when setting to unowned (barrier) */ +#define MTX_EXIT(lck,reg) \ + movl lck+MTX_LOCK,%eax; \ + movl $ MTX_UNOWNED,reg; \ + MPLOCKED \ + cmpxchgl reg,lck+MTX_LOCK; \ + +#define MTX_ENTER_WITH_RECURSION(reg, lck) \ + movl lck+MTX_LOCK,%eax; \ + cmpl PCPU_CURPROC,%eax; \ + jne 9f; \ + incw lck+MTX_RECURSECNT; \ + jmp 8f; \ +9: movl $ MTX_UNOWNED,%eax; \ + MPLOCKED \ + cmpxchgl reg,lck+MTX_LOCK; \ + jnz 9b; \ +8: + +#define MTX_EXIT_WITH_RECURSION(lck,reg) \ + movw lck+MTX_RECURSECNT,%ax; \ + decw %ax; \ + js 9f; \ + movw %ax,lck+MTX_RECURSECNT; \ + jmp 8f; \ +9: movl lck+MTX_LOCK,%eax; \ + movl $ MTX_UNOWNED,reg; \ + MPLOCKED \ + cmpxchgl reg,lck+MTX_LOCK; \ +8: + +#endif /* I386_CPU */ +#endif /* !LOCORE */ +#endif /* __MACHINE_MUTEX_H */ diff --git a/sys/amd64/include/pcb.h b/sys/amd64/include/pcb.h index 08beb5a83059..1c7af8505ab1 100644 --- a/sys/amd64/include/pcb.h +++ b/sys/amd64/include/pcb.h @@ -72,11 +72,7 @@ struct pcb { #define FP_SOFTFP 0x01 /* process using software fltng pnt emulator */ #define PCB_DBREGS 0x02 /* process using debug registers */ caddr_t pcb_onfault; /* copyin/out fault recovery */ -#ifdef SMP - u_long pcb_mpnest; -#else - u_long pcb_mpnest_dontuse; -#endif + int pcb_schednest; int pcb_gs; struct pcb_ext *pcb_ext; /* optional pcb extension */ u_long __pcb_spare[3]; /* adjust to avoid core dump size changes */ diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h index 58bd9cfe9416..440da60b4b83 100644 --- a/sys/amd64/include/pcpu.h +++ b/sys/amd64/include/pcpu.h @@ -26,6 +26,20 @@ * $FreeBSD$ */ +#ifndef _MACHINE_GLOBALDATA_H_ +#define _MACHINE_GLOBALDATA_H_ + +#include +#include +#include +#include +#include + +/* XXX */ +#ifdef KTR_PERCPU +#include +#endif + /* * This structure maps out the global data that needs to be kept on a * per-cpu basis. genassym uses this to generate offsets for the assembler @@ -41,11 +55,14 @@ struct globaldata { struct privatespace *gd_prvspace; /* self-reference */ struct proc *gd_curproc; + struct proc *gd_prevproc; struct proc *gd_npxproc; struct pcb *gd_curpcb; + struct proc *gd_idleproc; struct timeval gd_switchtime; struct i386tss gd_common_tss; int gd_switchticks; + int gd_intr_nesting_level; struct segment_descriptor gd_common_tssd; struct segment_descriptor *gd_tss_gdt; #ifdef USER_LDT @@ -67,8 +84,22 @@ struct globaldata { unsigned *gd_prv_PADDR1; #endif u_int gd_astpending; + SLIST_ENTRY(globaldata) gd_allcpu; + int gd_witness_spin_check; +#ifdef KTR_PERCPU +#ifdef KTR + volatile int gd_ktr_idx; + char *gd_ktr_buf; + char gd_ktr_buf_data[KTR_SIZE]; +#endif +#endif }; +extern struct globaldata globaldata; + +SLIST_HEAD(cpuhead, globaldata); +extern struct cpuhead cpuhead; + #ifdef SMP /* * This is the upper (0xff800000) address space layout that is per-cpu. @@ -93,3 +124,5 @@ struct privatespace { extern struct privatespace SMP_prvspace[]; #endif + +#endif /* ! _MACHINE_GLOBALDATA_H_ */ diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index 69b716ba8579..20d4fa3a8873 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -15,6 +15,9 @@ #ifdef _KERNEL +#ifdef I386_CPU +#error SMP not supported with I386_CPU +#endif #if defined(SMP) && !defined(APIC_IO) # error APIC_IO required for SMP, add "options APIC_IO" to your config file. #endif /* SMP && !APIC_IO */ @@ -57,23 +60,6 @@ extern int bootMP_size; /* functions in mpboot.s */ void bootMP __P((void)); -/* global data in mplock.s */ -extern u_int mp_lock; -extern u_int isr_lock; -#ifdef RECURSIVE_MPINTRLOCK -extern u_int mpintr_lock; -#endif /* RECURSIVE_MPINTRLOCK */ - -/* functions in mplock.s */ -void get_mplock __P((void)); -void rel_mplock __P((void)); -int try_mplock __P((void)); -#ifdef RECURSIVE_MPINTRLOCK -void get_mpintrlock __P((void)); -void rel_mpintrlock __P((void)); -int try_mpintrlock __P((void)); -#endif /* RECURSIVE_MPINTRLOCK */ - /* global data in apic_vector.s */ extern volatile u_int stopped_cpus; extern volatile u_int started_cpus; @@ -185,23 +171,7 @@ extern int smp_started; extern volatile int smp_idle_loops; #endif /* !LOCORE */ -#else /* !SMP && !APIC_IO */ - -/* - * Create dummy MP lock empties - */ - -static __inline void -get_mplock(void) -{ -} - -static __inline void -rel_mplock(void) -{ -} - -#endif +#endif /* SMP && !APIC_IO */ #endif /* _KERNEL */ #endif /* _MACHINE_SMP_H_ */ diff --git a/sys/amd64/isa/atpic_vector.S b/sys/amd64/isa/atpic_vector.S index e427351ca205..d2b88bf705a3 100644 --- a/sys/amd64/isa/atpic_vector.S +++ b/sys/amd64/isa/atpic_vector.S @@ -53,9 +53,11 @@ IDTVEC(vec_name) ; \ pushl %ecx ; \ pushl %edx ; \ pushl %ds ; \ + pushl %fs ; \ MAYBE_PUSHL_ES ; \ mov $KDSEL,%ax ; \ mov %ax,%ds ; \ + mov %ax,%fs ; \ MAYBE_MOVW_AX_ES ; \ FAKE_MCOUNT((4+ACTUALLY_PUSHED)*4(%esp)) ; \ pushl _intr_unit + (irq_num) * 4 ; \ @@ -65,18 +67,21 @@ IDTVEC(vec_name) ; \ incl _cnt+V_INTR ; /* book-keeping can wait */ \ movl _intr_countp + (irq_num) * 4,%eax ; \ incl (%eax) ; \ - movl _cpl,%eax ; /* are we unmasking pending HWIs or SWIs? */ \ +/* movl _cpl,%eax ; // are we unmasking pending SWIs? / \ notl %eax ; \ - andl _ipending,%eax ; \ - jne 2f ; /* yes, maybe handle them */ \ + andl _spending,$SWI_MASK ; \ + jne 2f ; // yes, maybe handle them */ \ 1: ; \ MEXITCOUNT ; \ MAYBE_POPL_ES ; \ + popl %fs ; \ popl %ds ; \ popl %edx ; \ popl %ecx ; \ popl %eax ; \ iret ; \ + +#if 0 ; \ ALIGN_TEXT ; \ 2: ; \ @@ -88,6 +93,7 @@ IDTVEC(vec_name) ; \ incb _intr_nesting_level ; /* ... really limit it ... */ \ sti ; /* ... to do this as early as possible */ \ MAYBE_POPL_ES ; /* discard most of thin frame ... */ \ + popl %fs ; \ popl %ecx ; /* ... original %ds ... */ \ popl %edx ; \ xchgl %eax,4(%esp) ; /* orig %eax; save cpl */ \ @@ -101,11 +107,20 @@ IDTVEC(vec_name) ; \ movl (3+8+0)*4(%esp),%ecx ; /* ... %ecx from thin frame ... */ \ movl %ecx,(3+6)*4(%esp) ; /* ... to fat frame ... */ \ movl (3+8+1)*4(%esp),%eax ; /* ... cpl from thin frame */ \ - pushl %eax ; \ subl $4,%esp ; /* junk for unit number */ \ MEXITCOUNT ; \ jmp _doreti +#endif +/* + * Slow, threaded interrupts. + * + * XXX Most of the parameters here are obsolete. Fix this when we're + * done. + * XXX we really shouldn't return via doreti if we just schedule the + * interrupt handler and don't run anything. We could just do an + * iret. FIXME. + */ #define INTR(irq_num, vec_name, icu, enable_icus, reg, maybe_extra_ipending) \ .text ; \ SUPERALIGN_TEXT ; \ @@ -116,8 +131,8 @@ IDTVEC(vec_name) ; \ pushl %ds ; /* save our data and extra segments ... */ \ pushl %es ; \ pushl %fs ; \ - mov $KDSEL,%ax ; /* ... and reload with kernel's own ... */ \ - mov %ax,%ds ; /* ... early for obsolete reasons */ \ + mov $KDSEL,%ax ; /* load kernel ds, es and fs */ \ + mov %ax,%ds ; \ mov %ax,%es ; \ mov %ax,%fs ; \ maybe_extra_ipending ; \ @@ -126,43 +141,37 @@ IDTVEC(vec_name) ; \ movb %al,_imen + IRQ_BYTE(irq_num) ; \ outb %al,$icu+ICU_IMR_OFFSET ; \ enable_icus ; \ - movl _cpl,%eax ; \ - testb $IRQ_BIT(irq_num),%reg ; \ - jne 2f ; \ - incb _intr_nesting_level ; \ + incb _intr_nesting_level ; /* XXX do we need this? */ \ __CONCAT(Xresume,irq_num): ; \ FAKE_MCOUNT(13*4(%esp)) ; /* XXX late to avoid double count */ \ - incl _cnt+V_INTR ; /* tally interrupts */ \ - movl _intr_countp + (irq_num) * 4,%eax ; \ - incl (%eax) ; \ - movl _cpl,%eax ; \ - pushl %eax ; \ - pushl _intr_unit + (irq_num) * 4 ; \ - orl _intr_mask + (irq_num) * 4,%eax ; \ - movl %eax,_cpl ; \ + pushl $irq_num; /* pass the IRQ */ \ sti ; \ - call *_intr_handler + (irq_num) * 4 ; \ - cli ; /* must unmask _imen and icu atomically */ \ - movb _imen + IRQ_BYTE(irq_num),%al ; \ - andb $~IRQ_BIT(irq_num),%al ; \ - movb %al,_imen + IRQ_BYTE(irq_num) ; \ - outb %al,$icu+ICU_IMR_OFFSET ; \ - sti ; /* XXX _doreti repeats the cli/sti */ \ + call _sched_ithd ; \ + addl $4, %esp ; /* discard the parameter */ \ MEXITCOUNT ; \ /* We could usually avoid the following jmp by inlining some of */ \ /* _doreti, but it's probably better to use less cache. */ \ - jmp _doreti ; \ -; \ - ALIGN_TEXT ; \ -2: ; \ - /* XXX skip mcounting here to avoid double count */ \ - orb $IRQ_BIT(irq_num),_ipending + IRQ_BYTE(irq_num) ; \ - popl %fs ; \ - popl %es ; \ - popl %ds ; \ - popal ; \ - addl $4+4,%esp ; \ - iret + jmp doreti_next /* and catch up inside doreti */ + +/* + * Reenable the interrupt mask after completing an interrupt. Called + * from ithd_loop. There are two separate functions, one for each + * ICU. + */ + .globl setimask0, setimask1 +setimask0: + cli + movb _imen,%al + outb %al,$IO_ICU1 + ICU_IMR_OFFSET + sti + ret + +setimask1: + cli + movb _imen + 1,%al + outb %al,$IO_ICU2 + ICU_IMR_OFFSET + sti + ret MCOUNT_LABEL(bintr) FAST_INTR(0,fastintr0, ENABLE_ICU1) @@ -181,7 +190,9 @@ MCOUNT_LABEL(bintr) FAST_INTR(13,fastintr13, ENABLE_ICU1_AND_2) FAST_INTR(14,fastintr14, ENABLE_ICU1_AND_2) FAST_INTR(15,fastintr15, ENABLE_ICU1_AND_2) + #define CLKINTR_PENDING movl $1,CNAME(clkintr_pending) +/* Threaded interrupts */ INTR(0,intr0, IO_ICU1, ENABLE_ICU1, al, CLKINTR_PENDING) INTR(1,intr1, IO_ICU1, ENABLE_ICU1, al,) INTR(2,intr2, IO_ICU1, ENABLE_ICU1, al,) @@ -198,6 +209,7 @@ MCOUNT_LABEL(bintr) INTR(13,intr13, IO_ICU2, ENABLE_ICU1_AND_2, ah,) INTR(14,intr14, IO_ICU2, ENABLE_ICU1_AND_2, ah,) INTR(15,intr15, IO_ICU2, ENABLE_ICU1_AND_2, ah,) + MCOUNT_LABEL(eintr) .data @@ -211,10 +223,4 @@ _ihandlers: /* addresses of interrupt handlers */ .long _swi_null, swi_net, _swi_null, _swi_null .long _swi_vm, _swi_null, _softclock -imasks: /* masks for interrupt handlers */ - .space NHWI*4 /* padding; HWI masks are elsewhere */ - - .long SWI_TTY_MASK, SWI_NET_MASK, SWI_CAMNET_MASK, SWI_CAMBIO_MASK - .long SWI_VM_MASK, SWI_TQ_MASK, SWI_CLOCK_MASK - .text diff --git a/sys/amd64/isa/clock.c b/sys/amd64/isa/clock.c index 15044abbaa3b..724f3c2817ba 100644 --- a/sys/amd64/isa/clock.c +++ b/sys/amd64/isa/clock.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -93,10 +94,6 @@ #include #endif -#ifdef SMP -#define disable_intr() CLOCK_DISABLE_INTR() -#define enable_intr() CLOCK_ENABLE_INTR() - #ifdef APIC_IO #include /* The interrupt triggered by the 8254 (timer) chip */ @@ -104,7 +101,6 @@ int apic_8254_intr; static u_long read_intr_count __P((int vec)); static void setup_8254_mixed_mode __P((void)); #endif -#endif /* SMP */ /* * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we @@ -147,7 +143,9 @@ int tsc_is_broken; int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */ static int beeping = 0; +#if 0 static u_int clk_imask = HWI_MASK | SWI_MASK; +#endif static const u_char daysinmonth[] = {31,28,31,30,31,30,31,31,30,31,30,31}; static u_int hardclock_max_count; static u_int32_t i8254_lastcount; @@ -205,8 +203,12 @@ SYSCTL_OPAQUE(_debug, OID_AUTO, i8254_timecounter, CTLFLAG_RD, static void clkintr(struct clockframe frame) { + int intrsave; + if (timecounter->tc_get_timecount == i8254_get_timecount) { + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); if (i8254_ticked) i8254_ticked = 0; else { @@ -214,7 +216,8 @@ clkintr(struct clockframe frame) i8254_lastcount = 0; } clkintr_pending = 0; - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); } timer_func(&frame); switch (timer0_state) { @@ -233,14 +236,17 @@ clkintr(struct clockframe frame) break; case ACQUIRE_PENDING: + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); i8254_offset = i8254_get_timecount(NULL); i8254_lastcount = 0; timer0_max_count = TIMER_DIV(new_rate); outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); timer_func = new_function; timer0_state = ACQUIRED; setdelayed(); @@ -249,7 +255,9 @@ clkintr(struct clockframe frame) case RELEASE_PENDING: if ((timer0_prescaler_count += timer0_max_count) >= hardclock_max_count) { + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); i8254_offset = i8254_get_timecount(NULL); i8254_lastcount = 0; timer0_max_count = hardclock_max_count; @@ -257,7 +265,8 @@ clkintr(struct clockframe frame) TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); timer0_prescaler_count = 0; timer_func = hardclock; timer0_state = RELEASED; @@ -404,11 +413,11 @@ DB_SHOW_COMMAND(rtc, rtc) static int getit(void) { - u_long ef; - int high, low; + int high, low, intrsave; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); /* Select timer0 and latch counter value. */ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); @@ -417,7 +426,7 @@ getit(void) high = inb(TIMER_CNTR0); CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); return ((high << 8) | low); } @@ -523,6 +532,7 @@ sysbeepstop(void *chan) int sysbeep(int pitch, int period) { + int intrsave; int x = splclock(); if (acquire_timer2(TIMER_SQWAVE|TIMER_16BIT)) @@ -531,10 +541,13 @@ sysbeep(int pitch, int period) splx(x); return (-1); /* XXX Should be EBUSY, but nobody cares anyway. */ } + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); outb(TIMER_CNTR2, pitch); outb(TIMER_CNTR2, (pitch>>8)); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); if (!beeping) { /* enable counter2 output to speaker */ outb(IO_PPI, inb(IO_PPI) | 3); @@ -683,11 +696,12 @@ calibrate_clocks(void) static void set_timer_freq(u_int freq, int intr_freq) { - u_long ef; + int intrsave; int new_timer0_max_count; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); timer_freq = freq; new_timer0_max_count = hardclock_max_count = TIMER_DIV(intr_freq); if (new_timer0_max_count != timer0_max_count) { @@ -697,7 +711,7 @@ set_timer_freq(u_int freq, int intr_freq) outb(TIMER_CNTR0, timer0_max_count >> 8); } CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); } /* @@ -711,15 +725,16 @@ set_timer_freq(u_int freq, int intr_freq) void i8254_restore(void) { - u_long ef; + int intrsave; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); } /* @@ -979,8 +994,8 @@ cpu_initclocks() { int diag; #ifdef APIC_IO - int apic_8254_trial; - struct intrec *clkdesc; + int apic_8254_trial, num_8254_ticks; + struct intrec *clkdesc, *rtcdesc; #endif /* APIC_IO */ if (statclock_disable) { @@ -1014,14 +1029,15 @@ cpu_initclocks() } else panic("APIC_IO: Cannot route 8254 interrupt to CPU"); } - - clkdesc = inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, - NULL, &clk_imask, INTR_EXCL); - INTREN(1 << apic_8254_intr); - #else /* APIC_IO */ - inthand_add("clk", 0, (inthand2_t *)clkintr, NULL, &clk_imask, + /* + * XXX Check the priority of this interrupt handler. I + * couldn't find anything suitable in the BSD/OS code (grog, + * 19 July 2000). + */ + /* Setup the PIC clk handler. The APIC handler is setup later */ + inthand_add("clk", 0, (inthand2_t *)clkintr, NULL, PI_REALTIME, INTR_EXCL); INTREN(IRQ0); @@ -1032,8 +1048,18 @@ cpu_initclocks() writertc(RTC_STATUSB, RTCSB_24HR); /* Don't bother enabling the statistics clock. */ - if (statclock_disable) + if (statclock_disable) { +#ifdef APIC_IO + /* + * XXX - if statclock is disabled, don't attempt the APIC + * trial. Not sure this is sane for APIC_IO. + */ + inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, NULL, + PI_REALTIME, INTR_EXCL); + INTREN(1 << apic_8254_intr); +#endif /* APIC_IO */ return; + } diag = rtcin(RTC_DIAG); if (diag != 0) printf("RTC BIOS diagnostic error %b\n", diag, RTCDG_BITS); @@ -1041,34 +1067,44 @@ cpu_initclocks() #ifdef APIC_IO if (isa_apic_irq(8) != 8) panic("APIC RTC != 8"); -#endif /* APIC_IO */ - inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, &stat_imask, - INTR_EXCL); - -#ifdef APIC_IO - INTREN(APIC_IRQ8); -#else - INTREN(IRQ8); -#endif /* APIC_IO */ - - writertc(RTC_STATUSB, rtc_statusb); - -#ifdef APIC_IO if (apic_8254_trial) { - + /* + * XXX - We use fast interrupts for clk and rtc long enough to + * perform the APIC probe and then revert to exclusive + * interrupts. + */ + clkdesc = inthand_add("clk", apic_8254_intr, + (inthand2_t *)clkintr, NULL, PI_REALTIME, INTR_FAST); + INTREN(1 << apic_8254_intr); + + rtcdesc = inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, + PI_REALTIME, INTR_FAST); /* XXX */ + INTREN(APIC_IRQ8); + writertc(RTC_STATUSB, rtc_statusb); + printf("APIC_IO: Testing 8254 interrupt delivery\n"); while (read_intr_count(8) < 6) ; /* nothing */ - if (read_intr_count(apic_8254_intr) < 3) { + num_8254_ticks = read_intr_count(apic_8254_intr); + + /* disable and remove our fake handlers */ + INTRDIS(1 << apic_8254_intr); + inthand_remove(clkdesc); + + writertc(RTC_STATUSA, rtc_statusa); + writertc(RTC_STATUSB, RTCSB_24HR); + + INTRDIS(APIC_IRQ8); + inthand_remove(rtcdesc); + + if (num_8254_ticks < 3) { /* * The MP table is broken. * The 8254 was not connected to the specified pin * on the IO APIC. * Workaround: Limited variant of mixed mode. */ - INTRDIS(1 << apic_8254_intr); - inthand_remove(clkdesc); printf("APIC_IO: Broken MP table detected: " "8254 is not connected to " "IOAPIC #%d intpin %d\n", @@ -1087,13 +1123,27 @@ cpu_initclocks() } apic_8254_intr = apic_irq(0, 0); setup_8254_mixed_mode(); - inthand_add("clk", apic_8254_intr, - (inthand2_t *)clkintr, - NULL, &clk_imask, INTR_EXCL); - INTREN(1 << apic_8254_intr); } } + + /* Finally, setup the real clock handlers */ + inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, NULL, + PI_REALTIME, INTR_EXCL); + INTREN(1 << apic_8254_intr); +#endif + + inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, PI_REALTIME, + INTR_EXCL); +#ifdef APIC_IO + INTREN(APIC_IRQ8); +#else + INTREN(IRQ8); +#endif + + writertc(RTC_STATUSB, rtc_statusb); + +#ifdef APIC_IO if (apic_int_type(0, 0) != 3 || int_to_apicintpin[apic_8254_intr].ioapic != 0 || int_to_apicintpin[apic_8254_intr].int_pin != 0) @@ -1198,11 +1248,12 @@ static unsigned i8254_get_timecount(struct timecounter *tc) { u_int count; - u_long ef; + int intrsave; u_int high, low; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); /* Select timer0 and latch counter value. */ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); @@ -1212,7 +1263,7 @@ i8254_get_timecount(struct timecounter *tc) count = timer0_max_count - ((high << 8) | low); if (count < i8254_lastcount || (!i8254_ticked && (clkintr_pending || - ((count < 20 || (!(ef & PSL_I) && count < timer0_max_count / 2u)) && + ((count < 20 || (!(intrsave & PSL_I) && count < timer0_max_count / 2u)) && #ifdef APIC_IO #define lapic_irr1 ((volatile u_int *)&lapic)[0x210 / 4] /* XXX XXX */ /* XXX this assumes that apic_8254_intr is < 24. */ @@ -1227,7 +1278,7 @@ i8254_get_timecount(struct timecounter *tc) i8254_lastcount = count; count += i8254_offset; CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); return (count); } diff --git a/sys/amd64/isa/icu_ipl.S b/sys/amd64/isa/icu_ipl.S index 34753583a41e..d178d5c43c45 100644 --- a/sys/amd64/isa/icu_ipl.S +++ b/sys/amd64/isa/icu_ipl.S @@ -54,63 +54,6 @@ _imen: .long HWI_MASK .text SUPERALIGN_TEXT -/* - * Interrupt priority mechanism - * -- soft splXX masks with group mechanism (cpl) - * -- h/w masks for currently active or unused interrupts (imen) - * -- ipending = active interrupts currently masked by cpl - */ - -ENTRY(splz) - /* - * The caller has restored cpl and checked that (ipending & ~cpl) - * is nonzero. We have to repeat the check since if there is an - * interrupt while we're looking, _doreti processing for the - * interrupt will handle all the unmasked pending interrupts - * because we restored early. We're repeating the calculation - * of (ipending & ~cpl) anyway so that the caller doesn't have - * to pass it, so this only costs one "jne". "bsfl %ecx,%ecx" - * is undefined when %ecx is 0 so we can't rely on the secondary - * btrl tests. - */ - movl _cpl,%eax -splz_next: - /* - * We don't need any locking here. (ipending & ~cpl) cannot grow - * while we're looking at it - any interrupt will shrink it to 0. - */ - movl %eax,%ecx - notl %ecx - andl _ipending,%ecx - jne splz_unpend - ret - - ALIGN_TEXT -splz_unpend: - bsfl %ecx,%ecx - btrl %ecx,_ipending - jnc splz_next - cmpl $NHWI,%ecx - jae splz_swi - /* - * We would prefer to call the intr handler directly here but that - * doesn't work for badly behaved handlers that want the interrupt - * frame. Also, there's a problem determining the unit number. - * We should change the interface so that the unit number is not - * determined at config time. - */ - jmp *vec(,%ecx,4) - - ALIGN_TEXT -splz_swi: - pushl %eax - orl imasks(,%ecx,4),%eax - movl %eax,_cpl - call *_ihandlers(,%ecx,4) - popl %eax - movl %eax,_cpl - jmp splz_next - /* * Fake clock interrupt(s) so that they appear to come from our caller instead * of from here, so that system profiling works. diff --git a/sys/amd64/isa/icu_ipl.s b/sys/amd64/isa/icu_ipl.s index 34753583a41e..d178d5c43c45 100644 --- a/sys/amd64/isa/icu_ipl.s +++ b/sys/amd64/isa/icu_ipl.s @@ -54,63 +54,6 @@ _imen: .long HWI_MASK .text SUPERALIGN_TEXT -/* - * Interrupt priority mechanism - * -- soft splXX masks with group mechanism (cpl) - * -- h/w masks for currently active or unused interrupts (imen) - * -- ipending = active interrupts currently masked by cpl - */ - -ENTRY(splz) - /* - * The caller has restored cpl and checked that (ipending & ~cpl) - * is nonzero. We have to repeat the check since if there is an - * interrupt while we're looking, _doreti processing for the - * interrupt will handle all the unmasked pending interrupts - * because we restored early. We're repeating the calculation - * of (ipending & ~cpl) anyway so that the caller doesn't have - * to pass it, so this only costs one "jne". "bsfl %ecx,%ecx" - * is undefined when %ecx is 0 so we can't rely on the secondary - * btrl tests. - */ - movl _cpl,%eax -splz_next: - /* - * We don't need any locking here. (ipending & ~cpl) cannot grow - * while we're looking at it - any interrupt will shrink it to 0. - */ - movl %eax,%ecx - notl %ecx - andl _ipending,%ecx - jne splz_unpend - ret - - ALIGN_TEXT -splz_unpend: - bsfl %ecx,%ecx - btrl %ecx,_ipending - jnc splz_next - cmpl $NHWI,%ecx - jae splz_swi - /* - * We would prefer to call the intr handler directly here but that - * doesn't work for badly behaved handlers that want the interrupt - * frame. Also, there's a problem determining the unit number. - * We should change the interface so that the unit number is not - * determined at config time. - */ - jmp *vec(,%ecx,4) - - ALIGN_TEXT -splz_swi: - pushl %eax - orl imasks(,%ecx,4),%eax - movl %eax,_cpl - call *_ihandlers(,%ecx,4) - popl %eax - movl %eax,_cpl - jmp splz_next - /* * Fake clock interrupt(s) so that they appear to come from our caller instead * of from here, so that system profiling works. diff --git a/sys/amd64/isa/icu_vector.S b/sys/amd64/isa/icu_vector.S index e427351ca205..d2b88bf705a3 100644 --- a/sys/amd64/isa/icu_vector.S +++ b/sys/amd64/isa/icu_vector.S @@ -53,9 +53,11 @@ IDTVEC(vec_name) ; \ pushl %ecx ; \ pushl %edx ; \ pushl %ds ; \ + pushl %fs ; \ MAYBE_PUSHL_ES ; \ mov $KDSEL,%ax ; \ mov %ax,%ds ; \ + mov %ax,%fs ; \ MAYBE_MOVW_AX_ES ; \ FAKE_MCOUNT((4+ACTUALLY_PUSHED)*4(%esp)) ; \ pushl _intr_unit + (irq_num) * 4 ; \ @@ -65,18 +67,21 @@ IDTVEC(vec_name) ; \ incl _cnt+V_INTR ; /* book-keeping can wait */ \ movl _intr_countp + (irq_num) * 4,%eax ; \ incl (%eax) ; \ - movl _cpl,%eax ; /* are we unmasking pending HWIs or SWIs? */ \ +/* movl _cpl,%eax ; // are we unmasking pending SWIs? / \ notl %eax ; \ - andl _ipending,%eax ; \ - jne 2f ; /* yes, maybe handle them */ \ + andl _spending,$SWI_MASK ; \ + jne 2f ; // yes, maybe handle them */ \ 1: ; \ MEXITCOUNT ; \ MAYBE_POPL_ES ; \ + popl %fs ; \ popl %ds ; \ popl %edx ; \ popl %ecx ; \ popl %eax ; \ iret ; \ + +#if 0 ; \ ALIGN_TEXT ; \ 2: ; \ @@ -88,6 +93,7 @@ IDTVEC(vec_name) ; \ incb _intr_nesting_level ; /* ... really limit it ... */ \ sti ; /* ... to do this as early as possible */ \ MAYBE_POPL_ES ; /* discard most of thin frame ... */ \ + popl %fs ; \ popl %ecx ; /* ... original %ds ... */ \ popl %edx ; \ xchgl %eax,4(%esp) ; /* orig %eax; save cpl */ \ @@ -101,11 +107,20 @@ IDTVEC(vec_name) ; \ movl (3+8+0)*4(%esp),%ecx ; /* ... %ecx from thin frame ... */ \ movl %ecx,(3+6)*4(%esp) ; /* ... to fat frame ... */ \ movl (3+8+1)*4(%esp),%eax ; /* ... cpl from thin frame */ \ - pushl %eax ; \ subl $4,%esp ; /* junk for unit number */ \ MEXITCOUNT ; \ jmp _doreti +#endif +/* + * Slow, threaded interrupts. + * + * XXX Most of the parameters here are obsolete. Fix this when we're + * done. + * XXX we really shouldn't return via doreti if we just schedule the + * interrupt handler and don't run anything. We could just do an + * iret. FIXME. + */ #define INTR(irq_num, vec_name, icu, enable_icus, reg, maybe_extra_ipending) \ .text ; \ SUPERALIGN_TEXT ; \ @@ -116,8 +131,8 @@ IDTVEC(vec_name) ; \ pushl %ds ; /* save our data and extra segments ... */ \ pushl %es ; \ pushl %fs ; \ - mov $KDSEL,%ax ; /* ... and reload with kernel's own ... */ \ - mov %ax,%ds ; /* ... early for obsolete reasons */ \ + mov $KDSEL,%ax ; /* load kernel ds, es and fs */ \ + mov %ax,%ds ; \ mov %ax,%es ; \ mov %ax,%fs ; \ maybe_extra_ipending ; \ @@ -126,43 +141,37 @@ IDTVEC(vec_name) ; \ movb %al,_imen + IRQ_BYTE(irq_num) ; \ outb %al,$icu+ICU_IMR_OFFSET ; \ enable_icus ; \ - movl _cpl,%eax ; \ - testb $IRQ_BIT(irq_num),%reg ; \ - jne 2f ; \ - incb _intr_nesting_level ; \ + incb _intr_nesting_level ; /* XXX do we need this? */ \ __CONCAT(Xresume,irq_num): ; \ FAKE_MCOUNT(13*4(%esp)) ; /* XXX late to avoid double count */ \ - incl _cnt+V_INTR ; /* tally interrupts */ \ - movl _intr_countp + (irq_num) * 4,%eax ; \ - incl (%eax) ; \ - movl _cpl,%eax ; \ - pushl %eax ; \ - pushl _intr_unit + (irq_num) * 4 ; \ - orl _intr_mask + (irq_num) * 4,%eax ; \ - movl %eax,_cpl ; \ + pushl $irq_num; /* pass the IRQ */ \ sti ; \ - call *_intr_handler + (irq_num) * 4 ; \ - cli ; /* must unmask _imen and icu atomically */ \ - movb _imen + IRQ_BYTE(irq_num),%al ; \ - andb $~IRQ_BIT(irq_num),%al ; \ - movb %al,_imen + IRQ_BYTE(irq_num) ; \ - outb %al,$icu+ICU_IMR_OFFSET ; \ - sti ; /* XXX _doreti repeats the cli/sti */ \ + call _sched_ithd ; \ + addl $4, %esp ; /* discard the parameter */ \ MEXITCOUNT ; \ /* We could usually avoid the following jmp by inlining some of */ \ /* _doreti, but it's probably better to use less cache. */ \ - jmp _doreti ; \ -; \ - ALIGN_TEXT ; \ -2: ; \ - /* XXX skip mcounting here to avoid double count */ \ - orb $IRQ_BIT(irq_num),_ipending + IRQ_BYTE(irq_num) ; \ - popl %fs ; \ - popl %es ; \ - popl %ds ; \ - popal ; \ - addl $4+4,%esp ; \ - iret + jmp doreti_next /* and catch up inside doreti */ + +/* + * Reenable the interrupt mask after completing an interrupt. Called + * from ithd_loop. There are two separate functions, one for each + * ICU. + */ + .globl setimask0, setimask1 +setimask0: + cli + movb _imen,%al + outb %al,$IO_ICU1 + ICU_IMR_OFFSET + sti + ret + +setimask1: + cli + movb _imen + 1,%al + outb %al,$IO_ICU2 + ICU_IMR_OFFSET + sti + ret MCOUNT_LABEL(bintr) FAST_INTR(0,fastintr0, ENABLE_ICU1) @@ -181,7 +190,9 @@ MCOUNT_LABEL(bintr) FAST_INTR(13,fastintr13, ENABLE_ICU1_AND_2) FAST_INTR(14,fastintr14, ENABLE_ICU1_AND_2) FAST_INTR(15,fastintr15, ENABLE_ICU1_AND_2) + #define CLKINTR_PENDING movl $1,CNAME(clkintr_pending) +/* Threaded interrupts */ INTR(0,intr0, IO_ICU1, ENABLE_ICU1, al, CLKINTR_PENDING) INTR(1,intr1, IO_ICU1, ENABLE_ICU1, al,) INTR(2,intr2, IO_ICU1, ENABLE_ICU1, al,) @@ -198,6 +209,7 @@ MCOUNT_LABEL(bintr) INTR(13,intr13, IO_ICU2, ENABLE_ICU1_AND_2, ah,) INTR(14,intr14, IO_ICU2, ENABLE_ICU1_AND_2, ah,) INTR(15,intr15, IO_ICU2, ENABLE_ICU1_AND_2, ah,) + MCOUNT_LABEL(eintr) .data @@ -211,10 +223,4 @@ _ihandlers: /* addresses of interrupt handlers */ .long _swi_null, swi_net, _swi_null, _swi_null .long _swi_vm, _swi_null, _softclock -imasks: /* masks for interrupt handlers */ - .space NHWI*4 /* padding; HWI masks are elsewhere */ - - .long SWI_TTY_MASK, SWI_NET_MASK, SWI_CAMNET_MASK, SWI_CAMBIO_MASK - .long SWI_VM_MASK, SWI_TQ_MASK, SWI_CLOCK_MASK - .text diff --git a/sys/amd64/isa/icu_vector.s b/sys/amd64/isa/icu_vector.s index e427351ca205..d2b88bf705a3 100644 --- a/sys/amd64/isa/icu_vector.s +++ b/sys/amd64/isa/icu_vector.s @@ -53,9 +53,11 @@ IDTVEC(vec_name) ; \ pushl %ecx ; \ pushl %edx ; \ pushl %ds ; \ + pushl %fs ; \ MAYBE_PUSHL_ES ; \ mov $KDSEL,%ax ; \ mov %ax,%ds ; \ + mov %ax,%fs ; \ MAYBE_MOVW_AX_ES ; \ FAKE_MCOUNT((4+ACTUALLY_PUSHED)*4(%esp)) ; \ pushl _intr_unit + (irq_num) * 4 ; \ @@ -65,18 +67,21 @@ IDTVEC(vec_name) ; \ incl _cnt+V_INTR ; /* book-keeping can wait */ \ movl _intr_countp + (irq_num) * 4,%eax ; \ incl (%eax) ; \ - movl _cpl,%eax ; /* are we unmasking pending HWIs or SWIs? */ \ +/* movl _cpl,%eax ; // are we unmasking pending SWIs? / \ notl %eax ; \ - andl _ipending,%eax ; \ - jne 2f ; /* yes, maybe handle them */ \ + andl _spending,$SWI_MASK ; \ + jne 2f ; // yes, maybe handle them */ \ 1: ; \ MEXITCOUNT ; \ MAYBE_POPL_ES ; \ + popl %fs ; \ popl %ds ; \ popl %edx ; \ popl %ecx ; \ popl %eax ; \ iret ; \ + +#if 0 ; \ ALIGN_TEXT ; \ 2: ; \ @@ -88,6 +93,7 @@ IDTVEC(vec_name) ; \ incb _intr_nesting_level ; /* ... really limit it ... */ \ sti ; /* ... to do this as early as possible */ \ MAYBE_POPL_ES ; /* discard most of thin frame ... */ \ + popl %fs ; \ popl %ecx ; /* ... original %ds ... */ \ popl %edx ; \ xchgl %eax,4(%esp) ; /* orig %eax; save cpl */ \ @@ -101,11 +107,20 @@ IDTVEC(vec_name) ; \ movl (3+8+0)*4(%esp),%ecx ; /* ... %ecx from thin frame ... */ \ movl %ecx,(3+6)*4(%esp) ; /* ... to fat frame ... */ \ movl (3+8+1)*4(%esp),%eax ; /* ... cpl from thin frame */ \ - pushl %eax ; \ subl $4,%esp ; /* junk for unit number */ \ MEXITCOUNT ; \ jmp _doreti +#endif +/* + * Slow, threaded interrupts. + * + * XXX Most of the parameters here are obsolete. Fix this when we're + * done. + * XXX we really shouldn't return via doreti if we just schedule the + * interrupt handler and don't run anything. We could just do an + * iret. FIXME. + */ #define INTR(irq_num, vec_name, icu, enable_icus, reg, maybe_extra_ipending) \ .text ; \ SUPERALIGN_TEXT ; \ @@ -116,8 +131,8 @@ IDTVEC(vec_name) ; \ pushl %ds ; /* save our data and extra segments ... */ \ pushl %es ; \ pushl %fs ; \ - mov $KDSEL,%ax ; /* ... and reload with kernel's own ... */ \ - mov %ax,%ds ; /* ... early for obsolete reasons */ \ + mov $KDSEL,%ax ; /* load kernel ds, es and fs */ \ + mov %ax,%ds ; \ mov %ax,%es ; \ mov %ax,%fs ; \ maybe_extra_ipending ; \ @@ -126,43 +141,37 @@ IDTVEC(vec_name) ; \ movb %al,_imen + IRQ_BYTE(irq_num) ; \ outb %al,$icu+ICU_IMR_OFFSET ; \ enable_icus ; \ - movl _cpl,%eax ; \ - testb $IRQ_BIT(irq_num),%reg ; \ - jne 2f ; \ - incb _intr_nesting_level ; \ + incb _intr_nesting_level ; /* XXX do we need this? */ \ __CONCAT(Xresume,irq_num): ; \ FAKE_MCOUNT(13*4(%esp)) ; /* XXX late to avoid double count */ \ - incl _cnt+V_INTR ; /* tally interrupts */ \ - movl _intr_countp + (irq_num) * 4,%eax ; \ - incl (%eax) ; \ - movl _cpl,%eax ; \ - pushl %eax ; \ - pushl _intr_unit + (irq_num) * 4 ; \ - orl _intr_mask + (irq_num) * 4,%eax ; \ - movl %eax,_cpl ; \ + pushl $irq_num; /* pass the IRQ */ \ sti ; \ - call *_intr_handler + (irq_num) * 4 ; \ - cli ; /* must unmask _imen and icu atomically */ \ - movb _imen + IRQ_BYTE(irq_num),%al ; \ - andb $~IRQ_BIT(irq_num),%al ; \ - movb %al,_imen + IRQ_BYTE(irq_num) ; \ - outb %al,$icu+ICU_IMR_OFFSET ; \ - sti ; /* XXX _doreti repeats the cli/sti */ \ + call _sched_ithd ; \ + addl $4, %esp ; /* discard the parameter */ \ MEXITCOUNT ; \ /* We could usually avoid the following jmp by inlining some of */ \ /* _doreti, but it's probably better to use less cache. */ \ - jmp _doreti ; \ -; \ - ALIGN_TEXT ; \ -2: ; \ - /* XXX skip mcounting here to avoid double count */ \ - orb $IRQ_BIT(irq_num),_ipending + IRQ_BYTE(irq_num) ; \ - popl %fs ; \ - popl %es ; \ - popl %ds ; \ - popal ; \ - addl $4+4,%esp ; \ - iret + jmp doreti_next /* and catch up inside doreti */ + +/* + * Reenable the interrupt mask after completing an interrupt. Called + * from ithd_loop. There are two separate functions, one for each + * ICU. + */ + .globl setimask0, setimask1 +setimask0: + cli + movb _imen,%al + outb %al,$IO_ICU1 + ICU_IMR_OFFSET + sti + ret + +setimask1: + cli + movb _imen + 1,%al + outb %al,$IO_ICU2 + ICU_IMR_OFFSET + sti + ret MCOUNT_LABEL(bintr) FAST_INTR(0,fastintr0, ENABLE_ICU1) @@ -181,7 +190,9 @@ MCOUNT_LABEL(bintr) FAST_INTR(13,fastintr13, ENABLE_ICU1_AND_2) FAST_INTR(14,fastintr14, ENABLE_ICU1_AND_2) FAST_INTR(15,fastintr15, ENABLE_ICU1_AND_2) + #define CLKINTR_PENDING movl $1,CNAME(clkintr_pending) +/* Threaded interrupts */ INTR(0,intr0, IO_ICU1, ENABLE_ICU1, al, CLKINTR_PENDING) INTR(1,intr1, IO_ICU1, ENABLE_ICU1, al,) INTR(2,intr2, IO_ICU1, ENABLE_ICU1, al,) @@ -198,6 +209,7 @@ MCOUNT_LABEL(bintr) INTR(13,intr13, IO_ICU2, ENABLE_ICU1_AND_2, ah,) INTR(14,intr14, IO_ICU2, ENABLE_ICU1_AND_2, ah,) INTR(15,intr15, IO_ICU2, ENABLE_ICU1_AND_2, ah,) + MCOUNT_LABEL(eintr) .data @@ -211,10 +223,4 @@ _ihandlers: /* addresses of interrupt handlers */ .long _swi_null, swi_net, _swi_null, _swi_null .long _swi_vm, _swi_null, _softclock -imasks: /* masks for interrupt handlers */ - .space NHWI*4 /* padding; HWI masks are elsewhere */ - - .long SWI_TTY_MASK, SWI_NET_MASK, SWI_CAMNET_MASK, SWI_CAMBIO_MASK - .long SWI_VM_MASK, SWI_TQ_MASK, SWI_CLOCK_MASK - .text diff --git a/sys/amd64/isa/intr_machdep.c b/sys/amd64/isa/intr_machdep.c index 34a8c229bd6b..870760e1ce01 100644 --- a/sys/amd64/isa/intr_machdep.c +++ b/sys/amd64/isa/intr_machdep.c @@ -36,12 +36,6 @@ * from: @(#)isa.c 7.2 (Berkeley) 5/13/91 * $FreeBSD$ */ -/* - * This file contains an aggregated module marked: - * Copyright (c) 1997, Stefan Esser - * All rights reserved. - * See the notice for details. - */ #include "opt_auto_eoi.h" @@ -51,11 +45,14 @@ #ifndef SMP #include #endif +#include #include #include #include +#include #include #include +#include #include #include #include @@ -91,30 +88,14 @@ #include #endif -/* XXX should be in suitable include files */ -#ifdef PC98 -#define ICU_IMR_OFFSET 2 /* IO_ICU{1,2} + 2 */ -#define ICU_SLAVEID 7 -#else -#define ICU_IMR_OFFSET 1 /* IO_ICU{1,2} + 1 */ -#define ICU_SLAVEID 2 -#endif - -#ifdef APIC_IO /* - * This is to accommodate "mixed-mode" programming for - * motherboards that don't connect the 8254 to the IO APIC. + * Per-interrupt data. We consider the soft interrupt to be a special + * case, so these arrays have NHWI + NSWI entries, not ICU_LEN. */ -#define AUTO_EOI_1 1 -#endif - -#define NR_INTRNAMES (1 + ICU_LEN + 2 * ICU_LEN) - -u_long *intr_countp[ICU_LEN]; -inthand2_t *intr_handler[ICU_LEN]; -u_int intr_mask[ICU_LEN]; -static u_int* intr_mptr[ICU_LEN]; -void *intr_unit[ICU_LEN]; +u_long *intr_countp[NHWI + NSWI]; /* pointers to interrupt counters */ +inthand2_t *intr_handler[NHWI + NSWI]; /* first level interrupt handler */ +ithd *ithds[NHWI + NSWI]; /* real interrupt handler */ +void *intr_unit[NHWI + NSWI]; static inthand_t *fastintr[ICU_LEN] = { &IDTVEC(fastintr0), &IDTVEC(fastintr1), @@ -292,8 +273,9 @@ isa_nmi(cd) } /* - * Fill in default interrupt table (in case of spuruious interrupt - * during configuration of kernel, setup interrupt control unit + * Create a default interrupt table to avoid problems caused by + * spurious interrupts during configuration of kernel, then setup + * interrupt control unit. */ void isa_defaultirq() @@ -364,16 +346,6 @@ isa_strayintr(vcookiep) { int intr = (void **)vcookiep - &intr_unit[0]; - /* DON'T BOTHER FOR NOW! */ - /* for some reason, we get bursts of intr #7, even if not enabled! */ - /* - * Well the reason you got bursts of intr #7 is because someone - * raised an interrupt line and dropped it before the 8259 could - * prioritize it. This is documented in the intel data book. This - * means you have BAD hardware! I have changed this so that only - * the first 5 get logged, then it quits logging them, and puts - * out a special message. rgrimes 3/25/1993 - */ /* * XXX TODO print a different message for #7 if it is for a * glitch. Glitches can be distinguished from real #7's by @@ -405,36 +377,10 @@ isa_irq_pending() } #endif -int -update_intr_masks(void) -{ - int intr, n=0; - u_int mask,*maskptr; - - for (intr=0; intr < ICU_LEN; intr ++) { -#if defined(APIC_IO) - /* no 8259 SLAVE to ignore */ -#else - if (intr==ICU_SLAVEID) continue; /* ignore 8259 SLAVE output */ -#endif /* APIC_IO */ - maskptr = intr_mptr[intr]; - if (!maskptr) - continue; - *maskptr |= SWI_LOW_MASK | (1 << intr); - mask = *maskptr; - if (mask != intr_mask[intr]) { -#if 0 - printf ("intr_mask[%2d] old=%08x new=%08x ptr=%p.\n", - intr, intr_mask[intr], mask, maskptr); -#endif - intr_mask[intr]=mask; - n++; - } - - } - return (n); -} - +/* + * Update intrnames array with the specified name. This is used by + * vmstat(8) and the like. + */ static void update_intrname(int intr, char *name) { @@ -485,7 +431,7 @@ update_intrname(int intr, char *name) } int -icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) +icu_setup(int intr, inthand2_t *handler, void *arg, int flags) { #ifdef FAST_HI int select; /* the select register is 8 bits */ @@ -493,7 +439,6 @@ icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) u_int32_t value; /* the window register is 32 bits */ #endif /* FAST_HI */ u_long ef; - u_int mask = (maskptr ? *maskptr : 0); #if defined(APIC_IO) if ((u_int)intr >= ICU_LEN) /* no 8259 SLAVE to ignore */ @@ -506,8 +451,6 @@ icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) ef = read_eflags(); disable_intr(); intr_handler[intr] = handler; - intr_mptr[intr] = maskptr; - intr_mask[intr] = mask | SWI_LOW_MASK | (1 << intr); intr_unit[intr] = arg; #ifdef FAST_HI if (flags & INTR_FAST) { @@ -547,11 +490,15 @@ icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /* FAST_HI */ INTREN(1 << intr); - MPINTR_UNLOCK(); write_eflags(ef); return (0); } +/* + * Dissociate an interrupt handler from an IRQ and set the handler to + * the stray interrupt handler. The 'handler' parameter is used only + * for consistency checking. + */ int icu_unset(intr, handler) int intr; @@ -567,8 +514,6 @@ icu_unset(intr, handler) disable_intr(); intr_countp[intr] = &intrcnt[1 + intr]; intr_handler[intr] = isa_strayintr; - intr_mptr[intr] = NULL; - intr_mask[intr] = HWI_MASK | SWI_MASK; intr_unit[intr] = &intr_unit[intr]; #ifdef FAST_HI_XXX /* XXX how do I re-create dvp here? */ @@ -581,353 +526,172 @@ icu_unset(intr, handler) setidt(ICU_OFFSET + intr, slowintr[intr], SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /* FAST_HI */ - MPINTR_UNLOCK(); write_eflags(ef); return (0); } -/* The following notice applies beyond this point in the file */ - -/* - * Copyright (c) 1997, Stefan Esser - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice unmodified, this list of conditions, and the following - * disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - * - */ - -typedef struct intrec { - intrmask_t mask; - inthand2_t *handler; - void *argument; - struct intrec *next; - char *name; - int intr; - intrmask_t *maskptr; - int flags; -} intrec; - -static intrec *intreclist_head[ICU_LEN]; - -/* - * The interrupt multiplexer calls each of the handlers in turn. The - * ipl is initially quite low. It is raised as necessary for each call - * and lowered after the call. Thus out of order handling is possible - * even for interrupts of the same type. This is probably no more - * harmful than out of order handling in general (not harmful except - * for real time response which we don't support anyway). - */ -static void -intr_mux(void *arg) -{ - intrec *p; - intrmask_t oldspl; - - for (p = arg; p != NULL; p = p->next) { - oldspl = splq(p->mask); - p->handler(p->argument); - splx(oldspl); - } -} - -static intrec* -find_idesc(unsigned *maskptr, int irq) -{ - intrec *p = intreclist_head[irq]; - - while (p && p->maskptr != maskptr) - p = p->next; - - return (p); -} - -static intrec** -find_pred(intrec *idesc, int irq) -{ - intrec **pp = &intreclist_head[irq]; - intrec *p = *pp; - - while (p != idesc) { - if (p == NULL) - return (NULL); - pp = &p->next; - p = *pp; - } - return (pp); -} - -/* - * Both the low level handler and the shared interrupt multiplexer - * block out further interrupts as set in the handlers "mask", while - * the handler is running. In fact *maskptr should be used for this - * purpose, but since this requires one more pointer dereference on - * each interrupt, we rather bother update "mask" whenever *maskptr - * changes. The function "update_masks" should be called **after** - * all manipulation of the linked list of interrupt handlers hung - * off of intrdec_head[irq] is complete, since the chain of handlers - * will both determine the *maskptr values and the instances of mask - * that are fixed. This function should be called with the irq for - * which a new handler has been add blocked, since the masks may not - * yet know about the use of this irq for a device of a certain class. - */ - -static void -update_mux_masks(void) -{ - int irq; - for (irq = 0; irq < ICU_LEN; irq++) { - intrec *idesc = intreclist_head[irq]; - while (idesc != NULL) { - if (idesc->maskptr != NULL) { - /* our copy of *maskptr may be stale, refresh */ - idesc->mask = *idesc->maskptr; - } - idesc = idesc->next; - } - } -} - -static void -update_masks(intrmask_t *maskptr, int irq) -{ - intrmask_t mask = 1 << irq; - - if (maskptr == NULL) - return; - - if (find_idesc(maskptr, irq) == NULL) { - /* no reference to this maskptr was found in this irq's chain */ - if ((*maskptr & mask) == 0) - return; - /* the irq was included in the classes mask, remove it */ - *maskptr &= ~mask; - } else { - /* a reference to this maskptr was found in this irq's chain */ - if ((*maskptr & mask) != 0) - return; - /* put the irq into the classes mask */ - *maskptr |= mask; - } - /* we need to update all values in the intr_mask[irq] array */ - update_intr_masks(); - /* update mask in chains of the interrupt multiplex handler as well */ - update_mux_masks(); -} - -/* - * Add interrupt handler to linked list hung off of intreclist_head[irq] - * and install shared interrupt multiplex handler, if necessary - */ - -static int -add_intrdesc(intrec *idesc) -{ - int irq = idesc->intr; - - intrec *head = intreclist_head[irq]; - - if (head == NULL) { - /* first handler for this irq, just install it */ - if (icu_setup(irq, idesc->handler, idesc->argument, - idesc->maskptr, idesc->flags) != 0) - return (-1); - - update_intrname(irq, idesc->name); - /* keep reference */ - intreclist_head[irq] = idesc; - } else { - if ((idesc->flags & INTR_EXCL) != 0 - || (head->flags & INTR_EXCL) != 0) { - /* - * can't append new handler, if either list head or - * new handler do not allow interrupts to be shared - */ - if (bootverbose) - printf("\tdevice combination doesn't support " - "shared irq%d\n", irq); - return (-1); - } - if (head->next == NULL) { - /* - * second handler for this irq, replace device driver's - * handler by shared interrupt multiplexer function - */ - icu_unset(irq, head->handler); - if (icu_setup(irq, intr_mux, head, 0, 0) != 0) - return (-1); - if (bootverbose) - printf("\tusing shared irq%d.\n", irq); - update_intrname(irq, "mux"); - } - /* just append to the end of the chain */ - while (head->next != NULL) - head = head->next; - head->next = idesc; - } - update_masks(idesc->maskptr, irq); - return (0); -} - -/* - * Create and activate an interrupt handler descriptor data structure. - * - * The dev_instance pointer is required for resource management, and will - * only be passed through to resource_claim(). - * - * There will be functions that derive a driver and unit name from a - * dev_instance variable, and those functions will be used to maintain the - * interrupt counter label array referenced by systat and vmstat to report - * device interrupt rates (->update_intrlabels). - * - * Add the interrupt handler descriptor data structure created by an - * earlier call of create_intr() to the linked list for its irq and - * adjust the interrupt masks if necessary. - * - * WARNING: This is an internal function and not to be used by device - * drivers. It is subject to change without notice. - */ - intrec * inthand_add(const char *name, int irq, inthand2_t handler, void *arg, - intrmask_t *maskptr, int flags) + int pri, int flags) { - intrec *idesc; - int errcode = -1; - intrmask_t oldspl; + ithd *ithd = ithds[irq]; /* descriptor for the IRQ */ + intrec *head; /* chain of handlers for IRQ */ + intrec *idesc; /* descriptor for this handler */ + struct proc *p; /* interrupt thread */ + int errcode = 0; - if (ICU_LEN > 8 * sizeof *maskptr) { - printf("create_intr: ICU_LEN of %d too high for %d bit intrmask\n", - ICU_LEN, 8 * sizeof *maskptr); - return (NULL); - } - if ((unsigned)irq >= ICU_LEN) { - printf("create_intr: requested irq%d too high, limit is %d\n", - irq, ICU_LEN -1); - return (NULL); - } + if (name == NULL) /* no name? */ + panic ("anonymous interrupt"); + if (ithd == NULL || ithd->it_ih == NULL) { + /* first handler for this irq. */ + if (ithd == NULL) { + ithd = malloc(sizeof (struct ithd), M_DEVBUF, M_WAITOK); + if (ithd == NULL) + return (NULL); + bzero(ithd, sizeof(struct ithd)); + ithd->irq = irq; + ithds[irq] = ithd; + } + /* + * If we have a fast interrupt, we need to set the + * handler address directly. Do that below. For a + * slow interrupt, we don't need to know more details, + * so do it here because it's tidier. + */ + if ((flags & INTR_FAST) == 0) { + /* + * Only create a kernel thread if we don't already + * have one. + */ + if (ithd->it_proc == NULL) { + errcode = kthread_create(ithd_loop, NULL, &p, + RFSTOPPED | RFHIGHPID, "irq%d: %s", irq, + name); + if (errcode) + panic("inthand_add: Can't create " + "interrupt thread"); + p->p_rtprio.type = RTP_PRIO_ITHREAD; + p->p_stat = SWAIT; /* we're idle */ - idesc = malloc(sizeof *idesc, M_DEVBUF, M_WAITOK); + /* Put in linkages. */ + ithd->it_proc = p; + p->p_ithd = ithd; + } else + snprintf(ithd->it_proc->p_comm, MAXCOMLEN, + "irq%d: %s", irq, name); + p->p_rtprio.prio = pri; + + /* + * The interrupt process must be in place, but + * not necessarily schedulable, before we + * initialize the ICU, since it may cause an + * immediate interrupt. + */ + if (icu_setup(irq, &sched_ithd, arg, flags) != 0) + panic("inthand_add: Can't initialize ICU"); + } + } else if ((flags & INTR_EXCL) != 0 + || (ithd->it_ih->flags & INTR_EXCL) != 0) { + /* + * We can't append the new handler if either + * list ithd or new handler do not allow + * interrupts to be shared. + */ + if (bootverbose) + printf("\tdevice combination %s and %s " + "doesn't support shared irq%d\n", + ithd->it_ih->name, name, irq); + return(NULL); + } else if (flags & INTR_FAST) { + /* We can only have one fast interrupt by itself. */ + if (bootverbose) + printf("\tCan't add fast interrupt %s" + " to normal interrupt %s on irq%d", + name, ithd->it_ih->name, irq); + return (NULL); + } else { /* update p_comm */ + p = ithd->it_proc; + if (strlen(p->p_comm) + strlen(name) < MAXCOMLEN) { + strcat(p->p_comm, " "); + strcat(p->p_comm, name); + } else if (strlen(p->p_comm) == MAXCOMLEN) + p->p_comm[MAXCOMLEN - 1] = '+'; + else + strcat(p->p_comm, "+"); + } + idesc = malloc(sizeof (struct intrec), M_DEVBUF, M_WAITOK); if (idesc == NULL) - return NULL; - bzero(idesc, sizeof *idesc); + return (NULL); + bzero(idesc, sizeof (struct intrec)); + + idesc->handler = handler; + idesc->argument = arg; + idesc->flags = flags; + idesc->ithd = ithd; - if (name == NULL) - name = "???"; idesc->name = malloc(strlen(name) + 1, M_DEVBUF, M_WAITOK); if (idesc->name == NULL) { free(idesc, M_DEVBUF); - return NULL; + return (NULL); } strcpy(idesc->name, name); - idesc->handler = handler; - idesc->argument = arg; - idesc->maskptr = maskptr; - idesc->intr = irq; - idesc->flags = flags; - - /* block this irq */ - oldspl = splq(1 << irq); - - /* add irq to class selected by maskptr */ - errcode = add_intrdesc(idesc); - splx(oldspl); - - if (errcode != 0) { + /* Slow interrupts got set up above. */ + if ((flags & INTR_FAST) + && (icu_setup(irq, idesc->handler, idesc->argument, + idesc->flags) != 0) ) { if (bootverbose) - printf("\tintr_connect(irq%d) failed, result=%d\n", + printf("\tinthand_add(irq%d) failed, result=%d\n", irq, errcode); free(idesc->name, M_DEVBUF); free(idesc, M_DEVBUF); - idesc = NULL; + return NULL; } - + head = ithd->it_ih; /* look at chain of handlers */ + if (head) { + while (head->next != NULL) + head = head->next; /* find the end */ + head->next = idesc; /* hook it in there */ + } else + ithd->it_ih = idesc; /* put it up front */ + update_intrname(irq, idesc->name); return (idesc); } /* - * Deactivate and remove the interrupt handler descriptor data connected - * created by an earlier call of intr_connect() from the linked list and - * adjust theinterrupt masks if necessary. + * Deactivate and remove linked list the interrupt handler descriptor + * data connected created by an earlier call of inthand_add(), then + * adjust the interrupt masks if necessary. * - * Return the memory held by the interrupt handler descriptor data structure - * to the system. Make sure, the handler is not actively used anymore, before. + * Return the memory held by the interrupt handler descriptor data + * structure to the system. First ensure the handler is not actively + * in use. */ int inthand_remove(intrec *idesc) { - intrec **hook, *head; - int irq; - int errcode = 0; - intrmask_t oldspl; + ithd *ithd; /* descriptor for the IRQ */ + intrec *ih; /* chain of handlers */ if (idesc == NULL) return (-1); + ithd = idesc->ithd; + ih = ithd->it_ih; - irq = idesc->intr; - - /* find pointer that keeps the reference to this interrupt descriptor */ - hook = find_pred(idesc, irq); - if (hook == NULL) + if (ih == idesc) /* first in the chain */ + ithd->it_ih = idesc->next; /* unhook it */ + else { + while ((ih != NULL) + && (ih->next != idesc) ) + ih = ih->next; + if (ih->next != idesc) return (-1); - - /* make copy of original list head, the line after may overwrite it */ - head = intreclist_head[irq]; - - /* unlink: make predecessor point to idesc->next instead of to idesc */ - *hook = idesc->next; - - /* now check whether the element we removed was the list head */ - if (idesc == head) { - - oldspl = splq(1 << irq); - - /* check whether the new list head is the only element on list */ - head = intreclist_head[irq]; - if (head != NULL) { - icu_unset(irq, intr_mux); - if (head->next != NULL) { - /* install the multiplex handler with new list head as argument */ - errcode = icu_setup(irq, intr_mux, head, 0, 0); - if (errcode == 0) - update_intrname(irq, NULL); - } else { - /* install the one remaining handler for this irq */ - errcode = icu_setup(irq, head->handler, - head->argument, - head->maskptr, head->flags); - if (errcode == 0) - update_intrname(irq, head->name); + ih->next = ih->next->next; } - } else { - /* revert to old handler, eg: strayintr */ - icu_unset(irq, idesc->handler); - } - splx(oldspl); - } - update_masks(idesc->maskptr, irq); + + if (ithd->it_ih == NULL) /* no handlers left, */ + icu_unset(ithd->irq, idesc->handler); free(idesc, M_DEVBUF); return (0); } diff --git a/sys/amd64/isa/intr_machdep.h b/sys/amd64/isa/intr_machdep.h index 5982295b1ab4..87c97a35f5ef 100644 --- a/sys/amd64/isa/intr_machdep.h +++ b/sys/amd64/isa/intr_machdep.h @@ -98,7 +98,6 @@ #define TPR_BLOCK_XCPUSTOP 0xaf /* */ #define TPR_BLOCK_ALL 0xff /* all INTs */ - #ifdef TEST_TEST1 /* put a 'fake' HWI in top of APIC prio 0x3x, 32 + 31 = 63 = 0x3f */ #define XTEST1_OFFSET (ICU_OFFSET + 31) @@ -145,8 +144,9 @@ extern u_long intrcnt[]; /* counts for for each device and stray */ extern char intrnames[]; /* string table containing device names */ extern u_long *intr_countp[]; /* pointers into intrcnt[] */ extern inthand2_t *intr_handler[]; /* C entry points of intr handlers */ -extern u_int intr_mask[]; /* sets of intrs masked during handling of 1 */ +extern ithd *ithds[]; extern void *intr_unit[]; /* cookies to pass to intr handlers */ +extern ithd softinterrupt; /* soft interrupt thread */ inthand_t IDTVEC(fastintr0), IDTVEC(fastintr1), @@ -190,26 +190,60 @@ inthand_t #endif /** TEST_TEST1 */ #endif /* SMP || APIC_IO */ +#ifdef PC98 +#define ICU_IMR_OFFSET 2 /* IO_ICU{1,2} + 2 */ +#define ICU_SLAVEID 7 +#else +#define ICU_IMR_OFFSET 1 /* IO_ICU{1,2} + 1 */ +#define ICU_SLAVEID 2 +#endif + +#ifdef APIC_IO +/* + * This is to accommodate "mixed-mode" programming for + * motherboards that don't connect the 8254 to the IO APIC. + */ +#define AUTO_EOI_1 1 +#endif + +#define NR_INTRNAMES (1 + ICU_LEN + 2 * ICU_LEN) + void isa_defaultirq __P((void)); int isa_nmi __P((int cd)); int icu_setup __P((int intr, inthand2_t *func, void *arg, - u_int *maskptr, int flags)); + int flags)); int icu_unset __P((int intr, inthand2_t *handler)); -int update_intr_masks __P((void)); intrmask_t splq __P((intrmask_t mask)); -#define INTR_FAST 0x00000001 /* fast interrupt handler */ -#define INTR_EXCL 0x00010000 /* excl. intr, default is shared */ +/* + * Describe a hardware interrupt handler. These structures are + * accessed via the array intreclist, which contains one pointer per + * hardware interrupt. + * + * Multiple interrupt handlers for a specific IRQ can be chained + * together via the 'next' pointer. + */ +typedef struct intrec { + inthand2_t *handler; /* code address of handler */ + void *argument; /* argument to pass to handler */ + enum intr_type flags; /* flag bits (sys/bus.h) */ + char *name; /* name of handler */ + ithd *ithd; /* handler we're connected to */ + struct intrec *next; /* next handler for this irq */ +} intrec; /* * WARNING: These are internal functions and not to be used by device drivers! * They are subject to change without notice. */ struct intrec *inthand_add(const char *name, int irq, inthand2_t handler, - void *arg, intrmask_t *maskptr, int flags); - + void *arg, int pri, int flags); int inthand_remove(struct intrec *idesc); +void sched_ithd(void *); +void ithd_loop(void *); +void start_softintr(void *); +void intr_soft(void *); #endif /* LOCORE */ diff --git a/sys/amd64/isa/ithread.c b/sys/amd64/isa/ithread.c new file mode 100644 index 000000000000..4ceac4229d1c --- /dev/null +++ b/sys/amd64/isa/ithread.c @@ -0,0 +1,353 @@ +/*- + * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From BSDI: intr.c,v 1.6.2.5 1999/07/06 19:16:52 cp Exp + * $FreeBSD$ + */ + +/* Interrupt thread code. */ + +#include "opt_auto_eoi.h" + +#include "isa.h" + +#include +#include /* change this name XXX */ +#ifndef SMP +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(APIC_IO) +#include +#include /** FAST_HI */ +#include +#endif /* APIC_IO */ +#ifdef PC98 +#include +#include +#include +#else +#include +#endif +#include + +#if NISA > 0 +#include +#endif +#include +#include +#ifdef APIC_IO +#include +#endif + +#include "mca.h" +#if NMCA > 0 +#include +#endif + +#include +#include +#include +#include +#if 0 +#include +#endif + +u_long softintrcnt [NSWI]; + +SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr, NULL) + +/* + * Schedule a heavyweight interrupt process. This function is called + * from the interrupt handlers Xintr. + */ +void +sched_ithd(void *cookie) +{ + int irq = (int) cookie; /* IRQ we're handling */ + ithd *ir = ithds[irq]; /* and the process that does it */ + + /* This used to be in icu_vector.s */ + /* + * We count software interrupts when we process them. The + * code here follows previous practice, but there's an + * argument for counting hardware interrupts when they're + * processed too. + */ + if (irq < NHWI) /* real interrupt, */ + atomic_add_long(intr_countp[irq], 1); /* one more for this IRQ */ + atomic_add_int(&cnt.v_intr, 1); /* one more global interrupt */ + + CTR3(KTR_INTR, "sched_ithd pid %d(%s) need=%d", + ir->it_proc->p_pid, ir->it_proc->p_comm, ir->it_need); + +#if 0 + /* + * If we are in the debugger, we can't use interrupt threads to + * process interrupts since the threads are scheduled. Instead, + * call the interrupt handlers directly. This should be able to + * go away once we have light-weight interrupt handlers. + */ + if (db_active) { + intrec *ih; /* and our interrupt handler chain */ +#if 0 + membar_unlock(); /* push out "it_need=0" */ +#endif + for (ih = ir->it_ih; ih != NULL; ih = ih->next) { + if ((ih->flags & INTR_MPSAFE) == 0) + mtx_enter(&Giant, MTX_DEF); + ih->handler(ih->argument); + if ((ih->flags & INTR_MPSAFE) == 0) + mtx_exit(&Giant, MTX_DEF); + } + + INTREN (1 << ir->irq); /* reset the mask bit */ + return; + } +#endif + + /* + * Set it_need so that if the thread is already running but close + * to done, it will do another go-round. Then get the sched lock + * and see if the thread is on whichkqs yet. If not, put it on + * there. In any case, kick everyone so that if the new thread + * is higher priority than their current thread, it gets run now. + */ + ir->it_need = 1; + mtx_enter(&sched_lock, MTX_SPIN); + if (ir->it_proc->p_stat == SWAIT) { /* not on run queue */ + CTR1(KTR_INTR, "sched_ithd: setrunqueue %d", + ir->it_proc->p_pid); +/* membar_lock(); */ + ir->it_proc->p_stat = SRUN; + setrunqueue(ir->it_proc); + aston(); + } + else { +if (irq < NHWI && (irq & 7) != 0) + CTR3(KTR_INTR, "sched_ithd %d: it_need %d, state %d", + ir->it_proc->p_pid, + ir->it_need, + ir->it_proc->p_stat ); + } + mtx_exit(&sched_lock, MTX_SPIN); +#if 0 + aston(); /* ??? check priorities first? */ +#else + need_resched(); +#endif +} + +/* + * This is the main code for all interrupt threads. It gets put on + * whichkqs by setrunqueue above. + */ +void +ithd_loop(void *dummy) +{ + ithd *me; /* our thread context */ + intrec *ih; /* and our interrupt handler chain */ + + me = curproc->p_ithd; /* point to myself */ + + /* + * As long as we have interrupts outstanding, go through the + * list of handlers, giving each one a go at it. + */ + for (;;) { + CTR3(KTR_INTR, "ithd_loop pid %d(%s) need=%d", + me->it_proc->p_pid, me->it_proc->p_comm, me->it_need); + while (me->it_need) { + /* + * Service interrupts. If another interrupt + * arrives while we are running, they will set + * it_need to denote that we should make + * another pass. + */ + me->it_need = 0; +#if 0 + membar_unlock(); /* push out "it_need=0" */ +#endif + for (ih = me->it_ih; ih != NULL; ih = ih->next) { + CTR5(KTR_INTR, + "ithd_loop pid %d ih=%p: %p(%p) flg=%x", + me->it_proc->p_pid, (void *)ih, + (void *)ih->handler, ih->argument, + ih->flags); + + if ((ih->flags & INTR_MPSAFE) == 0) + mtx_enter(&Giant, MTX_DEF); + ih->handler(ih->argument); + if ((ih->flags & INTR_MPSAFE) == 0) + mtx_exit(&Giant, MTX_DEF); + } + } + + /* + * Processed all our interrupts. Now get the sched + * lock. This may take a while and it_need may get + * set again, so we have to check it again. + */ + mtx_enter(&sched_lock, MTX_SPIN); + if (!me->it_need) { + + INTREN (1 << me->irq); /* reset the mask bit */ + me->it_proc->p_stat = SWAIT; /* we're idle */ +#ifdef APIC_IO + CTR1(KTR_INTR, "ithd_loop pid %d: done", + me->it_proc->p_pid); +#else + CTR2(KTR_INTR, "ithd_loop pid %d: done, imen=%x", + me->it_proc->p_pid, imen); +#endif + mi_switch(); + CTR1(KTR_INTR, "ithd_loop pid %d: resumed", + me->it_proc->p_pid); + } + mtx_exit(&sched_lock, MTX_SPIN); + } +} + +/* + * Start soft interrupt thread. + */ +void +start_softintr(void *dummy) +{ + int error; + struct proc *p; + ithd *softintr; /* descriptor for the "IRQ" */ + intrec *idesc; /* descriptor for this handler */ + char *name = "sintr"; /* name for idesc */ + int i; + + if (ithds[SOFTINTR]) { /* we already have a thread */ + printf("start_softintr: already running"); + return; + } + /* first handler for this irq. */ + softintr = malloc(sizeof (struct ithd), M_DEVBUF, M_WAITOK); + if (softintr == NULL) + panic ("Can't create soft interrupt thread"); + bzero(softintr, sizeof(struct ithd)); + softintr->irq = SOFTINTR; + ithds[SOFTINTR] = softintr; + error = kthread_create(intr_soft, NULL, &p, + RFSTOPPED | RFHIGHPID, "softinterrupt"); + if (error) + panic("start_softintr: kthread_create error %d\n", error); + + p->p_rtprio.type = RTP_PRIO_ITHREAD; + p->p_rtprio.prio = PI_SOFT; /* soft interrupt */ + p->p_stat = SWAIT; /* we're idle */ + + /* Put in linkages. */ + softintr->it_proc = p; + p->p_ithd = softintr; /* reverse link */ + + idesc = malloc(sizeof (struct intrec), M_DEVBUF, M_WAITOK); + if (idesc == NULL) + panic ("Can't create soft interrupt thread"); + bzero(idesc, sizeof (struct intrec)); + + idesc->ithd = softintr; + idesc->name = malloc(strlen(name) + 1, M_DEVBUF, M_WAITOK); + if (idesc->name == NULL) + panic ("Can't create soft interrupt thread"); + strcpy(idesc->name, name); + for (i = NHWI; i < NHWI + NSWI; i++) + intr_countp[i] = &softintrcnt [i - NHWI]; +} + +/* + * Software interrupt process code. + */ +void +intr_soft(void *dummy) +{ + int i; + ithd *me; /* our thread context */ + + me = curproc->p_ithd; /* point to myself */ + + /* Main loop */ + for (;;) { +#if 0 + CTR3(KTR_INTR, "intr_soft pid %d(%s) need=%d", + me->it_proc->p_pid, me->it_proc->p_comm, + me->it_need); +#endif + + /* + * Service interrupts. If another interrupt arrives + * while we are running, they will set it_need to + * denote that we should make another pass. + */ + me->it_need = 0; + while ((i = ffs(spending))) { + i--; + atomic_add_long(intr_countp[i], 1); + spending &= ~ (1 << i); + mtx_enter(&Giant, MTX_DEF); + (ihandlers[i])(); + mtx_exit(&Giant, MTX_DEF); + } + /* + * Processed all our interrupts. Now get the sched + * lock. This may take a while and it_need may get + * set again, so we have to check it again. + */ + mtx_enter(&sched_lock, MTX_SPIN); + if (!me->it_need) { +#if 0 + CTR1(KTR_INTR, "intr_soft pid %d: done", + me->it_proc->p_pid); +#endif + me->it_proc->p_stat = SWAIT; /* we're idle */ + mi_switch(); +#if 0 + CTR1(KTR_INTR, "intr_soft pid %d: resumed", + me->it_proc->p_pid); +#endif + } + mtx_exit(&sched_lock, MTX_SPIN); + } +} diff --git a/sys/amd64/isa/nmi.c b/sys/amd64/isa/nmi.c index 34a8c229bd6b..870760e1ce01 100644 --- a/sys/amd64/isa/nmi.c +++ b/sys/amd64/isa/nmi.c @@ -36,12 +36,6 @@ * from: @(#)isa.c 7.2 (Berkeley) 5/13/91 * $FreeBSD$ */ -/* - * This file contains an aggregated module marked: - * Copyright (c) 1997, Stefan Esser - * All rights reserved. - * See the notice for details. - */ #include "opt_auto_eoi.h" @@ -51,11 +45,14 @@ #ifndef SMP #include #endif +#include #include #include #include +#include #include #include +#include #include #include #include @@ -91,30 +88,14 @@ #include #endif -/* XXX should be in suitable include files */ -#ifdef PC98 -#define ICU_IMR_OFFSET 2 /* IO_ICU{1,2} + 2 */ -#define ICU_SLAVEID 7 -#else -#define ICU_IMR_OFFSET 1 /* IO_ICU{1,2} + 1 */ -#define ICU_SLAVEID 2 -#endif - -#ifdef APIC_IO /* - * This is to accommodate "mixed-mode" programming for - * motherboards that don't connect the 8254 to the IO APIC. + * Per-interrupt data. We consider the soft interrupt to be a special + * case, so these arrays have NHWI + NSWI entries, not ICU_LEN. */ -#define AUTO_EOI_1 1 -#endif - -#define NR_INTRNAMES (1 + ICU_LEN + 2 * ICU_LEN) - -u_long *intr_countp[ICU_LEN]; -inthand2_t *intr_handler[ICU_LEN]; -u_int intr_mask[ICU_LEN]; -static u_int* intr_mptr[ICU_LEN]; -void *intr_unit[ICU_LEN]; +u_long *intr_countp[NHWI + NSWI]; /* pointers to interrupt counters */ +inthand2_t *intr_handler[NHWI + NSWI]; /* first level interrupt handler */ +ithd *ithds[NHWI + NSWI]; /* real interrupt handler */ +void *intr_unit[NHWI + NSWI]; static inthand_t *fastintr[ICU_LEN] = { &IDTVEC(fastintr0), &IDTVEC(fastintr1), @@ -292,8 +273,9 @@ isa_nmi(cd) } /* - * Fill in default interrupt table (in case of spuruious interrupt - * during configuration of kernel, setup interrupt control unit + * Create a default interrupt table to avoid problems caused by + * spurious interrupts during configuration of kernel, then setup + * interrupt control unit. */ void isa_defaultirq() @@ -364,16 +346,6 @@ isa_strayintr(vcookiep) { int intr = (void **)vcookiep - &intr_unit[0]; - /* DON'T BOTHER FOR NOW! */ - /* for some reason, we get bursts of intr #7, even if not enabled! */ - /* - * Well the reason you got bursts of intr #7 is because someone - * raised an interrupt line and dropped it before the 8259 could - * prioritize it. This is documented in the intel data book. This - * means you have BAD hardware! I have changed this so that only - * the first 5 get logged, then it quits logging them, and puts - * out a special message. rgrimes 3/25/1993 - */ /* * XXX TODO print a different message for #7 if it is for a * glitch. Glitches can be distinguished from real #7's by @@ -405,36 +377,10 @@ isa_irq_pending() } #endif -int -update_intr_masks(void) -{ - int intr, n=0; - u_int mask,*maskptr; - - for (intr=0; intr < ICU_LEN; intr ++) { -#if defined(APIC_IO) - /* no 8259 SLAVE to ignore */ -#else - if (intr==ICU_SLAVEID) continue; /* ignore 8259 SLAVE output */ -#endif /* APIC_IO */ - maskptr = intr_mptr[intr]; - if (!maskptr) - continue; - *maskptr |= SWI_LOW_MASK | (1 << intr); - mask = *maskptr; - if (mask != intr_mask[intr]) { -#if 0 - printf ("intr_mask[%2d] old=%08x new=%08x ptr=%p.\n", - intr, intr_mask[intr], mask, maskptr); -#endif - intr_mask[intr]=mask; - n++; - } - - } - return (n); -} - +/* + * Update intrnames array with the specified name. This is used by + * vmstat(8) and the like. + */ static void update_intrname(int intr, char *name) { @@ -485,7 +431,7 @@ update_intrname(int intr, char *name) } int -icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) +icu_setup(int intr, inthand2_t *handler, void *arg, int flags) { #ifdef FAST_HI int select; /* the select register is 8 bits */ @@ -493,7 +439,6 @@ icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) u_int32_t value; /* the window register is 32 bits */ #endif /* FAST_HI */ u_long ef; - u_int mask = (maskptr ? *maskptr : 0); #if defined(APIC_IO) if ((u_int)intr >= ICU_LEN) /* no 8259 SLAVE to ignore */ @@ -506,8 +451,6 @@ icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) ef = read_eflags(); disable_intr(); intr_handler[intr] = handler; - intr_mptr[intr] = maskptr; - intr_mask[intr] = mask | SWI_LOW_MASK | (1 << intr); intr_unit[intr] = arg; #ifdef FAST_HI if (flags & INTR_FAST) { @@ -547,11 +490,15 @@ icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /* FAST_HI */ INTREN(1 << intr); - MPINTR_UNLOCK(); write_eflags(ef); return (0); } +/* + * Dissociate an interrupt handler from an IRQ and set the handler to + * the stray interrupt handler. The 'handler' parameter is used only + * for consistency checking. + */ int icu_unset(intr, handler) int intr; @@ -567,8 +514,6 @@ icu_unset(intr, handler) disable_intr(); intr_countp[intr] = &intrcnt[1 + intr]; intr_handler[intr] = isa_strayintr; - intr_mptr[intr] = NULL; - intr_mask[intr] = HWI_MASK | SWI_MASK; intr_unit[intr] = &intr_unit[intr]; #ifdef FAST_HI_XXX /* XXX how do I re-create dvp here? */ @@ -581,353 +526,172 @@ icu_unset(intr, handler) setidt(ICU_OFFSET + intr, slowintr[intr], SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /* FAST_HI */ - MPINTR_UNLOCK(); write_eflags(ef); return (0); } -/* The following notice applies beyond this point in the file */ - -/* - * Copyright (c) 1997, Stefan Esser - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice unmodified, this list of conditions, and the following - * disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - * - */ - -typedef struct intrec { - intrmask_t mask; - inthand2_t *handler; - void *argument; - struct intrec *next; - char *name; - int intr; - intrmask_t *maskptr; - int flags; -} intrec; - -static intrec *intreclist_head[ICU_LEN]; - -/* - * The interrupt multiplexer calls each of the handlers in turn. The - * ipl is initially quite low. It is raised as necessary for each call - * and lowered after the call. Thus out of order handling is possible - * even for interrupts of the same type. This is probably no more - * harmful than out of order handling in general (not harmful except - * for real time response which we don't support anyway). - */ -static void -intr_mux(void *arg) -{ - intrec *p; - intrmask_t oldspl; - - for (p = arg; p != NULL; p = p->next) { - oldspl = splq(p->mask); - p->handler(p->argument); - splx(oldspl); - } -} - -static intrec* -find_idesc(unsigned *maskptr, int irq) -{ - intrec *p = intreclist_head[irq]; - - while (p && p->maskptr != maskptr) - p = p->next; - - return (p); -} - -static intrec** -find_pred(intrec *idesc, int irq) -{ - intrec **pp = &intreclist_head[irq]; - intrec *p = *pp; - - while (p != idesc) { - if (p == NULL) - return (NULL); - pp = &p->next; - p = *pp; - } - return (pp); -} - -/* - * Both the low level handler and the shared interrupt multiplexer - * block out further interrupts as set in the handlers "mask", while - * the handler is running. In fact *maskptr should be used for this - * purpose, but since this requires one more pointer dereference on - * each interrupt, we rather bother update "mask" whenever *maskptr - * changes. The function "update_masks" should be called **after** - * all manipulation of the linked list of interrupt handlers hung - * off of intrdec_head[irq] is complete, since the chain of handlers - * will both determine the *maskptr values and the instances of mask - * that are fixed. This function should be called with the irq for - * which a new handler has been add blocked, since the masks may not - * yet know about the use of this irq for a device of a certain class. - */ - -static void -update_mux_masks(void) -{ - int irq; - for (irq = 0; irq < ICU_LEN; irq++) { - intrec *idesc = intreclist_head[irq]; - while (idesc != NULL) { - if (idesc->maskptr != NULL) { - /* our copy of *maskptr may be stale, refresh */ - idesc->mask = *idesc->maskptr; - } - idesc = idesc->next; - } - } -} - -static void -update_masks(intrmask_t *maskptr, int irq) -{ - intrmask_t mask = 1 << irq; - - if (maskptr == NULL) - return; - - if (find_idesc(maskptr, irq) == NULL) { - /* no reference to this maskptr was found in this irq's chain */ - if ((*maskptr & mask) == 0) - return; - /* the irq was included in the classes mask, remove it */ - *maskptr &= ~mask; - } else { - /* a reference to this maskptr was found in this irq's chain */ - if ((*maskptr & mask) != 0) - return; - /* put the irq into the classes mask */ - *maskptr |= mask; - } - /* we need to update all values in the intr_mask[irq] array */ - update_intr_masks(); - /* update mask in chains of the interrupt multiplex handler as well */ - update_mux_masks(); -} - -/* - * Add interrupt handler to linked list hung off of intreclist_head[irq] - * and install shared interrupt multiplex handler, if necessary - */ - -static int -add_intrdesc(intrec *idesc) -{ - int irq = idesc->intr; - - intrec *head = intreclist_head[irq]; - - if (head == NULL) { - /* first handler for this irq, just install it */ - if (icu_setup(irq, idesc->handler, idesc->argument, - idesc->maskptr, idesc->flags) != 0) - return (-1); - - update_intrname(irq, idesc->name); - /* keep reference */ - intreclist_head[irq] = idesc; - } else { - if ((idesc->flags & INTR_EXCL) != 0 - || (head->flags & INTR_EXCL) != 0) { - /* - * can't append new handler, if either list head or - * new handler do not allow interrupts to be shared - */ - if (bootverbose) - printf("\tdevice combination doesn't support " - "shared irq%d\n", irq); - return (-1); - } - if (head->next == NULL) { - /* - * second handler for this irq, replace device driver's - * handler by shared interrupt multiplexer function - */ - icu_unset(irq, head->handler); - if (icu_setup(irq, intr_mux, head, 0, 0) != 0) - return (-1); - if (bootverbose) - printf("\tusing shared irq%d.\n", irq); - update_intrname(irq, "mux"); - } - /* just append to the end of the chain */ - while (head->next != NULL) - head = head->next; - head->next = idesc; - } - update_masks(idesc->maskptr, irq); - return (0); -} - -/* - * Create and activate an interrupt handler descriptor data structure. - * - * The dev_instance pointer is required for resource management, and will - * only be passed through to resource_claim(). - * - * There will be functions that derive a driver and unit name from a - * dev_instance variable, and those functions will be used to maintain the - * interrupt counter label array referenced by systat and vmstat to report - * device interrupt rates (->update_intrlabels). - * - * Add the interrupt handler descriptor data structure created by an - * earlier call of create_intr() to the linked list for its irq and - * adjust the interrupt masks if necessary. - * - * WARNING: This is an internal function and not to be used by device - * drivers. It is subject to change without notice. - */ - intrec * inthand_add(const char *name, int irq, inthand2_t handler, void *arg, - intrmask_t *maskptr, int flags) + int pri, int flags) { - intrec *idesc; - int errcode = -1; - intrmask_t oldspl; + ithd *ithd = ithds[irq]; /* descriptor for the IRQ */ + intrec *head; /* chain of handlers for IRQ */ + intrec *idesc; /* descriptor for this handler */ + struct proc *p; /* interrupt thread */ + int errcode = 0; - if (ICU_LEN > 8 * sizeof *maskptr) { - printf("create_intr: ICU_LEN of %d too high for %d bit intrmask\n", - ICU_LEN, 8 * sizeof *maskptr); - return (NULL); - } - if ((unsigned)irq >= ICU_LEN) { - printf("create_intr: requested irq%d too high, limit is %d\n", - irq, ICU_LEN -1); - return (NULL); - } + if (name == NULL) /* no name? */ + panic ("anonymous interrupt"); + if (ithd == NULL || ithd->it_ih == NULL) { + /* first handler for this irq. */ + if (ithd == NULL) { + ithd = malloc(sizeof (struct ithd), M_DEVBUF, M_WAITOK); + if (ithd == NULL) + return (NULL); + bzero(ithd, sizeof(struct ithd)); + ithd->irq = irq; + ithds[irq] = ithd; + } + /* + * If we have a fast interrupt, we need to set the + * handler address directly. Do that below. For a + * slow interrupt, we don't need to know more details, + * so do it here because it's tidier. + */ + if ((flags & INTR_FAST) == 0) { + /* + * Only create a kernel thread if we don't already + * have one. + */ + if (ithd->it_proc == NULL) { + errcode = kthread_create(ithd_loop, NULL, &p, + RFSTOPPED | RFHIGHPID, "irq%d: %s", irq, + name); + if (errcode) + panic("inthand_add: Can't create " + "interrupt thread"); + p->p_rtprio.type = RTP_PRIO_ITHREAD; + p->p_stat = SWAIT; /* we're idle */ - idesc = malloc(sizeof *idesc, M_DEVBUF, M_WAITOK); + /* Put in linkages. */ + ithd->it_proc = p; + p->p_ithd = ithd; + } else + snprintf(ithd->it_proc->p_comm, MAXCOMLEN, + "irq%d: %s", irq, name); + p->p_rtprio.prio = pri; + + /* + * The interrupt process must be in place, but + * not necessarily schedulable, before we + * initialize the ICU, since it may cause an + * immediate interrupt. + */ + if (icu_setup(irq, &sched_ithd, arg, flags) != 0) + panic("inthand_add: Can't initialize ICU"); + } + } else if ((flags & INTR_EXCL) != 0 + || (ithd->it_ih->flags & INTR_EXCL) != 0) { + /* + * We can't append the new handler if either + * list ithd or new handler do not allow + * interrupts to be shared. + */ + if (bootverbose) + printf("\tdevice combination %s and %s " + "doesn't support shared irq%d\n", + ithd->it_ih->name, name, irq); + return(NULL); + } else if (flags & INTR_FAST) { + /* We can only have one fast interrupt by itself. */ + if (bootverbose) + printf("\tCan't add fast interrupt %s" + " to normal interrupt %s on irq%d", + name, ithd->it_ih->name, irq); + return (NULL); + } else { /* update p_comm */ + p = ithd->it_proc; + if (strlen(p->p_comm) + strlen(name) < MAXCOMLEN) { + strcat(p->p_comm, " "); + strcat(p->p_comm, name); + } else if (strlen(p->p_comm) == MAXCOMLEN) + p->p_comm[MAXCOMLEN - 1] = '+'; + else + strcat(p->p_comm, "+"); + } + idesc = malloc(sizeof (struct intrec), M_DEVBUF, M_WAITOK); if (idesc == NULL) - return NULL; - bzero(idesc, sizeof *idesc); + return (NULL); + bzero(idesc, sizeof (struct intrec)); + + idesc->handler = handler; + idesc->argument = arg; + idesc->flags = flags; + idesc->ithd = ithd; - if (name == NULL) - name = "???"; idesc->name = malloc(strlen(name) + 1, M_DEVBUF, M_WAITOK); if (idesc->name == NULL) { free(idesc, M_DEVBUF); - return NULL; + return (NULL); } strcpy(idesc->name, name); - idesc->handler = handler; - idesc->argument = arg; - idesc->maskptr = maskptr; - idesc->intr = irq; - idesc->flags = flags; - - /* block this irq */ - oldspl = splq(1 << irq); - - /* add irq to class selected by maskptr */ - errcode = add_intrdesc(idesc); - splx(oldspl); - - if (errcode != 0) { + /* Slow interrupts got set up above. */ + if ((flags & INTR_FAST) + && (icu_setup(irq, idesc->handler, idesc->argument, + idesc->flags) != 0) ) { if (bootverbose) - printf("\tintr_connect(irq%d) failed, result=%d\n", + printf("\tinthand_add(irq%d) failed, result=%d\n", irq, errcode); free(idesc->name, M_DEVBUF); free(idesc, M_DEVBUF); - idesc = NULL; + return NULL; } - + head = ithd->it_ih; /* look at chain of handlers */ + if (head) { + while (head->next != NULL) + head = head->next; /* find the end */ + head->next = idesc; /* hook it in there */ + } else + ithd->it_ih = idesc; /* put it up front */ + update_intrname(irq, idesc->name); return (idesc); } /* - * Deactivate and remove the interrupt handler descriptor data connected - * created by an earlier call of intr_connect() from the linked list and - * adjust theinterrupt masks if necessary. + * Deactivate and remove linked list the interrupt handler descriptor + * data connected created by an earlier call of inthand_add(), then + * adjust the interrupt masks if necessary. * - * Return the memory held by the interrupt handler descriptor data structure - * to the system. Make sure, the handler is not actively used anymore, before. + * Return the memory held by the interrupt handler descriptor data + * structure to the system. First ensure the handler is not actively + * in use. */ int inthand_remove(intrec *idesc) { - intrec **hook, *head; - int irq; - int errcode = 0; - intrmask_t oldspl; + ithd *ithd; /* descriptor for the IRQ */ + intrec *ih; /* chain of handlers */ if (idesc == NULL) return (-1); + ithd = idesc->ithd; + ih = ithd->it_ih; - irq = idesc->intr; - - /* find pointer that keeps the reference to this interrupt descriptor */ - hook = find_pred(idesc, irq); - if (hook == NULL) + if (ih == idesc) /* first in the chain */ + ithd->it_ih = idesc->next; /* unhook it */ + else { + while ((ih != NULL) + && (ih->next != idesc) ) + ih = ih->next; + if (ih->next != idesc) return (-1); - - /* make copy of original list head, the line after may overwrite it */ - head = intreclist_head[irq]; - - /* unlink: make predecessor point to idesc->next instead of to idesc */ - *hook = idesc->next; - - /* now check whether the element we removed was the list head */ - if (idesc == head) { - - oldspl = splq(1 << irq); - - /* check whether the new list head is the only element on list */ - head = intreclist_head[irq]; - if (head != NULL) { - icu_unset(irq, intr_mux); - if (head->next != NULL) { - /* install the multiplex handler with new list head as argument */ - errcode = icu_setup(irq, intr_mux, head, 0, 0); - if (errcode == 0) - update_intrname(irq, NULL); - } else { - /* install the one remaining handler for this irq */ - errcode = icu_setup(irq, head->handler, - head->argument, - head->maskptr, head->flags); - if (errcode == 0) - update_intrname(irq, head->name); + ih->next = ih->next->next; } - } else { - /* revert to old handler, eg: strayintr */ - icu_unset(irq, idesc->handler); - } - splx(oldspl); - } - update_masks(idesc->maskptr, irq); + + if (ithd->it_ih == NULL) /* no handlers left, */ + icu_unset(ithd->irq, idesc->handler); free(idesc, M_DEVBUF); return (0); } diff --git a/sys/amd64/isa/npx.c b/sys/amd64/isa/npx.c index 637853e25264..8610e35f1f11 100644 --- a/sys/amd64/isa/npx.c +++ b/sys/amd64/isa/npx.c @@ -245,6 +245,12 @@ npx_probe(dev) setidt(16, probetrap, SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(npx_intrno, probeintr, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); npx_idt_probeintr = idt[npx_intrno]; + + /* + * XXX This looks highly bogus, but it appears that npc_probe1 + * needs interrupts enabled. Does this make any difference + * here? + */ enable_intr(); result = npx_probe1(dev); disable_intr(); @@ -797,7 +803,7 @@ npxdna() /* * Record new context early in case frstor causes an IRQ13. */ - npxproc = curproc; + PCPU_SET(npxproc, CURPROC); curpcb->pcb_savefpu.sv_ex_sw = 0; /* * The following frstor may cause an IRQ13 when the state being @@ -834,16 +840,18 @@ npxsave(addr) fnsave(addr); /* fnop(); */ start_emulating(); - npxproc = NULL; + PCPU_SET(npxproc, NULL); #else /* SMP */ + int intrstate; u_char icu1_mask; u_char icu2_mask; u_char old_icu1_mask; u_char old_icu2_mask; struct gate_descriptor save_idt_npxintr; + intrstate = save_intr(); disable_intr(); old_icu1_mask = inb(IO_ICU1 + 1); old_icu2_mask = inb(IO_ICU2 + 1); @@ -851,12 +859,12 @@ npxsave(addr) outb(IO_ICU1 + 1, old_icu1_mask & ~(IRQ_SLAVE | npx0_imask)); outb(IO_ICU2 + 1, old_icu2_mask & ~(npx0_imask >> 8)); idt[npx_intrno] = npx_idt_probeintr; - enable_intr(); + write_eflags(intrstate); stop_emulating(); fnsave(addr); fnop(); start_emulating(); - npxproc = NULL; + PCPU_SET(npxproc, NULL); disable_intr(); icu1_mask = inb(IO_ICU1 + 1); /* masks may have changed */ icu2_mask = inb(IO_ICU2 + 1); @@ -866,7 +874,7 @@ npxsave(addr) (icu2_mask & ~(npx0_imask >> 8)) | (old_icu2_mask & (npx0_imask >> 8))); idt[npx_intrno] = save_idt_npxintr; - enable_intr(); /* back to usual state */ + restore_intr(intrstate); /* back to previous state */ #endif /* SMP */ } diff --git a/sys/amd64/isa/vector.S b/sys/amd64/isa/vector.S index 5447a90126a0..79f2320e6b8e 100644 --- a/sys/amd64/isa/vector.S +++ b/sys/amd64/isa/vector.S @@ -16,9 +16,10 @@ #include #endif +#define FAST_INTR_HANDLER_USES_ES 1 #ifdef FAST_INTR_HANDLER_USES_ES #define ACTUALLY_PUSHED 1 -#define MAYBE_MOVW_AX_ES movl %ax,%es +#define MAYBE_MOVW_AX_ES movw %ax,%es #define MAYBE_POPL_ES popl %es #define MAYBE_PUSHL_ES pushl %es #else @@ -36,11 +37,6 @@ .data ALIGN_DATA - .globl _intr_nesting_level -_intr_nesting_level: - .byte 0 - .space 3 - /* * Interrupt counters and names for export to vmstat(8) and friends. * @@ -58,7 +54,6 @@ _eintrcnt: _intrnames: .space NR_INTRNAMES * 16 _eintrnames: - .text /* diff --git a/sys/amd64/isa/vector.s b/sys/amd64/isa/vector.s index 5447a90126a0..79f2320e6b8e 100644 --- a/sys/amd64/isa/vector.s +++ b/sys/amd64/isa/vector.s @@ -16,9 +16,10 @@ #include #endif +#define FAST_INTR_HANDLER_USES_ES 1 #ifdef FAST_INTR_HANDLER_USES_ES #define ACTUALLY_PUSHED 1 -#define MAYBE_MOVW_AX_ES movl %ax,%es +#define MAYBE_MOVW_AX_ES movw %ax,%es #define MAYBE_POPL_ES popl %es #define MAYBE_PUSHL_ES pushl %es #else @@ -36,11 +37,6 @@ .data ALIGN_DATA - .globl _intr_nesting_level -_intr_nesting_level: - .byte 0 - .space 3 - /* * Interrupt counters and names for export to vmstat(8) and friends. * @@ -58,7 +54,6 @@ _eintrcnt: _intrnames: .space NR_INTRNAMES * 16 _eintrnames: - .text /* diff --git a/sys/conf/files b/sys/conf/files index 7b086d422e72..70ea3a5e9b76 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -433,9 +433,11 @@ kern/kern_event.c standard kern/kern_exec.c standard kern/kern_exit.c standard kern/kern_fork.c standard +kern/kern_idle.c standard kern/kern_intr.c standard kern/kern_jail.c standard kern/kern_kthread.c standard +kern/kern_ktr.c optional ktr kern/kern_ktrace.c standard kern/kern_linker.c standard kern/kern_lock.c standard @@ -443,6 +445,7 @@ kern/kern_lockf.c standard kern/kern_malloc.c standard kern/kern_mib.c standard kern/kern_module.c standard +kern/kern_mutex.c standard kern/kern_ntptime.c standard kern/kern_physio.c standard kern/kern_proc.c standard diff --git a/sys/conf/files.alpha b/sys/conf/files.alpha index bb746e11c945..6e8ba9481977 100644 --- a/sys/conf/files.alpha +++ b/sys/conf/files.alpha @@ -67,6 +67,7 @@ alpha/alpha/perfmon.c optional perfmon profiling-routine alpha/alpha/perfmon.c optional perfmon alpha/alpha/pmap.c standard alpha/alpha/procfs_machdep.c standard +alpha/alpha/mp_machdep.c standard alpha/alpha/prom.c standard alpha/alpha/promcons.c standard alpha/alpha/prom_disp.s standard @@ -75,6 +76,7 @@ alpha/alpha/simplelock.s optional smp alpha/alpha/support.s standard alpha/alpha/swtch.s standard alpha/alpha/sys_machdep.c standard +alpha/alpha/synch_machdep.c standard alpha/alpha/trap.c standard alpha/alpha/userconfig.c optional userconfig alpha/alpha/vm_machdep.c standard diff --git a/sys/conf/files.i386 b/sys/conf/files.i386 index f5fa25a722b6..e9a7acdccce7 100644 --- a/sys/conf/files.i386 +++ b/sys/conf/files.i386 @@ -188,7 +188,6 @@ i386/i386/mp_clock.c optional smp i386/i386/mp_machdep.c optional smp i386/i386/mpapic.c optional smp i386/i386/mpboot.s optional smp -i386/i386/mplock.s optional smp i386/i386/nexus.c standard i386/i386/perfmon.c optional perfmon i386/i386/perfmon.c optional perfmon profiling-routine @@ -198,6 +197,7 @@ i386/i386/simplelock.s optional smp i386/i386/support.s standard i386/i386/swtch.s standard i386/i386/sys_machdep.c standard +i386/i386/synch_machdep.c standard i386/i386/trap.c standard i386/i386/userconfig.c optional userconfig i386/i386/vm86.c standard @@ -242,6 +242,7 @@ i386/isa/if_wi.c optional wi card i386/isa/if_wl.c count wl i386/isa/if_wlp.c optional wlp i386/isa/intr_machdep.c standard +i386/isa/ithread.c standard i386/isa/ipl_funcs.c standard \ compile-with "${CC} -c ${CFLAGS} ${DEFINED_PROF:S/^$/-fomit-frame-pointer/} ${.IMPSRC}" i386/isa/isa.c optional isa diff --git a/sys/conf/files.pc98 b/sys/conf/files.pc98 index bcb677330d3b..2e8481e44222 100644 --- a/sys/conf/files.pc98 +++ b/sys/conf/files.pc98 @@ -175,7 +175,6 @@ i386/i386/mp_clock.c optional smp i386/i386/mp_machdep.c optional smp i386/i386/mpapic.c optional smp i386/i386/mpboot.s optional smp -i386/i386/mplock.s optional smp i386/i386/nexus.c standard i386/i386/perfmon.c optional perfmon i386/i386/perfmon.c optional perfmon profiling-routine diff --git a/sys/conf/options b/sys/conf/options index ddd04a365c51..8093240ff6f3 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -454,6 +454,15 @@ PCFCLOCK_VERBOSE opt_pcfclock.h PCFCLOCK_MAX_RETRIES opt_pcfclock.h TDFX_LINUX opt_tdfx.h +KTR opt_global.h +KTR_MASK opt_global.h +KTR_CPUMASK opt_global.h +KTR_COMPILE opt_global.h +KTR_ENTRIES opt_global.h +KTR_EXTEND opt_global.h +SMP_DEBUG opt_global.h +WITNESS opt_global.h + # options for ACPI support ACPI_DEBUG opt_acpi.h AML_DEBUG opt_acpi.h diff --git a/sys/conf/options.alpha b/sys/conf/options.alpha index 8260cb08a45e..7d53c371079b 100644 --- a/sys/conf/options.alpha +++ b/sys/conf/options.alpha @@ -64,3 +64,7 @@ KBD_MAXRETRY opt_kbd.h KBD_MAXWAIT opt_kbd.h KBD_RESETDELAY opt_kbd.h KBDIO_DEBUG opt_kbd.h + +# Clock options +CLK_USE_I8254_CALIBRATION opt_clock.h +TIMER_FREQ opt_clock.h diff --git a/sys/dev/ata/ata-all.c b/sys/dev/ata/ata-all.c index fc89297794fb..17aff9e6f39c 100644 --- a/sys/dev/ata/ata-all.c +++ b/sys/dev/ata/ata-all.c @@ -63,6 +63,8 @@ #include #ifdef __i386__ #include +#include +#include #include #endif #ifdef __alpha__ diff --git a/sys/dev/cy/cy.c b/sys/dev/cy/cy.c index 52a8cf36892f..5487d8fe6299 100644 --- a/sys/dev/cy/cy.c +++ b/sys/dev/cy/cy.c @@ -94,11 +94,6 @@ #error "The cy device requires the old isa compatibility shims" #endif -#ifdef SMP -#define disable_intr() COM_DISABLE_INTR() -#define enable_intr() COM_ENABLE_INTR() -#endif /* SMP */ - /* * Dictionary so that I can name everything *sio* or *com* to compare with * sio.c. There is also lots of ugly formatting and unnecessary ifdefs to @@ -366,7 +361,7 @@ static struct com_s *p_com_addr[NSIO]; #define com_addr(unit) (p_com_addr[unit]) struct isa_driver siodriver = { - INTR_TYPE_TTY | INTR_TYPE_FAST, + INTR_TYPE_TTY | INTR_FAST, sioprobe, sioattach, driver_name @@ -604,11 +599,9 @@ cyattach_common(cy_iobase, cy_align) com->lt_out.c_cflag = com->lt_in.c_cflag = CLOCAL; } if (siosetwater(com, com->it_in.c_ispeed) != 0) { - enable_intr(); free(com, M_DEVBUF); return (0); } - enable_intr(); termioschars(&com->it_in); com->it_in.c_ispeed = com->it_in.c_ospeed = comdefaultrate; com->it_out = com->it_in; @@ -662,6 +655,7 @@ sioopen(dev, flag, mode, p) int s; struct tty *tp; int unit; + int intrsave; mynor = minor(dev); unit = MINOR_TO_UNIT(mynor); @@ -768,14 +762,17 @@ sioopen(dev, flag, mode, p) } } + intrsave = save_intr(); disable_intr(); + COM_LOCK(); (void) inb(com->line_status_port); (void) inb(com->data_port); com->prev_modem_status = com->last_modem_status = inb(com->modem_status_port); outb(iobase + com_ier, IER_ERXRDY | IER_ETXRDY | IER_ERLS | IER_EMSC); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); #else /* !0 */ /* * Flush fifos. This requires a full channel reset which @@ -786,13 +783,16 @@ sioopen(dev, flag, mode, p) CD1400_CCR_CMDRESET | CD1400_CCR_CHANRESET); cd1400_channel_cmd(com, com->channel_control); + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com->prev_modem_status = com->last_modem_status = cd_getreg(com, CD1400_MSVR2); cd_setreg(com, CD1400_SRER, com->intr_enable = CD1400_SRER_MDMCH | CD1400_SRER_RXDATA); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); #endif /* 0 */ /* * Handle initial DCD. Callout devices get a fake initial @@ -875,6 +875,7 @@ comhardclose(com) int s; struct tty *tp; int unit; + int intrsave; unit = com->unit; iobase = com->iobase; @@ -888,10 +889,13 @@ comhardclose(com) outb(iobase + com_cfcr, com->cfcr_image &= ~CFCR_SBREAK); #else /* XXX */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com->etc = ETC_NONE; cd_setreg(com, CD1400_COR2, com->cor[1] &= ~CD1400_COR2_ETC); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); cd1400_channel_cmd(com, CD1400_CCR_CMDRESET | CD1400_CCR_FTF); #endif @@ -899,9 +903,12 @@ comhardclose(com) #if 0 outb(iobase + com_ier, 0); #else + intrsave = save_intr(); disable_intr(); + COM_LOCK(); cd_setreg(com, CD1400_SRER, com->intr_enable = 0); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); #endif tp = com->tp; if ((tp->t_cflag & HUPCL) @@ -991,6 +998,11 @@ siodtrwakeup(chan) wakeup(&com->dtr_wait); } +/* + * This function: + * a) needs to be called with COM_LOCK() held, and + * b) needs to return with COM_LOCK() held. + */ static void sioinput(com) struct com_s *com; @@ -1000,6 +1012,7 @@ sioinput(com) u_char line_status; int recv_data; struct tty *tp; + int intrsave; buf = com->ibuf; tp = com->tp; @@ -1016,7 +1029,15 @@ sioinput(com) * slinput is reasonably fast (usually 40 instructions plus * call overhead). */ + do { + /* + * This may look odd, but it is using save-and-enable + * semantics instead of the save-and-disable semantics + * that are used everywhere else. + */ + intrsave = save_intr(); + COM_UNLOCK(); enable_intr(); incc = com->iptr - buf; if (tp->t_rawq.c_cc + incc > tp->t_ihiwat @@ -1038,10 +1059,18 @@ sioinput(com) tp->t_lflag &= ~FLUSHO; comstart(tp); } - disable_intr(); + restore_intr(intrsave); + COM_LOCK(); } while (buf < com->iptr); } else { do { + /* + * This may look odd, but it is using save-and-enable + * semantics instead of the save-and-disable semantics + * that are used everywhere else. + */ + intrsave = save_intr(); + COM_UNLOCK(); enable_intr(); line_status = buf[com->ierroff]; recv_data = *buf++; @@ -1057,7 +1086,8 @@ sioinput(com) recv_data |= TTY_PE; } (*linesw[tp->t_line].l_rint)(recv_data, tp); - disable_intr(); + restore_intr(intrsave); + COM_LOCK(); } while (buf < com->iptr); } com_events -= (com->iptr - com->ibuf); @@ -1729,6 +1759,7 @@ static void siopoll() { int unit; + int intrsave; #ifdef CyDebug ++cy_timeouts; @@ -1751,7 +1782,9 @@ siopoll() * (actually never opened devices) so that we don't * loop. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); incc = com->iptr - com->ibuf; com->iptr = com->ibuf; if (com->state & CS_CHECKMSR) { @@ -1759,7 +1792,8 @@ siopoll() com->state &= ~CS_CHECKMSR; } com_events -= incc; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (incc != 0) log(LOG_DEBUG, "sio%d: %d events for device with no tp\n", @@ -1767,29 +1801,39 @@ siopoll() continue; } if (com->iptr != com->ibuf) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); sioinput(com); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } if (com->state & CS_CHECKMSR) { u_char delta_modem_status; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); + sioinput(com); delta_modem_status = com->last_modem_status ^ com->prev_modem_status; com->prev_modem_status = com->last_modem_status; com_events -= LOTS_OF_EVENTS; com->state &= ~CS_CHECKMSR; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (delta_modem_status & MSR_DCD) (*linesw[tp->t_line].l_modem) (tp, com->prev_modem_status & MSR_DCD); } if (com->extra_state & CSE_ODONE) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com_events -= LOTS_OF_EVENTS; com->extra_state &= ~CSE_ODONE; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (!(com->state & CS_BUSY)) { tp->t_state &= ~TS_BUSY; ttwwakeup(com->tp); @@ -1801,10 +1845,13 @@ siopoll() } } if (com->state & CS_ODONE) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com_events -= LOTS_OF_EVENTS; com->state &= ~CS_ODONE; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); (*linesw[tp->t_line].l_start)(tp); } if (com_events == 0) @@ -1833,6 +1880,7 @@ comparam(tp, t) u_char opt; int s; int unit; + int intrsave; /* do historical conversions */ if (t->c_ispeed == 0) @@ -1857,14 +1905,9 @@ comparam(tp, t) else (void)commctl(com, TIOCM_DTR, DMBIS); - /* - * This returns with interrupts disabled so that we can complete - * the speed change atomically. - */ (void) siosetwater(com, t->c_ispeed); /* XXX we don't actually change the speed atomically. */ - enable_intr(); if (idivisor != 0) { cd_setreg(com, CD1400_RBPR, idivisor); @@ -1985,12 +2028,15 @@ comparam(tp, t) if (cflag & CCTS_OFLOW) opt |= CD1400_COR2_CCTS_OFLOW; #endif + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (opt != com->cor[1]) { cor_change |= CD1400_CCR_COR2; cd_setreg(com, CD1400_COR2, com->cor[1] = opt); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); /* * set channel option register 3 - @@ -2111,7 +2157,9 @@ comparam(tp, t) * XXX should have done this long ago, but there is too much state * to change all atomically. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com->state &= ~CS_TTGO; if (!(tp->t_state & TS_TTSTOP)) @@ -2177,7 +2225,8 @@ comparam(tp, t) | CD1400_SRER_TXMPTY); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); splx(s); comstart(tp); if (com->ibufold != NULL) { @@ -2196,6 +2245,7 @@ siosetwater(com, speed) u_char *ibuf; int ibufsize; struct tty *tp; + int intrsave; /* * Make the buffer size large enough to handle a softtty interrupt @@ -2207,7 +2257,6 @@ siosetwater(com, speed) for (ibufsize = 128; ibufsize < cp4ticks;) ibufsize <<= 1; if (ibufsize == com->ibufsize) { - disable_intr(); return (0); } @@ -2217,7 +2266,6 @@ siosetwater(com, speed) */ ibuf = malloc(2 * ibufsize, M_DEVBUF, M_NOWAIT); if (ibuf == NULL) { - disable_intr(); return (ENOMEM); } @@ -2235,7 +2283,9 @@ siosetwater(com, speed) * Read current input buffer, if any. Continue with interrupts * disabled. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->iptr != com->ibuf) sioinput(com); @@ -2254,6 +2304,9 @@ siosetwater(com, speed) com->ibufend = ibuf + ibufsize; com->ierroff = ibufsize; com->ihighwater = ibuf + 3 * ibufsize / 4; + + COM_UNLOCK(); + restore_intr(intrsave); return (0); } @@ -2267,6 +2320,7 @@ comstart(tp) bool_t started; #endif int unit; + int intrsave; unit = DEV_TO_UNIT(tp->t_dev); com = com_addr(unit); @@ -2277,7 +2331,9 @@ comstart(tp) started = FALSE; #endif + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (tp->t_state & TS_TTSTOP) { com->state &= ~CS_TTGO; if (com->intr_enable & CD1400_SRER_TXRDY) @@ -2313,7 +2369,8 @@ comstart(tp) com->mcr_image |= com->mcr_rts); #endif } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (tp->t_state & (TS_TIMEOUT | TS_TTSTOP)) { ttwwakeup(tp); splx(s); @@ -2332,7 +2389,9 @@ comstart(tp) sizeof com->obuf1); com->obufs[0].l_next = NULL; com->obufs[0].l_queued = TRUE; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state & CS_BUSY) { qp = com->obufq.l_next; while ((next = qp->l_next) != NULL) @@ -2351,7 +2410,8 @@ comstart(tp) & ~CD1400_SRER_TXMPTY) | CD1400_SRER_TXRDY); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } if (tp->t_outq.c_cc != 0 && !com->obufs[1].l_queued) { #ifdef CyDebug @@ -2362,7 +2422,9 @@ comstart(tp) sizeof com->obuf2); com->obufs[1].l_next = NULL; com->obufs[1].l_queued = TRUE; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state & CS_BUSY) { qp = com->obufq.l_next; while ((next = qp->l_next) != NULL) @@ -2381,7 +2443,8 @@ comstart(tp) & ~CD1400_SRER_TXMPTY) | CD1400_SRER_TXRDY); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } tp->t_state |= TS_BUSY; } @@ -2390,10 +2453,13 @@ comstart(tp) ++com->start_real; #endif #if 0 + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state >= (CS_BUSY | CS_TTGO)) siointr1(com); /* fake interrupt to start output */ - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); #endif ttwwakeup(tp); splx(s); @@ -2406,10 +2472,13 @@ comstop(tp, rw) { struct com_s *com; bool_t wakeup_etc; + int intrsave; com = com_addr(DEV_TO_UNIT(tp->t_dev)); wakeup_etc = FALSE; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (rw & FWRITE) { com->obufs[0].l_queued = FALSE; com->obufs[1].l_queued = FALSE; @@ -2432,7 +2501,8 @@ comstop(tp, rw) com_events -= (com->iptr - com->ibuf); com->iptr = com->ibuf; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (wakeup_etc) wakeup(&com->etc); if (rw & FWRITE && com->etc == ETC_NONE) @@ -2448,6 +2518,7 @@ commctl(com, bits, how) { int mcr; int msr; + int intrsave; if (how == DMGET) { if (com->channel_control & CD1400_CCR_RCVEN) @@ -2485,7 +2556,9 @@ commctl(com, bits, how) mcr |= com->mcr_dtr; if (bits & TIOCM_RTS) mcr |= com->mcr_rts; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); switch (how) { case DMSET: com->mcr_image = mcr; @@ -2503,7 +2576,8 @@ commctl(com, bits, how) cd_setreg(com, CD1400_MSVR2, mcr); break; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); return (0); } @@ -2565,9 +2639,14 @@ comwakeup(chan) com = com_addr(unit); if (com != NULL && (com->state >= (CS_BUSY | CS_TTGO) || com->poll)) { + int intrsave; + + intrsave = save_intr(); disable_intr(); + COM_LOCK(); siointr1(com); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } } #endif @@ -2587,11 +2666,15 @@ comwakeup(chan) for (errnum = 0; errnum < CE_NTYPES; ++errnum) { u_int delta; u_long total; + int intrsave; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); delta = com->delta_error_counts[errnum]; com->delta_error_counts[errnum] = 0; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (delta == 0) continue; total = com->error_counts[errnum] += delta; @@ -2743,6 +2826,8 @@ cd_etc(com, etc) struct com_s *com; int etc; { + int intrsave; + /* * We can't change the hardware's ETC state while there are any * characters in the tx fifo, since those characters would be @@ -2754,26 +2839,28 @@ cd_etc(com, etc) * for the tx to become empty so that the command is sure to be * executed soon after we issue it. */ + intrsave = save_intr(); disable_intr(); - if (com->etc == etc) { - enable_intr(); + COM_LOCK(); + if (com->etc == etc) goto wait; - } if ((etc == CD1400_ETC_SENDBREAK && (com->etc == ETC_BREAK_STARTING || com->etc == ETC_BREAK_STARTED)) || (etc == CD1400_ETC_STOPBREAK && (com->etc == ETC_BREAK_ENDING || com->etc == ETC_BREAK_ENDED || com->etc == ETC_NONE))) { - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); return; } com->etc = etc; cd_setreg(com, CD1400_SRER, com->intr_enable = (com->intr_enable & ~CD1400_SRER_TXRDY) | CD1400_SRER_TXMPTY); - enable_intr(); wait: + COM_UNLOCK(); + restore_intr(intrsave); while (com->etc == etc && tsleep(&com->etc, TTIPRI | PCATCH, "cyetc", 0) == 0) continue; @@ -2787,7 +2874,7 @@ cd_getreg(com, reg) struct com_s *basecom; u_char car; int cy_align; - u_long ef; + int intrsave; cy_addr iobase; int val; @@ -2795,14 +2882,16 @@ cd_getreg(com, reg) car = com->unit & CD1400_CAR_CHAN; cy_align = com->cy_align; iobase = com->iobase; - ef = read_eflags(); - if (ef & PSL_I) - disable_intr(); + intrsave = save_intr(); + disable_intr(); + if (intrsave & PSL_I) + COM_LOCK(); if (basecom->car != car) cd_outb(iobase, CD1400_CAR, cy_align, basecom->car = car); val = cd_inb(iobase, reg, cy_align); - if (ef & PSL_I) - enable_intr(); + if (intrsave & PSL_I) + COM_UNLOCK(); + restore_intr(intrsave); return (val); } @@ -2815,21 +2904,23 @@ cd_setreg(com, reg, val) struct com_s *basecom; u_char car; int cy_align; - u_long ef; + int intrsave; cy_addr iobase; basecom = com_addr(com->unit & ~(CD1400_NO_OF_CHANNELS - 1)); car = com->unit & CD1400_CAR_CHAN; cy_align = com->cy_align; iobase = com->iobase; - ef = read_eflags(); - if (ef & PSL_I) - disable_intr(); + intrsave = save_intr(); + disable_intr(); + if (intrsave & PSL_I) + COM_LOCK(); if (basecom->car != car) cd_outb(iobase, CD1400_CAR, cy_align, basecom->car = car); cd_outb(iobase, reg, cy_align, val); - if (ef & PSL_I) - enable_intr(); + if (intrsave & PSL_I) + COM_UNLOCK(); + restore_intr(intrsave); } #ifdef CyDebug diff --git a/sys/dev/cy/cy_isa.c b/sys/dev/cy/cy_isa.c index 52a8cf36892f..5487d8fe6299 100644 --- a/sys/dev/cy/cy_isa.c +++ b/sys/dev/cy/cy_isa.c @@ -94,11 +94,6 @@ #error "The cy device requires the old isa compatibility shims" #endif -#ifdef SMP -#define disable_intr() COM_DISABLE_INTR() -#define enable_intr() COM_ENABLE_INTR() -#endif /* SMP */ - /* * Dictionary so that I can name everything *sio* or *com* to compare with * sio.c. There is also lots of ugly formatting and unnecessary ifdefs to @@ -366,7 +361,7 @@ static struct com_s *p_com_addr[NSIO]; #define com_addr(unit) (p_com_addr[unit]) struct isa_driver siodriver = { - INTR_TYPE_TTY | INTR_TYPE_FAST, + INTR_TYPE_TTY | INTR_FAST, sioprobe, sioattach, driver_name @@ -604,11 +599,9 @@ cyattach_common(cy_iobase, cy_align) com->lt_out.c_cflag = com->lt_in.c_cflag = CLOCAL; } if (siosetwater(com, com->it_in.c_ispeed) != 0) { - enable_intr(); free(com, M_DEVBUF); return (0); } - enable_intr(); termioschars(&com->it_in); com->it_in.c_ispeed = com->it_in.c_ospeed = comdefaultrate; com->it_out = com->it_in; @@ -662,6 +655,7 @@ sioopen(dev, flag, mode, p) int s; struct tty *tp; int unit; + int intrsave; mynor = minor(dev); unit = MINOR_TO_UNIT(mynor); @@ -768,14 +762,17 @@ sioopen(dev, flag, mode, p) } } + intrsave = save_intr(); disable_intr(); + COM_LOCK(); (void) inb(com->line_status_port); (void) inb(com->data_port); com->prev_modem_status = com->last_modem_status = inb(com->modem_status_port); outb(iobase + com_ier, IER_ERXRDY | IER_ETXRDY | IER_ERLS | IER_EMSC); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); #else /* !0 */ /* * Flush fifos. This requires a full channel reset which @@ -786,13 +783,16 @@ sioopen(dev, flag, mode, p) CD1400_CCR_CMDRESET | CD1400_CCR_CHANRESET); cd1400_channel_cmd(com, com->channel_control); + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com->prev_modem_status = com->last_modem_status = cd_getreg(com, CD1400_MSVR2); cd_setreg(com, CD1400_SRER, com->intr_enable = CD1400_SRER_MDMCH | CD1400_SRER_RXDATA); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); #endif /* 0 */ /* * Handle initial DCD. Callout devices get a fake initial @@ -875,6 +875,7 @@ comhardclose(com) int s; struct tty *tp; int unit; + int intrsave; unit = com->unit; iobase = com->iobase; @@ -888,10 +889,13 @@ comhardclose(com) outb(iobase + com_cfcr, com->cfcr_image &= ~CFCR_SBREAK); #else /* XXX */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com->etc = ETC_NONE; cd_setreg(com, CD1400_COR2, com->cor[1] &= ~CD1400_COR2_ETC); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); cd1400_channel_cmd(com, CD1400_CCR_CMDRESET | CD1400_CCR_FTF); #endif @@ -899,9 +903,12 @@ comhardclose(com) #if 0 outb(iobase + com_ier, 0); #else + intrsave = save_intr(); disable_intr(); + COM_LOCK(); cd_setreg(com, CD1400_SRER, com->intr_enable = 0); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); #endif tp = com->tp; if ((tp->t_cflag & HUPCL) @@ -991,6 +998,11 @@ siodtrwakeup(chan) wakeup(&com->dtr_wait); } +/* + * This function: + * a) needs to be called with COM_LOCK() held, and + * b) needs to return with COM_LOCK() held. + */ static void sioinput(com) struct com_s *com; @@ -1000,6 +1012,7 @@ sioinput(com) u_char line_status; int recv_data; struct tty *tp; + int intrsave; buf = com->ibuf; tp = com->tp; @@ -1016,7 +1029,15 @@ sioinput(com) * slinput is reasonably fast (usually 40 instructions plus * call overhead). */ + do { + /* + * This may look odd, but it is using save-and-enable + * semantics instead of the save-and-disable semantics + * that are used everywhere else. + */ + intrsave = save_intr(); + COM_UNLOCK(); enable_intr(); incc = com->iptr - buf; if (tp->t_rawq.c_cc + incc > tp->t_ihiwat @@ -1038,10 +1059,18 @@ sioinput(com) tp->t_lflag &= ~FLUSHO; comstart(tp); } - disable_intr(); + restore_intr(intrsave); + COM_LOCK(); } while (buf < com->iptr); } else { do { + /* + * This may look odd, but it is using save-and-enable + * semantics instead of the save-and-disable semantics + * that are used everywhere else. + */ + intrsave = save_intr(); + COM_UNLOCK(); enable_intr(); line_status = buf[com->ierroff]; recv_data = *buf++; @@ -1057,7 +1086,8 @@ sioinput(com) recv_data |= TTY_PE; } (*linesw[tp->t_line].l_rint)(recv_data, tp); - disable_intr(); + restore_intr(intrsave); + COM_LOCK(); } while (buf < com->iptr); } com_events -= (com->iptr - com->ibuf); @@ -1729,6 +1759,7 @@ static void siopoll() { int unit; + int intrsave; #ifdef CyDebug ++cy_timeouts; @@ -1751,7 +1782,9 @@ siopoll() * (actually never opened devices) so that we don't * loop. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); incc = com->iptr - com->ibuf; com->iptr = com->ibuf; if (com->state & CS_CHECKMSR) { @@ -1759,7 +1792,8 @@ siopoll() com->state &= ~CS_CHECKMSR; } com_events -= incc; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (incc != 0) log(LOG_DEBUG, "sio%d: %d events for device with no tp\n", @@ -1767,29 +1801,39 @@ siopoll() continue; } if (com->iptr != com->ibuf) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); sioinput(com); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } if (com->state & CS_CHECKMSR) { u_char delta_modem_status; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); + sioinput(com); delta_modem_status = com->last_modem_status ^ com->prev_modem_status; com->prev_modem_status = com->last_modem_status; com_events -= LOTS_OF_EVENTS; com->state &= ~CS_CHECKMSR; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (delta_modem_status & MSR_DCD) (*linesw[tp->t_line].l_modem) (tp, com->prev_modem_status & MSR_DCD); } if (com->extra_state & CSE_ODONE) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com_events -= LOTS_OF_EVENTS; com->extra_state &= ~CSE_ODONE; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (!(com->state & CS_BUSY)) { tp->t_state &= ~TS_BUSY; ttwwakeup(com->tp); @@ -1801,10 +1845,13 @@ siopoll() } } if (com->state & CS_ODONE) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com_events -= LOTS_OF_EVENTS; com->state &= ~CS_ODONE; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); (*linesw[tp->t_line].l_start)(tp); } if (com_events == 0) @@ -1833,6 +1880,7 @@ comparam(tp, t) u_char opt; int s; int unit; + int intrsave; /* do historical conversions */ if (t->c_ispeed == 0) @@ -1857,14 +1905,9 @@ comparam(tp, t) else (void)commctl(com, TIOCM_DTR, DMBIS); - /* - * This returns with interrupts disabled so that we can complete - * the speed change atomically. - */ (void) siosetwater(com, t->c_ispeed); /* XXX we don't actually change the speed atomically. */ - enable_intr(); if (idivisor != 0) { cd_setreg(com, CD1400_RBPR, idivisor); @@ -1985,12 +2028,15 @@ comparam(tp, t) if (cflag & CCTS_OFLOW) opt |= CD1400_COR2_CCTS_OFLOW; #endif + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (opt != com->cor[1]) { cor_change |= CD1400_CCR_COR2; cd_setreg(com, CD1400_COR2, com->cor[1] = opt); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); /* * set channel option register 3 - @@ -2111,7 +2157,9 @@ comparam(tp, t) * XXX should have done this long ago, but there is too much state * to change all atomically. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com->state &= ~CS_TTGO; if (!(tp->t_state & TS_TTSTOP)) @@ -2177,7 +2225,8 @@ comparam(tp, t) | CD1400_SRER_TXMPTY); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); splx(s); comstart(tp); if (com->ibufold != NULL) { @@ -2196,6 +2245,7 @@ siosetwater(com, speed) u_char *ibuf; int ibufsize; struct tty *tp; + int intrsave; /* * Make the buffer size large enough to handle a softtty interrupt @@ -2207,7 +2257,6 @@ siosetwater(com, speed) for (ibufsize = 128; ibufsize < cp4ticks;) ibufsize <<= 1; if (ibufsize == com->ibufsize) { - disable_intr(); return (0); } @@ -2217,7 +2266,6 @@ siosetwater(com, speed) */ ibuf = malloc(2 * ibufsize, M_DEVBUF, M_NOWAIT); if (ibuf == NULL) { - disable_intr(); return (ENOMEM); } @@ -2235,7 +2283,9 @@ siosetwater(com, speed) * Read current input buffer, if any. Continue with interrupts * disabled. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->iptr != com->ibuf) sioinput(com); @@ -2254,6 +2304,9 @@ siosetwater(com, speed) com->ibufend = ibuf + ibufsize; com->ierroff = ibufsize; com->ihighwater = ibuf + 3 * ibufsize / 4; + + COM_UNLOCK(); + restore_intr(intrsave); return (0); } @@ -2267,6 +2320,7 @@ comstart(tp) bool_t started; #endif int unit; + int intrsave; unit = DEV_TO_UNIT(tp->t_dev); com = com_addr(unit); @@ -2277,7 +2331,9 @@ comstart(tp) started = FALSE; #endif + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (tp->t_state & TS_TTSTOP) { com->state &= ~CS_TTGO; if (com->intr_enable & CD1400_SRER_TXRDY) @@ -2313,7 +2369,8 @@ comstart(tp) com->mcr_image |= com->mcr_rts); #endif } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (tp->t_state & (TS_TIMEOUT | TS_TTSTOP)) { ttwwakeup(tp); splx(s); @@ -2332,7 +2389,9 @@ comstart(tp) sizeof com->obuf1); com->obufs[0].l_next = NULL; com->obufs[0].l_queued = TRUE; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state & CS_BUSY) { qp = com->obufq.l_next; while ((next = qp->l_next) != NULL) @@ -2351,7 +2410,8 @@ comstart(tp) & ~CD1400_SRER_TXMPTY) | CD1400_SRER_TXRDY); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } if (tp->t_outq.c_cc != 0 && !com->obufs[1].l_queued) { #ifdef CyDebug @@ -2362,7 +2422,9 @@ comstart(tp) sizeof com->obuf2); com->obufs[1].l_next = NULL; com->obufs[1].l_queued = TRUE; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state & CS_BUSY) { qp = com->obufq.l_next; while ((next = qp->l_next) != NULL) @@ -2381,7 +2443,8 @@ comstart(tp) & ~CD1400_SRER_TXMPTY) | CD1400_SRER_TXRDY); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } tp->t_state |= TS_BUSY; } @@ -2390,10 +2453,13 @@ comstart(tp) ++com->start_real; #endif #if 0 + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state >= (CS_BUSY | CS_TTGO)) siointr1(com); /* fake interrupt to start output */ - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); #endif ttwwakeup(tp); splx(s); @@ -2406,10 +2472,13 @@ comstop(tp, rw) { struct com_s *com; bool_t wakeup_etc; + int intrsave; com = com_addr(DEV_TO_UNIT(tp->t_dev)); wakeup_etc = FALSE; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (rw & FWRITE) { com->obufs[0].l_queued = FALSE; com->obufs[1].l_queued = FALSE; @@ -2432,7 +2501,8 @@ comstop(tp, rw) com_events -= (com->iptr - com->ibuf); com->iptr = com->ibuf; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (wakeup_etc) wakeup(&com->etc); if (rw & FWRITE && com->etc == ETC_NONE) @@ -2448,6 +2518,7 @@ commctl(com, bits, how) { int mcr; int msr; + int intrsave; if (how == DMGET) { if (com->channel_control & CD1400_CCR_RCVEN) @@ -2485,7 +2556,9 @@ commctl(com, bits, how) mcr |= com->mcr_dtr; if (bits & TIOCM_RTS) mcr |= com->mcr_rts; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); switch (how) { case DMSET: com->mcr_image = mcr; @@ -2503,7 +2576,8 @@ commctl(com, bits, how) cd_setreg(com, CD1400_MSVR2, mcr); break; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); return (0); } @@ -2565,9 +2639,14 @@ comwakeup(chan) com = com_addr(unit); if (com != NULL && (com->state >= (CS_BUSY | CS_TTGO) || com->poll)) { + int intrsave; + + intrsave = save_intr(); disable_intr(); + COM_LOCK(); siointr1(com); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } } #endif @@ -2587,11 +2666,15 @@ comwakeup(chan) for (errnum = 0; errnum < CE_NTYPES; ++errnum) { u_int delta; u_long total; + int intrsave; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); delta = com->delta_error_counts[errnum]; com->delta_error_counts[errnum] = 0; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (delta == 0) continue; total = com->error_counts[errnum] += delta; @@ -2743,6 +2826,8 @@ cd_etc(com, etc) struct com_s *com; int etc; { + int intrsave; + /* * We can't change the hardware's ETC state while there are any * characters in the tx fifo, since those characters would be @@ -2754,26 +2839,28 @@ cd_etc(com, etc) * for the tx to become empty so that the command is sure to be * executed soon after we issue it. */ + intrsave = save_intr(); disable_intr(); - if (com->etc == etc) { - enable_intr(); + COM_LOCK(); + if (com->etc == etc) goto wait; - } if ((etc == CD1400_ETC_SENDBREAK && (com->etc == ETC_BREAK_STARTING || com->etc == ETC_BREAK_STARTED)) || (etc == CD1400_ETC_STOPBREAK && (com->etc == ETC_BREAK_ENDING || com->etc == ETC_BREAK_ENDED || com->etc == ETC_NONE))) { - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); return; } com->etc = etc; cd_setreg(com, CD1400_SRER, com->intr_enable = (com->intr_enable & ~CD1400_SRER_TXRDY) | CD1400_SRER_TXMPTY); - enable_intr(); wait: + COM_UNLOCK(); + restore_intr(intrsave); while (com->etc == etc && tsleep(&com->etc, TTIPRI | PCATCH, "cyetc", 0) == 0) continue; @@ -2787,7 +2874,7 @@ cd_getreg(com, reg) struct com_s *basecom; u_char car; int cy_align; - u_long ef; + int intrsave; cy_addr iobase; int val; @@ -2795,14 +2882,16 @@ cd_getreg(com, reg) car = com->unit & CD1400_CAR_CHAN; cy_align = com->cy_align; iobase = com->iobase; - ef = read_eflags(); - if (ef & PSL_I) - disable_intr(); + intrsave = save_intr(); + disable_intr(); + if (intrsave & PSL_I) + COM_LOCK(); if (basecom->car != car) cd_outb(iobase, CD1400_CAR, cy_align, basecom->car = car); val = cd_inb(iobase, reg, cy_align); - if (ef & PSL_I) - enable_intr(); + if (intrsave & PSL_I) + COM_UNLOCK(); + restore_intr(intrsave); return (val); } @@ -2815,21 +2904,23 @@ cd_setreg(com, reg, val) struct com_s *basecom; u_char car; int cy_align; - u_long ef; + int intrsave; cy_addr iobase; basecom = com_addr(com->unit & ~(CD1400_NO_OF_CHANNELS - 1)); car = com->unit & CD1400_CAR_CHAN; cy_align = com->cy_align; iobase = com->iobase; - ef = read_eflags(); - if (ef & PSL_I) - disable_intr(); + intrsave = save_intr(); + disable_intr(); + if (intrsave & PSL_I) + COM_LOCK(); if (basecom->car != car) cd_outb(iobase, CD1400_CAR, cy_align, basecom->car = car); cd_outb(iobase, reg, cy_align, val); - if (ef & PSL_I) - enable_intr(); + if (intrsave & PSL_I) + COM_UNLOCK(); + restore_intr(intrsave); } #ifdef CyDebug diff --git a/sys/dev/sio/sio.c b/sys/dev/sio/sio.c index 2725a201076b..a6f05e762ce8 100644 --- a/sys/dev/sio/sio.c +++ b/sys/dev/sio/sio.c @@ -95,16 +95,12 @@ #endif #include +/* XXX - this is ok because we only do sio fast interrupts on i386 */ #ifndef __i386__ #define disable_intr() #define enable_intr() #endif -#ifdef SMP -#define disable_intr() COM_DISABLE_INTR() -#define enable_intr() COM_ENABLE_INTR() -#endif /* SMP */ - #define LOTS_OF_EVENTS 64 /* helps separate urgent events from input */ #define CALLOUT_MASK 0x80 @@ -760,6 +756,7 @@ sioprobe(dev, xrid) u_int flags = device_get_flags(dev); int rid; struct resource *port; + int intrsave; rid = xrid; port = bus_alloc_resource(dev, SYS_RES_IOPORT, &rid, @@ -856,7 +853,9 @@ sioprobe(dev, xrid) * but mask them in the processor as well in case there are some * (misconfigured) shared interrupts. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); /* EXTRA DELAY? */ /* @@ -953,7 +952,8 @@ sioprobe(dev, xrid) CLR_FLAG(dev, COM_C_IIR_TXRDYBUG); } sio_setreg(com, com_cfcr, CFCR_8BITS); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); bus_release_resource(dev, SYS_RES_IOPORT, rid, port); return (iobase == siocniobase ? 0 : result); } @@ -993,7 +993,8 @@ sioprobe(dev, xrid) irqmap[3] = isa_irq_pending(); failures[9] = (sio_getreg(com, com_iir) & IIR_IMASK) - IIR_NOPEND; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); irqs = irqmap[1] & ~irqmap[0]; if (bus_get_resource(idev, SYS_RES_IRQ, 0, &xirq, NULL) == 0 && @@ -1181,7 +1182,6 @@ sioattach(dev, xrid) } else com->it_in.c_ispeed = com->it_in.c_ospeed = TTYDEF_SPEED; if (siosetwater(com, com->it_in.c_ispeed) != 0) { - enable_intr(); /* * Leave i/o resources allocated if this is a `cn'-level * console, so that other devices can't snarf them. @@ -1190,7 +1190,6 @@ sioattach(dev, xrid) bus_release_resource(dev, SYS_RES_IOPORT, rid, port); return (ENOMEM); } - enable_intr(); termioschars(&com->it_in); com->it_out = com->it_in; @@ -1340,7 +1339,7 @@ determined_type: ; RF_ACTIVE); if (com->irqres) { ret = BUS_SETUP_INTR(device_get_parent(dev), dev, com->irqres, - INTR_TYPE_TTY | INTR_TYPE_FAST, + INTR_TYPE_TTY | INTR_FAST, siointr, com, &com->cookie); if (ret) { ret = BUS_SETUP_INTR(device_get_parent(dev), dev, @@ -1424,6 +1423,8 @@ sioopen(dev, flag, mode, p) goto out; } } else { + int intrsave; + /* * The device isn't open, so there are no conflicts. * Initialize it. Initialization is done twice in many @@ -1483,7 +1484,9 @@ sioopen(dev, flag, mode, p) } } + intrsave = save_intr(); disable_intr(); + COM_LOCK(); (void) inb(com->line_status_port); (void) inb(com->data_port); com->prev_modem_status = com->last_modem_status @@ -1495,7 +1498,8 @@ sioopen(dev, flag, mode, p) outb(com->intr_ctl_port, IER_ERXRDY | IER_ETXRDY | IER_ERLS | IER_EMSC); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); /* * Handle initial DCD. Callout devices get a fake initial * DCD (trapdoor DCD). If we are callout, then any sleeping @@ -1716,6 +1720,9 @@ siodtrwakeup(chan) wakeup(&com->dtr_wait); } +/* + * Call this function with COM_LOCK. It will return with the lock still held. + */ static void sioinput(com) struct com_s *com; @@ -1725,6 +1732,7 @@ sioinput(com) u_char line_status; int recv_data; struct tty *tp; + int intrsave; buf = com->ibuf; tp = com->tp; @@ -1742,6 +1750,13 @@ sioinput(com) * call overhead). */ do { + /* + * This may look odd, but it is using save-and-enable + * semantics instead of the save-and-disable semantics + * that are used everywhere else. + */ + intrsave = save_intr(); + COM_UNLOCK(); enable_intr(); incc = com->iptr - buf; if (tp->t_rawq.c_cc + incc > tp->t_ihiwat @@ -1763,10 +1778,18 @@ sioinput(com) tp->t_lflag &= ~FLUSHO; comstart(tp); } - disable_intr(); + restore_intr(intrsave); + COM_LOCK(); } while (buf < com->iptr); } else { do { + /* + * This may look odd, but it is using save-and-enable + * semantics instead of the save-and-disable semantics + * that are used everywhere else. + */ + intrsave = save_intr(); + COM_UNLOCK(); enable_intr(); line_status = buf[com->ierroff]; recv_data = *buf++; @@ -1782,7 +1805,8 @@ sioinput(com) recv_data |= TTY_PE; } (*linesw[tp->t_line].l_rint)(recv_data, tp); - disable_intr(); + restore_intr(intrsave); + COM_LOCK(); } while (buf < com->iptr); } com_events -= (com->iptr - com->ibuf); @@ -1893,12 +1917,16 @@ siointr1(com) if (recv_data == KEY_CR) { brk_state1 = recv_data; brk_state2 = 0; - } else if (brk_state1 == KEY_CR && (recv_data == KEY_TILDE || recv_data == KEY_CRTLB)) { + } else if (brk_state1 == KEY_CR + && (recv_data == KEY_TILDE + || recv_data == KEY_CRTLB)) { if (recv_data == KEY_TILDE) brk_state2 = recv_data; - else if (brk_state2 == KEY_TILDE && recv_data == KEY_CRTLB) { + else if (brk_state2 == KEY_TILDE + && recv_data == KEY_CRTLB) { breakpoint(); - brk_state1 = brk_state2 = 0; + brk_state1 = 0; + brk_state2 = 0; goto cont; } else brk_state2 = 0; @@ -1949,7 +1977,10 @@ siointr1(com) if (com->do_timestamp) microtime(&com->timestamp); ++com_events; +/* XXX - needs to go away when alpha gets ithreads */ +#ifdef __alpha__ schedsofttty(); +#endif #if 0 /* for testing input latency vs efficiency */ if (com->iptr - com->ibuf == 8) setsofttty(); @@ -2217,10 +2248,12 @@ sioioctl(dev, cmd, data, flag, p) return (0); } +/* software interrupt handler for SWI_TTY */ static void siopoll() { int unit; + int intrsave; if (com_events == 0) return; @@ -2239,7 +2272,9 @@ siopoll() * Discard any events related to never-opened or * going-away devices. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); incc = com->iptr - com->ibuf; com->iptr = com->ibuf; if (com->state & CS_CHECKMSR) { @@ -2247,33 +2282,43 @@ siopoll() com->state &= ~CS_CHECKMSR; } com_events -= incc; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); continue; } if (com->iptr != com->ibuf) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); sioinput(com); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } if (com->state & CS_CHECKMSR) { u_char delta_modem_status; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); delta_modem_status = com->last_modem_status ^ com->prev_modem_status; com->prev_modem_status = com->last_modem_status; com_events -= LOTS_OF_EVENTS; com->state &= ~CS_CHECKMSR; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (delta_modem_status & MSR_DCD) (*linesw[tp->t_line].l_modem) (tp, com->prev_modem_status & MSR_DCD); } if (com->state & CS_ODONE) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com_events -= LOTS_OF_EVENTS; com->state &= ~CS_ODONE; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (!(com->state & CS_BUSY) && !(com->extra_state & CSE_BUSYCHECK)) { timeout(siobusycheck, com, hz / 100); @@ -2301,6 +2346,7 @@ comparam(tp, t) u_char dlbl; int s; int unit; + int intrsave; /* do historical conversions */ if (t->c_ispeed == 0) @@ -2367,11 +2413,10 @@ comparam(tp, t) sio_setreg(com, com_fifo, com->fifo_image); } - /* - * This returns with interrupts disabled so that we can complete - * the speed change atomically. Keeping interrupts disabled is - * especially important while com_data is hidden. - */ + intrsave = save_intr(); + disable_intr(); + COM_LOCK(); + (void) siosetwater(com, t->c_ispeed); if (divisor != 0) { @@ -2459,7 +2504,8 @@ comparam(tp, t) if (com->state >= (CS_BUSY | CS_TTGO)) siointr1(com); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); splx(s); comstart(tp); if (com->ibufold != NULL) { @@ -2478,6 +2524,7 @@ siosetwater(com, speed) u_char *ibuf; int ibufsize; struct tty *tp; + int intrsave; /* * Make the buffer size large enough to handle a softtty interrupt @@ -2488,20 +2535,16 @@ siosetwater(com, speed) cp4ticks = speed / 10 / hz * 4; for (ibufsize = 128; ibufsize < cp4ticks;) ibufsize <<= 1; - if (ibufsize == com->ibufsize) { - disable_intr(); + if (ibufsize == com->ibufsize) return (0); - } /* * Allocate input buffer. The extra factor of 2 in the size is * to allow for an error byte for each input byte. */ ibuf = malloc(2 * ibufsize, M_DEVBUF, M_NOWAIT); - if (ibuf == NULL) { - disable_intr(); + if (ibuf == NULL) return (ENOMEM); - } /* Initialize non-critical variables. */ com->ibufold = com->ibuf; @@ -2517,7 +2560,9 @@ siosetwater(com, speed) * Read current input buffer, if any. Continue with interrupts * disabled. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->iptr != com->ibuf) sioinput(com); @@ -2536,6 +2581,8 @@ siosetwater(com, speed) com->ibufend = ibuf + ibufsize; com->ierroff = ibufsize; com->ihighwater = ibuf + 3 * ibufsize / 4; + COM_UNLOCK(); + restore_intr(intrsave); return (0); } @@ -2546,13 +2593,16 @@ comstart(tp) struct com_s *com; int s; int unit; + int intrsave; unit = DEV_TO_UNIT(tp->t_dev); com = com_addr(unit); if (com == NULL) return; s = spltty(); + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (tp->t_state & TS_TTSTOP) com->state &= ~CS_TTGO; else @@ -2565,7 +2615,8 @@ comstart(tp) && com->state & CS_RTS_IFLOW) outb(com->modem_ctl_port, com->mcr_image |= MCR_RTS); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (tp->t_state & (TS_TIMEOUT | TS_TTSTOP)) { ttwwakeup(tp); splx(s); @@ -2581,7 +2632,9 @@ comstart(tp) sizeof com->obuf1); com->obufs[0].l_next = NULL; com->obufs[0].l_queued = TRUE; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state & CS_BUSY) { qp = com->obufq.l_next; while ((next = qp->l_next) != NULL) @@ -2593,7 +2646,8 @@ comstart(tp) com->obufq.l_next = &com->obufs[0]; com->state |= CS_BUSY; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } if (tp->t_outq.c_cc != 0 && !com->obufs[1].l_queued) { com->obufs[1].l_tail @@ -2601,7 +2655,9 @@ comstart(tp) sizeof com->obuf2); com->obufs[1].l_next = NULL; com->obufs[1].l_queued = TRUE; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state & CS_BUSY) { qp = com->obufq.l_next; while ((next = qp->l_next) != NULL) @@ -2613,14 +2669,18 @@ comstart(tp) com->obufq.l_next = &com->obufs[1]; com->state |= CS_BUSY; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } tp->t_state |= TS_BUSY; } + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state >= (CS_BUSY | CS_TTGO)) siointr1(com); /* fake interrupt to start output */ - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); ttwwakeup(tp); splx(s); } @@ -2631,11 +2691,14 @@ comstop(tp, rw) int rw; { struct com_s *com; + int intrsave; com = com_addr(DEV_TO_UNIT(tp->t_dev)); if (com == NULL || com->gone) return; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (rw & FWRITE) { if (com->hasfifo) #ifdef COM_ESP @@ -2662,7 +2725,8 @@ comstop(tp, rw) com_events -= (com->iptr - com->ibuf); com->iptr = com->ibuf; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); comstart(tp); } @@ -2674,6 +2738,7 @@ commctl(com, bits, how) { int mcr; int msr; + int intrsave; if (how == DMGET) { bits = TIOCM_LE; /* XXX - always enabled while open */ @@ -2705,7 +2770,9 @@ commctl(com, bits, how) mcr |= MCR_RTS; if (com->gone) return(0); + intrsave = save_intr(); disable_intr(); + COM_LOCK(); switch (how) { case DMSET: outb(com->modem_ctl_port, @@ -2718,7 +2785,8 @@ commctl(com, bits, how) outb(com->modem_ctl_port, com->mcr_image &= ~mcr); break; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); return (0); } @@ -2766,6 +2834,7 @@ comwakeup(chan) { struct com_s *com; int unit; + int intrsave; sio_timeout_handle = timeout(comwakeup, (void *)NULL, sio_timeout); @@ -2777,9 +2846,12 @@ comwakeup(chan) com = com_addr(unit); if (com != NULL && !com->gone && (com->state >= (CS_BUSY | CS_TTGO) || com->poll)) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); siointr1(com); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } } @@ -2801,10 +2873,13 @@ comwakeup(chan) u_int delta; u_long total; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); delta = com->delta_error_counts[errnum]; com->delta_error_counts[errnum] = 0; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (delta == 0) continue; total = com->error_counts[errnum] += delta; diff --git a/sys/fs/cd9660/cd9660_util.c b/sys/fs/cd9660/cd9660_util.c index 2a11dc2f6361..d0f2e1c45c5b 100644 --- a/sys/fs/cd9660/cd9660_util.c +++ b/sys/fs/cd9660/cd9660_util.c @@ -41,6 +41,7 @@ */ #include +#include #include #include diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s index 2a7559df7f97..54bf00366c81 100644 --- a/sys/i386/i386/apic_vector.s +++ b/sys/i386/i386/apic_vector.s @@ -17,7 +17,7 @@ /* - * Macros for interrupt interrupt entry, call to handler, and exit. + * Macros for interrupt entry, call to handler, and exit. */ #define FAST_INTR(irq_num, vec_name) \ @@ -121,7 +121,7 @@ IDTVEC(vec_name) ; \ /* - * Test to see if the source is currntly masked, clear if so. + * Test to see if the source is currently masked, clear if so. */ #define UNMASK_IRQ(irq_num) \ IMASK_LOCK ; /* into critical reg */ \ @@ -200,7 +200,16 @@ log_intr_event: #else #define APIC_ITRACE(name, irq_num, id) #endif - + +/* + * Slow, threaded interrupts. + * + * XXX Most of the parameters here are obsolete. Fix this when we're + * done. + * XXX we really shouldn't return via doreti if we just schedule the + * interrupt handler and don't run anything. We could just do an + * iret. FIXME. + */ #define INTR(irq_num, vec_name, maybe_extra_ipending) \ .text ; \ SUPERALIGN_TEXT ; \ @@ -216,87 +225,24 @@ IDTVEC(vec_name) ; \ maybe_extra_ipending ; \ ; \ APIC_ITRACE(apic_itrace_enter, irq_num, APIC_ITRACE_ENTER) ; \ - lock ; /* MP-safe */ \ - btsl $(irq_num), iactive ; /* lazy masking */ \ - jc 1f ; /* already active */ \ ; \ MASK_LEVEL_IRQ(irq_num) ; \ EOI_IRQ(irq_num) ; \ 0: ; \ - APIC_ITRACE(apic_itrace_tryisrlock, irq_num, APIC_ITRACE_TRYISRLOCK) ;\ - MP_TRYLOCK ; /* XXX this is going away... */ \ - testl %eax, %eax ; /* did we get it? */ \ - jz 3f ; /* no */ \ -; \ - APIC_ITRACE(apic_itrace_gotisrlock, irq_num, APIC_ITRACE_GOTISRLOCK) ;\ - testl $IRQ_BIT(irq_num), _cpl ; \ - jne 2f ; /* this INT masked */ \ -; \ incb _intr_nesting_level ; \ ; \ /* entry point used by doreti_unpend for HWIs. */ \ __CONCAT(Xresume,irq_num): ; \ FAKE_MCOUNT(13*4(%esp)) ; /* XXX avoid dbl cnt */ \ - lock ; incl _cnt+V_INTR ; /* tally interrupts */ \ - movl _intr_countp + (irq_num) * 4, %eax ; \ - lock ; incl (%eax) ; \ -; \ - movl _cpl, %eax ; \ - pushl %eax ; \ - orl _intr_mask + (irq_num) * 4, %eax ; \ - movl %eax, _cpl ; \ - lock ; \ - andl $~IRQ_BIT(irq_num), _ipending ; \ -; \ - pushl _intr_unit + (irq_num) * 4 ; \ + pushl $irq_num; /* pass the IRQ */ \ APIC_ITRACE(apic_itrace_enter2, irq_num, APIC_ITRACE_ENTER2) ; \ sti ; \ - call *_intr_handler + (irq_num) * 4 ; \ - cli ; \ + call _sched_ithd ; \ + addl $4, %esp ; /* discard the parameter */ \ APIC_ITRACE(apic_itrace_leave, irq_num, APIC_ITRACE_LEAVE) ; \ ; \ - lock ; andl $~IRQ_BIT(irq_num), iactive ; \ - UNMASK_IRQ(irq_num) ; \ - APIC_ITRACE(apic_itrace_unmask, irq_num, APIC_ITRACE_UNMASK) ; \ - sti ; /* doreti repeats cli/sti */ \ MEXITCOUNT ; \ - jmp _doreti ; \ -; \ - ALIGN_TEXT ; \ -1: ; /* active */ \ - APIC_ITRACE(apic_itrace_active, irq_num, APIC_ITRACE_ACTIVE) ; \ - MASK_IRQ(irq_num) ; \ - EOI_IRQ(irq_num) ; \ - lock ; \ - orl $IRQ_BIT(irq_num), _ipending ; \ - lock ; \ - btsl $(irq_num), iactive ; /* still active */ \ - jnc 0b ; /* retry */ \ - POP_FRAME ; \ - iret ; /* XXX: iactive bit might be 0 now */ \ - ALIGN_TEXT ; \ -2: ; /* masked by cpl, leave iactive set */ \ - APIC_ITRACE(apic_itrace_masked, irq_num, APIC_ITRACE_MASKED) ; \ - lock ; \ - orl $IRQ_BIT(irq_num), _ipending ; \ - MP_RELLOCK ; \ - POP_FRAME ; \ - iret ; \ - ALIGN_TEXT ; \ -3: ; /* other cpu has isr lock */ \ - APIC_ITRACE(apic_itrace_noisrlock, irq_num, APIC_ITRACE_NOISRLOCK) ;\ - lock ; \ - orl $IRQ_BIT(irq_num), _ipending ; \ - testl $IRQ_BIT(irq_num), _cpl ; \ - jne 4f ; /* this INT masked */ \ - call forward_irq ; /* forward irq to lock holder */ \ - POP_FRAME ; /* and return */ \ - iret ; \ - ALIGN_TEXT ; \ -4: ; /* blocked */ \ - APIC_ITRACE(apic_itrace_masked2, irq_num, APIC_ITRACE_MASKED2) ;\ - POP_FRAME ; /* and return */ \ - iret + jmp doreti_next /* * Handle "spurious INTerrupts". @@ -434,20 +380,10 @@ _Xcpuast: FAKE_MCOUNT(13*4(%esp)) - /* - * Giant locks do not come cheap. - * A lot of cycles are going to be wasted here. - */ - call _get_mplock - - movl _cpl, %eax - pushl %eax orl $AST_PENDING, _astpending /* XXX */ incb _intr_nesting_level sti - pushl $0 - movl _cpuid, %eax lock btrl %eax, _checkstate_pending_ast @@ -461,7 +397,7 @@ _Xcpuast: lock incl CNAME(cpuast_cnt) MEXITCOUNT - jmp _doreti + jmp doreti_next 1: /* We are already in the process of delivering an ast for this CPU */ POP_FRAME @@ -487,40 +423,24 @@ _Xforward_irq: FAKE_MCOUNT(13*4(%esp)) - MP_TRYLOCK - testl %eax,%eax /* Did we get the lock ? */ - jz 1f /* No */ - lock incl CNAME(forward_irq_hitcnt) cmpb $4, _intr_nesting_level - jae 2f + jae 1f - movl _cpl, %eax - pushl %eax incb _intr_nesting_level sti - pushl $0 - MEXITCOUNT - jmp _doreti /* Handle forwarded interrupt */ + jmp doreti_next /* Handle forwarded interrupt */ 1: - lock - incl CNAME(forward_irq_misscnt) - call forward_irq /* Oops, we've lost the isr lock */ - MEXITCOUNT - POP_FRAME - iret -2: lock incl CNAME(forward_irq_toodeepcnt) -3: - MP_RELLOCK MEXITCOUNT POP_FRAME iret +#if 0 /* * */ @@ -532,9 +452,11 @@ forward_irq: cmpl $0, CNAME(forward_irq_enabled) jz 4f +/* XXX - this is broken now, because mp_lock doesn't exist movl _mp_lock,%eax cmpl $FREE_LOCK,%eax jne 1f + */ movl $0, %eax /* Pick CPU #0 if noone has lock */ 1: shrl $24,%eax @@ -559,6 +481,7 @@ forward_irq: jnz 3b 4: ret +#endif /* * Executed by a CPU when it receives an Xcpustop IPI from another CPU, @@ -654,6 +577,7 @@ MCOUNT_LABEL(bintr) FAST_INTR(22,fastintr22) FAST_INTR(23,fastintr23) #define CLKINTR_PENDING movl $1,CNAME(clkintr_pending) +/* Threaded interrupts */ INTR(0,intr0, CLKINTR_PENDING) INTR(1,intr1,) INTR(2,intr2,) @@ -728,15 +652,11 @@ _ihandlers: .long _swi_null, swi_net, _swi_null, _swi_null .long _swi_vm, _swi_null, _softclock -imasks: /* masks for interrupt handlers */ - .space NHWI*4 /* padding; HWI masks are elsewhere */ - - .long SWI_TTY_MASK, SWI_NET_MASK, SWI_CAMNET_MASK, SWI_CAMBIO_MASK - .long SWI_VM_MASK, SWI_TQ_MASK, SWI_CLOCK_MASK - +#if 0 /* active flag for lazy masking */ iactive: .long 0 +#endif #ifdef COUNT_XINVLTLB_HITS .globl _xhits diff --git a/sys/i386/i386/autoconf.c b/sys/i386/i386/autoconf.c index b209065027d6..4edda4bdcab5 100644 --- a/sys/i386/i386/autoconf.c +++ b/sys/i386/i386/autoconf.c @@ -163,14 +163,6 @@ configure(dummy) * XXX this is slightly misplaced. */ spl0(); - - /* - * Allow lowering of the ipl to the lowest kernel level if we - * panic (or call tsleep() before clearing `cold'). No level is - * completely safe (since a panic may occur in a critical region - * at splhigh()), but we want at least bio interrupts to work. - */ - safepri = cpl; } static void diff --git a/sys/i386/i386/exception.s b/sys/i386/i386/exception.s index acb8b40f2810..9e77114a1385 100644 --- a/sys/i386/i386/exception.s +++ b/sys/i386/i386/exception.s @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #ifdef SMP @@ -175,20 +176,12 @@ IDTVEC(fpu) mov %ax,%fs FAKE_MCOUNT(13*4(%esp)) -#ifdef SMP MPLOCKED incl _cnt+V_TRAP - MP_LOCK - movl _cpl,%eax - pushl %eax /* save original cpl */ pushl $0 /* dummy unit to finish intr frame */ -#else /* SMP */ - movl _cpl,%eax - pushl %eax - pushl $0 /* dummy unit to finish intr frame */ - incl _cnt+V_TRAP -#endif /* SMP */ + call __mtx_enter_giant_def call _npx_intr + call __mtx_exit_giant_def incb _intr_nesting_level MEXITCOUNT @@ -205,9 +198,6 @@ IDTVEC(align) * gate (TGT), else disabled if this was an interrupt gate (IGT). * Note that int0x80_syscall is a trap gate. Only page faults * use an interrupt gate. - * - * Note that all calls to MP_LOCK must occur with interrupts enabled - * in order to be able to take IPI's while waiting for the lock. */ SUPERALIGN_TEXT @@ -227,16 +217,12 @@ alltraps_with_regs_pushed: FAKE_MCOUNT(13*4(%esp)) calltrap: FAKE_MCOUNT(_btrap) /* init "from" _btrap -> calltrap */ - MPLOCKED incl _cnt+V_TRAP - MP_LOCK - movl _cpl,%ebx /* keep orig. cpl here during trap() */ call _trap /* * Return via _doreti to handle ASTs. Have to change trap frame * to interrupt frame. */ - pushl %ebx /* cpl to restore */ subl $4,%esp /* dummy unit to finish intr frame */ incb _intr_nesting_level MEXITCOUNT @@ -274,16 +260,11 @@ IDTVEC(syscall) movl %eax,TF_EFLAGS(%esp) movl $7,TF_ERR(%esp) /* sizeof "lcall 7,0" */ FAKE_MCOUNT(13*4(%esp)) - MPLOCKED incl _cnt+V_SYSCALL call _syscall2 MEXITCOUNT cli /* atomic astpending access */ - cmpl $0,_astpending - je doreti_syscall_ret -#ifdef SMP - MP_LOCK -#endif - pushl $0 /* cpl to restore */ + cmpl $0,_astpending /* AST pending? */ + je doreti_syscall_ret /* no, get out of here */ subl $4,%esp /* dummy unit for interrupt frame */ movb $1,_intr_nesting_level jmp _doreti @@ -312,21 +293,18 @@ IDTVEC(int0x80_syscall) mov %ax,%fs movl $2,TF_ERR(%esp) /* sizeof "int 0x80" */ FAKE_MCOUNT(13*4(%esp)) - MPLOCKED incl _cnt+V_SYSCALL call _syscall2 MEXITCOUNT cli /* atomic astpending access */ - cmpl $0,_astpending - je doreti_syscall_ret -#ifdef SMP - MP_LOCK -#endif - pushl $0 /* cpl to restore */ + cmpl $0,_astpending /* AST pending? */ + je doreti_syscall_ret /* no, get out of here */ subl $4,%esp /* dummy unit for interrupt frame */ movb $1,_intr_nesting_level jmp _doreti ENTRY(fork_trampoline) + MTX_EXIT(_sched_lock, %ecx) + sti call _spl0 #ifdef SMP @@ -355,7 +333,6 @@ ENTRY(fork_trampoline) /* * Return via _doreti to handle ASTs. */ - pushl $0 /* cpl to restore */ subl $4,%esp /* dummy unit to finish intr frame */ movb $1,_intr_nesting_level MEXITCOUNT diff --git a/sys/i386/i386/genassym.c b/sys/i386/i386/genassym.c index 60accd19ba8e..78c607591875 100644 --- a/sys/i386/i386/genassym.c +++ b/sys/i386/i386/genassym.c @@ -51,6 +51,10 @@ #include #include #include +/* XXX */ +#ifdef KTR_PERCPU +#include +#endif #include #include #include @@ -73,6 +77,7 @@ #include #include #include +#include ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); @@ -127,9 +132,7 @@ ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); -#ifdef SMP -ASSYM(PCB_MPNEST, offsetof(struct pcb, pcb_mpnest)); -#endif +ASSYM(PCB_SCHEDNEST, offsetof(struct pcb, pcb_schednest)); ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); @@ -170,7 +173,9 @@ ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab)); ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend)); ASSYM(GD_SIZEOF, sizeof(struct globaldata)); ASSYM(GD_CURPROC, offsetof(struct globaldata, gd_curproc)); +ASSYM(GD_PREVPROC, offsetof(struct globaldata, gd_prevproc)); ASSYM(GD_NPXPROC, offsetof(struct globaldata, gd_npxproc)); +ASSYM(GD_IDLEPROC, offsetof(struct globaldata, gd_idleproc)); ASSYM(GD_CURPCB, offsetof(struct globaldata, gd_curpcb)); ASSYM(GD_COMMON_TSS, offsetof(struct globaldata, gd_common_tss)); ASSYM(GD_SWITCHTIME, offsetof(struct globaldata, gd_switchtime)); @@ -178,11 +183,21 @@ ASSYM(GD_SWITCHTICKS, offsetof(struct globaldata, gd_switchticks)); ASSYM(GD_COMMON_TSSD, offsetof(struct globaldata, gd_common_tssd)); ASSYM(GD_TSS_GDT, offsetof(struct globaldata, gd_tss_gdt)); ASSYM(GD_ASTPENDING, offsetof(struct globaldata, gd_astpending)); +ASSYM(GD_INTR_NESTING_LEVEL, offsetof(struct globaldata, gd_intr_nesting_level)); #ifdef USER_LDT ASSYM(GD_CURRENTLDT, offsetof(struct globaldata, gd_currentldt)); #endif +ASSYM(GD_WITNESS_SPIN_CHECK, offsetof(struct globaldata, gd_witness_spin_check)); + +/* XXX */ +#ifdef KTR_PERCPU +ASSYM(GD_KTR_IDX, offsetof(struct globaldata, gd_ktr_idx)); +ASSYM(GD_KTR_BUF, offsetof(struct globaldata, gd_ktr_buf)); +ASSYM(GD_KTR_BUF_DATA, offsetof(struct globaldata, gd_ktr_buf_data)); +#endif + #ifdef SMP ASSYM(GD_CPUID, offsetof(struct globaldata, gd_cpuid)); ASSYM(GD_CPU_LOCKID, offsetof(struct globaldata, gd_cpu_lockid)); @@ -211,3 +226,9 @@ ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL)); ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL)); ASSYM(GPROC0_SEL, GPROC0_SEL); ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame)); + +ASSYM(MTX_LOCK, offsetof(struct mtx, mtx_lock)); +ASSYM(MTX_RECURSE, offsetof(struct mtx, mtx_recurse)); +ASSYM(MTX_SAVEFL, offsetof(struct mtx, mtx_savefl)); + +ASSYM(MTX_UNOWNED, MTX_UNOWNED); diff --git a/sys/i386/i386/globals.s b/sys/i386/i386/globals.s index 31fbfd5e98b1..f3181429cad5 100644 --- a/sys/i386/i386/globals.s +++ b/sys/i386/i386/globals.s @@ -61,44 +61,74 @@ globaldata: #else .set globaldata,0 #endif - .globl gd_curproc, gd_curpcb, gd_npxproc, gd_astpending - .globl gd_common_tss, gd_switchtime, gd_switchticks + .globl gd_curproc, gd_prevproc, gd_curpcb, gd_npxproc, gd_idleproc + .globl gd_astpending, gd_common_tss, gd_switchtime, gd_switchticks + .globl gd_intr_nesting_level .set gd_curproc,globaldata + GD_CURPROC + .set gd_prevproc,globaldata + GD_PREVPROC .set gd_astpending,globaldata + GD_ASTPENDING .set gd_curpcb,globaldata + GD_CURPCB .set gd_npxproc,globaldata + GD_NPXPROC + .set gd_idleproc,globaldata + GD_IDLEPROC .set gd_common_tss,globaldata + GD_COMMON_TSS .set gd_switchtime,globaldata + GD_SWITCHTIME .set gd_switchticks,globaldata + GD_SWITCHTICKS + .set gd_intr_nesting_level,globaldata + GD_INTR_NESTING_LEVEL .globl gd_common_tssd, gd_tss_gdt .set gd_common_tssd,globaldata + GD_COMMON_TSSD .set gd_tss_gdt,globaldata + GD_TSS_GDT + .globl gd_witness_spin_check + .set gd_witness_spin_check, globaldata + GD_WITNESS_SPIN_CHECK + #ifdef USER_LDT .globl gd_currentldt .set gd_currentldt,globaldata + GD_CURRENTLDT #endif +/* XXX - doesn't work yet */ +#ifdef KTR_PERCPU + .globl gd_ktr_idx, gd_ktr_buf, gd_ktr_buf_data + .set gd_ktr_idx,globaldata + GD_KTR_IDX + .set gd_ktr_buf,globaldata + GD_KTR_BUF + .set gd_ktr_buf_data,globaldata + GD_KTR_BUF_DATA +#endif + #ifndef SMP - .globl _curproc, _curpcb, _npxproc, _astpending - .globl _common_tss, _switchtime, _switchticks + .globl _curproc, _prevproc, _curpcb, _npxproc, _idleproc, + .globl _astpending, _common_tss, _switchtime, _switchticks + .global _intr_nesting_level .set _curproc,globaldata + GD_CURPROC + .set _prevproc,globaldata + GD_PREVPROC .set _astpending,globaldata + GD_ASTPENDING .set _curpcb,globaldata + GD_CURPCB .set _npxproc,globaldata + GD_NPXPROC + .set _idleproc,globaldata + GD_IDLEPROC .set _common_tss,globaldata + GD_COMMON_TSS .set _switchtime,globaldata + GD_SWITCHTIME .set _switchticks,globaldata + GD_SWITCHTICKS + .set _intr_nesting_level,globaldata + GD_INTR_NESTING_LEVEL .globl _common_tssd, _tss_gdt .set _common_tssd,globaldata + GD_COMMON_TSSD .set _tss_gdt,globaldata + GD_TSS_GDT + .globl _witness_spin_check + .set _witness_spin_check,globaldata + GD_WITNESS_SPIN_CHECK + #ifdef USER_LDT .globl _currentldt .set _currentldt,globaldata + GD_CURRENTLDT #endif + +/* XXX - doesn't work yet */ +#ifdef KTR_PERCPU + .globl _ktr_idx, _ktr_buf, _ktr_buf_data + .set _ktr_idx,globaldata + GD_KTR_IDX + .set _ktr_buf,globaldata + GD_KTR_BUF + .set _ktr_buf_data,globaldata + GD_KTR_BUF_DATA +#endif #endif #ifdef SMP diff --git a/sys/i386/i386/i386-gdbstub.c b/sys/i386/i386/i386-gdbstub.c index 986b8d4daa1f..b442a377c44f 100644 --- a/sys/i386/i386/i386-gdbstub.c +++ b/sys/i386/i386/i386-gdbstub.c @@ -188,7 +188,8 @@ getpacket (char *buffer) unsigned char ch; int s; - s = spltty (); + s = read_eflags(); + disable_intr(); do { /* wait around for the start character, ignore all other characters */ @@ -239,7 +240,7 @@ getpacket (char *buffer) } } while (checksum != xmitcsum); - splx (s); + write_eflags(s); } /* send the packet in buffer. */ @@ -253,7 +254,8 @@ putpacket (char *buffer) int s; /* $#. */ - s = spltty (); + s = read_eflags(); + disable_intr(); do { /* @@ -285,7 +287,7 @@ putpacket (char *buffer) putDebugChar (hexchars[checksum & 0xf]); } while ((getDebugChar () & 0x7f) != '+'); - splx (s); + write_eflags(s); } static char remcomInBuffer[BUFMAX]; diff --git a/sys/i386/i386/identcpu.c b/sys/i386/i386/identcpu.c index 0e11e2b8eadf..71ecd63de85a 100644 --- a/sys/i386/i386/identcpu.c +++ b/sys/i386/i386/identcpu.c @@ -42,6 +42,7 @@ #include "opt_cpu.h" #include +#include #include #include #include @@ -53,6 +54,8 @@ #include #include +#include +#include #include #define IDENTBLUE_CYRIX486 0 diff --git a/sys/i386/i386/initcpu.c b/sys/i386/i386/initcpu.c index be86c65cb279..b9395bfc7f85 100644 --- a/sys/i386/i386/initcpu.c +++ b/sys/i386/i386/initcpu.c @@ -607,12 +607,14 @@ void enable_K5_wt_alloc(void) { u_int64_t msr; + int intrstate; /* * Write allocate is supported only on models 1, 2, and 3, with * a stepping of 4 or greater. */ if (((cpu_id & 0xf0) > 0) && ((cpu_id & 0x0f) > 3)) { + intrstate = save_intr(); disable_intr(); msr = rdmsr(0x83); /* HWCR */ wrmsr(0x83, msr & !(0x10)); @@ -645,7 +647,7 @@ enable_K5_wt_alloc(void) msr=rdmsr(0x83); wrmsr(0x83, msr|0x10); /* enable write allocate */ - enable_intr(); + restore_intr(intrstate); } } @@ -708,7 +710,6 @@ enable_K6_wt_alloc(void) wrmsr(0x0c0000082, whcr); write_eflags(eflags); - enable_intr(); } void @@ -770,7 +771,6 @@ enable_K6_2_wt_alloc(void) wrmsr(0x0c0000082, whcr); write_eflags(eflags); - enable_intr(); } #endif /* I585_CPU && CPU_WT_ALLOC */ diff --git a/sys/i386/i386/legacy.c b/sys/i386/i386/legacy.c index 8a3077058718..5b6cdbc85618 100644 --- a/sys/i386/i386/legacy.c +++ b/sys/i386/i386/legacy.c @@ -68,7 +68,10 @@ #else #include #endif +#include +#include #include +#include static struct rman irq_rman, drq_rman, port_rman, mem_rman; @@ -397,9 +400,9 @@ static int nexus_setup_intr(device_t bus, device_t child, struct resource *irq, int flags, void (*ihand)(void *), void *arg, void **cookiep) { - intrmask_t *mask; driver_t *driver; - int error, icflags; + int error, icflags; + int pri; /* interrupt thread priority */ /* somebody tried to setup an irq that failed to allocate! */ if (irq == NULL) @@ -413,27 +416,32 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, driver = device_get_driver(child); switch (flags) { - case INTR_TYPE_TTY: - mask = &tty_imask; + case INTR_TYPE_TTY: /* keyboard or parallel port */ + pri = PI_TTYLOW; break; - case (INTR_TYPE_TTY | INTR_TYPE_FAST): - mask = &tty_imask; + case (INTR_TYPE_TTY | INTR_FAST): /* sio */ + pri = PI_TTYHIGH; icflags |= INTR_FAST; break; case INTR_TYPE_BIO: - mask = &bio_imask; + /* + * XXX We need to refine this. BSD/OS distinguishes + * between tape and disk priorities. + */ + pri = PI_DISK; break; case INTR_TYPE_NET: - mask = &net_imask; + pri = PI_NET; break; case INTR_TYPE_CAM: - mask = &cam_imask; + pri = PI_DISK; /* XXX or PI_CAM? */ break; case INTR_TYPE_MISC: - mask = 0; + pri = PI_DULL; /* don't care */ break; + /* We didn't specify an interrupt level. */ default: - panic("still using grody create_intr interface"); + panic("nexus_setup_intr: no interrupt type in flags"); } /* @@ -444,7 +452,7 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, return (error); *cookiep = inthand_add(device_get_nameunit(child), irq->r_start, - ihand, arg, mask, icflags); + ihand, arg, pri, icflags); if (*cookiep == NULL) error = EINVAL; /* XXX ??? */ diff --git a/sys/i386/i386/locore.s b/sys/i386/i386/locore.s index bddd7d5be868..fa95fb0d6b53 100644 --- a/sys/i386/i386/locore.s +++ b/sys/i386/i386/locore.s @@ -862,9 +862,6 @@ map_read_write: movl $(NPTEPG-1), %ebx /* pte offset = NTEPG-1 */ movl $1, %ecx /* one private pt coming right up */ fillkpt(R(SMPptpa), $PG_RW) - -/* Initialize mp lock to allow early traps */ - movl $1, R(_mp_lock) #endif /* SMP */ /* install a pde for temporary double map of bottom of VA */ diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index 6edecf04db54..875c9d5a7a8a 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -98,10 +99,12 @@ #include #include #include +#include #include /* pcb.h included via sys/user.h */ +#include +#include #ifdef SMP #include -#include #endif #ifdef PERFMON #include @@ -110,6 +113,7 @@ #ifdef OLD_BUS_ARCH #include #endif +#include #include #include #include @@ -247,6 +251,11 @@ vm_offset_t clean_sva, clean_eva; static vm_offset_t pager_sva, pager_eva; static struct trapframe proc0_tf; +struct cpuhead cpuhead; + +mtx_t sched_lock; +mtx_t Giant; + #define offsetof(type, member) ((size_t)(&((type *)0)->member)) static void @@ -431,6 +440,11 @@ cpu_startup(dummy) bufinit(); vm_pager_bufferinit(); + SLIST_INIT(&cpuhead); + SLIST_INSERT_HEAD(&cpuhead, GLOBALDATA, gd_allcpu); + + mtx_init(&sched_lock, "sched lock", MTX_SPIN); + #ifdef SMP /* * OK, enough kmem_alloc/malloc state should be up, lets get on with it! @@ -1817,11 +1831,6 @@ init386(first) #endif int off; - /* - * Prevent lowering of the ipl if we call tsleep() early. - */ - safepri = cpl; - proc0.p_addr = proc0paddr; atdevbase = ISA_HOLE_START + KERNBASE; @@ -1871,6 +1880,10 @@ init386(first) r_gdt.rd_base = (int) gdt; lgdt(&r_gdt); + /* setup curproc so that mutexes work */ + PCPU_SET(curproc, &proc0); + PCPU_SET(prevproc, &proc0); + /* make ldt memory segments */ /* * The data segment limit must not cover the user area because we @@ -1953,7 +1966,7 @@ init386(first) /* make an initial tss so cpu can get interrupt stack on syscall! */ common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16; - common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ; + common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); private_tss = 0; tss_gdt = &gdt[GPROC0_SEL].sd; @@ -1974,6 +1987,12 @@ init386(first) dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); + /* + * We grab Giant during the vm86bios routines, so we need to ensure + * that it is up and running before we use vm86. + */ + mtx_init(&Giant, "Giant", MTX_DEF); + vm86_initialize(); getmemsize(first); @@ -2009,9 +2028,7 @@ init386(first) /* setup proc 0's pcb */ proc0.p_addr->u_pcb.pcb_flags = 0; proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD; -#ifdef SMP - proc0.p_addr->u_pcb.pcb_mpnest = 1; -#endif + proc0.p_addr->u_pcb.pcb_schednest = 0; proc0.p_addr->u_pcb.pcb_ext = 0; proc0.p_md.md_regs = &proc0_tf; } diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c index 61c5ecf73205..95b5759f9e66 100644 --- a/sys/i386/i386/mp_machdep.c +++ b/sys/i386/i386/mp_machdep.c @@ -36,6 +36,7 @@ #endif #include +#include #include #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -236,6 +238,8 @@ typedef struct BASETABLE_ENTRY { #define MP_ANNOUNCE_POST 0x19 +/* used to hold the AP's until we are ready to release them */ +struct simplelock ap_boot_lock; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; @@ -336,6 +340,7 @@ static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); static int apic_int_is_bus_type(int intr, int bus_type); +static void release_aps(void *dummy); /* * Calculate usable address in base memory for AP trampoline code. @@ -403,7 +408,7 @@ mp_probe(void) /* - * Startup the SMP processors. + * Initialize the SMP hardware and the APIC and start up the AP's. */ void mp_start(void) @@ -619,6 +624,9 @@ mp_enable(u_int boot_addr) /* initialize all SMP locks */ init_locks(); + /* obtain the ap_boot_lock */ + s_lock(&ap_boot_lock); + /* start each Application Processor */ start_all_aps(boot_addr); } @@ -1866,9 +1874,6 @@ struct simplelock fast_intr_lock; /* critical region around INTR() routines */ struct simplelock intr_lock; -/* lock regions protected in UP kernel via cli/sti */ -struct simplelock mpintr_lock; - /* lock region used by kernel profiling */ struct simplelock mcount_lock; @@ -1885,26 +1890,16 @@ struct simplelock clock_lock; /* lock around the MP rendezvous */ static struct simplelock smp_rv_lock; +/* only 1 CPU can panic at a time :) */ +struct simplelock panic_lock; + static void init_locks(void) { - /* - * Get the initial mp_lock with a count of 1 for the BSP. - * This uses a LOGICAL cpu ID, ie BSP == 0. - */ - mp_lock = 0x00000001; - -#if 0 - /* ISR uses its own "giant lock" */ - isr_lock = FREE_LOCK; -#endif - #if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ) s_lock_init((struct simplelock*)&apic_itrace_debuglock); #endif - s_lock_init((struct simplelock*)&mpintr_lock); - s_lock_init((struct simplelock*)&mcount_lock); s_lock_init((struct simplelock*)&fast_intr_lock); @@ -1912,6 +1907,7 @@ init_locks(void) s_lock_init((struct simplelock*)&imen_lock); s_lock_init((struct simplelock*)&cpl_lock); s_lock_init(&smp_rv_lock); + s_lock_init(&panic_lock); #ifdef USE_COMLOCK s_lock_init((struct simplelock*)&com_lock); @@ -1919,12 +1915,10 @@ init_locks(void) #ifdef USE_CLOCKLOCK s_lock_init((struct simplelock*)&clock_lock); #endif /* USE_CLOCKLOCK */ + + s_lock_init(&ap_boot_lock); } - -/* Wait for all APs to be fully initialized */ -extern int wait_ap(unsigned int); - /* * start each AP in our list */ @@ -1987,6 +1981,7 @@ start_all_aps(u_int boot_addr) SMPpt[pg + 4] = 0; /* *prv_PMAP1 */ /* prime data page for it to use */ + SLIST_INSERT_HEAD(&cpuhead, gd, gd_allcpu); gd->gd_cpuid = x; gd->gd_cpu_lockid = x << 24; gd->gd_prv_CMAP1 = &SMPpt[pg + 1]; @@ -2211,7 +2206,6 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } - /* * Flush the TLB on all other CPU's * @@ -2348,10 +2342,13 @@ SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW, void ap_init(void); void -ap_init() +ap_init(void) { u_int apic_id; + /* lock against other AP's that are waking up */ + s_lock(&ap_boot_lock); + /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); @@ -2397,6 +2394,30 @@ ap_init() smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } + + /* let other AP's wake up now */ + s_unlock(&ap_boot_lock); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ; /* nothing */ + + /* + * Set curproc to our per-cpu idleproc so that mutexes have + * something unique to lock with. + */ + PCPU_SET(curproc,idleproc); + PCPU_SET(prevproc,idleproc); + + microuptime(&switchtime); + switchticks = ticks; + + /* ok, now grab sched_lock and enter the scheduler */ + enable_intr(); + mtx_enter(&sched_lock, MTX_SPIN); + cpu_throw(); /* doesn't return */ + + panic("scheduler returned us to ap_init"); } #ifdef BETTER_CLOCK @@ -2453,6 +2474,12 @@ forwarded_statclock(int id, int pscnt, int *astmap) p = checkstate_curproc[id]; cpustate = checkstate_cpustate[id]; + /* XXX */ + if (p->p_ithd) + cpustate = CHECKSTATE_INTR; + else if (p == idleproc) + cpustate = CHECKSTATE_SYS; + switch (cpustate) { case CHECKSTATE_USER: if (p->p_flag & P_PROFIL) @@ -2482,9 +2509,10 @@ forwarded_statclock(int id, int pscnt, int *astmap) if (pscnt > 1) return; - if (!p) + if (p == idleproc) { + p->p_sticks++; cp_time[CP_IDLE]++; - else { + } else { p->p_sticks++; cp_time[CP_SYS]++; } @@ -2510,7 +2538,7 @@ forwarded_statclock(int id, int pscnt, int *astmap) p->p_iticks++; cp_time[CP_INTR]++; } - if (p != NULL) { + if (p != idleproc) { schedclock(p); /* Update resource usage integrals and maximums. */ @@ -2863,3 +2891,11 @@ smp_rendezvous(void (* setup_func)(void *), /* release lock */ s_unlock(&smp_rv_lock); } + +void +release_aps(void *dummy __unused) +{ + s_unlock(&ap_boot_lock); +} + +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); diff --git a/sys/i386/i386/mpapic.c b/sys/i386/i386/mpapic.c index a3594a8ed20c..3f971d83548d 100644 --- a/sys/i386/i386/mpapic.c +++ b/sys/i386/i386/mpapic.c @@ -28,11 +28,14 @@ #include "opt_smp.h" #include +#include #include +#include #include /** TEST_TEST1 */ #include #include +#include #include #include /* Xspuriousint() */ diff --git a/sys/i386/i386/mpboot.s b/sys/i386/i386/mpboot.s index d3602d29a2f4..9ede02c24342 100644 --- a/sys/i386/i386/mpboot.s +++ b/sys/i386/i386/mpboot.s @@ -114,43 +114,9 @@ mp_begin: /* now running relocated at KERNBASE */ CHECKPOINT(0x39, 6) - /* wait till we can get into the kernel */ - call _boot_get_mplock - - /* Now, let's prepare for some REAL WORK :-) */ + /* Now, let's prepare for some REAL WORK :-) This doesn't return. */ call _ap_init - call _rel_mplock - lock /* Avoid livelock (PIII Errata 39) */ - addl $0,-4(%esp) -2: - cmpl $0, CNAME(smp_started) /* Wait for last AP to be ready */ - jz 2b - call _get_mplock - - /* let her rip! (loads new stack) */ - jmp _cpu_switch - -NON_GPROF_ENTRY(wait_ap) - pushl %ebp - movl %esp, %ebp - call _rel_mplock - lock /* Avoid livelock (PIII Errata 39) */ - addl $0,0(%esp) - movl %eax, 8(%ebp) -1: - cmpl $0, CNAME(smp_started) - jnz 2f - decl %eax - cmpl $0, %eax - jge 1b -2: - call _get_mplock - movl %ebp, %esp - popl %ebp - ret - - /* * This is the embedded trampoline or bootstrap that is * copied into 'real-mode' low memory, it is where the diff --git a/sys/i386/i386/mplock.s b/sys/i386/i386/mplock.s deleted file mode 100644 index dc5ba01e1f05..000000000000 --- a/sys/i386/i386/mplock.s +++ /dev/null @@ -1,343 +0,0 @@ -/* - * ---------------------------------------------------------------------------- - * "THE BEER-WARE LICENSE" (Revision 42): - * wrote this file. As long as you retain this notice you - * can do whatever you want with this stuff. If we meet some day, and you think - * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp - * ---------------------------------------------------------------------------- - * - * $FreeBSD$ - * - * Functions for locking between CPUs in a SMP system. - * - * This is an "exclusive counting semaphore". This means that it can be - * free (0xffffffff) or be owned by a CPU (0xXXYYYYYY where XX is CPU-id - * and YYYYYY is the count). - * - * Contrary to most implementations around, this one is entirely atomic: - * The attempt to seize/release the semaphore and the increment/decrement - * is done in one atomic operation. This way we are safe from all kinds - * of weird reentrancy situations. - */ - -#include -#include /** GRAB_LOPRIO */ -#include - -#define GLPROFILE_NOT - -#ifdef CHEAP_TPR - -/* we assumme that the 'reserved bits' can be written with zeros */ - -#else /* CHEAP_TPR */ - -#error HEADS UP: this code needs work -/* - * The APIC doc says that reserved bits must be written with whatever - * value they currently contain, ie you should: read, modify, write, - * instead of just writing new values to the TPR register. Current - * silicon seems happy with just writing. If the behaviour of the - * silicon changes, all code that access the lapic_tpr must be modified. - * The last version to contain such code was: - * Id: mplock.s,v 1.17 1997/08/10 20:59:07 fsmp Exp - */ - -#endif /* CHEAP_TPR */ - -#ifdef GRAB_LOPRIO -/* - * Claim LOWest PRIOrity, ie. attempt to grab ALL INTerrupts. - */ - -/* after 1st acquire of lock we grab all hardware INTs */ -#define GRAB_HWI movl $ALLHWI_LEVEL, lapic_tpr - -/* after last release of lock give up LOW PRIO (ie, arbitrate INTerrupts) */ -#define ARB_HWI movl $LOPRIO_LEVEL, lapic_tpr /* CHEAP_TPR */ - -#else /* GRAB_LOPRIO */ - -#define GRAB_HWI /* nop */ -#define ARB_HWI /* nop */ - -#endif /* GRAB_LOPRIO */ - - - .text - -#ifdef SMP - -/*********************************************************************** - * void MPgetlock_edx(unsigned int *lock : %edx) - * ---------------------------------- - * Destroys %eax, %ecx. %edx must hold lock argument. - * - * Grabs hardware interrupts on first aquire. - * - * NOTE: Serialization is not required if we already hold the lock, since - * we already hold the lock, nor do we need a locked instruction if we - * already hold the lock. - */ - -NON_GPROF_ENTRY(MPgetlock_edx) -1: - movl (%edx), %eax /* Get current contents of lock */ - movl %eax, %ecx - andl $CPU_FIELD,%ecx - cmpl _cpu_lockid, %ecx /* Do we already own the lock? */ - jne 2f - incl %eax /* yes, just bump the count */ - movl %eax, (%edx) /* serialization not required */ - ret -2: - movl $FREE_LOCK, %eax /* lock must be free */ - movl _cpu_lockid, %ecx - incl %ecx - lock - cmpxchg %ecx, (%edx) /* attempt to replace %eax<->%ecx */ -#ifdef GLPROFILE - jne 3f - incl _gethits2 -#else - jne 1b -#endif /* GLPROFILE */ - GRAB_HWI /* 1st acquire, grab hw INTs */ - ret -#ifdef GLPROFILE -3: - incl _gethits3 - jmp 1b -#endif - -/*********************************************************************** - * int MPtrylock(unsigned int *lock) - * --------------------------------- - * Destroys %eax, %ecx and %edx. - * Returns 1 if lock was successfull - */ - -NON_GPROF_ENTRY(MPtrylock) - movl 4(%esp), %edx /* Get the address of the lock */ - - movl $FREE_LOCK, %eax /* Assume it's free */ - movl _cpu_lockid, %ecx /* - get pre-shifted logical cpu id */ - incl %ecx /* - new count is one */ - lock - cmpxchg %ecx, (%edx) /* - try it atomically */ - jne 1f /* ...do not collect $200 */ -#ifdef GLPROFILE - incl _tryhits2 -#endif /* GLPROFILE */ - GRAB_HWI /* 1st acquire, grab hw INTs */ - movl $1, %eax - ret -1: - movl (%edx), %eax /* Try to see if we have it already */ - andl $COUNT_FIELD, %eax /* - get count */ - movl _cpu_lockid, %ecx /* - get pre-shifted logical cpu id */ - orl %ecx, %eax /* - combine them */ - movl %eax, %ecx - incl %ecx /* - new count is one more */ - lock - cmpxchg %ecx, (%edx) /* - try it atomically */ - jne 2f /* - miss */ -#ifdef GLPROFILE - incl _tryhits -#endif /* GLPROFILE */ - movl $1, %eax - ret -2: -#ifdef GLPROFILE - incl _tryhits3 -#endif /* GLPROFILE */ - movl $0, %eax - ret - - -/*********************************************************************** - * void MPrellock_edx(unsigned int *lock : %edx) - * ---------------------------------- - * Destroys %ecx, argument must be in %edx - * - * SERIALIZATION NOTE! - * - * After a lot of arguing, it turns out that there is no problem with - * not having a synchronizing instruction in the MP unlock code. There - * are two things to keep in mind: First, Intel guarentees that writes - * are ordered amoungst themselves. Second, the P6 is allowed to reorder - * reads around writes. Third, the P6 maintains cache consistency (snoops - * the bus). The second is not an issue since the one read we do is the - * basis for the conditional which determines whether the write will be - * made or not. - * - * Therefore, no synchronizing instruction is required on unlock. There are - * three performance cases: First, if a single cpu is getting and releasing - * the lock the removal of the synchronizing instruction saves approx - * 200 nS (testing w/ duel cpu PIII 450). Second, if one cpu is contending - * for the lock while the other holds it, the removal of the synchronizing - * instruction results in a 700nS LOSS in performance. Third, if two cpu's - * are switching off ownership of the MP lock but not contending for it (the - * most common case), this results in a 400nS IMPROVEMENT in performance. - * - * Since our goal is to reduce lock contention in the first place, we have - * decided to remove the synchronizing instruction from the unlock code. - */ - -NON_GPROF_ENTRY(MPrellock_edx) - movl (%edx), %ecx /* - get the value */ - decl %ecx /* - new count is one less */ - testl $COUNT_FIELD, %ecx /* - Unless it's zero... */ - jnz 2f - ARB_HWI /* last release, arbitrate hw INTs */ - movl $FREE_LOCK, %ecx /* - In which case we release it */ -#if 0 - lock - addl $0,0(%esp) /* see note above */ -#endif -2: - movl %ecx, (%edx) - ret - -/*********************************************************************** - * void get_mplock() - * ----------------- - * All registers preserved - * - * Stack (after call to _MPgetlock): - * - * edx 4(%esp) - * ecx 8(%esp) - * eax 12(%esp) - * - * Requirements: Interrupts should be enabled on call so we can take - * IPI's and FAST INTs while we are waiting for the lock - * (else the system may not be able to halt). - * - * XXX there are still places where get_mplock() is called - * with interrupts disabled, so we have to temporarily reenable - * interrupts. - * - * Side effects: The current cpu will be given ownership of the - * hardware interrupts when it first aquires the lock. - * - * Costs: Initial aquisition requires the use of a costly locked - * instruction, but recursive aquisition is cheap. Release - * is very cheap. - */ - -NON_GPROF_ENTRY(get_mplock) - pushl %eax - pushl %ecx - pushl %edx - movl $_mp_lock, %edx - pushfl - testl $(1<<9), (%esp) - jz 2f - call _MPgetlock_edx - addl $4,%esp -1: - popl %edx - popl %ecx - popl %eax - ret -2: - sti - call _MPgetlock_edx - popfl - jmp 1b - -/* - * Special version of get_mplock that is used during bootstrap when we can't - * yet enable interrupts of any sort since the APIC isn't online yet. We - * do an endrun around MPgetlock_edx to avoid enabling interrupts. - * - * XXX FIXME.. - APIC should be online from the start to simplify IPI's. - */ -NON_GPROF_ENTRY(boot_get_mplock) - pushl %eax - pushl %ecx - pushl %edx -#ifdef GRAB_LOPRIO - pushfl - pushl lapic_tpr - cli -#endif - - movl $_mp_lock, %edx - call _MPgetlock_edx - -#ifdef GRAB_LOPRIO - popl lapic_tpr - popfl -#endif - popl %edx - popl %ecx - popl %eax - ret - -/*********************************************************************** - * void try_mplock() - * ----------------- - * reg %eax == 1 if success - */ - -NON_GPROF_ENTRY(try_mplock) - pushl %ecx - pushl %edx - pushl $_mp_lock - call _MPtrylock - add $4, %esp - popl %edx - popl %ecx - ret - -/*********************************************************************** - * void rel_mplock() - * ----------------- - * All registers preserved - */ - -NON_GPROF_ENTRY(rel_mplock) - pushl %ecx - pushl %edx - movl $_mp_lock,%edx - call _MPrellock_edx - popl %edx - popl %ecx - ret - -#endif - -/*********************************************************************** - * - */ - .data - .p2align 2 /* xx_lock aligned on int boundary */ - -#ifdef SMP - - .globl _mp_lock -_mp_lock: .long 0 - -#ifdef GLPROFILE - .globl _gethits -_gethits: - .long 0 -_gethits2: - .long 0 -_gethits3: - .long 0 - - .globl _tryhits -_tryhits: - .long 0 -_tryhits2: - .long 0 -_tryhits3: - .long 0 - -msg: - .asciz "lock hits: 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x\n" -#endif /* GLPROFILE */ -#endif /* SMP */ diff --git a/sys/i386/i386/mptable.c b/sys/i386/i386/mptable.c index 61c5ecf73205..95b5759f9e66 100644 --- a/sys/i386/i386/mptable.c +++ b/sys/i386/i386/mptable.c @@ -36,6 +36,7 @@ #endif #include +#include #include #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -236,6 +238,8 @@ typedef struct BASETABLE_ENTRY { #define MP_ANNOUNCE_POST 0x19 +/* used to hold the AP's until we are ready to release them */ +struct simplelock ap_boot_lock; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; @@ -336,6 +340,7 @@ static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); static int apic_int_is_bus_type(int intr, int bus_type); +static void release_aps(void *dummy); /* * Calculate usable address in base memory for AP trampoline code. @@ -403,7 +408,7 @@ mp_probe(void) /* - * Startup the SMP processors. + * Initialize the SMP hardware and the APIC and start up the AP's. */ void mp_start(void) @@ -619,6 +624,9 @@ mp_enable(u_int boot_addr) /* initialize all SMP locks */ init_locks(); + /* obtain the ap_boot_lock */ + s_lock(&ap_boot_lock); + /* start each Application Processor */ start_all_aps(boot_addr); } @@ -1866,9 +1874,6 @@ struct simplelock fast_intr_lock; /* critical region around INTR() routines */ struct simplelock intr_lock; -/* lock regions protected in UP kernel via cli/sti */ -struct simplelock mpintr_lock; - /* lock region used by kernel profiling */ struct simplelock mcount_lock; @@ -1885,26 +1890,16 @@ struct simplelock clock_lock; /* lock around the MP rendezvous */ static struct simplelock smp_rv_lock; +/* only 1 CPU can panic at a time :) */ +struct simplelock panic_lock; + static void init_locks(void) { - /* - * Get the initial mp_lock with a count of 1 for the BSP. - * This uses a LOGICAL cpu ID, ie BSP == 0. - */ - mp_lock = 0x00000001; - -#if 0 - /* ISR uses its own "giant lock" */ - isr_lock = FREE_LOCK; -#endif - #if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ) s_lock_init((struct simplelock*)&apic_itrace_debuglock); #endif - s_lock_init((struct simplelock*)&mpintr_lock); - s_lock_init((struct simplelock*)&mcount_lock); s_lock_init((struct simplelock*)&fast_intr_lock); @@ -1912,6 +1907,7 @@ init_locks(void) s_lock_init((struct simplelock*)&imen_lock); s_lock_init((struct simplelock*)&cpl_lock); s_lock_init(&smp_rv_lock); + s_lock_init(&panic_lock); #ifdef USE_COMLOCK s_lock_init((struct simplelock*)&com_lock); @@ -1919,12 +1915,10 @@ init_locks(void) #ifdef USE_CLOCKLOCK s_lock_init((struct simplelock*)&clock_lock); #endif /* USE_CLOCKLOCK */ + + s_lock_init(&ap_boot_lock); } - -/* Wait for all APs to be fully initialized */ -extern int wait_ap(unsigned int); - /* * start each AP in our list */ @@ -1987,6 +1981,7 @@ start_all_aps(u_int boot_addr) SMPpt[pg + 4] = 0; /* *prv_PMAP1 */ /* prime data page for it to use */ + SLIST_INSERT_HEAD(&cpuhead, gd, gd_allcpu); gd->gd_cpuid = x; gd->gd_cpu_lockid = x << 24; gd->gd_prv_CMAP1 = &SMPpt[pg + 1]; @@ -2211,7 +2206,6 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } - /* * Flush the TLB on all other CPU's * @@ -2348,10 +2342,13 @@ SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW, void ap_init(void); void -ap_init() +ap_init(void) { u_int apic_id; + /* lock against other AP's that are waking up */ + s_lock(&ap_boot_lock); + /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); @@ -2397,6 +2394,30 @@ ap_init() smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } + + /* let other AP's wake up now */ + s_unlock(&ap_boot_lock); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ; /* nothing */ + + /* + * Set curproc to our per-cpu idleproc so that mutexes have + * something unique to lock with. + */ + PCPU_SET(curproc,idleproc); + PCPU_SET(prevproc,idleproc); + + microuptime(&switchtime); + switchticks = ticks; + + /* ok, now grab sched_lock and enter the scheduler */ + enable_intr(); + mtx_enter(&sched_lock, MTX_SPIN); + cpu_throw(); /* doesn't return */ + + panic("scheduler returned us to ap_init"); } #ifdef BETTER_CLOCK @@ -2453,6 +2474,12 @@ forwarded_statclock(int id, int pscnt, int *astmap) p = checkstate_curproc[id]; cpustate = checkstate_cpustate[id]; + /* XXX */ + if (p->p_ithd) + cpustate = CHECKSTATE_INTR; + else if (p == idleproc) + cpustate = CHECKSTATE_SYS; + switch (cpustate) { case CHECKSTATE_USER: if (p->p_flag & P_PROFIL) @@ -2482,9 +2509,10 @@ forwarded_statclock(int id, int pscnt, int *astmap) if (pscnt > 1) return; - if (!p) + if (p == idleproc) { + p->p_sticks++; cp_time[CP_IDLE]++; - else { + } else { p->p_sticks++; cp_time[CP_SYS]++; } @@ -2510,7 +2538,7 @@ forwarded_statclock(int id, int pscnt, int *astmap) p->p_iticks++; cp_time[CP_INTR]++; } - if (p != NULL) { + if (p != idleproc) { schedclock(p); /* Update resource usage integrals and maximums. */ @@ -2863,3 +2891,11 @@ smp_rendezvous(void (* setup_func)(void *), /* release lock */ s_unlock(&smp_rv_lock); } + +void +release_aps(void *dummy __unused) +{ + s_unlock(&ap_boot_lock); +} + +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); diff --git a/sys/i386/i386/nexus.c b/sys/i386/i386/nexus.c index 8a3077058718..5b6cdbc85618 100644 --- a/sys/i386/i386/nexus.c +++ b/sys/i386/i386/nexus.c @@ -68,7 +68,10 @@ #else #include #endif +#include +#include #include +#include static struct rman irq_rman, drq_rman, port_rman, mem_rman; @@ -397,9 +400,9 @@ static int nexus_setup_intr(device_t bus, device_t child, struct resource *irq, int flags, void (*ihand)(void *), void *arg, void **cookiep) { - intrmask_t *mask; driver_t *driver; - int error, icflags; + int error, icflags; + int pri; /* interrupt thread priority */ /* somebody tried to setup an irq that failed to allocate! */ if (irq == NULL) @@ -413,27 +416,32 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, driver = device_get_driver(child); switch (flags) { - case INTR_TYPE_TTY: - mask = &tty_imask; + case INTR_TYPE_TTY: /* keyboard or parallel port */ + pri = PI_TTYLOW; break; - case (INTR_TYPE_TTY | INTR_TYPE_FAST): - mask = &tty_imask; + case (INTR_TYPE_TTY | INTR_FAST): /* sio */ + pri = PI_TTYHIGH; icflags |= INTR_FAST; break; case INTR_TYPE_BIO: - mask = &bio_imask; + /* + * XXX We need to refine this. BSD/OS distinguishes + * between tape and disk priorities. + */ + pri = PI_DISK; break; case INTR_TYPE_NET: - mask = &net_imask; + pri = PI_NET; break; case INTR_TYPE_CAM: - mask = &cam_imask; + pri = PI_DISK; /* XXX or PI_CAM? */ break; case INTR_TYPE_MISC: - mask = 0; + pri = PI_DULL; /* don't care */ break; + /* We didn't specify an interrupt level. */ default: - panic("still using grody create_intr interface"); + panic("nexus_setup_intr: no interrupt type in flags"); } /* @@ -444,7 +452,7 @@ nexus_setup_intr(device_t bus, device_t child, struct resource *irq, return (error); *cookiep = inthand_add(device_get_nameunit(child), irq->r_start, - ihand, arg, mask, icflags); + ihand, arg, pri, icflags); if (*cookiep == NULL) error = EINVAL; /* XXX ??? */ diff --git a/sys/i386/i386/perfmon.c b/sys/i386/i386/perfmon.c index 574f416df2be..2efa51642d85 100644 --- a/sys/i386/i386/perfmon.c +++ b/sys/i386/i386/perfmon.c @@ -118,16 +118,19 @@ perfmon_avail(void) int perfmon_setup(int pmc, unsigned int control) { + int intrstate; + if (pmc < 0 || pmc >= NPMC) return EINVAL; perfmon_inuse |= (1 << pmc); control &= ~(PMCF_SYS_FLAGS << 16); + intrstate = save_intr(); disable_intr(); ctl_shadow[pmc] = control; writectl(pmc); wrmsr(msr_pmc[pmc], pmc_shadow[pmc] = 0); - enable_intr(); + restore_intr(intrstate); return 0; } @@ -162,15 +165,18 @@ perfmon_fini(int pmc) int perfmon_start(int pmc) { + int intrstate; + if (pmc < 0 || pmc >= NPMC) return EINVAL; if (perfmon_inuse & (1 << pmc)) { + intrstate = save_intr(); disable_intr(); ctl_shadow[pmc] |= (PMCF_EN << 16); wrmsr(msr_pmc[pmc], pmc_shadow[pmc]); writectl(pmc); - enable_intr(); + restore_intr(intrstate); return 0; } return EBUSY; @@ -179,15 +185,18 @@ perfmon_start(int pmc) int perfmon_stop(int pmc) { + int intrstate; + if (pmc < 0 || pmc >= NPMC) return EINVAL; if (perfmon_inuse & (1 << pmc)) { + intrstate = save_intr(); disable_intr(); pmc_shadow[pmc] = rdmsr(msr_pmc[pmc]) & 0xffffffffffULL; ctl_shadow[pmc] &= ~(PMCF_EN << 16); writectl(pmc); - enable_intr(); + restore_intr(intrstate); return 0; } return EBUSY; diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index edae2929fb87..7ce9120d243f 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -668,7 +668,7 @@ pmap_pte_quick(pmap, va) * (unsigned *) prv_PMAP1 = newpf | PG_RW | PG_V; cpu_invlpg(prv_PADDR1); } - return prv_PADDR1 + ((unsigned) index & (NPTEPG - 1)); + return (unsigned *)(prv_PADDR1 + (index & (NPTEPG - 1))); #else if ( ((* (unsigned *) PMAP1) & PG_FRAME) != newpf) { * (unsigned *) PMAP1 = newpf | PG_RW | PG_V; diff --git a/sys/i386/i386/swtch.s b/sys/i386/i386/swtch.s index c895fefa8c15..db56a1b40af6 100644 --- a/sys/i386/i386/swtch.s +++ b/sys/i386/i386/swtch.s @@ -73,189 +73,6 @@ _tlb_flush_count: .long 0 .text -/* - * When no processes are on the runq, cpu_switch() branches to _idle - * to wait for something to come ready. - */ - ALIGN_TEXT - .type _idle,@function -_idle: - xorl %ebp,%ebp - movl %ebp,_switchtime - -#ifdef SMP - - /* when called, we have the mplock, intr disabled */ - /* use our idleproc's "context" */ - movl _IdlePTD, %ecx - movl %cr3, %eax - cmpl %ecx, %eax - je 2f -#if defined(SWTCH_OPTIM_STATS) - decl _swtch_optim_stats - incl _tlb_flush_count -#endif - movl %ecx, %cr3 -2: - /* Keep space for nonexisting return addr, or profiling bombs */ - movl $gd_idlestack_top-4, %ecx - addl %fs:0, %ecx - movl %ecx, %esp - - /* update common_tss.tss_esp0 pointer */ - movl %ecx, _common_tss + TSS_ESP0 - - movl _cpuid, %esi - btrl %esi, _private_tss - jae 1f - - movl $gd_common_tssd, %edi - addl %fs:0, %edi - - /* move correct tss descriptor into GDT slot, then reload tr */ - movl _tss_gdt, %ebx /* entry in GDT */ - movl 0(%edi), %eax - movl %eax, 0(%ebx) - movl 4(%edi), %eax - movl %eax, 4(%ebx) - movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ - ltr %si -1: - - sti - - /* - * XXX callers of cpu_switch() do a bogus splclock(). Locking should - * be left to cpu_switch(). - * - * NOTE: spl*() may only be called while we hold the MP lock (which - * we do). - */ - call _spl0 - - cli - - /* - * _REALLY_ free the lock, no matter how deep the prior nesting. - * We will recover the nesting on the way out when we have a new - * proc to load. - * - * XXX: we had damn well better be sure we had it before doing this! - */ - movl $FREE_LOCK, %eax - movl %eax, _mp_lock - - /* do NOT have lock, intrs disabled */ - .globl idle_loop -idle_loop: - - cmpl $0,_smp_active - jne 1f - cmpl $0,_cpuid - je 1f - jmp 2f - -1: - call _procrunnable - testl %eax,%eax - jnz 3f - - /* - * Handle page-zeroing in the idle loop. Called with interrupts - * disabled and the MP lock released. Inside vm_page_zero_idle - * we enable interrupts and grab the mplock as required. - */ - cmpl $0,_do_page_zero_idle - je 2f - - call _vm_page_zero_idle /* internal locking */ - testl %eax, %eax - jnz idle_loop -2: - - /* enable intrs for a halt */ - movl $0, lapic_tpr /* 1st candidate for an INT */ - call *_hlt_vector /* wait for interrupt */ - cli - jmp idle_loop - - /* - * Note that interrupts must be enabled while obtaining the MP lock - * in order to be able to take IPI's while blocked. - */ -3: - movl $LOPRIO_LEVEL, lapic_tpr /* arbitrate for INTs */ - sti - call _get_mplock - cli - call _procrunnable - testl %eax,%eax - CROSSJUMP(jnz, sw1a, jz) - call _rel_mplock - jmp idle_loop - -#else /* !SMP */ - - movl $HIDENAME(tmpstk),%esp -#if defined(OVERLY_CONSERVATIVE_PTD_MGMT) -#if defined(SWTCH_OPTIM_STATS) - incl _swtch_optim_stats -#endif - movl _IdlePTD, %ecx - movl %cr3, %eax - cmpl %ecx, %eax - je 2f -#if defined(SWTCH_OPTIM_STATS) - decl _swtch_optim_stats - incl _tlb_flush_count -#endif - movl %ecx, %cr3 -2: -#endif - - /* update common_tss.tss_esp0 pointer */ - movl %esp, _common_tss + TSS_ESP0 - - movl $0, %esi - btrl %esi, _private_tss - jae 1f - - movl $_common_tssd, %edi - - /* move correct tss descriptor into GDT slot, then reload tr */ - movl _tss_gdt, %ebx /* entry in GDT */ - movl 0(%edi), %eax - movl %eax, 0(%ebx) - movl 4(%edi), %eax - movl %eax, 4(%ebx) - movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ - ltr %si -1: - - sti - - /* - * XXX callers of cpu_switch() do a bogus splclock(). Locking should - * be left to cpu_switch(). - */ - call _spl0 - - ALIGN_TEXT -idle_loop: - cli - call _procrunnable - testl %eax,%eax - CROSSJUMP(jnz, sw1a, jz) - call _vm_page_zero_idle - testl %eax, %eax - jnz idle_loop - call *_hlt_vector /* wait for interrupt */ - jmp idle_loop - -#endif /* SMP */ - -CROSSJUMPTARGET(_idle) - ENTRY(default_halt) sti #ifndef SMP @@ -263,6 +80,12 @@ ENTRY(default_halt) #endif ret +/* + * cpu_throw() + */ +ENTRY(cpu_throw) + jmp sw1 + /* * cpu_switch() */ @@ -270,10 +93,11 @@ ENTRY(cpu_switch) /* switch to new process. first, save context as needed */ movl _curproc,%ecx + movl %ecx,_prevproc /* if no process to save, don't bother */ testl %ecx,%ecx - je sw1 + jz sw1 #ifdef SMP movb P_ONCPU(%ecx), %al /* save "last" cpu */ @@ -299,7 +123,7 @@ ENTRY(cpu_switch) movl %edi,PCB_EDI(%edx) movl %gs,PCB_GS(%edx) - /* test if debug regisers should be saved */ + /* test if debug registers should be saved */ movb PCB_FLAGS(%edx),%al andb $PCB_DBREGS,%al jz 1f /* no, skip over */ @@ -319,15 +143,12 @@ ENTRY(cpu_switch) movl %eax,PCB_DR0(%edx) 1: + /* save sched_lock recursion count */ + movl _sched_lock+MTX_RECURSE,%eax + movl %eax,PCB_SCHEDNEST(%edx) + #ifdef SMP - movl _mp_lock, %eax /* XXX FIXME: we should be saving the local APIC TPR */ -#ifdef DIAGNOSTIC - cmpl $FREE_LOCK, %eax /* is it free? */ - je badsw4 /* yes, bad medicine! */ -#endif /* DIAGNOSTIC */ - andl $COUNT_FIELD, %eax /* clear CPU portion */ - movl %eax, PCB_MPNEST(%edx) /* store it */ #endif /* SMP */ #if NNPX > 0 @@ -341,25 +162,33 @@ ENTRY(cpu_switch) 1: #endif /* NNPX > 0 */ - movl $0,_curproc /* out of process */ - - /* save is done, now choose a new process or idle */ + /* save is done, now choose a new process */ sw1: - cli #ifdef SMP /* Stop scheduling if smp_active goes zero and we are not BSP */ cmpl $0,_smp_active jne 1f cmpl $0,_cpuid - CROSSJUMP(je, _idle, jne) /* wind down */ + je 1f + + movl _idleproc, %eax + jmp sw1b 1: #endif + /* + * Choose a new process to schedule. chooseproc() returns idleproc + * if it cannot find another process to run. + */ sw1a: call _chooseproc /* trash ecx, edx, ret eax*/ - testl %eax,%eax - CROSSJUMP(je, _idle, jne) /* if no proc, idle */ + +#ifdef DIAGNOSTIC + testl %eax,%eax /* no process? */ + jz badsw3 /* no, panic */ +#endif +sw1b: movl %eax,%ecx xorl %eax,%eax @@ -456,9 +285,6 @@ sw1a: movl %ecx, _curproc /* into next process */ #ifdef SMP - movl _cpu_lockid, %eax - orl PCB_MPNEST(%edx), %eax /* add next count from PROC */ - movl %eax, _mp_lock /* load the mp_lock */ /* XXX FIXME: we should be restoring the local APIC TPR */ #endif /* SMP */ @@ -500,7 +326,22 @@ cpu_switch_load_gs: movl %eax,%dr7 1: - sti + /* + * restore sched_lock recursion count and transfer ownership to + * new process + */ + movl PCB_SCHEDNEST(%edx),%eax + movl %eax,_sched_lock+MTX_RECURSE + + movl _curproc,%eax + movl %eax,_sched_lock+MTX_LOCK + +#ifdef DIAGNOSTIC + pushfl + popl %ecx + testl $0x200, %ecx /* interrupts enabled? */ + jnz badsw6 /* that way madness lies */ +#endif ret CROSSJUMPTARGET(sw1a) @@ -517,15 +358,27 @@ badsw2: call _panic sw0_2: .asciz "cpu_switch: not SRUN" -#endif -#if defined(SMP) && defined(DIAGNOSTIC) -badsw4: - pushl $sw0_4 +badsw3: + pushl $sw0_3 call _panic -sw0_4: .asciz "cpu_switch: do not have lock" -#endif /* SMP && DIAGNOSTIC */ +sw0_3: .asciz "cpu_switch: chooseproc returned NULL" + +#endif + +#ifdef DIAGNOSTIC +badsw5: + pushl $sw0_5 + call _panic + +sw0_5: .asciz "cpu_switch: interrupts enabled (again)" +badsw6: + pushl $sw0_6 + call _panic + +sw0_6: .asciz "cpu_switch: interrupts enabled" +#endif /* * savectx(pcb) diff --git a/sys/i386/i386/synch_machdep.c b/sys/i386/i386/synch_machdep.c new file mode 100644 index 000000000000..029225dbf314 --- /dev/null +++ b/sys/i386/i386/synch_machdep.c @@ -0,0 +1,559 @@ +/*- + * Copyright (c) 1997, 1998 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ + * $FreeBSD$ + */ + +#define MTX_STRS /* define common strings */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* All mutexes in system (used for debug/panic) */ +mtx_t all_mtx = { MTX_UNOWNED, 0, 0, 0, "All mutexes queue head", + TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked), + { NULL, NULL }, &all_mtx, &all_mtx +#ifdef SMP_DEBUG + , NULL, { NULL, NULL }, NULL, 0 +#endif +}; + +int mtx_cur_cnt; +int mtx_max_cnt; + +extern void _mtx_enter_giant_def(void); +extern void _mtx_exit_giant_def(void); + +static void propagate_priority(struct proc *) __unused; + +#define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED) +#define mtx_owner(m) (mtx_unowned(m) ? NULL \ + : (struct proc *)((m)->mtx_lock & MTX_FLAGMASK)) + +#define RETIP(x) *(((int *)(&x)) - 1) +#define SET_PRIO(p, pri) (p)->p_priority = (pri) + +/* + * XXX Temporary, for use from assembly language + */ + +void +_mtx_enter_giant_def(void) +{ + + mtx_enter(&Giant, MTX_DEF); +} + +void +_mtx_exit_giant_def(void) +{ + + mtx_exit(&Giant, MTX_DEF); +} + +static void +propagate_priority(struct proc *p) +{ + int pri = p->p_priority; + mtx_t *m = p->p_blocked; + + for (;;) { + struct proc *p1; + + p = mtx_owner(m); + + if (p == NULL) { + /* + * This really isn't quite right. Really + * ought to bump priority of process that + * next acquires the mutex. + */ + MPASS(m->mtx_lock == MTX_CONTESTED); + return; + } + MPASS(p->p_magic == P_MAGIC); + if (p->p_priority <= pri) + return; + /* + * If lock holder is actually running, just bump priority. + */ + if (TAILQ_NEXT(p, p_procq) == NULL) { + MPASS(p->p_stat == SRUN || p->p_stat == SZOMB); + SET_PRIO(p, pri); + return; + } + /* + * If on run queue move to new run queue, and + * quit. + */ +#if 1 + if (p->p_stat == SRUN) { +#else + if ((m = p->p_blocked) == NULL) { +#endif + MPASS(p->p_stat == SRUN); + remrunqueue(p); + SET_PRIO(p, pri); + setrunqueue(p); + return; + } + + /* + * If we aren't blocked on a mutex, give up and quit. + */ + if (p->p_stat != SMTX) { + printf( + "XXX: process %d(%s):%d holds %s but isn't blocked on a mutex\n", + p->p_pid, p->p_comm, p->p_stat, m->mtx_description); + return; + } + + /* + * Pick up the mutex that p is blocked on. + */ + m = p->p_blocked; + MPASS(m != NULL); + + printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid, + p->p_comm, m->mtx_description); + /* + * Check if the proc needs to be moved up on + * the blocked chain + */ + if ((p1 = TAILQ_PREV(p, rq, p_procq)) == NULL || + p1->p_priority <= pri) { + if (p1) + printf( + "XXX: previous process %d(%s) has higher priority\n", + p->p_pid, p->p_comm); + else + printf("XXX: process at head of run queue\n"); + continue; + } + + /* + * Remove proc from blocked chain + */ + TAILQ_REMOVE(&m->mtx_blocked, p, p_procq); + TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) { + MPASS(p1->p_magic == P_MAGIC); + if (p1->p_priority > pri) + break; + } + if (p1) + TAILQ_INSERT_BEFORE(p1, p, p_procq); + else + TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq); + CTR4(KTR_LOCK, + "propagate priority: p 0x%p moved before 0x%p on [0x%p] %s", + p, p1, m, m->mtx_description); + } +} + +void +mtx_enter_hard(mtx_t *m, int type, int flags) +{ + struct proc *p = CURPROC; + + KASSERT(p != NULL, ("curproc is NULL in mutex")); + + switch (type) { + case MTX_DEF: + if ((m->mtx_lock & MTX_FLAGMASK) == (u_int)p) { + m->mtx_recurse++; + atomic_set_int(&m->mtx_lock, MTX_RECURSE); + CTR1(KTR_LOCK, "mtx_enter: 0x%p recurse", m); + return; + } + CTR3(KTR_LOCK, "mtx_enter: 0x%p contested (lock=%x) [0x%x]", + m, m->mtx_lock, RETIP(m)); + while (!atomic_cmpset_int(&m->mtx_lock, MTX_UNOWNED, (int)p)) { + int v; + struct proc *p1; + + mtx_enter(&sched_lock, MTX_SPIN | MTX_RLIKELY); + /* + * check if the lock has been released while + * waiting for the schedlock. + */ + if ((v = m->mtx_lock) == MTX_UNOWNED) { + mtx_exit(&sched_lock, MTX_SPIN); + continue; + } + /* + * The mutex was marked contested on release. This + * means that there are processes blocked on it. + */ + if (v == MTX_CONTESTED) { + p1 = TAILQ_FIRST(&m->mtx_blocked); + KASSERT(p1 != NULL, ("contested mutex has no contesters")); + KASSERT(p != NULL, ("curproc is NULL for contested mutex")); + m->mtx_lock = (int)p | MTX_CONTESTED; + if (p1->p_priority < p->p_priority) { + SET_PRIO(p, p1->p_priority); + } + mtx_exit(&sched_lock, MTX_SPIN); + return; + } + /* + * If the mutex isn't already contested and + * a failure occurs setting the contested bit the + * mutex was either release or the + * state of the RECURSION bit changed. + */ + if ((v & MTX_CONTESTED) == 0 && + !atomic_cmpset_int(&m->mtx_lock, v, + v | MTX_CONTESTED)) { + mtx_exit(&sched_lock, MTX_SPIN); + continue; + } + + /* We definitely have to sleep for this lock */ + mtx_assert(m, MA_NOTOWNED); + +#ifdef notyet + /* + * If we're borrowing an interrupted thread's VM + * context must clean up before going to sleep. + */ + if (p->p_flag & (P_ITHD | P_SITHD)) { + ithd_t *it = (ithd_t *)p; + + if (it->it_interrupted) { + CTR2(KTR_LOCK, + "mtx_enter: 0x%x interrupted 0x%x", + it, it->it_interrupted); + intr_thd_fixup(it); + } + } +#endif + + /* Put us on the list of procs blocked on this mutex */ + if (TAILQ_EMPTY(&m->mtx_blocked)) { + p1 = (struct proc *)(m->mtx_lock & + MTX_FLAGMASK); + LIST_INSERT_HEAD(&p1->p_contested, m, + mtx_contested); + TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq); + } else { + TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) + if (p1->p_priority > p->p_priority) + break; + if (p1) + TAILQ_INSERT_BEFORE(p1, p, p_procq); + else + TAILQ_INSERT_TAIL(&m->mtx_blocked, p, + p_procq); + } + + p->p_blocked = m; /* Who we're blocked on */ + p->p_stat = SMTX; +#if 0 + propagate_priority(p); +#endif + CTR3(KTR_LOCK, "mtx_enter: p 0x%p blocked on [0x%p] %s", + p, m, m->mtx_description); + mi_switch(); + CTR3(KTR_LOCK, + "mtx_enter: p 0x%p free from blocked on [0x%p] %s", + p, m, m->mtx_description); + mtx_exit(&sched_lock, MTX_SPIN); + } + return; + case MTX_SPIN: + case MTX_SPIN | MTX_FIRST: + case MTX_SPIN | MTX_TOPHALF: + { + int i = 0; + + if (m->mtx_lock == (u_int)p) { + m->mtx_recurse++; + return; + } + CTR1(KTR_LOCK, "mtx_enter: %p spinning", m); + for (;;) { + if (atomic_cmpset_int(&m->mtx_lock, MTX_UNOWNED, + (u_int)p)) + break; + while (m->mtx_lock != MTX_UNOWNED) { + if (i++ < 1000000) + continue; + if (i++ < 6000000) + DELAY (1); +#ifdef DDB + else if (!db_active) { +#else + else { +#endif +#if 0 + Debugger ("spinning"); + panic("spin lock %s held by 0x%x for > 5 seconds", + m->mtx_description, + m->mtx_lock); +#endif + } + } + } + +#ifdef SMP_DEBUG + if (type != MTX_SPIN) + m->mtx_savefl = 0xdeadbeef; + else +#endif + m->mtx_savefl = flags; + CTR1(KTR_LOCK, "mtx_enter: 0x%p spin done", m); + return; + } + } +} + +void +mtx_exit_hard(mtx_t *m, int type) +{ + struct proc *p, *p1; + mtx_t *m1; + int pri; + + switch (type) { + case MTX_DEF: + case MTX_DEF | MTX_NOSWITCH: + if (m->mtx_recurse != 0) { + if (--(m->mtx_recurse) == 0) + atomic_clear_int(&m->mtx_lock, MTX_RECURSE); + CTR1(KTR_LOCK, "mtx_exit: 0x%p unrecurse", m); + return; + } + mtx_enter(&sched_lock, MTX_SPIN); + CTR1(KTR_LOCK, "mtx_exit: 0x%p contested", m); + p = CURPROC; + p1 = TAILQ_FIRST(&m->mtx_blocked); + MPASS(p->p_magic == P_MAGIC); + MPASS(p1->p_magic == P_MAGIC); + TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq); + if (TAILQ_EMPTY(&m->mtx_blocked)) { + LIST_REMOVE(m, mtx_contested); + atomic_cmpset_int(&m->mtx_lock, m->mtx_lock, + MTX_UNOWNED); + CTR1(KTR_LOCK, "mtx_exit: 0x%p not held", m); + } else + m->mtx_lock = MTX_CONTESTED; + pri = MAXPRI; + LIST_FOREACH(m1, &p->p_contested, mtx_contested) { + int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority; + if (cp < pri) + pri = cp; + } + if (pri > p->p_nativepri) + pri = p->p_nativepri; + SET_PRIO(p, pri); + CTR2(KTR_LOCK, "mtx_exit: 0x%p contested setrunqueue 0x%p", + m, p1); + p1->p_blocked = NULL; + p1->p_stat = SRUN; + setrunqueue(p1); + if ((type & MTX_NOSWITCH) == 0 && p1->p_priority < pri) { +#ifdef notyet + if (p->p_flag & (P_ITHD | P_SITHD)) { + ithd_t *it = (ithd_t *)p; + + if (it->it_interrupted) { + CTR2(KTR_LOCK, + "mtx_exit: 0x%x interruped 0x%x", + it, it->it_interrupted); + intr_thd_fixup(it); + } + } +#endif + setrunqueue(p); + CTR2(KTR_LOCK, "mtx_exit: 0x%p switching out lock=0x%x", + m, m->mtx_lock); + mi_switch(); + CTR2(KTR_LOCK, "mtx_exit: 0x%p resuming lock=0x%x", + m, m->mtx_lock); + } + mtx_exit(&sched_lock, MTX_SPIN); + return; + case MTX_SPIN: + case MTX_SPIN | MTX_FIRST: + if (m->mtx_recurse != 0) { + m->mtx_recurse--; + return; + } + if (atomic_cmpset_int(&m->mtx_lock, CURTHD, MTX_UNOWNED)) { + if (type & MTX_FIRST) { + enable_intr(); /* XXX is this kosher? */ + } else { + MPASS(m->mtx_savefl != 0xdeadbeef); + write_eflags(m->mtx_savefl); + } + return; + } + panic("unsucuessful release of spin lock"); + case MTX_SPIN | MTX_TOPHALF: + if (m->mtx_recurse != 0) { + m->mtx_recurse--; + return; + } + if (atomic_cmpset_int(&m->mtx_lock, CURTHD, MTX_UNOWNED)) + return; + panic("unsucuessful release of spin lock"); + default: + panic("mtx_exit_hard: unsupported type 0x%x\n", type); + } +} + +#define MV_DESTROY 0 /* validate before destory */ +#define MV_INIT 1 /* validate before init */ + +#ifdef SMP_DEBUG + +int mtx_validate __P((mtx_t *, int)); + +int +mtx_validate(mtx_t *m, int when) +{ + mtx_t *mp; + int i; + int retval = 0; + + if (m == &all_mtx || cold) + return 0; + + mtx_enter(&all_mtx, MTX_DEF); + ASS(kernacc((caddr_t)all_mtx.mtx_next, 4, 1) == 1); + ASS(all_mtx.mtx_next->mtx_prev == &all_mtx); + for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) { + if (kernacc((caddr_t)mp->mtx_next, 4, 1) != 1) { + panic("mtx_validate: mp=%p mp->mtx_next=%p", + mp, mp->mtx_next); + } + i++; + if (i > mtx_cur_cnt) { + panic("mtx_validate: too many in chain, known=%d\n", + mtx_cur_cnt); + } + } + ASS(i == mtx_cur_cnt); + switch (when) { + case MV_DESTROY: + for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) + if (mp == m) + break; + ASS(mp == m); + break; + case MV_INIT: + for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) + if (mp == m) { + /* + * Not good. This mutex already exits + */ + retval = 1; +#if 1 + printf("re-initing existing mutex %s\n", + m->mtx_description); + ASS(m->mtx_lock == MTX_UNOWNED); + retval = 1; +#else + panic("re-initing existing mutex %s", + m->mtx_description); +#endif + } + } + mtx_exit(&all_mtx, MTX_DEF); + return (retval); +} +#endif + +void +mtx_init(mtx_t *m, char *t, int flag) +{ + + CTR2(KTR_LOCK, "mtx_init 0x%p (%s)", m, t); +#ifdef SMP_DEBUG + if (mtx_validate(m, MV_INIT)) /* diagnostic and error correction */ + return; +#endif + bzero((void *)m, sizeof *m); + TAILQ_INIT(&m->mtx_blocked); + m->mtx_description = t; + m->mtx_lock = MTX_UNOWNED; + /* Put on all mutex queue */ + mtx_enter(&all_mtx, MTX_DEF); + m->mtx_next = &all_mtx; + m->mtx_prev = all_mtx.mtx_prev; + m->mtx_prev->mtx_next = m; + all_mtx.mtx_prev = m; + if (++mtx_cur_cnt > mtx_max_cnt) + mtx_max_cnt = mtx_cur_cnt; + mtx_exit(&all_mtx, MTX_DEF); + witness_init(m, flag); +} + +void +mtx_destroy(mtx_t *m) +{ + + CTR2(KTR_LOCK, "mtx_destroy 0x%p (%s)", m, m->mtx_description); +#ifdef SMP_DEBUG + if (m->mtx_next == NULL) + panic("mtx_destroy: %p (%s) already destroyed", + m, m->mtx_description); + + if (!mtx_owned(m)) { + ASS(m->mtx_lock == MTX_UNOWNED); + } else { + ASS((m->mtx_lock & (MTX_RECURSE|MTX_CONTESTED)) == 0); + } + mtx_validate(m, MV_DESTROY); /* diagnostic */ +#endif + +#ifdef WITNESS + if (m->mtx_witness) + witness_destroy(m); +#endif /* WITNESS */ + + /* Remove from the all mutex queue */ + mtx_enter(&all_mtx, MTX_DEF); + m->mtx_next->mtx_prev = m->mtx_prev; + m->mtx_prev->mtx_next = m->mtx_next; +#ifdef SMP_DEBUG + m->mtx_next = m->mtx_prev = NULL; +#endif + mtx_cur_cnt--; + mtx_exit(&all_mtx, MTX_DEF); +} diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c index 51de1ac9e650..f32dfaeeddc0 100644 --- a/sys/i386/i386/trap.c +++ b/sys/i386/i386/trap.c @@ -49,10 +49,12 @@ #include "opt_trap.h" #include +#include #include #include #include #include +#include #include #include #include @@ -76,12 +78,14 @@ #include #include #include +#include #include #ifdef SMP #include #endif #include +#include #include #ifdef POWERFAIL_NMI @@ -96,11 +100,14 @@ #include "isa.h" #include "npx.h" +#include + int (*pmath_emulate) __P((struct trapframe *)); extern void trap __P((struct trapframe frame)); extern int trapwrite __P((unsigned addr)); extern void syscall2 __P((struct trapframe frame)); +extern void ast __P((struct trapframe frame)); static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); static void trap_fatal __P((struct trapframe *, vm_offset_t)); @@ -142,7 +149,7 @@ static char *trap_msg[] = { }; static __inline int userret __P((struct proc *p, struct trapframe *frame, - u_quad_t oticks, int have_mplock)); + u_quad_t oticks, int have_giant)); #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; @@ -158,18 +165,18 @@ SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); static __inline int -userret(p, frame, oticks, have_mplock) +userret(p, frame, oticks, have_giant) struct proc *p; struct trapframe *frame; u_quad_t oticks; - int have_mplock; + int have_giant; { int sig, s; while ((sig = CURSIG(p)) != 0) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } postsig(sig); } @@ -184,31 +191,34 @@ userret(p, frame, oticks, have_mplock) * mi_switch()'ed, we might not be on the queue indicated by * our priority. */ - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; - } s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); + mtx_exit(&sched_lock, MTX_SPIN); splx(s); - while ((sig = CURSIG(p)) != 0) + while ((sig = CURSIG(p)) != 0) { + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; + } postsig(sig); + } } /* * Charge system time if profiling. */ if (p->p_flag & P_PROFIL) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } addupc_task(p, frame->tf_eip, (u_int)(p->p_sticks - oticks) * psratio); } curpriority = p->p_priority; - return(have_mplock); + return(have_giant); } /* @@ -226,13 +236,20 @@ trap(frame) u_quad_t sticks = 0; int i = 0, ucode = 0, type, code; vm_offset_t eva; +#ifdef POWERFAIL_NMI + static int lastalert = 0; +#endif - if (!(frame.tf_eflags & PSL_I)) { + atomic_add_int(&cnt.v_trap, 1); + + if ((frame.tf_eflags & PSL_I) == 0) { /* - * Buggy application or kernel code has disabled interrupts - * and then trapped. Enabling interrupts now is wrong, but - * it is better than running with interrupts disabled until - * they are accidentally enabled later. + * Buggy application or kernel code has disabled + * interrupts and then trapped. Enabling interrupts + * now is wrong, but it is better than running with + * interrupts disabled until they are accidentally + * enabled later. XXX Consider whether is this still + * correct. */ type = frame.tf_trapno; if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) @@ -252,54 +269,27 @@ trap(frame) eva = 0; if (frame.tf_trapno == T_PAGEFLT) { /* - * For some Cyrix CPUs, %cr2 is clobbered by interrupts. - * This problem is worked around by using an interrupt - * gate for the pagefault handler. We are finally ready - * to read %cr2 and then must reenable interrupts. - * - * XXX this should be in the switch statement, but the - * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the - * flow of control too much for this to be obviously - * correct. + * For some Cyrix CPUs, %cr2 is clobbered by + * interrupts. This problem is worked around by using + * an interrupt gate for the pagefault handler. We + * are finally ready to read %cr2 and then must + * reenable interrupts. */ eva = rcr2(); enable_intr(); - } + } + + mtx_enter(&Giant, MTX_DEF); #if defined(I586_CPU) && !defined(NO_F00F_HACK) restart: #endif + type = frame.tf_trapno; code = frame.tf_err; - if (in_vm86call) { - if (frame.tf_eflags & PSL_VM && - (type == T_PROTFLT || type == T_STKFLT)) { - i = vm86_emulate((struct vm86frame *)&frame); - if (i != 0) - /* - * returns to original process - */ - vm86_trap((struct vm86frame *)&frame); - return; - } - switch (type) { - /* - * these traps want either a process context, or - * assume a normal userspace trap. - */ - case T_PROTFLT: - case T_SEGNPFLT: - trap_fatal(&frame, eva); - return; - case T_TRCTRAP: - type = T_BPTFLT; /* kernel breakpoint */ - /* FALL THROUGH */ - } - goto kernel_trap; /* normal kernel trap handling */ - } - - if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) { + if ((ISPL(frame.tf_cs) == SEL_UPL) || + ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { /* user trap */ sticks = p->p_sticks; @@ -322,16 +312,6 @@ trap(frame) i = SIGFPE; break; - case T_ASTFLT: /* Allow process switch */ - astoff(); - cnt.v_soft++; - if (p->p_flag & P_OWEUPC) { - p->p_flag &= ~P_OWEUPC; - addupc_task(p, p->p_stats->p_prof.pr_addr, - p->p_stats->p_prof.pr_ticks); - } - goto out; - /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle @@ -342,7 +322,7 @@ trap(frame) if (frame.tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)&frame); if (i == 0) - goto out; + goto user; break; } /* FALL THROUGH */ @@ -357,14 +337,20 @@ trap(frame) case T_PAGEFLT: /* page fault */ i = trap_pfault(&frame, TRUE, eva); - if (i == -1) - return; #if defined(I586_CPU) && !defined(NO_F00F_HACK) - if (i == -2) + if (i == -2) { + /* + * f00f hack workaround has triggered, treat + * as illegal instruction not page fault. + */ + frame.tf_trapno = T_PRIVINFLT; goto restart; + } #endif - if (i == 0) + if (i == -1) goto out; + if (i == 0) + goto user; ucode = T_PAGEFLT; break; @@ -377,7 +363,15 @@ trap(frame) #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI - goto handle_powerfail; +#ifndef TIMER_FREQ +# define TIMER_FREQ 1193182 +#endif + if (time_second - lastalert > 10) { + log(LOG_WARNING, "NMI: power fail\n"); + sysbeep(TIMER_FREQ/880, hz); + lastalert = time_second; + } + goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { @@ -391,7 +385,7 @@ trap(frame) kdb_trap (type, 0, &frame); } #endif /* DDB */ - return; + goto out; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; @@ -410,9 +404,9 @@ trap(frame) case T_DNA: #if NNPX > 0 - /* if a transparent fault (due to context switch "late") */ + /* transparent fault (due to context switch "late") */ if (npxdna()) - return; + goto out; #endif if (!pmath_emulate) { i = SIGFPE; @@ -422,7 +416,7 @@ trap(frame) i = (*pmath_emulate)(&frame); if (i == 0) { if (!(frame.tf_eflags & PSL_T)) - return; + goto out; frame.tf_eflags &= ~PSL_T; i = SIGTRAP; } @@ -435,13 +429,12 @@ trap(frame) break; } } else { -kernel_trap: /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(&frame, FALSE, eva); - return; + goto out; case T_DNA: #if NNPX > 0 @@ -451,31 +444,35 @@ trap(frame) * registered such use. */ if (npxdna()) - return; + goto out; #endif break; - case T_PROTFLT: /* general protection fault */ - case T_SEGNPFLT: /* segment not present fault */ /* - * Invalid segment selectors and out of bounds - * %eip's and %esp's can be set up in user mode. - * This causes a fault in kernel mode when the - * kernel tries to return to user mode. We want - * to get this fault so that we can fix the - * problem here and not have to check all the - * selectors and pointers when the user changes - * them. + * The following two traps can happen in + * vm86 mode, and, if so, we want to handle + * them specially. */ -#define MAYBE_DORETI_FAULT(where, whereto) \ - do { \ - if (frame.tf_eip == (int)where) { \ - frame.tf_eip = (int)whereto; \ - return; \ - } \ - } while (0) + case T_PROTFLT: /* general protection fault */ + case T_STKFLT: /* stack fault */ + if (frame.tf_eflags & PSL_VM) { + i = vm86_emulate((struct vm86frame *)&frame); + if (i != 0) + /* + * returns to original process + */ + vm86_trap((struct vm86frame *)&frame); + goto out; + } + /* FALL THROUGH */ + + case T_SEGNPFLT: /* segment not present fault */ + if (in_vm86call) + break; + + if (intr_nesting_level != 0) + break; - if (intr_nesting_level == 0) { /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the @@ -488,20 +485,38 @@ trap(frame) if (frame.tf_eip == (int)cpu_switch_load_gs) { curpcb->pcb_gs = 0; psignal(p, SIGBUS); - return; + goto out; + } + + /* + * Invalid segment selectors and out of bounds + * %eip's and %esp's can be set up in user mode. + * This causes a fault in kernel mode when the + * kernel tries to return to user mode. We want + * to get this fault so that we can fix the + * problem here and not have to check all the + * selectors and pointers when the user changes + * them. + */ + if (frame.tf_eip == (int)doreti_iret) { + frame.tf_eip = (int)doreti_iret_fault; + goto out; + } + if (frame.tf_eip == (int)doreti_popl_ds) { + frame.tf_eip = (int)doreti_popl_ds_fault; + goto out; + } + if (frame.tf_eip == (int)doreti_popl_es) { + frame.tf_eip = (int)doreti_popl_es_fault; + goto out; } - MAYBE_DORETI_FAULT(doreti_iret, - doreti_iret_fault); - MAYBE_DORETI_FAULT(doreti_popl_ds, - doreti_popl_ds_fault); - MAYBE_DORETI_FAULT(doreti_popl_es, - doreti_popl_es_fault); - MAYBE_DORETI_FAULT(doreti_popl_fs, - doreti_popl_fs_fault); + if (frame.tf_eip == (int)doreti_popl_fs) { + frame.tf_eip = (int)doreti_popl_fs_fault; + goto out; + } if (curpcb && curpcb->pcb_onfault) { frame.tf_eip = (int)curpcb->pcb_onfault; - return; - } + goto out; } break; @@ -517,7 +532,7 @@ trap(frame) */ if (frame.tf_eflags & PSL_NT) { frame.tf_eflags &= ~PSL_NT; - return; + goto out; } break; @@ -529,7 +544,7 @@ trap(frame) * silently until the syscall handler has * saved the flags. */ - return; + goto out; } if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { /* @@ -537,7 +552,7 @@ trap(frame) * flags. Stop single stepping it. */ frame.tf_eflags &= ~PSL_T; - return; + goto out; } /* * Ignore debug register trace traps due to @@ -549,13 +564,13 @@ trap(frame) * in kernel space because that is useful when * debugging the kernel. */ - if (user_dbreg_trap()) { + if (user_dbreg_trap() && !in_vm86call) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); - return; + goto out; } /* * Fall through (TRCTRAP kernel mode, kernel address) @@ -567,28 +582,19 @@ trap(frame) */ #ifdef DDB if (kdb_trap (type, 0, &frame)) - return; + goto out; #endif break; #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI -#ifndef TIMER_FREQ -# define TIMER_FREQ 1193182 -#endif - handle_powerfail: - { - static unsigned lastalert = 0; - - if(time_second - lastalert > 10) - { + if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; - } - return; } + goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { @@ -602,16 +608,16 @@ trap(frame) kdb_trap (type, 0, &frame); } #endif /* DDB */ - return; + goto out; } else if (panic_on_nmi == 0) - return; + goto out; /* FALL THROUGH */ #endif /* POWERFAIL_NMI */ #endif /* NISA > 0 */ } trap_fatal(&frame, eva); - return; + goto out; } /* Translate fault for emulators (e.g. Linux) */ @@ -630,8 +636,10 @@ trap(frame) } #endif -out: +user: userret(p, &frame, sticks, 1); +out: + mtx_exit(&Giant, MTX_DEF); } #ifdef notyet @@ -769,10 +777,8 @@ trap_pfault(frame, usermode, eva) * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) - if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) { - frame->tf_trapno = T_PRIVINFLT; + if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return -2; - } #endif if (usermode) goto nogo; @@ -869,8 +875,7 @@ trap_fatal(frame, eva) frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP - /* three seperate prints in case of a trap on an unmapped page */ - printf("mp_lock = %08x; ", mp_lock); + /* two seperate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", cpuid); printf("lapic.id = %08x\n", lapic.id); #endif @@ -917,26 +922,6 @@ trap_fatal(frame, eva) } else { printf("Idle\n"); } - printf("interrupt mask = "); - if ((cpl & net_imask) == net_imask) - printf("net "); - if ((cpl & tty_imask) == tty_imask) - printf("tty "); - if ((cpl & bio_imask) == bio_imask) - printf("bio "); - if ((cpl & cam_imask) == cam_imask) - printf("cam "); - if (cpl == 0) - printf("none"); -#ifdef SMP -/** - * XXX FIXME: - * we probably SHOULD have stopped the other CPUs before now! - * another CPU COULD have been touching cpl at this moment... - */ - printf(" <- SMP: XXX"); -#endif - printf("\n"); #ifdef KDB if (kdb_trap(&psl)) @@ -973,8 +958,7 @@ dblfault_handler() printf("esp = 0x%x\n", common_tss.tss_esp); printf("ebp = 0x%x\n", common_tss.tss_ebp); #ifdef SMP - /* three seperate prints in case of a trap on an unmapped page */ - printf("mp_lock = %08x; ", mp_lock); + /* two seperate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", cpuid); printf("lapic.id = %08x\n", lapic.id); #endif @@ -1048,12 +1032,14 @@ syscall2(frame) int error; int narg; int args[8]; - int have_mplock = 0; + int have_giant = 0; u_int code; + atomic_add_int(&cnt.v_syscall, 1); + #ifdef DIAGNOSTIC if (ISPL(frame.tf_cs) != SEL_UPL) { - get_mplock(); + mtx_enter(&Giant, MTX_DEF); panic("syscall"); /* NOT REACHED */ } @@ -1075,9 +1061,9 @@ syscall2(frame) /* * The prep code is not MP aware. */ - get_mplock(); + mtx_enter(&Giant, MTX_DEF); (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); - rel_mplock(); + mtx_exit(&Giant, MTX_DEF); } else { /* * Need to check if this is a 32 bit or 64 bit syscall. @@ -1114,8 +1100,8 @@ syscall2(frame) */ if (params && (i = narg * sizeof(int)) && (error = copyin(params, (caddr_t)args, (u_int)i))) { - get_mplock(); - have_mplock = 1; + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, narg, args); @@ -1129,15 +1115,15 @@ syscall2(frame) * we are ktracing */ if ((callp->sy_narg & SYF_MPSAFE) == 0) { - get_mplock(); - have_mplock = 1; + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } ktrsyscall(p->p_tracep, code, narg, args); } @@ -1192,9 +1178,9 @@ syscall2(frame) * Traced syscall. trapsignal() is not MP aware. */ if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); @@ -1203,13 +1189,13 @@ syscall2(frame) /* * Handle reschedule and other end-of-syscall issues */ - have_mplock = userret(p, &frame, sticks, have_mplock); + have_giant = userret(p, &frame, sticks, have_giant); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } ktrsysret(p->p_tracep, code, error, p->p_retval[0]); } @@ -1225,27 +1211,66 @@ syscall2(frame) /* * Release the MP lock if we had to get it */ - if (have_mplock) - rel_mplock(); + if (have_giant) + mtx_exit(&Giant, MTX_DEF); + + mtx_assert(&sched_lock, MA_NOTOWNED); + mtx_assert(&Giant, MA_NOTOWNED); +} + +void +ast(frame) + struct trapframe frame; +{ + struct proc *p = CURPROC; + u_quad_t sticks; + + /* + * handle atomicy by looping since interrupts are enabled and the + * MP lock is not held. + */ + sticks = ((volatile struct proc *)p)->p_sticks; + while (sticks != ((volatile struct proc *)p)->p_sticks) + sticks = ((volatile struct proc *)p)->p_sticks; + + astoff(); + atomic_add_int(&cnt.v_soft, 1); + if (p->p_flag & P_OWEUPC) { + mtx_enter(&Giant, MTX_DEF); + p->p_flag &= ~P_OWEUPC; + addupc_task(p, p->p_stats->p_prof.pr_addr, + p->p_stats->p_prof.pr_ticks); +} + if (userret(p, &frame, sticks, mtx_owned(&Giant)) != 0) + mtx_exit(&Giant, MTX_DEF); } /* * Simplified back end of syscall(), used when returning from fork() - * directly into user mode. MP lock is held on entry and should be - * held on return. + * directly into user mode. Giant is not held on entry, and must not + * be held on return. */ void fork_return(p, frame) struct proc *p; struct trapframe frame; { + int have_giant; + frame.tf_eax = 0; /* Child returns zero */ frame.tf_eflags &= ~PSL_C; /* success */ frame.tf_edx = 1; - userret(p, &frame, 0, 1); + have_giant = userret(p, &frame, 0, mtx_owned(&Giant)); #ifdef KTRACE - if (KTRPOINT(p, KTR_SYSRET)) + if (KTRPOINT(p, KTR_SYSRET)) { + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; + } ktrsysret(p->p_tracep, SYS_fork, 0, 0); + } #endif + if (have_giant) + mtx_exit(&Giant, MTX_DEF); } diff --git a/sys/i386/i386/tsc.c b/sys/i386/i386/tsc.c index 15044abbaa3b..724f3c2817ba 100644 --- a/sys/i386/i386/tsc.c +++ b/sys/i386/i386/tsc.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -93,10 +94,6 @@ #include #endif -#ifdef SMP -#define disable_intr() CLOCK_DISABLE_INTR() -#define enable_intr() CLOCK_ENABLE_INTR() - #ifdef APIC_IO #include /* The interrupt triggered by the 8254 (timer) chip */ @@ -104,7 +101,6 @@ int apic_8254_intr; static u_long read_intr_count __P((int vec)); static void setup_8254_mixed_mode __P((void)); #endif -#endif /* SMP */ /* * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we @@ -147,7 +143,9 @@ int tsc_is_broken; int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */ static int beeping = 0; +#if 0 static u_int clk_imask = HWI_MASK | SWI_MASK; +#endif static const u_char daysinmonth[] = {31,28,31,30,31,30,31,31,30,31,30,31}; static u_int hardclock_max_count; static u_int32_t i8254_lastcount; @@ -205,8 +203,12 @@ SYSCTL_OPAQUE(_debug, OID_AUTO, i8254_timecounter, CTLFLAG_RD, static void clkintr(struct clockframe frame) { + int intrsave; + if (timecounter->tc_get_timecount == i8254_get_timecount) { + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); if (i8254_ticked) i8254_ticked = 0; else { @@ -214,7 +216,8 @@ clkintr(struct clockframe frame) i8254_lastcount = 0; } clkintr_pending = 0; - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); } timer_func(&frame); switch (timer0_state) { @@ -233,14 +236,17 @@ clkintr(struct clockframe frame) break; case ACQUIRE_PENDING: + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); i8254_offset = i8254_get_timecount(NULL); i8254_lastcount = 0; timer0_max_count = TIMER_DIV(new_rate); outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); timer_func = new_function; timer0_state = ACQUIRED; setdelayed(); @@ -249,7 +255,9 @@ clkintr(struct clockframe frame) case RELEASE_PENDING: if ((timer0_prescaler_count += timer0_max_count) >= hardclock_max_count) { + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); i8254_offset = i8254_get_timecount(NULL); i8254_lastcount = 0; timer0_max_count = hardclock_max_count; @@ -257,7 +265,8 @@ clkintr(struct clockframe frame) TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); timer0_prescaler_count = 0; timer_func = hardclock; timer0_state = RELEASED; @@ -404,11 +413,11 @@ DB_SHOW_COMMAND(rtc, rtc) static int getit(void) { - u_long ef; - int high, low; + int high, low, intrsave; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); /* Select timer0 and latch counter value. */ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); @@ -417,7 +426,7 @@ getit(void) high = inb(TIMER_CNTR0); CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); return ((high << 8) | low); } @@ -523,6 +532,7 @@ sysbeepstop(void *chan) int sysbeep(int pitch, int period) { + int intrsave; int x = splclock(); if (acquire_timer2(TIMER_SQWAVE|TIMER_16BIT)) @@ -531,10 +541,13 @@ sysbeep(int pitch, int period) splx(x); return (-1); /* XXX Should be EBUSY, but nobody cares anyway. */ } + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); outb(TIMER_CNTR2, pitch); outb(TIMER_CNTR2, (pitch>>8)); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); if (!beeping) { /* enable counter2 output to speaker */ outb(IO_PPI, inb(IO_PPI) | 3); @@ -683,11 +696,12 @@ calibrate_clocks(void) static void set_timer_freq(u_int freq, int intr_freq) { - u_long ef; + int intrsave; int new_timer0_max_count; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); timer_freq = freq; new_timer0_max_count = hardclock_max_count = TIMER_DIV(intr_freq); if (new_timer0_max_count != timer0_max_count) { @@ -697,7 +711,7 @@ set_timer_freq(u_int freq, int intr_freq) outb(TIMER_CNTR0, timer0_max_count >> 8); } CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); } /* @@ -711,15 +725,16 @@ set_timer_freq(u_int freq, int intr_freq) void i8254_restore(void) { - u_long ef; + int intrsave; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); } /* @@ -979,8 +994,8 @@ cpu_initclocks() { int diag; #ifdef APIC_IO - int apic_8254_trial; - struct intrec *clkdesc; + int apic_8254_trial, num_8254_ticks; + struct intrec *clkdesc, *rtcdesc; #endif /* APIC_IO */ if (statclock_disable) { @@ -1014,14 +1029,15 @@ cpu_initclocks() } else panic("APIC_IO: Cannot route 8254 interrupt to CPU"); } - - clkdesc = inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, - NULL, &clk_imask, INTR_EXCL); - INTREN(1 << apic_8254_intr); - #else /* APIC_IO */ - inthand_add("clk", 0, (inthand2_t *)clkintr, NULL, &clk_imask, + /* + * XXX Check the priority of this interrupt handler. I + * couldn't find anything suitable in the BSD/OS code (grog, + * 19 July 2000). + */ + /* Setup the PIC clk handler. The APIC handler is setup later */ + inthand_add("clk", 0, (inthand2_t *)clkintr, NULL, PI_REALTIME, INTR_EXCL); INTREN(IRQ0); @@ -1032,8 +1048,18 @@ cpu_initclocks() writertc(RTC_STATUSB, RTCSB_24HR); /* Don't bother enabling the statistics clock. */ - if (statclock_disable) + if (statclock_disable) { +#ifdef APIC_IO + /* + * XXX - if statclock is disabled, don't attempt the APIC + * trial. Not sure this is sane for APIC_IO. + */ + inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, NULL, + PI_REALTIME, INTR_EXCL); + INTREN(1 << apic_8254_intr); +#endif /* APIC_IO */ return; + } diag = rtcin(RTC_DIAG); if (diag != 0) printf("RTC BIOS diagnostic error %b\n", diag, RTCDG_BITS); @@ -1041,34 +1067,44 @@ cpu_initclocks() #ifdef APIC_IO if (isa_apic_irq(8) != 8) panic("APIC RTC != 8"); -#endif /* APIC_IO */ - inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, &stat_imask, - INTR_EXCL); - -#ifdef APIC_IO - INTREN(APIC_IRQ8); -#else - INTREN(IRQ8); -#endif /* APIC_IO */ - - writertc(RTC_STATUSB, rtc_statusb); - -#ifdef APIC_IO if (apic_8254_trial) { - + /* + * XXX - We use fast interrupts for clk and rtc long enough to + * perform the APIC probe and then revert to exclusive + * interrupts. + */ + clkdesc = inthand_add("clk", apic_8254_intr, + (inthand2_t *)clkintr, NULL, PI_REALTIME, INTR_FAST); + INTREN(1 << apic_8254_intr); + + rtcdesc = inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, + PI_REALTIME, INTR_FAST); /* XXX */ + INTREN(APIC_IRQ8); + writertc(RTC_STATUSB, rtc_statusb); + printf("APIC_IO: Testing 8254 interrupt delivery\n"); while (read_intr_count(8) < 6) ; /* nothing */ - if (read_intr_count(apic_8254_intr) < 3) { + num_8254_ticks = read_intr_count(apic_8254_intr); + + /* disable and remove our fake handlers */ + INTRDIS(1 << apic_8254_intr); + inthand_remove(clkdesc); + + writertc(RTC_STATUSA, rtc_statusa); + writertc(RTC_STATUSB, RTCSB_24HR); + + INTRDIS(APIC_IRQ8); + inthand_remove(rtcdesc); + + if (num_8254_ticks < 3) { /* * The MP table is broken. * The 8254 was not connected to the specified pin * on the IO APIC. * Workaround: Limited variant of mixed mode. */ - INTRDIS(1 << apic_8254_intr); - inthand_remove(clkdesc); printf("APIC_IO: Broken MP table detected: " "8254 is not connected to " "IOAPIC #%d intpin %d\n", @@ -1087,13 +1123,27 @@ cpu_initclocks() } apic_8254_intr = apic_irq(0, 0); setup_8254_mixed_mode(); - inthand_add("clk", apic_8254_intr, - (inthand2_t *)clkintr, - NULL, &clk_imask, INTR_EXCL); - INTREN(1 << apic_8254_intr); } } + + /* Finally, setup the real clock handlers */ + inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, NULL, + PI_REALTIME, INTR_EXCL); + INTREN(1 << apic_8254_intr); +#endif + + inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, PI_REALTIME, + INTR_EXCL); +#ifdef APIC_IO + INTREN(APIC_IRQ8); +#else + INTREN(IRQ8); +#endif + + writertc(RTC_STATUSB, rtc_statusb); + +#ifdef APIC_IO if (apic_int_type(0, 0) != 3 || int_to_apicintpin[apic_8254_intr].ioapic != 0 || int_to_apicintpin[apic_8254_intr].int_pin != 0) @@ -1198,11 +1248,12 @@ static unsigned i8254_get_timecount(struct timecounter *tc) { u_int count; - u_long ef; + int intrsave; u_int high, low; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); /* Select timer0 and latch counter value. */ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); @@ -1212,7 +1263,7 @@ i8254_get_timecount(struct timecounter *tc) count = timer0_max_count - ((high << 8) | low); if (count < i8254_lastcount || (!i8254_ticked && (clkintr_pending || - ((count < 20 || (!(ef & PSL_I) && count < timer0_max_count / 2u)) && + ((count < 20 || (!(intrsave & PSL_I) && count < timer0_max_count / 2u)) && #ifdef APIC_IO #define lapic_irr1 ((volatile u_int *)&lapic)[0x210 / 4] /* XXX XXX */ /* XXX this assumes that apic_8254_intr is < 24. */ @@ -1227,7 +1278,7 @@ i8254_get_timecount(struct timecounter *tc) i8254_lastcount = count; count += i8254_offset; CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); return (count); } diff --git a/sys/i386/i386/vm86bios.s b/sys/i386/i386/vm86bios.s index 6a11c2685488..14b4259005bf 100644 --- a/sys/i386/i386/vm86bios.s +++ b/sys/i386/i386/vm86bios.s @@ -62,11 +62,9 @@ ENTRY(vm86_bioscall) pushl %edi pushl %gs -#ifdef SMP pushl %edx - MP_LOCK /* Get global lock */ + call __mtx_enter_giant_def /* Get global lock */ popl %edx -#endif #if NNPX > 0 movl _curproc,%ecx @@ -135,13 +133,9 @@ ENTRY(vm86_bioscall) /* * Return via _doreti */ -#ifdef SMP - pushl _cpl /* cpl to restore */ -#else - pushl _cpl /* cpl to restore */ -#endif subl $4,%esp /* dummy unit */ incb _intr_nesting_level + call __mtx_exit_giant_def MEXITCOUNT jmp _doreti diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c index cfb6ceef44d6..831ab3b168a6 100644 --- a/sys/i386/i386/vm_machdep.c +++ b/sys/i386/i386/vm_machdep.c @@ -57,12 +57,14 @@ #include #include #include +#include #include #include #include #include #include +#include #ifdef SMP #include #endif @@ -177,9 +179,8 @@ cpu_fork(p1, p2, flags) * pcb2->pcb_onfault: cloned above (always NULL here?). */ -#ifdef SMP - pcb2->pcb_mpnest = 1; -#endif + pcb2->pcb_schednest = 0; + /* * XXX don't copy the i/o pages. this should probably be fixed. */ @@ -256,8 +257,11 @@ cpu_exit(p) reset_dbregs(); pcb->pcb_flags &= ~PCB_DBREGS; } + mtx_enter(&sched_lock, MTX_SPIN); + mtx_exit(&Giant, MTX_DEF | MTX_NOSWITCH); + mtx_assert(&Giant, MA_NOTOWNED); cnt.v_swtch++; - cpu_switch(p); + cpu_switch(); panic("cpu_exit"); } @@ -406,17 +410,10 @@ vunmapbuf(bp) static void cpu_reset_proxy() { - u_int saved_mp_lock; cpu_reset_proxy_active = 1; while (cpu_reset_proxy_active == 1) - ; /* Wait for other cpu to disable interupts */ - saved_mp_lock = mp_lock; - mp_lock = 1; - printf("cpu_reset_proxy: Grabbed mp lock for BSP\n"); - cpu_reset_proxy_active = 3; - while (cpu_reset_proxy_active == 3) - ; /* Wait for other cpu to enable interrupts */ + ; /* Wait for other cpu to see that we've started */ stop_cpus((1<= ZIDLE_HI(cnt.v_free_count)) return(0); -#ifdef SMP - if (try_mplock()) { -#endif + if (mtx_try_enter(&Giant, MTX_DEF)) { s = splvm(); - __asm __volatile("sti" : : : "memory"); + intrsave = save_intr(); + enable_intr(); zero_state = 0; m = vm_page_list_find(PQ_FREE, free_rover, FALSE); if (m != NULL && (m->flags & PG_ZERO) == 0) { @@ -595,14 +584,10 @@ vm_page_zero_idle() } free_rover = (free_rover + PQ_PRIME2) & PQ_L2_MASK; splx(s); - __asm __volatile("cli" : : : "memory"); -#ifdef SMP - rel_mplock(); -#endif + restore_intr(intrsave); + mtx_exit(&Giant, MTX_DEF); return (1); -#ifdef SMP } -#endif /* * We have to enable interrupts for a moment if the try_mplock fails * in order to potentially take an IPI. XXX this should be in diff --git a/sys/i386/include/asnames.h b/sys/i386/include/asnames.h index 3ccbee6be344..efdb0f9710a1 100644 --- a/sys/i386/include/asnames.h +++ b/sys/i386/include/asnames.h @@ -131,6 +131,7 @@ #define _Xintr7 Xintr7 #define _Xintr8 Xintr8 #define _Xintr9 Xintr9 +#define _Xtintr0 Xtintr0 #define _Xinvltlb Xinvltlb #define _Xrendezvous Xrendezvous #define _Xmchk Xmchk @@ -155,6 +156,7 @@ #define _arith_invalid arith_invalid #define _arith_overflow arith_overflow #define _arith_underflow arith_underflow +#define _ast ast #define _bcopy bcopy #define _bcopy_vector bcopy_vector #define _bigJump bigJump @@ -184,7 +186,6 @@ #define _cnt cnt #define _copyin_vector copyin_vector #define _copyout_vector copyout_vector -#define _cpl cpl #define _cpl_lock cpl_lock #define _cpu cpu #define _cpu0prvpage cpu0prvpage @@ -222,6 +223,7 @@ #define _get_isrlock get_isrlock #define _get_mplock get_mplock #define _get_syscall_lock get_syscall_lock +#define _Giant Giant #define _idle idle #define _ihandlers ihandlers #define _imen imen @@ -232,13 +234,11 @@ #define _intr_countp intr_countp #define _intr_handler intr_handler #define _intr_mask intr_mask -#define _intr_nesting_level intr_nesting_level #define _intr_unit intr_unit #define _intrcnt intrcnt #define _intrnames intrnames #define _invltlb_ok invltlb_ok #define _ioapic ioapic -#define _ipending ipending #define _isr_lock isr_lock #define _kernelname kernelname #define _lapic lapic @@ -249,6 +249,8 @@ #define _mp_gdtbase mp_gdtbase #define _mp_lock mp_lock #define _mp_ncpus mp_ncpus +#define __mtx_enter_giant_def _mtx_enter_giant_def +#define __mtx_exit_giant_def _mtx_exit_giant_def #define _mul64 mul64 #define _net_imask net_imask #define _netisr netisr @@ -281,6 +283,8 @@ #define _round_reg round_reg #define _s_lock s_lock #define _s_unlock s_unlock +#define _sched_ithd sched_ithd +#define _sched_lock sched_lock #define _set_precision_flag_down set_precision_flag_down #define _set_precision_flag_up set_precision_flag_up #define _set_user_ldt set_user_ldt @@ -293,6 +297,7 @@ #define _softclock softclock #define _softnet_imask softnet_imask #define _softtty_imask softtty_imask +#define _spending spending #define _spl0 spl0 #define _splz splz #define _ss_lock ss_lock @@ -326,9 +331,9 @@ #if defined(SMP) || defined(__ELF__) #ifdef SMP -#define FS(x) %fs:gd_ ## x +#define FS(x) %fs:gd_ ## x #else -#define FS(x) x +#define FS(x) x #endif #define _common_tss FS(common_tss) @@ -337,6 +342,8 @@ #define _cpu_lockid FS(cpu_lockid) #define _curpcb FS(curpcb) #define _curproc FS(curproc) +#define _prevproc FS(prevproc) +#define _idleproc FS(idleproc) #define _astpending FS(astpending) #define _currentldt FS(currentldt) #define _inside_intr FS(inside_intr) @@ -353,9 +360,16 @@ #define _ss_eflags FS(ss_eflags) #define _switchticks FS(switchticks) #define _switchtime FS(switchtime) +#define _intr_nesting_level FS(intr_nesting_level) #define _tss_gdt FS(tss_gdt) #define _idlestack FS(idlestack) #define _idlestack_top FS(idlestack_top) +#define _witness_spin_check FS(witness_spin_check) +/* +#define _ktr_idx FS(ktr_idx) +#define _ktr_buf FS(ktr_buf) +#define _ktr_buf_data FS(ktr_buf_data) +*/ #endif diff --git a/sys/i386/include/cpu.h b/sys/i386/include/cpu.h index ffabf7f8ed54..18822b87cc5b 100644 --- a/sys/i386/include/cpu.h +++ b/sys/i386/include/cpu.h @@ -46,6 +46,7 @@ #include #include #include +#include /* * definitions of cpu-dependent requirements @@ -86,7 +87,9 @@ * added, we will have an atomicy problem. The type of atomicy we need is * a non-locked orl. */ -#define need_resched() do { astpending = AST_RESCHED|AST_PENDING; } while (0) +#define need_resched() do { \ + PCPU_SET(astpending, AST_RESCHED|AST_PENDING); \ +} while (0) #define resched_wanted() (astpending & AST_RESCHED) /* @@ -109,8 +112,9 @@ * it off (asynchronous need_resched() conflicts are not critical). */ #define signotify(p) aston() - -#define aston() do { astpending |= AST_PENDING; } while (0) +#define aston() do { \ + PCPU_SET(astpending, astpending | AST_PENDING); \ +} while (0) #define astoff() /* @@ -135,7 +139,9 @@ #ifdef _KERNEL extern char btext[]; extern char etext[]; +#ifndef intr_nesting_level extern u_char intr_nesting_level; +#endif void fork_trampoline __P((void)); void fork_return __P((struct proc *, struct trapframe)); diff --git a/sys/i386/include/cpufunc.h b/sys/i386/include/cpufunc.h index 9a4052fd41d1..39868df422aa 100644 --- a/sys/i386/include/cpufunc.h +++ b/sys/i386/include/cpufunc.h @@ -86,20 +86,29 @@ static __inline void disable_intr(void) { __asm __volatile("cli" : : : "memory"); -#ifdef SMP - MPINTR_LOCK(); -#endif } static __inline void enable_intr(void) { -#ifdef SMP - MPINTR_UNLOCK(); -#endif __asm __volatile("sti"); } +static __inline u_int +save_intr(void) +{ + u_int ef; + + __asm __volatile("pushfl; popl %0" : "=r" (ef)); + return (ef); +} + +static __inline void +restore_intr(u_int ef) +{ + __asm __volatile("pushl %0; popfl" : : "r" (ef) : "memory" ); +} + #define HAVE_INLINE_FFS static __inline int diff --git a/sys/i386/include/globaldata.h b/sys/i386/include/globaldata.h index 58bd9cfe9416..440da60b4b83 100644 --- a/sys/i386/include/globaldata.h +++ b/sys/i386/include/globaldata.h @@ -26,6 +26,20 @@ * $FreeBSD$ */ +#ifndef _MACHINE_GLOBALDATA_H_ +#define _MACHINE_GLOBALDATA_H_ + +#include +#include +#include +#include +#include + +/* XXX */ +#ifdef KTR_PERCPU +#include +#endif + /* * This structure maps out the global data that needs to be kept on a * per-cpu basis. genassym uses this to generate offsets for the assembler @@ -41,11 +55,14 @@ struct globaldata { struct privatespace *gd_prvspace; /* self-reference */ struct proc *gd_curproc; + struct proc *gd_prevproc; struct proc *gd_npxproc; struct pcb *gd_curpcb; + struct proc *gd_idleproc; struct timeval gd_switchtime; struct i386tss gd_common_tss; int gd_switchticks; + int gd_intr_nesting_level; struct segment_descriptor gd_common_tssd; struct segment_descriptor *gd_tss_gdt; #ifdef USER_LDT @@ -67,8 +84,22 @@ struct globaldata { unsigned *gd_prv_PADDR1; #endif u_int gd_astpending; + SLIST_ENTRY(globaldata) gd_allcpu; + int gd_witness_spin_check; +#ifdef KTR_PERCPU +#ifdef KTR + volatile int gd_ktr_idx; + char *gd_ktr_buf; + char gd_ktr_buf_data[KTR_SIZE]; +#endif +#endif }; +extern struct globaldata globaldata; + +SLIST_HEAD(cpuhead, globaldata); +extern struct cpuhead cpuhead; + #ifdef SMP /* * This is the upper (0xff800000) address space layout that is per-cpu. @@ -93,3 +124,5 @@ struct privatespace { extern struct privatespace SMP_prvspace[]; #endif + +#endif /* ! _MACHINE_GLOBALDATA_H_ */ diff --git a/sys/i386/include/globals.h b/sys/i386/include/globals.h index ae05d5644e76..71bbbd580d9e 100644 --- a/sys/i386/include/globals.h +++ b/sys/i386/include/globals.h @@ -74,6 +74,14 @@ __asm("movl %0,%%fs:gd_" #name : : "r" (val)); \ } +static __inline int +_global_globaldata(void) +{ + int val; + __asm("movl %%fs:globaldata,%0" : "=r" (val)); + return (val); +} + #if defined(SMP) || defined(KLD_MODULE) || defined(ACTUALLY_LKM_NOT_KERNEL) /* * The following set of macros works for UP kernel as well, but for maximum @@ -82,18 +90,21 @@ * portability between UP and SMP kernels. */ #define curproc GLOBAL_RVALUE_NV(curproc, struct proc *) +#define prevproc GLOBAL_RVALUE_NV(prevproc, struct proc *) #define curpcb GLOBAL_RVALUE_NV(curpcb, struct pcb *) -#define npxproc GLOBAL_LVALUE(npxproc, struct proc *) +#define npxproc GLOBAL_RVALUE_NV(npxproc, struct proc *) +#define idleproc GLOBAL_RVALUE_NV(idleproc, struct proc *) #define common_tss GLOBAL_LVALUE(common_tss, struct i386tss) #define switchtime GLOBAL_LVALUE(switchtime, struct timeval) #define switchticks GLOBAL_LVALUE(switchticks, int) +#define intr_nesting_level GLOBAL_RVALUE(intr_nesting_level, u_char) #define common_tssd GLOBAL_LVALUE(common_tssd, struct segment_descriptor) #define tss_gdt GLOBAL_LVALUE(tss_gdt, struct segment_descriptor *) -#define astpending GLOBAL_LVALUE(astpending, u_int) +#define astpending GLOBAL_RVALUE(astpending, u_int) #ifdef USER_LDT -#define currentldt GLOBAL_LVALUE(currentldt, int) +#define currentldt GLOBAL_RVALUE(currentldt, int) #endif #ifdef SMP @@ -109,19 +120,32 @@ #define prv_CADDR3 GLOBAL_RVALUE(prv_CADDR3, caddr_t) #define prv_PADDR1 GLOBAL_RVALUE(prv_PADDR1, unsigned *) #endif + +#define witness_spin_check GLOBAL_RVALUE(witness_spin_check, int) + #endif /*UP kernel*/ GLOBAL_FUNC(curproc) +GLOBAL_FUNC(prevproc) GLOBAL_FUNC(astpending) GLOBAL_FUNC(curpcb) GLOBAL_FUNC(npxproc) +GLOBAL_FUNC(idleproc) GLOBAL_FUNC(common_tss) GLOBAL_FUNC(switchtime) GLOBAL_FUNC(switchticks) +GLOBAL_FUNC(intr_nesting_level) GLOBAL_FUNC(common_tssd) GLOBAL_FUNC(tss_gdt) +/* XXX */ +#ifdef KTR_PERCPU +GLOBAL_FUNC(ktr_idx) +GLOBAL_FUNC(ktr_buf) +GLOBAL_FUNC(ktr_buf_data) +#endif + #ifdef USER_LDT GLOBAL_FUNC(currentldt) #endif @@ -140,7 +164,17 @@ GLOBAL_FUNC(prv_CADDR3) GLOBAL_FUNC(prv_PADDR1) #endif -#define SET_CURPROC(x) (_global_curproc_set_nv((int)x)) +GLOBAL_FUNC(witness_spin_check) + +#ifdef SMP +#define GLOBALDATA GLOBAL_RVALUE(globaldata, struct globaldata *) +#else +#define GLOBALDATA (&globaldata) +#endif + +#define CURPROC curproc + +#define PCPU_SET(name, value) (_global_##name##_set((int)value)) #endif /* _KERNEL */ diff --git a/sys/i386/include/ipl.h b/sys/i386/include/ipl.h index 54d3f4b7b4b5..08726df51d84 100644 --- a/sys/i386/include/ipl.h +++ b/sys/i386/include/ipl.h @@ -42,10 +42,20 @@ #include #endif +/* + * Software interrupt level. We treat the software interrupt as a + * single interrupt at a fictive hardware interrupt level. + */ +#define SOFTINTR (NHWI + 0) + /* * Software interrupt bit numbers in priority order. The priority only * determines which swi will be dispatched next; a higher priority swi * may be dispatched when a nested h/w interrupt handler returns. + * + * XXX FIXME: There's no longer a relation between the SWIs and the + * HWIs, so it makes more sense for these values to start at 0, but + * there's lots of code which expects them to start at NHWI. */ #define SWI_TTY (NHWI + 0) #define SWI_NET (NHWI + 1) @@ -104,12 +114,9 @@ #ifdef notyet /* in until pci drivers stop hacking on them */ extern unsigned bio_imask; /* group of interrupts masked with splbio() */ #endif -extern unsigned cpl; /* current priority level mask */ -#ifdef SMP -extern unsigned cil; /* current INTerrupt level mask */ -#endif + extern volatile unsigned idelayed; /* interrupts to become pending */ -extern volatile unsigned ipending; /* active interrupts masked by cpl */ +extern volatile unsigned spending; /* pending software interrupts */ #ifdef notyet /* in until pci drivers stop hacking on them */ extern unsigned net_imask; /* group of interrupts masked with splimp() */ extern unsigned stat_imask; /* interrupts masked with splstatclock() */ diff --git a/sys/i386/include/lock.h b/sys/i386/include/lock.h index 534f77e8d2fb..b4af09d9c579 100644 --- a/sys/i386/include/lock.h +++ b/sys/i386/include/lock.h @@ -36,21 +36,6 @@ #define MPLOCKED lock ; -/* - * Some handy macros to allow logical organization. - */ - -#define MP_LOCK call _get_mplock - -#define MP_TRYLOCK \ - pushl $_mp_lock ; /* GIANT_LOCK */ \ - call _MPtrylock ; /* try to get lock */ \ - add $4, %esp - -#define MP_RELLOCK \ - movl $_mp_lock,%edx ; /* GIANT_LOCK */ \ - call _MPrellock_edx - /* * Protects the IO APIC and apic_imen as a critical region. */ @@ -66,7 +51,8 @@ #define MPLOCKED /* NOP */ -#define MP_LOCK /* NOP */ +#define IMASK_LOCK /* NOP */ +#define IMASK_UNLOCK /* NOP */ #endif /* SMP */ @@ -76,17 +62,6 @@ #include /** xxx_LOCK */ -/* - * Locks regions protected in UP kernel via cli/sti. - */ -#ifdef USE_MPINTRLOCK -#define MPINTR_LOCK() s_lock(&mpintr_lock) -#define MPINTR_UNLOCK() s_unlock(&mpintr_lock) -#else -#define MPINTR_LOCK() -#define MPINTR_UNLOCK() -#endif /* USE_MPINTRLOCK */ - /* * sio/cy lock. * XXX should rc (RISCom/8) use this? @@ -94,15 +69,9 @@ #ifdef USE_COMLOCK #define COM_LOCK() s_lock(&com_lock) #define COM_UNLOCK() s_unlock(&com_lock) -#define COM_DISABLE_INTR() \ - { __asm __volatile("cli" : : : "memory"); COM_LOCK(); } -#define COM_ENABLE_INTR() \ - { COM_UNLOCK(); __asm __volatile("sti"); } #else #define COM_LOCK() #define COM_UNLOCK() -#define COM_DISABLE_INTR() disable_intr() -#define COM_ENABLE_INTR() enable_intr() #endif /* USE_COMLOCK */ /* @@ -112,22 +81,13 @@ #ifdef USE_CLOCKLOCK #define CLOCK_LOCK() s_lock(&clock_lock) #define CLOCK_UNLOCK() s_unlock(&clock_lock) -#define CLOCK_DISABLE_INTR() \ - { __asm __volatile("cli" : : : "memory"); CLOCK_LOCK(); } -#define CLOCK_ENABLE_INTR() \ - { CLOCK_UNLOCK(); __asm __volatile("sti"); } #else #define CLOCK_LOCK() #define CLOCK_UNLOCK() -#define CLOCK_DISABLE_INTR() disable_intr() -#define CLOCK_ENABLE_INTR() enable_intr() #endif /* USE_CLOCKLOCK */ #else /* SMP */ -#define MPINTR_LOCK() -#define MPINTR_UNLOCK() - #define COM_LOCK() #define COM_UNLOCK() #define CLOCK_LOCK() @@ -168,6 +128,7 @@ extern struct simplelock clock_lock; extern struct simplelock com_lock; extern struct simplelock mpintr_lock; extern struct simplelock mcount_lock; +extern struct simplelock panic_lock; #if !defined(SIMPLELOCK_DEBUG) && NCPUS > 1 /* diff --git a/sys/i386/include/mptable.h b/sys/i386/include/mptable.h index 61c5ecf73205..95b5759f9e66 100644 --- a/sys/i386/include/mptable.h +++ b/sys/i386/include/mptable.h @@ -36,6 +36,7 @@ #endif #include +#include #include #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -236,6 +238,8 @@ typedef struct BASETABLE_ENTRY { #define MP_ANNOUNCE_POST 0x19 +/* used to hold the AP's until we are ready to release them */ +struct simplelock ap_boot_lock; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; @@ -336,6 +340,7 @@ static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); static int apic_int_is_bus_type(int intr, int bus_type); +static void release_aps(void *dummy); /* * Calculate usable address in base memory for AP trampoline code. @@ -403,7 +408,7 @@ mp_probe(void) /* - * Startup the SMP processors. + * Initialize the SMP hardware and the APIC and start up the AP's. */ void mp_start(void) @@ -619,6 +624,9 @@ mp_enable(u_int boot_addr) /* initialize all SMP locks */ init_locks(); + /* obtain the ap_boot_lock */ + s_lock(&ap_boot_lock); + /* start each Application Processor */ start_all_aps(boot_addr); } @@ -1866,9 +1874,6 @@ struct simplelock fast_intr_lock; /* critical region around INTR() routines */ struct simplelock intr_lock; -/* lock regions protected in UP kernel via cli/sti */ -struct simplelock mpintr_lock; - /* lock region used by kernel profiling */ struct simplelock mcount_lock; @@ -1885,26 +1890,16 @@ struct simplelock clock_lock; /* lock around the MP rendezvous */ static struct simplelock smp_rv_lock; +/* only 1 CPU can panic at a time :) */ +struct simplelock panic_lock; + static void init_locks(void) { - /* - * Get the initial mp_lock with a count of 1 for the BSP. - * This uses a LOGICAL cpu ID, ie BSP == 0. - */ - mp_lock = 0x00000001; - -#if 0 - /* ISR uses its own "giant lock" */ - isr_lock = FREE_LOCK; -#endif - #if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ) s_lock_init((struct simplelock*)&apic_itrace_debuglock); #endif - s_lock_init((struct simplelock*)&mpintr_lock); - s_lock_init((struct simplelock*)&mcount_lock); s_lock_init((struct simplelock*)&fast_intr_lock); @@ -1912,6 +1907,7 @@ init_locks(void) s_lock_init((struct simplelock*)&imen_lock); s_lock_init((struct simplelock*)&cpl_lock); s_lock_init(&smp_rv_lock); + s_lock_init(&panic_lock); #ifdef USE_COMLOCK s_lock_init((struct simplelock*)&com_lock); @@ -1919,12 +1915,10 @@ init_locks(void) #ifdef USE_CLOCKLOCK s_lock_init((struct simplelock*)&clock_lock); #endif /* USE_CLOCKLOCK */ + + s_lock_init(&ap_boot_lock); } - -/* Wait for all APs to be fully initialized */ -extern int wait_ap(unsigned int); - /* * start each AP in our list */ @@ -1987,6 +1981,7 @@ start_all_aps(u_int boot_addr) SMPpt[pg + 4] = 0; /* *prv_PMAP1 */ /* prime data page for it to use */ + SLIST_INSERT_HEAD(&cpuhead, gd, gd_allcpu); gd->gd_cpuid = x; gd->gd_cpu_lockid = x << 24; gd->gd_prv_CMAP1 = &SMPpt[pg + 1]; @@ -2211,7 +2206,6 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } - /* * Flush the TLB on all other CPU's * @@ -2348,10 +2342,13 @@ SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW, void ap_init(void); void -ap_init() +ap_init(void) { u_int apic_id; + /* lock against other AP's that are waking up */ + s_lock(&ap_boot_lock); + /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); @@ -2397,6 +2394,30 @@ ap_init() smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } + + /* let other AP's wake up now */ + s_unlock(&ap_boot_lock); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ; /* nothing */ + + /* + * Set curproc to our per-cpu idleproc so that mutexes have + * something unique to lock with. + */ + PCPU_SET(curproc,idleproc); + PCPU_SET(prevproc,idleproc); + + microuptime(&switchtime); + switchticks = ticks; + + /* ok, now grab sched_lock and enter the scheduler */ + enable_intr(); + mtx_enter(&sched_lock, MTX_SPIN); + cpu_throw(); /* doesn't return */ + + panic("scheduler returned us to ap_init"); } #ifdef BETTER_CLOCK @@ -2453,6 +2474,12 @@ forwarded_statclock(int id, int pscnt, int *astmap) p = checkstate_curproc[id]; cpustate = checkstate_cpustate[id]; + /* XXX */ + if (p->p_ithd) + cpustate = CHECKSTATE_INTR; + else if (p == idleproc) + cpustate = CHECKSTATE_SYS; + switch (cpustate) { case CHECKSTATE_USER: if (p->p_flag & P_PROFIL) @@ -2482,9 +2509,10 @@ forwarded_statclock(int id, int pscnt, int *astmap) if (pscnt > 1) return; - if (!p) + if (p == idleproc) { + p->p_sticks++; cp_time[CP_IDLE]++; - else { + } else { p->p_sticks++; cp_time[CP_SYS]++; } @@ -2510,7 +2538,7 @@ forwarded_statclock(int id, int pscnt, int *astmap) p->p_iticks++; cp_time[CP_INTR]++; } - if (p != NULL) { + if (p != idleproc) { schedclock(p); /* Update resource usage integrals and maximums. */ @@ -2863,3 +2891,11 @@ smp_rendezvous(void (* setup_func)(void *), /* release lock */ s_unlock(&smp_rv_lock); } + +void +release_aps(void *dummy __unused) +{ + s_unlock(&ap_boot_lock); +} + +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); diff --git a/sys/i386/include/mutex.h b/sys/i386/include/mutex.h new file mode 100644 index 000000000000..ef0c9638fc18 --- /dev/null +++ b/sys/i386/include/mutex.h @@ -0,0 +1,786 @@ +/*- + * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: mutex.h,v 2.7.2.35 2000/04/27 03:10:26 cp Exp $ + * $FreeBSD$ + */ + +#ifndef _MACHINE_MUTEX_H_ +#define _MACHINE_MUTEX_H_ + +#ifndef LOCORE + +#include +#include +#include +#include +#include + +/* + * If kern_mutex.c is being built, compile non-inlined versions of various + * functions so that kernel modules can use them. + */ +#ifndef _KERN_MUTEX_C_ +#define _MTX_INLINE static __inline +#else +#define _MTX_INLINE +#endif + +/* + * Mutex flags + * + * Types + */ +#define MTX_DEF 0x0 /* Default (spin/sleep) */ +#define MTX_SPIN 0x1 /* Spin only lock */ + +/* Options */ +#define MTX_RLIKELY 0x4 /* (opt) Recursion likely */ +#define MTX_NORECURSE 0x8 /* No recursion possible */ +#define MTX_NOSPIN 0x10 /* Don't spin before sleeping */ +#define MTX_NOSWITCH 0x20 /* Do not switch on release */ +#define MTX_FIRST 0x40 /* First spin lock holder */ +#define MTX_TOPHALF 0x80 /* Interrupts not disabled on spin */ + +/* options that should be passed on to mtx_enter_hard, mtx_exit_hard */ +#define MTX_HARDOPTS (MTX_SPIN | MTX_FIRST | MTX_TOPHALF | MTX_NOSWITCH) + +/* Flags/value used in mtx_lock */ +#define MTX_RECURSE 0x01 /* (non-spin) lock held recursively */ +#define MTX_CONTESTED 0x02 /* (non-spin) lock contested */ +#define MTX_FLAGMASK ~(MTX_RECURSE | MTX_CONTESTED) +#define MTX_UNOWNED 0x8 /* Cookie for free mutex */ + +struct proc; /* XXX */ + +/* + * Sleep/spin mutex + */ +struct mtx { + volatile u_int mtx_lock; /* lock owner/gate/flags */ + volatile u_short mtx_recurse; /* number of recursive holds */ + u_short mtx_f1; + u_int mtx_savefl; /* saved flags (for spin locks) */ + char *mtx_description; + TAILQ_HEAD(, proc) mtx_blocked; + LIST_ENTRY(mtx) mtx_contested; + struct mtx *mtx_next; /* all locks in system */ + struct mtx *mtx_prev; +#ifdef SMP_DEBUG + /* If you add anything here, adjust the mtxf_t definition below */ + struct witness *mtx_witness; + LIST_ENTRY(mtx) mtx_held; + char *mtx_file; + int mtx_line; +#endif /* SMP_DEBUG */ +}; + +typedef struct mtx mtx_t; + +/* + * Filler for structs which need to remain the same size + * whether or not SMP_DEBUG is turned on. + */ +typedef struct mtxf { +#ifdef SMP_DEBUG + char mtxf_data[0]; +#else + char mtxf_data[4*sizeof(void *) + sizeof(int)]; +#endif +} mtxf_t; + +#define mp_fixme(string) + +#ifdef _KERNEL +/* Misc */ +#define CURTHD ((u_int)CURPROC) /* Current thread ID */ + +/* Prototypes */ +void mtx_init(mtx_t *m, char *description, int flag); +void mtx_enter_hard(mtx_t *, int type, int flags); +void mtx_exit_hard(mtx_t *, int type); +void mtx_destroy(mtx_t *m); + +#if (defined(KLD_MODULE) || defined(_KERN_MUTEX_C_)) +void mtx_enter(mtx_t *mtxp, int type); +int mtx_try_enter(mtx_t *mtxp, int type); +void mtx_exit(mtx_t *mtxp, int type); +#endif + +/* Global locks */ +extern mtx_t sched_lock; +extern mtx_t Giant; + +/* + * Used to replace return with an exit Giant and return. + */ + +#define EGAR(a) \ +do { \ + mtx_exit(&Giant, MTX_DEF); \ + return (a); \ +} while (0) + +#define VEGAR \ +do { \ + mtx_exit(&Giant, MTX_DEF); \ + return; \ +} while (0) + +#define DROP_GIANT() \ +do { \ + int _giantcnt; \ + WITNESS_SAVE_DECL(Giant); \ + \ + WITNESS_SAVE(&Giant, Giant); \ + for (_giantcnt = 0; mtx_owned(&Giant); _giantcnt++) \ + mtx_exit(&Giant, MTX_DEF) + +#define PICKUP_GIANT() \ + mtx_assert(&Giant, MA_NOTOWNED); \ + while (_giantcnt--) \ + mtx_enter(&Giant, MTX_DEF); \ + WITNESS_RESTORE(&Giant, Giant); \ +} while (0) + +#define PARTIAL_PICKUP_GIANT() \ + mtx_assert(&Giant, MA_NOTOWNED); \ + while (_giantcnt--) \ + mtx_enter(&Giant, MTX_DEF); \ + WITNESS_RESTORE(&Giant, Giant) + + +/* + * Debugging + */ +#ifndef SMP_DEBUG +#define mtx_assert(m, what) +#else /* SMP_DEBUG */ + +#define MA_OWNED 1 +#define MA_NOTOWNED 2 +#define mtx_assert(m, what) { \ + switch ((what)) { \ + case MA_OWNED: \ + ASS(mtx_owned((m))); \ + break; \ + case MA_NOTOWNED: \ + ASS(!mtx_owned((m))); \ + break; \ + default: \ + panic("unknown mtx_assert at %s:%d", __FILE__, __LINE__); \ + } \ +} + +#ifdef INVARIANTS +#define ASS(ex) MPASS(ex) +#define MPASS(ex) if (!(ex)) panic("Assertion %s failed at %s:%d", \ + #ex, __FILE__, __LINE__) +#define MPASS2(ex, what) if (!(ex)) panic("Assertion %s failed at %s:%d", \ + what, __FILE__, __LINE__) + +#ifdef MTX_STRS +char STR_IEN[] = "fl & 0x200"; +char STR_IDIS[] = "!(fl & 0x200)"; +#else /* MTX_STRS */ +extern char STR_IEN[]; +extern char STR_IDIS[]; +#endif /* MTX_STRS */ +#define ASS_IEN MPASS2(read_eflags() & 0x200, STR_IEN) +#define ASS_IDIS MPASS2((read_eflags() & 0x200) == 0, STR_IDIS) +#endif /* INVARIANTS */ + +#endif /* SMP_DEBUG */ + +#if !defined(SMP_DEBUG) || !defined(INVARIANTS) +#define ASS(ex) +#define MPASS(ex) +#define MPASS2(ex, where) +#define ASS_IEN +#define ASS_IDIS +#endif /* !defined(SMP_DEBUG) || !defined(INVARIANTS) */ + +#ifdef WITNESS +#ifndef SMP_DEBUG +#error WITNESS requires SMP_DEBUG +#endif /* SMP_DEBUG */ +#define WITNESS_ENTER(m, f) \ + if ((m)->mtx_witness != NULL) \ + witness_enter((m), (f), __FILE__, __LINE__) +#define WITNESS_EXIT(m, f) \ + if ((m)->mtx_witness != NULL) \ + witness_exit((m), (f), __FILE__, __LINE__) + +#define WITNESS_SLEEP(check, m) witness_sleep(check, (m), __FILE__, __LINE__) +#define WITNESS_SAVE_DECL(n) \ + char * __CONCAT(n, __wf); \ + int __CONCAT(n, __wl) + +#define WITNESS_SAVE(m, n) \ +do { \ + if ((m)->mtx_witness != NULL) \ + witness_save(m, &__CONCAT(n, __wf), &__CONCAT(n, __wl)); \ +} while (0) + +#define WITNESS_RESTORE(m, n) \ +do { \ + if ((m)->mtx_witness != NULL) \ + witness_restore(m, __CONCAT(n, __wf), __CONCAT(n, __wl)); \ +} while (0) + +void witness_init(mtx_t *, int flag); +void witness_destroy(mtx_t *); +void witness_enter(mtx_t *, int, char *, int); +void witness_try_enter(mtx_t *, int, char *, int); +void witness_exit(mtx_t *, int, char *, int); +void witness_display(void(*)(const char *fmt, ...)); +void witness_list(struct proc *); +int witness_sleep(int, mtx_t *, char *, int); +void witness_save(mtx_t *, char **, int *); +void witness_restore(mtx_t *, char *, int); +#else /* WITNESS */ +#define WITNESS_ENTER(m, flag) +#define WITNESS_EXIT(m, flag) +#define WITNESS_SLEEP(check, m) +#define WITNESS_SAVE_DECL(n) +#define WITNESS_SAVE(m, n) +#define WITNESS_RESTORE(m, n) + +/* + * flag++ is slezoid way of shutting up unused parameter warning + * in mtx_init() + */ +#define witness_init(m, flag) flag++ +#define witness_destroy(m) +#define witness_enter(m, flag, f, l) +#define witness_try_enter(m, flag, f, l ) +#define witness_exit(m, flag, f, l) +#endif /* WITNESS */ + +/* + * Assembly macros (for internal use only) + *------------------------------------------------------------------------------ + */ + +#define _V(x) __STRING(x) + +#ifndef I386_CPU + +/* + * For 486 and newer processors. + */ + +/* Get a sleep lock, deal with recursion inline. */ +#define _getlock_sleep(mtxp, tid, type) ({ \ + int _res; \ + \ + __asm __volatile ( \ +" movl $" _V(MTX_UNOWNED) ",%%eax;" /* Unowned cookie */ \ +" " MPLOCKED "" \ +" cmpxchgl %3,%1;" /* Try */ \ +" jz 1f;" /* Got it */ \ +" andl $" _V(MTX_FLAGMASK) ",%%eax;" /* turn off spec bits */ \ +" cmpl %%eax,%3;" /* already have it? */ \ +" je 2f;" /* yes, recurse */ \ +" pushl %4;" \ +" pushl %5;" \ +" call mtx_enter_hard;" \ +" addl $8,%%esp;" \ +" jmp 1f;" \ +"2: lock; orl $" _V(MTX_RECURSE) ",%1;" \ +" incw %2;" \ +"1:" \ +"# getlock_sleep" \ + : "=&a" (_res), /* 0 (dummy output) */ \ + "+m" (mtxp->mtx_lock), /* 1 */ \ + "+m" (mtxp->mtx_recurse) /* 2 */ \ + : "r" (tid), /* 3 (input) */ \ + "gi" (type), /* 4 */ \ + "g" (mtxp) /* 5 */ \ + : "memory", "ecx", "edx" /* used */ ); \ +}) + +/* Get a spin lock, handle recursion inline (as the less common case) */ +#define _getlock_spin_block(mtxp, tid, type) ({ \ + int _res; \ + \ + __asm __volatile ( \ +" pushfl;" \ +" cli;" \ +" movl $" _V(MTX_UNOWNED) ",%%eax;" /* Unowned cookie */ \ +" " MPLOCKED "" \ +" cmpxchgl %3,%1;" /* Try */ \ +" jz 2f;" /* got it */ \ +" pushl %4;" \ +" pushl %5;" \ +" call mtx_enter_hard;" /* mtx_enter_hard(mtxp, type, oflags) */ \ +" addl $0xc,%%esp;" \ +" jmp 1f;" \ +"2: popl %2;" /* save flags */ \ +"1:" \ +"# getlock_spin_block" \ + : "=&a" (_res), /* 0 (dummy output) */ \ + "+m" (mtxp->mtx_lock), /* 1 */ \ + "=m" (mtxp->mtx_savefl) /* 2 */ \ + : "r" (tid), /* 3 (input) */ \ + "gi" (type), /* 4 */ \ + "g" (mtxp) /* 5 */ \ + : "memory", "ecx", "edx" /* used */ ); \ +}) + +/* + * Get a lock without any recursion handling. Calls the hard enter function if + * we can't get it inline. + */ +#define _getlock_norecurse(mtxp, tid, type) ({ \ + int _res; \ + \ + __asm __volatile ( \ +" movl $" _V(MTX_UNOWNED) ",%%eax;" /* Unowned cookie */ \ +" " MPLOCKED "" \ +" cmpxchgl %2,%1;" /* Try */ \ +" jz 1f;" /* got it */ \ +" pushl %3;" \ +" pushl %4;" \ +" call mtx_enter_hard;" /* mtx_enter_hard(mtxp, type) */ \ +" addl $8,%%esp;" \ +"1:" \ +"# getlock_norecurse" \ + : "=&a" (_res), /* 0 (dummy output) */ \ + "+m" (mtxp->mtx_lock) /* 1 */ \ + : "r" (tid), /* 2 (input) */ \ + "gi" (type), /* 3 */ \ + "g" (mtxp) /* 4 */ \ + : "memory", "ecx", "edx" /* used */ ); \ +}) + +/* + * Release a sleep lock assuming we haven't recursed on it, recursion is handled + * in the hard function. + */ +#define _exitlock_norecurse(mtxp, tid, type) ({ \ + int _tid = (int)(tid); \ + \ + __asm __volatile ( \ +" " MPLOCKED "" \ +" cmpxchgl %4,%0;" /* try easy rel */ \ +" jz 1f;" /* released! */ \ +" pushl %2;" \ +" pushl %3;" \ +" call mtx_exit_hard;" \ +" addl $8,%%esp;" \ +"1:" \ +"# exitlock_norecurse" \ + : "+m" (mtxp->mtx_lock), /* 0 */ \ + "+a" (_tid) /* 1 */ \ + : "gi" (type), /* 2 (input) */ \ + "g" (mtxp), /* 3 */ \ + "r" (MTX_UNOWNED) /* 4 */ \ + : "memory", "ecx", "edx" /* used */ ); \ +}) + +/* + * Release a sleep lock when its likely we recursed (the code to + * deal with simple recursion is inline). + */ +#define _exitlock(mtxp, tid, type) ({ \ + int _tid = (int)(tid); \ + \ + __asm __volatile ( \ +" " MPLOCKED "" \ +" cmpxchgl %5,%0;" /* try easy rel */ \ +" jz 1f;" /* released! */ \ +" testl $" _V(MTX_RECURSE) ",%%eax;" /* recursed? */ \ +" jnz 3f;" /* handle recursion */ \ + /* Lock not recursed and contested: do the hard way */ \ +" pushl %3;" \ +" pushl %4;" \ +" call mtx_exit_hard;" /* mtx_exit_hard(mtxp,type) */ \ +" addl $8,%%esp;" \ +" jmp 1f;" \ + /* lock recursed, lower recursion level */ \ +"3: decw %1;" /* one less level */ \ +" jnz 1f;" /* still recursed, done */ \ +" lock; andl $~" _V(MTX_RECURSE) ",%0;" /* turn off recurse flag */ \ +"1:" \ +"# exitlock" \ + : "+m" (mtxp->mtx_lock), /* 0 */ \ + "+m" (mtxp->mtx_recurse), /* 1 */ \ + "+a" (_tid) /* 2 */ \ + : "gi" (type), /* 3 (input) */ \ + "g" (mtxp), /* 4 */ \ + "r" (MTX_UNOWNED) /* 5 */ \ + : "memory", "ecx", "edx" /* used */ ); \ +}) + +/* + * Release a spin lock (with possible recursion). + * + * We use cmpxchgl to clear lock (instead of simple store) to flush posting + * buffers and make the change visible to other CPU's. + */ +#define _exitlock_spin(mtxp, inten1, inten2) ({ \ + int _res; \ + \ + __asm __volatile ( \ +" movw %1,%%ax;" \ +" decw %%ax;" \ +" js 1f;" \ +" movw %%ax,%1;" \ +" jmp 2f;" \ +"1: movl %0,%%eax;" \ +" movl $ " _V(MTX_UNOWNED) ",%%ecx;" \ +" " inten1 ";" \ +" " MPLOCKED "" \ +" cmpxchgl %%ecx,%0;" \ +" " inten2 ";" \ +"2:" \ +"# exitlock_spin" \ + : "+m" (mtxp->mtx_lock), /* 0 */ \ + "+m" (mtxp->mtx_recurse), /* 1 */ \ + "=&a" (_res) /* 2 */ \ + : "g" (mtxp->mtx_savefl) /* 3 (used in 'inten') */ \ + : "memory", "ecx" /* used */ ); \ +}) + +#else /* I386_CPU */ + +/* + * For 386 processors only. + */ + +/* Get a sleep lock, deal with recursion inline. */ +#define _getlock_sleep(mp, tid, type) do { \ + if (atomic_cmpset_int(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) == 0) { \ + if (((mp)->mtx_lock & MTX_FLAGMASK) != (tid)) \ + mtx_enter_hard(mp, (type) & MTX_HARDOPTS, 0); \ + else { \ + atomic_set_int(&(mp)->mtx_lock, MTX_RECURSE); \ + (mp)->mtx_recurse++; \ + } \ + } \ +} while (0) + +/* Get a spin lock, handle recursion inline (as the less common case) */ +#define _getlock_spin_block(mp, tid, type) do { \ + u_int _mtx_fl = read_eflags(); \ + disable_intr(); \ + if (atomic_cmpset_int(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) == 0) \ + mtx_enter_hard(mp, (type) & MTX_HARDOPTS, _mtx_fl); \ + else \ + (mp)->mtx_savefl = _mtx_fl; \ +} while (0) + +/* + * Get a lock without any recursion handling. Calls the hard enter function if + * we can't get it inline. + */ +#define _getlock_norecurse(mp, tid, type) do { \ + if (atomic_cmpset_int(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) == 0) \ + mtx_enter_hard((mp), (type) & MTX_HARDOPTS, 0); \ +} while (0) + +/* + * Release a sleep lock assuming we haven't recursed on it, recursion is handled + * in the hard function. + */ +#define _exitlock_norecurse(mp, tid, type) do { \ + if (atomic_cmpset_int(&(mp)->mtx_lock, (tid), MTX_UNOWNED) == 0) \ + mtx_exit_hard((mp), (type) & MTX_HARDOPTS); \ +} while (0) + +/* + * Release a sleep lock when its likely we recursed (the code to + * deal with simple recursion is inline). + */ +#define _exitlock(mp, tid, type) do { \ + if (atomic_cmpset_int(&(mp)->mtx_lock, (tid), MTX_UNOWNED) == 0) { \ + if ((mp)->mtx_lock & MTX_RECURSE) { \ + if (--((mp)->mtx_recurse) == 0) \ + atomic_clear_int(&(mp)->mtx_lock, \ + MTX_RECURSE); \ + } else { \ + mtx_exit_hard((mp), (type) & MTX_HARDOPTS); \ + } \ + } \ +} while (0) + +/* Release a spin lock (with possible recursion). */ +#define _exitlock_spin(mp, inten1, inten2) do { \ + if ((mp)->mtx_recurse == 0) { \ + atomic_cmpset_int(&(mp)->mtx_lock, (mp)->mtx_lock, \ + MTX_UNOWNED); \ + write_eflags((mp)->mtx_savefl); \ + } else { \ + (mp)->mtx_recurse--; \ + } \ +} while (0) + +#endif /* I386_CPU */ + +/* + * Externally visible mutex functions. + *------------------------------------------------------------------------------ + */ + +/* + * Return non-zero if a mutex is already owned by the current thread. + */ +#define mtx_owned(m) (((m)->mtx_lock & MTX_FLAGMASK) == CURTHD) + +/* Common strings */ +#ifdef MTX_STRS +#ifdef KTR_EXTEND + +/* + * KTR_EXTEND saves file name and line for all entries, so we don't need them + * here. Theoretically we should also change the entries which refer to them + * (from CTR5 to CTR3), but since they're just passed to snprinf as the last + * parameters, it doesn't do any harm to leave them. + */ +char STR_mtx_enter_fmt[] = "GOT %s [%x] r=%d"; +char STR_mtx_exit_fmt[] = "REL %s [%x] r=%d"; +char STR_mtx_try_enter_fmt[] = "TRY_ENTER %s [%x] result=%d"; +#else +char STR_mtx_enter_fmt[] = "GOT %s [%x] at %s:%d r=%d"; +char STR_mtx_exit_fmt[] = "REL %s [%x] at %s:%d r=%d"; +char STR_mtx_try_enter_fmt[] = "TRY_ENTER %s [%x] at %s:%d result=%d"; +#endif +char STR_mtx_bad_type[] = "((type) & (MTX_NORECURSE | MTX_NOSWITCH)) == 0"; +char STR_mtx_owned[] = "mtx_owned(_mpp)"; +char STR_mtx_recurse[] = "_mpp->mtx_recurse == 0"; +#else /* MTX_STRS */ +extern char STR_mtx_enter_fmt[]; +extern char STR_mtx_bad_type[]; +extern char STR_mtx_exit_fmt[]; +extern char STR_mtx_owned[]; +extern char STR_mtx_recurse[]; +extern char STR_mtx_try_enter_fmt[]; +#endif /* MTX_STRS */ + +#ifndef KLD_MODULE +/* + * Get lock 'm', the macro handles the easy (and most common cases) and leaves + * the slow stuff to the mtx_enter_hard() function. + * + * Note: since type is usually a constant much of this code is optimized out. + */ +_MTX_INLINE void +mtx_enter(mtx_t *mtxp, int type) +{ + mtx_t *_mpp = mtxp; + + /* bits only valid on mtx_exit() */ + MPASS2(((type) & (MTX_NORECURSE | MTX_NOSWITCH)) == 0, + STR_mtx_bad_type); + + do { + if ((type) & MTX_SPIN) { + /* + * Easy cases of spin locks: + * + * 1) We already own the lock and will simply + * recurse on it (if RLIKELY) + * + * 2) The lock is free, we just get it + */ + if ((type) & MTX_RLIKELY) { + /* + * Check for recursion, if we already + * have this lock we just bump the + * recursion count. + */ + if (_mpp->mtx_lock == CURTHD) { + _mpp->mtx_recurse++; + break; /* Done */ + } + } + + if (((type) & MTX_TOPHALF) == 0) { + /* + * If an interrupt thread uses this + * we must block interrupts here. + */ + if ((type) & MTX_FIRST) { + ASS_IEN; + disable_intr(); + _getlock_norecurse(_mpp, CURTHD, + (type) & MTX_HARDOPTS); + } else { + _getlock_spin_block(_mpp, CURTHD, + (type) & MTX_HARDOPTS); + } + } else + _getlock_norecurse(_mpp, CURTHD, + (type) & MTX_HARDOPTS); + } else { + /* Sleep locks */ + if ((type) & MTX_RLIKELY) + _getlock_sleep(_mpp, CURTHD, + (type) & MTX_HARDOPTS); + else + _getlock_norecurse(_mpp, CURTHD, + (type) & MTX_HARDOPTS); + } + } while (0); + WITNESS_ENTER(_mpp, type); + CTR5(KTR_LOCK, STR_mtx_enter_fmt, + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, + (_mpp)->mtx_recurse); +} + +/* + * Attempt to get MTX_DEF lock, return non-zero if lock acquired. + * + * XXX DOES NOT HANDLE RECURSION + */ +_MTX_INLINE int +mtx_try_enter(mtx_t *mtxp, int type) +{ + mtx_t *const _mpp = mtxp; + int _rval; + + _rval = atomic_cmpset_int(&_mpp->mtx_lock, MTX_UNOWNED, CURTHD); +#ifdef SMP_DEBUG + if (_rval && (_mpp)->mtx_witness != NULL) { + ASS((_mpp)->mtx_recurse == 0); + witness_try_enter(_mpp, type, __FILE__, __LINE__); + } +#endif + CTR5(KTR_LOCK, STR_mtx_try_enter_fmt, + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, _rval); + + return _rval; +} + +#define mtx_legal2block() (read_eflags() & 0x200) + +/* + * Release lock m. + */ +_MTX_INLINE void +mtx_exit(mtx_t *mtxp, int type) +{ + mtx_t *const _mpp = mtxp; + + MPASS2(mtx_owned(_mpp), STR_mtx_owned); + WITNESS_EXIT(_mpp, type); + CTR5(KTR_LOCK, STR_mtx_exit_fmt, + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, + (_mpp)->mtx_recurse); + if ((type) & MTX_SPIN) { + if ((type) & MTX_NORECURSE) { + MPASS2(_mpp->mtx_recurse == 0, STR_mtx_recurse); + atomic_cmpset_int(&_mpp->mtx_lock, _mpp->mtx_lock, + MTX_UNOWNED); + if (((type) & MTX_TOPHALF) == 0) { + if ((type) & MTX_FIRST) { + ASS_IDIS; + enable_intr(); + } else + write_eflags(_mpp->mtx_savefl); + } + } else { + if ((type) & MTX_TOPHALF) + _exitlock_spin(_mpp,,); + else { + if ((type) & MTX_FIRST) { + ASS_IDIS; + _exitlock_spin(_mpp,, "sti"); + } else { + _exitlock_spin(_mpp, + "pushl %3", "popfl"); + } + } + } + } else { + /* Handle sleep locks */ + if ((type) & MTX_RLIKELY) + _exitlock(_mpp, CURTHD, (type) & MTX_HARDOPTS); + else { + _exitlock_norecurse(_mpp, CURTHD, + (type) & MTX_HARDOPTS); + } + } +} + +#endif /* KLD_MODULE */ +#endif /* _KERNEL */ + +#else /* !LOCORE */ + +/* + * Simple assembly macros to get and release non-recursive spin locks + */ + +#if defined(I386_CPU) + +#define MTX_EXIT(lck, reg) \ + movl $ MTX_UNOWNED,lck+MTX_LOCK; + +#else /* I386_CPU */ + +#define MTX_ENTER(reg, lck) \ +9: movl $ MTX_UNOWNED,%eax; \ + MPLOCKED \ + cmpxchgl reg,lck+MTX_LOCK; \ + jnz 9b + +/* Must use locked bus op (cmpxchg) when setting to unowned (barrier) */ +#define MTX_EXIT(lck,reg) \ + movl lck+MTX_LOCK,%eax; \ + movl $ MTX_UNOWNED,reg; \ + MPLOCKED \ + cmpxchgl reg,lck+MTX_LOCK; \ + +#define MTX_ENTER_WITH_RECURSION(reg, lck) \ + movl lck+MTX_LOCK,%eax; \ + cmpl PCPU_CURPROC,%eax; \ + jne 9f; \ + incw lck+MTX_RECURSECNT; \ + jmp 8f; \ +9: movl $ MTX_UNOWNED,%eax; \ + MPLOCKED \ + cmpxchgl reg,lck+MTX_LOCK; \ + jnz 9b; \ +8: + +#define MTX_EXIT_WITH_RECURSION(lck,reg) \ + movw lck+MTX_RECURSECNT,%ax; \ + decw %ax; \ + js 9f; \ + movw %ax,lck+MTX_RECURSECNT; \ + jmp 8f; \ +9: movl lck+MTX_LOCK,%eax; \ + movl $ MTX_UNOWNED,reg; \ + MPLOCKED \ + cmpxchgl reg,lck+MTX_LOCK; \ +8: + +#endif /* I386_CPU */ +#endif /* !LOCORE */ +#endif /* __MACHINE_MUTEX_H */ diff --git a/sys/i386/include/pcb.h b/sys/i386/include/pcb.h index 08beb5a83059..1c7af8505ab1 100644 --- a/sys/i386/include/pcb.h +++ b/sys/i386/include/pcb.h @@ -72,11 +72,7 @@ struct pcb { #define FP_SOFTFP 0x01 /* process using software fltng pnt emulator */ #define PCB_DBREGS 0x02 /* process using debug registers */ caddr_t pcb_onfault; /* copyin/out fault recovery */ -#ifdef SMP - u_long pcb_mpnest; -#else - u_long pcb_mpnest_dontuse; -#endif + int pcb_schednest; int pcb_gs; struct pcb_ext *pcb_ext; /* optional pcb extension */ u_long __pcb_spare[3]; /* adjust to avoid core dump size changes */ diff --git a/sys/i386/include/pcpu.h b/sys/i386/include/pcpu.h index 58bd9cfe9416..440da60b4b83 100644 --- a/sys/i386/include/pcpu.h +++ b/sys/i386/include/pcpu.h @@ -26,6 +26,20 @@ * $FreeBSD$ */ +#ifndef _MACHINE_GLOBALDATA_H_ +#define _MACHINE_GLOBALDATA_H_ + +#include +#include +#include +#include +#include + +/* XXX */ +#ifdef KTR_PERCPU +#include +#endif + /* * This structure maps out the global data that needs to be kept on a * per-cpu basis. genassym uses this to generate offsets for the assembler @@ -41,11 +55,14 @@ struct globaldata { struct privatespace *gd_prvspace; /* self-reference */ struct proc *gd_curproc; + struct proc *gd_prevproc; struct proc *gd_npxproc; struct pcb *gd_curpcb; + struct proc *gd_idleproc; struct timeval gd_switchtime; struct i386tss gd_common_tss; int gd_switchticks; + int gd_intr_nesting_level; struct segment_descriptor gd_common_tssd; struct segment_descriptor *gd_tss_gdt; #ifdef USER_LDT @@ -67,8 +84,22 @@ struct globaldata { unsigned *gd_prv_PADDR1; #endif u_int gd_astpending; + SLIST_ENTRY(globaldata) gd_allcpu; + int gd_witness_spin_check; +#ifdef KTR_PERCPU +#ifdef KTR + volatile int gd_ktr_idx; + char *gd_ktr_buf; + char gd_ktr_buf_data[KTR_SIZE]; +#endif +#endif }; +extern struct globaldata globaldata; + +SLIST_HEAD(cpuhead, globaldata); +extern struct cpuhead cpuhead; + #ifdef SMP /* * This is the upper (0xff800000) address space layout that is per-cpu. @@ -93,3 +124,5 @@ struct privatespace { extern struct privatespace SMP_prvspace[]; #endif + +#endif /* ! _MACHINE_GLOBALDATA_H_ */ diff --git a/sys/i386/include/smp.h b/sys/i386/include/smp.h index 69b716ba8579..20d4fa3a8873 100644 --- a/sys/i386/include/smp.h +++ b/sys/i386/include/smp.h @@ -15,6 +15,9 @@ #ifdef _KERNEL +#ifdef I386_CPU +#error SMP not supported with I386_CPU +#endif #if defined(SMP) && !defined(APIC_IO) # error APIC_IO required for SMP, add "options APIC_IO" to your config file. #endif /* SMP && !APIC_IO */ @@ -57,23 +60,6 @@ extern int bootMP_size; /* functions in mpboot.s */ void bootMP __P((void)); -/* global data in mplock.s */ -extern u_int mp_lock; -extern u_int isr_lock; -#ifdef RECURSIVE_MPINTRLOCK -extern u_int mpintr_lock; -#endif /* RECURSIVE_MPINTRLOCK */ - -/* functions in mplock.s */ -void get_mplock __P((void)); -void rel_mplock __P((void)); -int try_mplock __P((void)); -#ifdef RECURSIVE_MPINTRLOCK -void get_mpintrlock __P((void)); -void rel_mpintrlock __P((void)); -int try_mpintrlock __P((void)); -#endif /* RECURSIVE_MPINTRLOCK */ - /* global data in apic_vector.s */ extern volatile u_int stopped_cpus; extern volatile u_int started_cpus; @@ -185,23 +171,7 @@ extern int smp_started; extern volatile int smp_idle_loops; #endif /* !LOCORE */ -#else /* !SMP && !APIC_IO */ - -/* - * Create dummy MP lock empties - */ - -static __inline void -get_mplock(void) -{ -} - -static __inline void -rel_mplock(void) -{ -} - -#endif +#endif /* SMP && !APIC_IO */ #endif /* _KERNEL */ #endif /* _MACHINE_SMP_H_ */ diff --git a/sys/i386/include/smptests.h b/sys/i386/include/smptests.h index f9ac4a36919e..304e99051295 100644 --- a/sys/i386/include/smptests.h +++ b/sys/i386/include/smptests.h @@ -86,7 +86,6 @@ * These defines enable critical region locking of areas that were * protected via cli/sti in the UP kernel. * - * MPINTRLOCK protects all the generic areas. * COMLOCK protects the sio/cy drivers. * CLOCKLOCK protects clock hardware and data * known to be incomplete: @@ -94,7 +93,6 @@ * ? */ #ifdef PUSHDOWN_LEVEL_1 -#define USE_MPINTRLOCK #define USE_COMLOCK #define USE_CLOCKLOCK #endif @@ -176,9 +174,8 @@ /* * Send CPUSTOP IPI for stop/restart of other CPUs on DDB break. - * -#define VERBOSE_CPUSTOP_ON_DDBBREAK */ +#define VERBOSE_CPUSTOP_ON_DDBBREAK #define CPUSTOP_ON_DDBBREAK diff --git a/sys/i386/isa/apic_ipl.s b/sys/i386/isa/apic_ipl.s index 94771f3eadb3..0def1de7e02d 100644 --- a/sys/i386/isa/apic_ipl.s +++ b/sys/i386/isa/apic_ipl.s @@ -68,78 +68,6 @@ _apic_imen: .text SUPERALIGN_TEXT -/* - * splz() - dispatch pending interrupts after cpl reduced - * - * Interrupt priority mechanism - * -- soft splXX masks with group mechanism (cpl) - * -- h/w masks for currently active or unused interrupts (imen) - * -- ipending = active interrupts currently masked by cpl - */ - -ENTRY(splz) - /* - * The caller has restored cpl and checked that (ipending & ~cpl) - * is nonzero. However, since ipending can change at any time - * (by an interrupt or, with SMP, by another cpu), we have to - * repeat the check. At the moment we must own the MP lock in - * the SMP case because the interruput handlers require it. We - * loop until no unmasked pending interrupts remain. - * - * No new unmaksed pending interrupts will be added during the - * loop because, being unmasked, the interrupt code will be able - * to execute the interrupts. - * - * Interrupts come in two flavors: Hardware interrupts and software - * interrupts. We have to detect the type of interrupt (based on the - * position of the interrupt bit) and call the appropriate dispatch - * routine. - * - * NOTE: "bsfl %ecx,%ecx" is undefined when %ecx is 0 so we can't - * rely on the secondary btrl tests. - */ - movl _cpl,%eax -splz_next: - /* - * We don't need any locking here. (ipending & ~cpl) cannot grow - * while we're looking at it - any interrupt will shrink it to 0. - */ - movl %eax,%ecx - notl %ecx /* set bit = unmasked level */ - andl _ipending,%ecx /* set bit = unmasked pending INT */ - jne splz_unpend - ret - - ALIGN_TEXT -splz_unpend: - bsfl %ecx,%ecx - lock - btrl %ecx,_ipending - jnc splz_next - cmpl $NHWI,%ecx - jae splz_swi - /* - * We would prefer to call the intr handler directly here but that - * doesn't work for badly behaved handlers that want the interrupt - * frame. Also, there's a problem determining the unit number. - * We should change the interface so that the unit number is not - * determined at config time. - * - * The vec[] routines build the proper frame on the stack, - * then call one of _Xintr0 thru _XintrNN. - */ - jmp *_vec(,%ecx,4) - - ALIGN_TEXT -splz_swi: - pushl %eax - orl imasks(,%ecx,4),%eax - movl %eax,_cpl - call *_ihandlers(,%ecx,4) - popl %eax - movl %eax,_cpl - jmp splz_next - /* * Fake clock interrupt(s) so that they appear to come from our caller instead * of from here, so that system profiling works. @@ -161,8 +89,6 @@ __CONCAT(vec,irq_num): ; \ pushl $KCSEL ; \ pushl %eax ; \ cli ; \ - lock ; /* MP-safe */ \ - andl $~IRQ_BIT(irq_num), iactive ; /* lazy masking */ \ MEXITCOUNT ; \ APIC_ITRACE(apic_itrace_splz, irq_num, APIC_ITRACE_SPLZ) ; \ jmp __CONCAT(_Xintr,irq_num) diff --git a/sys/i386/isa/apic_vector.s b/sys/i386/isa/apic_vector.s index 2a7559df7f97..54bf00366c81 100644 --- a/sys/i386/isa/apic_vector.s +++ b/sys/i386/isa/apic_vector.s @@ -17,7 +17,7 @@ /* - * Macros for interrupt interrupt entry, call to handler, and exit. + * Macros for interrupt entry, call to handler, and exit. */ #define FAST_INTR(irq_num, vec_name) \ @@ -121,7 +121,7 @@ IDTVEC(vec_name) ; \ /* - * Test to see if the source is currntly masked, clear if so. + * Test to see if the source is currently masked, clear if so. */ #define UNMASK_IRQ(irq_num) \ IMASK_LOCK ; /* into critical reg */ \ @@ -200,7 +200,16 @@ log_intr_event: #else #define APIC_ITRACE(name, irq_num, id) #endif - + +/* + * Slow, threaded interrupts. + * + * XXX Most of the parameters here are obsolete. Fix this when we're + * done. + * XXX we really shouldn't return via doreti if we just schedule the + * interrupt handler and don't run anything. We could just do an + * iret. FIXME. + */ #define INTR(irq_num, vec_name, maybe_extra_ipending) \ .text ; \ SUPERALIGN_TEXT ; \ @@ -216,87 +225,24 @@ IDTVEC(vec_name) ; \ maybe_extra_ipending ; \ ; \ APIC_ITRACE(apic_itrace_enter, irq_num, APIC_ITRACE_ENTER) ; \ - lock ; /* MP-safe */ \ - btsl $(irq_num), iactive ; /* lazy masking */ \ - jc 1f ; /* already active */ \ ; \ MASK_LEVEL_IRQ(irq_num) ; \ EOI_IRQ(irq_num) ; \ 0: ; \ - APIC_ITRACE(apic_itrace_tryisrlock, irq_num, APIC_ITRACE_TRYISRLOCK) ;\ - MP_TRYLOCK ; /* XXX this is going away... */ \ - testl %eax, %eax ; /* did we get it? */ \ - jz 3f ; /* no */ \ -; \ - APIC_ITRACE(apic_itrace_gotisrlock, irq_num, APIC_ITRACE_GOTISRLOCK) ;\ - testl $IRQ_BIT(irq_num), _cpl ; \ - jne 2f ; /* this INT masked */ \ -; \ incb _intr_nesting_level ; \ ; \ /* entry point used by doreti_unpend for HWIs. */ \ __CONCAT(Xresume,irq_num): ; \ FAKE_MCOUNT(13*4(%esp)) ; /* XXX avoid dbl cnt */ \ - lock ; incl _cnt+V_INTR ; /* tally interrupts */ \ - movl _intr_countp + (irq_num) * 4, %eax ; \ - lock ; incl (%eax) ; \ -; \ - movl _cpl, %eax ; \ - pushl %eax ; \ - orl _intr_mask + (irq_num) * 4, %eax ; \ - movl %eax, _cpl ; \ - lock ; \ - andl $~IRQ_BIT(irq_num), _ipending ; \ -; \ - pushl _intr_unit + (irq_num) * 4 ; \ + pushl $irq_num; /* pass the IRQ */ \ APIC_ITRACE(apic_itrace_enter2, irq_num, APIC_ITRACE_ENTER2) ; \ sti ; \ - call *_intr_handler + (irq_num) * 4 ; \ - cli ; \ + call _sched_ithd ; \ + addl $4, %esp ; /* discard the parameter */ \ APIC_ITRACE(apic_itrace_leave, irq_num, APIC_ITRACE_LEAVE) ; \ ; \ - lock ; andl $~IRQ_BIT(irq_num), iactive ; \ - UNMASK_IRQ(irq_num) ; \ - APIC_ITRACE(apic_itrace_unmask, irq_num, APIC_ITRACE_UNMASK) ; \ - sti ; /* doreti repeats cli/sti */ \ MEXITCOUNT ; \ - jmp _doreti ; \ -; \ - ALIGN_TEXT ; \ -1: ; /* active */ \ - APIC_ITRACE(apic_itrace_active, irq_num, APIC_ITRACE_ACTIVE) ; \ - MASK_IRQ(irq_num) ; \ - EOI_IRQ(irq_num) ; \ - lock ; \ - orl $IRQ_BIT(irq_num), _ipending ; \ - lock ; \ - btsl $(irq_num), iactive ; /* still active */ \ - jnc 0b ; /* retry */ \ - POP_FRAME ; \ - iret ; /* XXX: iactive bit might be 0 now */ \ - ALIGN_TEXT ; \ -2: ; /* masked by cpl, leave iactive set */ \ - APIC_ITRACE(apic_itrace_masked, irq_num, APIC_ITRACE_MASKED) ; \ - lock ; \ - orl $IRQ_BIT(irq_num), _ipending ; \ - MP_RELLOCK ; \ - POP_FRAME ; \ - iret ; \ - ALIGN_TEXT ; \ -3: ; /* other cpu has isr lock */ \ - APIC_ITRACE(apic_itrace_noisrlock, irq_num, APIC_ITRACE_NOISRLOCK) ;\ - lock ; \ - orl $IRQ_BIT(irq_num), _ipending ; \ - testl $IRQ_BIT(irq_num), _cpl ; \ - jne 4f ; /* this INT masked */ \ - call forward_irq ; /* forward irq to lock holder */ \ - POP_FRAME ; /* and return */ \ - iret ; \ - ALIGN_TEXT ; \ -4: ; /* blocked */ \ - APIC_ITRACE(apic_itrace_masked2, irq_num, APIC_ITRACE_MASKED2) ;\ - POP_FRAME ; /* and return */ \ - iret + jmp doreti_next /* * Handle "spurious INTerrupts". @@ -434,20 +380,10 @@ _Xcpuast: FAKE_MCOUNT(13*4(%esp)) - /* - * Giant locks do not come cheap. - * A lot of cycles are going to be wasted here. - */ - call _get_mplock - - movl _cpl, %eax - pushl %eax orl $AST_PENDING, _astpending /* XXX */ incb _intr_nesting_level sti - pushl $0 - movl _cpuid, %eax lock btrl %eax, _checkstate_pending_ast @@ -461,7 +397,7 @@ _Xcpuast: lock incl CNAME(cpuast_cnt) MEXITCOUNT - jmp _doreti + jmp doreti_next 1: /* We are already in the process of delivering an ast for this CPU */ POP_FRAME @@ -487,40 +423,24 @@ _Xforward_irq: FAKE_MCOUNT(13*4(%esp)) - MP_TRYLOCK - testl %eax,%eax /* Did we get the lock ? */ - jz 1f /* No */ - lock incl CNAME(forward_irq_hitcnt) cmpb $4, _intr_nesting_level - jae 2f + jae 1f - movl _cpl, %eax - pushl %eax incb _intr_nesting_level sti - pushl $0 - MEXITCOUNT - jmp _doreti /* Handle forwarded interrupt */ + jmp doreti_next /* Handle forwarded interrupt */ 1: - lock - incl CNAME(forward_irq_misscnt) - call forward_irq /* Oops, we've lost the isr lock */ - MEXITCOUNT - POP_FRAME - iret -2: lock incl CNAME(forward_irq_toodeepcnt) -3: - MP_RELLOCK MEXITCOUNT POP_FRAME iret +#if 0 /* * */ @@ -532,9 +452,11 @@ forward_irq: cmpl $0, CNAME(forward_irq_enabled) jz 4f +/* XXX - this is broken now, because mp_lock doesn't exist movl _mp_lock,%eax cmpl $FREE_LOCK,%eax jne 1f + */ movl $0, %eax /* Pick CPU #0 if noone has lock */ 1: shrl $24,%eax @@ -559,6 +481,7 @@ forward_irq: jnz 3b 4: ret +#endif /* * Executed by a CPU when it receives an Xcpustop IPI from another CPU, @@ -654,6 +577,7 @@ MCOUNT_LABEL(bintr) FAST_INTR(22,fastintr22) FAST_INTR(23,fastintr23) #define CLKINTR_PENDING movl $1,CNAME(clkintr_pending) +/* Threaded interrupts */ INTR(0,intr0, CLKINTR_PENDING) INTR(1,intr1,) INTR(2,intr2,) @@ -728,15 +652,11 @@ _ihandlers: .long _swi_null, swi_net, _swi_null, _swi_null .long _swi_vm, _swi_null, _softclock -imasks: /* masks for interrupt handlers */ - .space NHWI*4 /* padding; HWI masks are elsewhere */ - - .long SWI_TTY_MASK, SWI_NET_MASK, SWI_CAMNET_MASK, SWI_CAMBIO_MASK - .long SWI_VM_MASK, SWI_TQ_MASK, SWI_CLOCK_MASK - +#if 0 /* active flag for lazy masking */ iactive: .long 0 +#endif #ifdef COUNT_XINVLTLB_HITS .globl _xhits diff --git a/sys/i386/isa/atpic_vector.s b/sys/i386/isa/atpic_vector.s index e427351ca205..d2b88bf705a3 100644 --- a/sys/i386/isa/atpic_vector.s +++ b/sys/i386/isa/atpic_vector.s @@ -53,9 +53,11 @@ IDTVEC(vec_name) ; \ pushl %ecx ; \ pushl %edx ; \ pushl %ds ; \ + pushl %fs ; \ MAYBE_PUSHL_ES ; \ mov $KDSEL,%ax ; \ mov %ax,%ds ; \ + mov %ax,%fs ; \ MAYBE_MOVW_AX_ES ; \ FAKE_MCOUNT((4+ACTUALLY_PUSHED)*4(%esp)) ; \ pushl _intr_unit + (irq_num) * 4 ; \ @@ -65,18 +67,21 @@ IDTVEC(vec_name) ; \ incl _cnt+V_INTR ; /* book-keeping can wait */ \ movl _intr_countp + (irq_num) * 4,%eax ; \ incl (%eax) ; \ - movl _cpl,%eax ; /* are we unmasking pending HWIs or SWIs? */ \ +/* movl _cpl,%eax ; // are we unmasking pending SWIs? / \ notl %eax ; \ - andl _ipending,%eax ; \ - jne 2f ; /* yes, maybe handle them */ \ + andl _spending,$SWI_MASK ; \ + jne 2f ; // yes, maybe handle them */ \ 1: ; \ MEXITCOUNT ; \ MAYBE_POPL_ES ; \ + popl %fs ; \ popl %ds ; \ popl %edx ; \ popl %ecx ; \ popl %eax ; \ iret ; \ + +#if 0 ; \ ALIGN_TEXT ; \ 2: ; \ @@ -88,6 +93,7 @@ IDTVEC(vec_name) ; \ incb _intr_nesting_level ; /* ... really limit it ... */ \ sti ; /* ... to do this as early as possible */ \ MAYBE_POPL_ES ; /* discard most of thin frame ... */ \ + popl %fs ; \ popl %ecx ; /* ... original %ds ... */ \ popl %edx ; \ xchgl %eax,4(%esp) ; /* orig %eax; save cpl */ \ @@ -101,11 +107,20 @@ IDTVEC(vec_name) ; \ movl (3+8+0)*4(%esp),%ecx ; /* ... %ecx from thin frame ... */ \ movl %ecx,(3+6)*4(%esp) ; /* ... to fat frame ... */ \ movl (3+8+1)*4(%esp),%eax ; /* ... cpl from thin frame */ \ - pushl %eax ; \ subl $4,%esp ; /* junk for unit number */ \ MEXITCOUNT ; \ jmp _doreti +#endif +/* + * Slow, threaded interrupts. + * + * XXX Most of the parameters here are obsolete. Fix this when we're + * done. + * XXX we really shouldn't return via doreti if we just schedule the + * interrupt handler and don't run anything. We could just do an + * iret. FIXME. + */ #define INTR(irq_num, vec_name, icu, enable_icus, reg, maybe_extra_ipending) \ .text ; \ SUPERALIGN_TEXT ; \ @@ -116,8 +131,8 @@ IDTVEC(vec_name) ; \ pushl %ds ; /* save our data and extra segments ... */ \ pushl %es ; \ pushl %fs ; \ - mov $KDSEL,%ax ; /* ... and reload with kernel's own ... */ \ - mov %ax,%ds ; /* ... early for obsolete reasons */ \ + mov $KDSEL,%ax ; /* load kernel ds, es and fs */ \ + mov %ax,%ds ; \ mov %ax,%es ; \ mov %ax,%fs ; \ maybe_extra_ipending ; \ @@ -126,43 +141,37 @@ IDTVEC(vec_name) ; \ movb %al,_imen + IRQ_BYTE(irq_num) ; \ outb %al,$icu+ICU_IMR_OFFSET ; \ enable_icus ; \ - movl _cpl,%eax ; \ - testb $IRQ_BIT(irq_num),%reg ; \ - jne 2f ; \ - incb _intr_nesting_level ; \ + incb _intr_nesting_level ; /* XXX do we need this? */ \ __CONCAT(Xresume,irq_num): ; \ FAKE_MCOUNT(13*4(%esp)) ; /* XXX late to avoid double count */ \ - incl _cnt+V_INTR ; /* tally interrupts */ \ - movl _intr_countp + (irq_num) * 4,%eax ; \ - incl (%eax) ; \ - movl _cpl,%eax ; \ - pushl %eax ; \ - pushl _intr_unit + (irq_num) * 4 ; \ - orl _intr_mask + (irq_num) * 4,%eax ; \ - movl %eax,_cpl ; \ + pushl $irq_num; /* pass the IRQ */ \ sti ; \ - call *_intr_handler + (irq_num) * 4 ; \ - cli ; /* must unmask _imen and icu atomically */ \ - movb _imen + IRQ_BYTE(irq_num),%al ; \ - andb $~IRQ_BIT(irq_num),%al ; \ - movb %al,_imen + IRQ_BYTE(irq_num) ; \ - outb %al,$icu+ICU_IMR_OFFSET ; \ - sti ; /* XXX _doreti repeats the cli/sti */ \ + call _sched_ithd ; \ + addl $4, %esp ; /* discard the parameter */ \ MEXITCOUNT ; \ /* We could usually avoid the following jmp by inlining some of */ \ /* _doreti, but it's probably better to use less cache. */ \ - jmp _doreti ; \ -; \ - ALIGN_TEXT ; \ -2: ; \ - /* XXX skip mcounting here to avoid double count */ \ - orb $IRQ_BIT(irq_num),_ipending + IRQ_BYTE(irq_num) ; \ - popl %fs ; \ - popl %es ; \ - popl %ds ; \ - popal ; \ - addl $4+4,%esp ; \ - iret + jmp doreti_next /* and catch up inside doreti */ + +/* + * Reenable the interrupt mask after completing an interrupt. Called + * from ithd_loop. There are two separate functions, one for each + * ICU. + */ + .globl setimask0, setimask1 +setimask0: + cli + movb _imen,%al + outb %al,$IO_ICU1 + ICU_IMR_OFFSET + sti + ret + +setimask1: + cli + movb _imen + 1,%al + outb %al,$IO_ICU2 + ICU_IMR_OFFSET + sti + ret MCOUNT_LABEL(bintr) FAST_INTR(0,fastintr0, ENABLE_ICU1) @@ -181,7 +190,9 @@ MCOUNT_LABEL(bintr) FAST_INTR(13,fastintr13, ENABLE_ICU1_AND_2) FAST_INTR(14,fastintr14, ENABLE_ICU1_AND_2) FAST_INTR(15,fastintr15, ENABLE_ICU1_AND_2) + #define CLKINTR_PENDING movl $1,CNAME(clkintr_pending) +/* Threaded interrupts */ INTR(0,intr0, IO_ICU1, ENABLE_ICU1, al, CLKINTR_PENDING) INTR(1,intr1, IO_ICU1, ENABLE_ICU1, al,) INTR(2,intr2, IO_ICU1, ENABLE_ICU1, al,) @@ -198,6 +209,7 @@ MCOUNT_LABEL(bintr) INTR(13,intr13, IO_ICU2, ENABLE_ICU1_AND_2, ah,) INTR(14,intr14, IO_ICU2, ENABLE_ICU1_AND_2, ah,) INTR(15,intr15, IO_ICU2, ENABLE_ICU1_AND_2, ah,) + MCOUNT_LABEL(eintr) .data @@ -211,10 +223,4 @@ _ihandlers: /* addresses of interrupt handlers */ .long _swi_null, swi_net, _swi_null, _swi_null .long _swi_vm, _swi_null, _softclock -imasks: /* masks for interrupt handlers */ - .space NHWI*4 /* padding; HWI masks are elsewhere */ - - .long SWI_TTY_MASK, SWI_NET_MASK, SWI_CAMNET_MASK, SWI_CAMBIO_MASK - .long SWI_VM_MASK, SWI_TQ_MASK, SWI_CLOCK_MASK - .text diff --git a/sys/i386/isa/bs/bsif.h b/sys/i386/isa/bs/bsif.h index 5a89681bcce0..6dcc2ab1b86d 100644 --- a/sys/i386/isa/bs/bsif.h +++ b/sys/i386/isa/bs/bsif.h @@ -208,17 +208,10 @@ static BS_INLINE void memcopy __P((void *from, void *to, register size_t len)); u_int32_t bs_adapter_info __P((int)); #define delay(y) DELAY(y) extern int dma_init_flag; -#ifdef SMP -#error XXX see comments in i386/isa/bs/bsif.h for details -/* - * ipending is 'opaque' in SMP, and can't be accessed this way. - * Since its my belief that this is PC98 code, and that PC98 and SMP - * are mutually exclusive, the above compile-time error is the "fix". - * Please inform smp@freebsd.org if this is NOT the case. - */ -#else + #define softintr(y) ipending |= (1 << y) -#endif /* SMP */ + +#endif /* IPENDING */ static BS_INLINE void memcopy(from, to, len) diff --git a/sys/i386/isa/clock.c b/sys/i386/isa/clock.c index 15044abbaa3b..724f3c2817ba 100644 --- a/sys/i386/isa/clock.c +++ b/sys/i386/isa/clock.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -93,10 +94,6 @@ #include #endif -#ifdef SMP -#define disable_intr() CLOCK_DISABLE_INTR() -#define enable_intr() CLOCK_ENABLE_INTR() - #ifdef APIC_IO #include /* The interrupt triggered by the 8254 (timer) chip */ @@ -104,7 +101,6 @@ int apic_8254_intr; static u_long read_intr_count __P((int vec)); static void setup_8254_mixed_mode __P((void)); #endif -#endif /* SMP */ /* * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we @@ -147,7 +143,9 @@ int tsc_is_broken; int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */ static int beeping = 0; +#if 0 static u_int clk_imask = HWI_MASK | SWI_MASK; +#endif static const u_char daysinmonth[] = {31,28,31,30,31,30,31,31,30,31,30,31}; static u_int hardclock_max_count; static u_int32_t i8254_lastcount; @@ -205,8 +203,12 @@ SYSCTL_OPAQUE(_debug, OID_AUTO, i8254_timecounter, CTLFLAG_RD, static void clkintr(struct clockframe frame) { + int intrsave; + if (timecounter->tc_get_timecount == i8254_get_timecount) { + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); if (i8254_ticked) i8254_ticked = 0; else { @@ -214,7 +216,8 @@ clkintr(struct clockframe frame) i8254_lastcount = 0; } clkintr_pending = 0; - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); } timer_func(&frame); switch (timer0_state) { @@ -233,14 +236,17 @@ clkintr(struct clockframe frame) break; case ACQUIRE_PENDING: + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); i8254_offset = i8254_get_timecount(NULL); i8254_lastcount = 0; timer0_max_count = TIMER_DIV(new_rate); outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); timer_func = new_function; timer0_state = ACQUIRED; setdelayed(); @@ -249,7 +255,9 @@ clkintr(struct clockframe frame) case RELEASE_PENDING: if ((timer0_prescaler_count += timer0_max_count) >= hardclock_max_count) { + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); i8254_offset = i8254_get_timecount(NULL); i8254_lastcount = 0; timer0_max_count = hardclock_max_count; @@ -257,7 +265,8 @@ clkintr(struct clockframe frame) TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); timer0_prescaler_count = 0; timer_func = hardclock; timer0_state = RELEASED; @@ -404,11 +413,11 @@ DB_SHOW_COMMAND(rtc, rtc) static int getit(void) { - u_long ef; - int high, low; + int high, low, intrsave; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); /* Select timer0 and latch counter value. */ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); @@ -417,7 +426,7 @@ getit(void) high = inb(TIMER_CNTR0); CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); return ((high << 8) | low); } @@ -523,6 +532,7 @@ sysbeepstop(void *chan) int sysbeep(int pitch, int period) { + int intrsave; int x = splclock(); if (acquire_timer2(TIMER_SQWAVE|TIMER_16BIT)) @@ -531,10 +541,13 @@ sysbeep(int pitch, int period) splx(x); return (-1); /* XXX Should be EBUSY, but nobody cares anyway. */ } + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); outb(TIMER_CNTR2, pitch); outb(TIMER_CNTR2, (pitch>>8)); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); if (!beeping) { /* enable counter2 output to speaker */ outb(IO_PPI, inb(IO_PPI) | 3); @@ -683,11 +696,12 @@ calibrate_clocks(void) static void set_timer_freq(u_int freq, int intr_freq) { - u_long ef; + int intrsave; int new_timer0_max_count; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); timer_freq = freq; new_timer0_max_count = hardclock_max_count = TIMER_DIV(intr_freq); if (new_timer0_max_count != timer0_max_count) { @@ -697,7 +711,7 @@ set_timer_freq(u_int freq, int intr_freq) outb(TIMER_CNTR0, timer0_max_count >> 8); } CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); } /* @@ -711,15 +725,16 @@ set_timer_freq(u_int freq, int intr_freq) void i8254_restore(void) { - u_long ef; + int intrsave; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); } /* @@ -979,8 +994,8 @@ cpu_initclocks() { int diag; #ifdef APIC_IO - int apic_8254_trial; - struct intrec *clkdesc; + int apic_8254_trial, num_8254_ticks; + struct intrec *clkdesc, *rtcdesc; #endif /* APIC_IO */ if (statclock_disable) { @@ -1014,14 +1029,15 @@ cpu_initclocks() } else panic("APIC_IO: Cannot route 8254 interrupt to CPU"); } - - clkdesc = inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, - NULL, &clk_imask, INTR_EXCL); - INTREN(1 << apic_8254_intr); - #else /* APIC_IO */ - inthand_add("clk", 0, (inthand2_t *)clkintr, NULL, &clk_imask, + /* + * XXX Check the priority of this interrupt handler. I + * couldn't find anything suitable in the BSD/OS code (grog, + * 19 July 2000). + */ + /* Setup the PIC clk handler. The APIC handler is setup later */ + inthand_add("clk", 0, (inthand2_t *)clkintr, NULL, PI_REALTIME, INTR_EXCL); INTREN(IRQ0); @@ -1032,8 +1048,18 @@ cpu_initclocks() writertc(RTC_STATUSB, RTCSB_24HR); /* Don't bother enabling the statistics clock. */ - if (statclock_disable) + if (statclock_disable) { +#ifdef APIC_IO + /* + * XXX - if statclock is disabled, don't attempt the APIC + * trial. Not sure this is sane for APIC_IO. + */ + inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, NULL, + PI_REALTIME, INTR_EXCL); + INTREN(1 << apic_8254_intr); +#endif /* APIC_IO */ return; + } diag = rtcin(RTC_DIAG); if (diag != 0) printf("RTC BIOS diagnostic error %b\n", diag, RTCDG_BITS); @@ -1041,34 +1067,44 @@ cpu_initclocks() #ifdef APIC_IO if (isa_apic_irq(8) != 8) panic("APIC RTC != 8"); -#endif /* APIC_IO */ - inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, &stat_imask, - INTR_EXCL); - -#ifdef APIC_IO - INTREN(APIC_IRQ8); -#else - INTREN(IRQ8); -#endif /* APIC_IO */ - - writertc(RTC_STATUSB, rtc_statusb); - -#ifdef APIC_IO if (apic_8254_trial) { - + /* + * XXX - We use fast interrupts for clk and rtc long enough to + * perform the APIC probe and then revert to exclusive + * interrupts. + */ + clkdesc = inthand_add("clk", apic_8254_intr, + (inthand2_t *)clkintr, NULL, PI_REALTIME, INTR_FAST); + INTREN(1 << apic_8254_intr); + + rtcdesc = inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, + PI_REALTIME, INTR_FAST); /* XXX */ + INTREN(APIC_IRQ8); + writertc(RTC_STATUSB, rtc_statusb); + printf("APIC_IO: Testing 8254 interrupt delivery\n"); while (read_intr_count(8) < 6) ; /* nothing */ - if (read_intr_count(apic_8254_intr) < 3) { + num_8254_ticks = read_intr_count(apic_8254_intr); + + /* disable and remove our fake handlers */ + INTRDIS(1 << apic_8254_intr); + inthand_remove(clkdesc); + + writertc(RTC_STATUSA, rtc_statusa); + writertc(RTC_STATUSB, RTCSB_24HR); + + INTRDIS(APIC_IRQ8); + inthand_remove(rtcdesc); + + if (num_8254_ticks < 3) { /* * The MP table is broken. * The 8254 was not connected to the specified pin * on the IO APIC. * Workaround: Limited variant of mixed mode. */ - INTRDIS(1 << apic_8254_intr); - inthand_remove(clkdesc); printf("APIC_IO: Broken MP table detected: " "8254 is not connected to " "IOAPIC #%d intpin %d\n", @@ -1087,13 +1123,27 @@ cpu_initclocks() } apic_8254_intr = apic_irq(0, 0); setup_8254_mixed_mode(); - inthand_add("clk", apic_8254_intr, - (inthand2_t *)clkintr, - NULL, &clk_imask, INTR_EXCL); - INTREN(1 << apic_8254_intr); } } + + /* Finally, setup the real clock handlers */ + inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, NULL, + PI_REALTIME, INTR_EXCL); + INTREN(1 << apic_8254_intr); +#endif + + inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, PI_REALTIME, + INTR_EXCL); +#ifdef APIC_IO + INTREN(APIC_IRQ8); +#else + INTREN(IRQ8); +#endif + + writertc(RTC_STATUSB, rtc_statusb); + +#ifdef APIC_IO if (apic_int_type(0, 0) != 3 || int_to_apicintpin[apic_8254_intr].ioapic != 0 || int_to_apicintpin[apic_8254_intr].int_pin != 0) @@ -1198,11 +1248,12 @@ static unsigned i8254_get_timecount(struct timecounter *tc) { u_int count; - u_long ef; + int intrsave; u_int high, low; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); /* Select timer0 and latch counter value. */ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); @@ -1212,7 +1263,7 @@ i8254_get_timecount(struct timecounter *tc) count = timer0_max_count - ((high << 8) | low); if (count < i8254_lastcount || (!i8254_ticked && (clkintr_pending || - ((count < 20 || (!(ef & PSL_I) && count < timer0_max_count / 2u)) && + ((count < 20 || (!(intrsave & PSL_I) && count < timer0_max_count / 2u)) && #ifdef APIC_IO #define lapic_irr1 ((volatile u_int *)&lapic)[0x210 / 4] /* XXX XXX */ /* XXX this assumes that apic_8254_intr is < 24. */ @@ -1227,7 +1278,7 @@ i8254_get_timecount(struct timecounter *tc) i8254_lastcount = count; count += i8254_offset; CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); return (count); } diff --git a/sys/i386/isa/cy.c b/sys/i386/isa/cy.c index 52a8cf36892f..5487d8fe6299 100644 --- a/sys/i386/isa/cy.c +++ b/sys/i386/isa/cy.c @@ -94,11 +94,6 @@ #error "The cy device requires the old isa compatibility shims" #endif -#ifdef SMP -#define disable_intr() COM_DISABLE_INTR() -#define enable_intr() COM_ENABLE_INTR() -#endif /* SMP */ - /* * Dictionary so that I can name everything *sio* or *com* to compare with * sio.c. There is also lots of ugly formatting and unnecessary ifdefs to @@ -366,7 +361,7 @@ static struct com_s *p_com_addr[NSIO]; #define com_addr(unit) (p_com_addr[unit]) struct isa_driver siodriver = { - INTR_TYPE_TTY | INTR_TYPE_FAST, + INTR_TYPE_TTY | INTR_FAST, sioprobe, sioattach, driver_name @@ -604,11 +599,9 @@ cyattach_common(cy_iobase, cy_align) com->lt_out.c_cflag = com->lt_in.c_cflag = CLOCAL; } if (siosetwater(com, com->it_in.c_ispeed) != 0) { - enable_intr(); free(com, M_DEVBUF); return (0); } - enable_intr(); termioschars(&com->it_in); com->it_in.c_ispeed = com->it_in.c_ospeed = comdefaultrate; com->it_out = com->it_in; @@ -662,6 +655,7 @@ sioopen(dev, flag, mode, p) int s; struct tty *tp; int unit; + int intrsave; mynor = minor(dev); unit = MINOR_TO_UNIT(mynor); @@ -768,14 +762,17 @@ sioopen(dev, flag, mode, p) } } + intrsave = save_intr(); disable_intr(); + COM_LOCK(); (void) inb(com->line_status_port); (void) inb(com->data_port); com->prev_modem_status = com->last_modem_status = inb(com->modem_status_port); outb(iobase + com_ier, IER_ERXRDY | IER_ETXRDY | IER_ERLS | IER_EMSC); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); #else /* !0 */ /* * Flush fifos. This requires a full channel reset which @@ -786,13 +783,16 @@ sioopen(dev, flag, mode, p) CD1400_CCR_CMDRESET | CD1400_CCR_CHANRESET); cd1400_channel_cmd(com, com->channel_control); + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com->prev_modem_status = com->last_modem_status = cd_getreg(com, CD1400_MSVR2); cd_setreg(com, CD1400_SRER, com->intr_enable = CD1400_SRER_MDMCH | CD1400_SRER_RXDATA); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); #endif /* 0 */ /* * Handle initial DCD. Callout devices get a fake initial @@ -875,6 +875,7 @@ comhardclose(com) int s; struct tty *tp; int unit; + int intrsave; unit = com->unit; iobase = com->iobase; @@ -888,10 +889,13 @@ comhardclose(com) outb(iobase + com_cfcr, com->cfcr_image &= ~CFCR_SBREAK); #else /* XXX */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com->etc = ETC_NONE; cd_setreg(com, CD1400_COR2, com->cor[1] &= ~CD1400_COR2_ETC); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); cd1400_channel_cmd(com, CD1400_CCR_CMDRESET | CD1400_CCR_FTF); #endif @@ -899,9 +903,12 @@ comhardclose(com) #if 0 outb(iobase + com_ier, 0); #else + intrsave = save_intr(); disable_intr(); + COM_LOCK(); cd_setreg(com, CD1400_SRER, com->intr_enable = 0); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); #endif tp = com->tp; if ((tp->t_cflag & HUPCL) @@ -991,6 +998,11 @@ siodtrwakeup(chan) wakeup(&com->dtr_wait); } +/* + * This function: + * a) needs to be called with COM_LOCK() held, and + * b) needs to return with COM_LOCK() held. + */ static void sioinput(com) struct com_s *com; @@ -1000,6 +1012,7 @@ sioinput(com) u_char line_status; int recv_data; struct tty *tp; + int intrsave; buf = com->ibuf; tp = com->tp; @@ -1016,7 +1029,15 @@ sioinput(com) * slinput is reasonably fast (usually 40 instructions plus * call overhead). */ + do { + /* + * This may look odd, but it is using save-and-enable + * semantics instead of the save-and-disable semantics + * that are used everywhere else. + */ + intrsave = save_intr(); + COM_UNLOCK(); enable_intr(); incc = com->iptr - buf; if (tp->t_rawq.c_cc + incc > tp->t_ihiwat @@ -1038,10 +1059,18 @@ sioinput(com) tp->t_lflag &= ~FLUSHO; comstart(tp); } - disable_intr(); + restore_intr(intrsave); + COM_LOCK(); } while (buf < com->iptr); } else { do { + /* + * This may look odd, but it is using save-and-enable + * semantics instead of the save-and-disable semantics + * that are used everywhere else. + */ + intrsave = save_intr(); + COM_UNLOCK(); enable_intr(); line_status = buf[com->ierroff]; recv_data = *buf++; @@ -1057,7 +1086,8 @@ sioinput(com) recv_data |= TTY_PE; } (*linesw[tp->t_line].l_rint)(recv_data, tp); - disable_intr(); + restore_intr(intrsave); + COM_LOCK(); } while (buf < com->iptr); } com_events -= (com->iptr - com->ibuf); @@ -1729,6 +1759,7 @@ static void siopoll() { int unit; + int intrsave; #ifdef CyDebug ++cy_timeouts; @@ -1751,7 +1782,9 @@ siopoll() * (actually never opened devices) so that we don't * loop. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); incc = com->iptr - com->ibuf; com->iptr = com->ibuf; if (com->state & CS_CHECKMSR) { @@ -1759,7 +1792,8 @@ siopoll() com->state &= ~CS_CHECKMSR; } com_events -= incc; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (incc != 0) log(LOG_DEBUG, "sio%d: %d events for device with no tp\n", @@ -1767,29 +1801,39 @@ siopoll() continue; } if (com->iptr != com->ibuf) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); sioinput(com); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } if (com->state & CS_CHECKMSR) { u_char delta_modem_status; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); + sioinput(com); delta_modem_status = com->last_modem_status ^ com->prev_modem_status; com->prev_modem_status = com->last_modem_status; com_events -= LOTS_OF_EVENTS; com->state &= ~CS_CHECKMSR; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (delta_modem_status & MSR_DCD) (*linesw[tp->t_line].l_modem) (tp, com->prev_modem_status & MSR_DCD); } if (com->extra_state & CSE_ODONE) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com_events -= LOTS_OF_EVENTS; com->extra_state &= ~CSE_ODONE; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (!(com->state & CS_BUSY)) { tp->t_state &= ~TS_BUSY; ttwwakeup(com->tp); @@ -1801,10 +1845,13 @@ siopoll() } } if (com->state & CS_ODONE) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com_events -= LOTS_OF_EVENTS; com->state &= ~CS_ODONE; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); (*linesw[tp->t_line].l_start)(tp); } if (com_events == 0) @@ -1833,6 +1880,7 @@ comparam(tp, t) u_char opt; int s; int unit; + int intrsave; /* do historical conversions */ if (t->c_ispeed == 0) @@ -1857,14 +1905,9 @@ comparam(tp, t) else (void)commctl(com, TIOCM_DTR, DMBIS); - /* - * This returns with interrupts disabled so that we can complete - * the speed change atomically. - */ (void) siosetwater(com, t->c_ispeed); /* XXX we don't actually change the speed atomically. */ - enable_intr(); if (idivisor != 0) { cd_setreg(com, CD1400_RBPR, idivisor); @@ -1985,12 +2028,15 @@ comparam(tp, t) if (cflag & CCTS_OFLOW) opt |= CD1400_COR2_CCTS_OFLOW; #endif + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (opt != com->cor[1]) { cor_change |= CD1400_CCR_COR2; cd_setreg(com, CD1400_COR2, com->cor[1] = opt); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); /* * set channel option register 3 - @@ -2111,7 +2157,9 @@ comparam(tp, t) * XXX should have done this long ago, but there is too much state * to change all atomically. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com->state &= ~CS_TTGO; if (!(tp->t_state & TS_TTSTOP)) @@ -2177,7 +2225,8 @@ comparam(tp, t) | CD1400_SRER_TXMPTY); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); splx(s); comstart(tp); if (com->ibufold != NULL) { @@ -2196,6 +2245,7 @@ siosetwater(com, speed) u_char *ibuf; int ibufsize; struct tty *tp; + int intrsave; /* * Make the buffer size large enough to handle a softtty interrupt @@ -2207,7 +2257,6 @@ siosetwater(com, speed) for (ibufsize = 128; ibufsize < cp4ticks;) ibufsize <<= 1; if (ibufsize == com->ibufsize) { - disable_intr(); return (0); } @@ -2217,7 +2266,6 @@ siosetwater(com, speed) */ ibuf = malloc(2 * ibufsize, M_DEVBUF, M_NOWAIT); if (ibuf == NULL) { - disable_intr(); return (ENOMEM); } @@ -2235,7 +2283,9 @@ siosetwater(com, speed) * Read current input buffer, if any. Continue with interrupts * disabled. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->iptr != com->ibuf) sioinput(com); @@ -2254,6 +2304,9 @@ siosetwater(com, speed) com->ibufend = ibuf + ibufsize; com->ierroff = ibufsize; com->ihighwater = ibuf + 3 * ibufsize / 4; + + COM_UNLOCK(); + restore_intr(intrsave); return (0); } @@ -2267,6 +2320,7 @@ comstart(tp) bool_t started; #endif int unit; + int intrsave; unit = DEV_TO_UNIT(tp->t_dev); com = com_addr(unit); @@ -2277,7 +2331,9 @@ comstart(tp) started = FALSE; #endif + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (tp->t_state & TS_TTSTOP) { com->state &= ~CS_TTGO; if (com->intr_enable & CD1400_SRER_TXRDY) @@ -2313,7 +2369,8 @@ comstart(tp) com->mcr_image |= com->mcr_rts); #endif } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (tp->t_state & (TS_TIMEOUT | TS_TTSTOP)) { ttwwakeup(tp); splx(s); @@ -2332,7 +2389,9 @@ comstart(tp) sizeof com->obuf1); com->obufs[0].l_next = NULL; com->obufs[0].l_queued = TRUE; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state & CS_BUSY) { qp = com->obufq.l_next; while ((next = qp->l_next) != NULL) @@ -2351,7 +2410,8 @@ comstart(tp) & ~CD1400_SRER_TXMPTY) | CD1400_SRER_TXRDY); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } if (tp->t_outq.c_cc != 0 && !com->obufs[1].l_queued) { #ifdef CyDebug @@ -2362,7 +2422,9 @@ comstart(tp) sizeof com->obuf2); com->obufs[1].l_next = NULL; com->obufs[1].l_queued = TRUE; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state & CS_BUSY) { qp = com->obufq.l_next; while ((next = qp->l_next) != NULL) @@ -2381,7 +2443,8 @@ comstart(tp) & ~CD1400_SRER_TXMPTY) | CD1400_SRER_TXRDY); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } tp->t_state |= TS_BUSY; } @@ -2390,10 +2453,13 @@ comstart(tp) ++com->start_real; #endif #if 0 + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state >= (CS_BUSY | CS_TTGO)) siointr1(com); /* fake interrupt to start output */ - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); #endif ttwwakeup(tp); splx(s); @@ -2406,10 +2472,13 @@ comstop(tp, rw) { struct com_s *com; bool_t wakeup_etc; + int intrsave; com = com_addr(DEV_TO_UNIT(tp->t_dev)); wakeup_etc = FALSE; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (rw & FWRITE) { com->obufs[0].l_queued = FALSE; com->obufs[1].l_queued = FALSE; @@ -2432,7 +2501,8 @@ comstop(tp, rw) com_events -= (com->iptr - com->ibuf); com->iptr = com->ibuf; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (wakeup_etc) wakeup(&com->etc); if (rw & FWRITE && com->etc == ETC_NONE) @@ -2448,6 +2518,7 @@ commctl(com, bits, how) { int mcr; int msr; + int intrsave; if (how == DMGET) { if (com->channel_control & CD1400_CCR_RCVEN) @@ -2485,7 +2556,9 @@ commctl(com, bits, how) mcr |= com->mcr_dtr; if (bits & TIOCM_RTS) mcr |= com->mcr_rts; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); switch (how) { case DMSET: com->mcr_image = mcr; @@ -2503,7 +2576,8 @@ commctl(com, bits, how) cd_setreg(com, CD1400_MSVR2, mcr); break; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); return (0); } @@ -2565,9 +2639,14 @@ comwakeup(chan) com = com_addr(unit); if (com != NULL && (com->state >= (CS_BUSY | CS_TTGO) || com->poll)) { + int intrsave; + + intrsave = save_intr(); disable_intr(); + COM_LOCK(); siointr1(com); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } } #endif @@ -2587,11 +2666,15 @@ comwakeup(chan) for (errnum = 0; errnum < CE_NTYPES; ++errnum) { u_int delta; u_long total; + int intrsave; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); delta = com->delta_error_counts[errnum]; com->delta_error_counts[errnum] = 0; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (delta == 0) continue; total = com->error_counts[errnum] += delta; @@ -2743,6 +2826,8 @@ cd_etc(com, etc) struct com_s *com; int etc; { + int intrsave; + /* * We can't change the hardware's ETC state while there are any * characters in the tx fifo, since those characters would be @@ -2754,26 +2839,28 @@ cd_etc(com, etc) * for the tx to become empty so that the command is sure to be * executed soon after we issue it. */ + intrsave = save_intr(); disable_intr(); - if (com->etc == etc) { - enable_intr(); + COM_LOCK(); + if (com->etc == etc) goto wait; - } if ((etc == CD1400_ETC_SENDBREAK && (com->etc == ETC_BREAK_STARTING || com->etc == ETC_BREAK_STARTED)) || (etc == CD1400_ETC_STOPBREAK && (com->etc == ETC_BREAK_ENDING || com->etc == ETC_BREAK_ENDED || com->etc == ETC_NONE))) { - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); return; } com->etc = etc; cd_setreg(com, CD1400_SRER, com->intr_enable = (com->intr_enable & ~CD1400_SRER_TXRDY) | CD1400_SRER_TXMPTY); - enable_intr(); wait: + COM_UNLOCK(); + restore_intr(intrsave); while (com->etc == etc && tsleep(&com->etc, TTIPRI | PCATCH, "cyetc", 0) == 0) continue; @@ -2787,7 +2874,7 @@ cd_getreg(com, reg) struct com_s *basecom; u_char car; int cy_align; - u_long ef; + int intrsave; cy_addr iobase; int val; @@ -2795,14 +2882,16 @@ cd_getreg(com, reg) car = com->unit & CD1400_CAR_CHAN; cy_align = com->cy_align; iobase = com->iobase; - ef = read_eflags(); - if (ef & PSL_I) - disable_intr(); + intrsave = save_intr(); + disable_intr(); + if (intrsave & PSL_I) + COM_LOCK(); if (basecom->car != car) cd_outb(iobase, CD1400_CAR, cy_align, basecom->car = car); val = cd_inb(iobase, reg, cy_align); - if (ef & PSL_I) - enable_intr(); + if (intrsave & PSL_I) + COM_UNLOCK(); + restore_intr(intrsave); return (val); } @@ -2815,21 +2904,23 @@ cd_setreg(com, reg, val) struct com_s *basecom; u_char car; int cy_align; - u_long ef; + int intrsave; cy_addr iobase; basecom = com_addr(com->unit & ~(CD1400_NO_OF_CHANNELS - 1)); car = com->unit & CD1400_CAR_CHAN; cy_align = com->cy_align; iobase = com->iobase; - ef = read_eflags(); - if (ef & PSL_I) - disable_intr(); + intrsave = save_intr(); + disable_intr(); + if (intrsave & PSL_I) + COM_LOCK(); if (basecom->car != car) cd_outb(iobase, CD1400_CAR, cy_align, basecom->car = car); cd_outb(iobase, reg, cy_align, val); - if (ef & PSL_I) - enable_intr(); + if (intrsave & PSL_I) + COM_UNLOCK(); + restore_intr(intrsave); } #ifdef CyDebug diff --git a/sys/i386/isa/icu_ipl.s b/sys/i386/isa/icu_ipl.s index 34753583a41e..d178d5c43c45 100644 --- a/sys/i386/isa/icu_ipl.s +++ b/sys/i386/isa/icu_ipl.s @@ -54,63 +54,6 @@ _imen: .long HWI_MASK .text SUPERALIGN_TEXT -/* - * Interrupt priority mechanism - * -- soft splXX masks with group mechanism (cpl) - * -- h/w masks for currently active or unused interrupts (imen) - * -- ipending = active interrupts currently masked by cpl - */ - -ENTRY(splz) - /* - * The caller has restored cpl and checked that (ipending & ~cpl) - * is nonzero. We have to repeat the check since if there is an - * interrupt while we're looking, _doreti processing for the - * interrupt will handle all the unmasked pending interrupts - * because we restored early. We're repeating the calculation - * of (ipending & ~cpl) anyway so that the caller doesn't have - * to pass it, so this only costs one "jne". "bsfl %ecx,%ecx" - * is undefined when %ecx is 0 so we can't rely on the secondary - * btrl tests. - */ - movl _cpl,%eax -splz_next: - /* - * We don't need any locking here. (ipending & ~cpl) cannot grow - * while we're looking at it - any interrupt will shrink it to 0. - */ - movl %eax,%ecx - notl %ecx - andl _ipending,%ecx - jne splz_unpend - ret - - ALIGN_TEXT -splz_unpend: - bsfl %ecx,%ecx - btrl %ecx,_ipending - jnc splz_next - cmpl $NHWI,%ecx - jae splz_swi - /* - * We would prefer to call the intr handler directly here but that - * doesn't work for badly behaved handlers that want the interrupt - * frame. Also, there's a problem determining the unit number. - * We should change the interface so that the unit number is not - * determined at config time. - */ - jmp *vec(,%ecx,4) - - ALIGN_TEXT -splz_swi: - pushl %eax - orl imasks(,%ecx,4),%eax - movl %eax,_cpl - call *_ihandlers(,%ecx,4) - popl %eax - movl %eax,_cpl - jmp splz_next - /* * Fake clock interrupt(s) so that they appear to come from our caller instead * of from here, so that system profiling works. diff --git a/sys/i386/isa/icu_vector.s b/sys/i386/isa/icu_vector.s index e427351ca205..d2b88bf705a3 100644 --- a/sys/i386/isa/icu_vector.s +++ b/sys/i386/isa/icu_vector.s @@ -53,9 +53,11 @@ IDTVEC(vec_name) ; \ pushl %ecx ; \ pushl %edx ; \ pushl %ds ; \ + pushl %fs ; \ MAYBE_PUSHL_ES ; \ mov $KDSEL,%ax ; \ mov %ax,%ds ; \ + mov %ax,%fs ; \ MAYBE_MOVW_AX_ES ; \ FAKE_MCOUNT((4+ACTUALLY_PUSHED)*4(%esp)) ; \ pushl _intr_unit + (irq_num) * 4 ; \ @@ -65,18 +67,21 @@ IDTVEC(vec_name) ; \ incl _cnt+V_INTR ; /* book-keeping can wait */ \ movl _intr_countp + (irq_num) * 4,%eax ; \ incl (%eax) ; \ - movl _cpl,%eax ; /* are we unmasking pending HWIs or SWIs? */ \ +/* movl _cpl,%eax ; // are we unmasking pending SWIs? / \ notl %eax ; \ - andl _ipending,%eax ; \ - jne 2f ; /* yes, maybe handle them */ \ + andl _spending,$SWI_MASK ; \ + jne 2f ; // yes, maybe handle them */ \ 1: ; \ MEXITCOUNT ; \ MAYBE_POPL_ES ; \ + popl %fs ; \ popl %ds ; \ popl %edx ; \ popl %ecx ; \ popl %eax ; \ iret ; \ + +#if 0 ; \ ALIGN_TEXT ; \ 2: ; \ @@ -88,6 +93,7 @@ IDTVEC(vec_name) ; \ incb _intr_nesting_level ; /* ... really limit it ... */ \ sti ; /* ... to do this as early as possible */ \ MAYBE_POPL_ES ; /* discard most of thin frame ... */ \ + popl %fs ; \ popl %ecx ; /* ... original %ds ... */ \ popl %edx ; \ xchgl %eax,4(%esp) ; /* orig %eax; save cpl */ \ @@ -101,11 +107,20 @@ IDTVEC(vec_name) ; \ movl (3+8+0)*4(%esp),%ecx ; /* ... %ecx from thin frame ... */ \ movl %ecx,(3+6)*4(%esp) ; /* ... to fat frame ... */ \ movl (3+8+1)*4(%esp),%eax ; /* ... cpl from thin frame */ \ - pushl %eax ; \ subl $4,%esp ; /* junk for unit number */ \ MEXITCOUNT ; \ jmp _doreti +#endif +/* + * Slow, threaded interrupts. + * + * XXX Most of the parameters here are obsolete. Fix this when we're + * done. + * XXX we really shouldn't return via doreti if we just schedule the + * interrupt handler and don't run anything. We could just do an + * iret. FIXME. + */ #define INTR(irq_num, vec_name, icu, enable_icus, reg, maybe_extra_ipending) \ .text ; \ SUPERALIGN_TEXT ; \ @@ -116,8 +131,8 @@ IDTVEC(vec_name) ; \ pushl %ds ; /* save our data and extra segments ... */ \ pushl %es ; \ pushl %fs ; \ - mov $KDSEL,%ax ; /* ... and reload with kernel's own ... */ \ - mov %ax,%ds ; /* ... early for obsolete reasons */ \ + mov $KDSEL,%ax ; /* load kernel ds, es and fs */ \ + mov %ax,%ds ; \ mov %ax,%es ; \ mov %ax,%fs ; \ maybe_extra_ipending ; \ @@ -126,43 +141,37 @@ IDTVEC(vec_name) ; \ movb %al,_imen + IRQ_BYTE(irq_num) ; \ outb %al,$icu+ICU_IMR_OFFSET ; \ enable_icus ; \ - movl _cpl,%eax ; \ - testb $IRQ_BIT(irq_num),%reg ; \ - jne 2f ; \ - incb _intr_nesting_level ; \ + incb _intr_nesting_level ; /* XXX do we need this? */ \ __CONCAT(Xresume,irq_num): ; \ FAKE_MCOUNT(13*4(%esp)) ; /* XXX late to avoid double count */ \ - incl _cnt+V_INTR ; /* tally interrupts */ \ - movl _intr_countp + (irq_num) * 4,%eax ; \ - incl (%eax) ; \ - movl _cpl,%eax ; \ - pushl %eax ; \ - pushl _intr_unit + (irq_num) * 4 ; \ - orl _intr_mask + (irq_num) * 4,%eax ; \ - movl %eax,_cpl ; \ + pushl $irq_num; /* pass the IRQ */ \ sti ; \ - call *_intr_handler + (irq_num) * 4 ; \ - cli ; /* must unmask _imen and icu atomically */ \ - movb _imen + IRQ_BYTE(irq_num),%al ; \ - andb $~IRQ_BIT(irq_num),%al ; \ - movb %al,_imen + IRQ_BYTE(irq_num) ; \ - outb %al,$icu+ICU_IMR_OFFSET ; \ - sti ; /* XXX _doreti repeats the cli/sti */ \ + call _sched_ithd ; \ + addl $4, %esp ; /* discard the parameter */ \ MEXITCOUNT ; \ /* We could usually avoid the following jmp by inlining some of */ \ /* _doreti, but it's probably better to use less cache. */ \ - jmp _doreti ; \ -; \ - ALIGN_TEXT ; \ -2: ; \ - /* XXX skip mcounting here to avoid double count */ \ - orb $IRQ_BIT(irq_num),_ipending + IRQ_BYTE(irq_num) ; \ - popl %fs ; \ - popl %es ; \ - popl %ds ; \ - popal ; \ - addl $4+4,%esp ; \ - iret + jmp doreti_next /* and catch up inside doreti */ + +/* + * Reenable the interrupt mask after completing an interrupt. Called + * from ithd_loop. There are two separate functions, one for each + * ICU. + */ + .globl setimask0, setimask1 +setimask0: + cli + movb _imen,%al + outb %al,$IO_ICU1 + ICU_IMR_OFFSET + sti + ret + +setimask1: + cli + movb _imen + 1,%al + outb %al,$IO_ICU2 + ICU_IMR_OFFSET + sti + ret MCOUNT_LABEL(bintr) FAST_INTR(0,fastintr0, ENABLE_ICU1) @@ -181,7 +190,9 @@ MCOUNT_LABEL(bintr) FAST_INTR(13,fastintr13, ENABLE_ICU1_AND_2) FAST_INTR(14,fastintr14, ENABLE_ICU1_AND_2) FAST_INTR(15,fastintr15, ENABLE_ICU1_AND_2) + #define CLKINTR_PENDING movl $1,CNAME(clkintr_pending) +/* Threaded interrupts */ INTR(0,intr0, IO_ICU1, ENABLE_ICU1, al, CLKINTR_PENDING) INTR(1,intr1, IO_ICU1, ENABLE_ICU1, al,) INTR(2,intr2, IO_ICU1, ENABLE_ICU1, al,) @@ -198,6 +209,7 @@ MCOUNT_LABEL(bintr) INTR(13,intr13, IO_ICU2, ENABLE_ICU1_AND_2, ah,) INTR(14,intr14, IO_ICU2, ENABLE_ICU1_AND_2, ah,) INTR(15,intr15, IO_ICU2, ENABLE_ICU1_AND_2, ah,) + MCOUNT_LABEL(eintr) .data @@ -211,10 +223,4 @@ _ihandlers: /* addresses of interrupt handlers */ .long _swi_null, swi_net, _swi_null, _swi_null .long _swi_vm, _swi_null, _softclock -imasks: /* masks for interrupt handlers */ - .space NHWI*4 /* padding; HWI masks are elsewhere */ - - .long SWI_TTY_MASK, SWI_NET_MASK, SWI_CAMNET_MASK, SWI_CAMBIO_MASK - .long SWI_VM_MASK, SWI_TQ_MASK, SWI_CLOCK_MASK - .text diff --git a/sys/i386/isa/intr_machdep.c b/sys/i386/isa/intr_machdep.c index 34a8c229bd6b..870760e1ce01 100644 --- a/sys/i386/isa/intr_machdep.c +++ b/sys/i386/isa/intr_machdep.c @@ -36,12 +36,6 @@ * from: @(#)isa.c 7.2 (Berkeley) 5/13/91 * $FreeBSD$ */ -/* - * This file contains an aggregated module marked: - * Copyright (c) 1997, Stefan Esser - * All rights reserved. - * See the notice for details. - */ #include "opt_auto_eoi.h" @@ -51,11 +45,14 @@ #ifndef SMP #include #endif +#include #include #include #include +#include #include #include +#include #include #include #include @@ -91,30 +88,14 @@ #include #endif -/* XXX should be in suitable include files */ -#ifdef PC98 -#define ICU_IMR_OFFSET 2 /* IO_ICU{1,2} + 2 */ -#define ICU_SLAVEID 7 -#else -#define ICU_IMR_OFFSET 1 /* IO_ICU{1,2} + 1 */ -#define ICU_SLAVEID 2 -#endif - -#ifdef APIC_IO /* - * This is to accommodate "mixed-mode" programming for - * motherboards that don't connect the 8254 to the IO APIC. + * Per-interrupt data. We consider the soft interrupt to be a special + * case, so these arrays have NHWI + NSWI entries, not ICU_LEN. */ -#define AUTO_EOI_1 1 -#endif - -#define NR_INTRNAMES (1 + ICU_LEN + 2 * ICU_LEN) - -u_long *intr_countp[ICU_LEN]; -inthand2_t *intr_handler[ICU_LEN]; -u_int intr_mask[ICU_LEN]; -static u_int* intr_mptr[ICU_LEN]; -void *intr_unit[ICU_LEN]; +u_long *intr_countp[NHWI + NSWI]; /* pointers to interrupt counters */ +inthand2_t *intr_handler[NHWI + NSWI]; /* first level interrupt handler */ +ithd *ithds[NHWI + NSWI]; /* real interrupt handler */ +void *intr_unit[NHWI + NSWI]; static inthand_t *fastintr[ICU_LEN] = { &IDTVEC(fastintr0), &IDTVEC(fastintr1), @@ -292,8 +273,9 @@ isa_nmi(cd) } /* - * Fill in default interrupt table (in case of spuruious interrupt - * during configuration of kernel, setup interrupt control unit + * Create a default interrupt table to avoid problems caused by + * spurious interrupts during configuration of kernel, then setup + * interrupt control unit. */ void isa_defaultirq() @@ -364,16 +346,6 @@ isa_strayintr(vcookiep) { int intr = (void **)vcookiep - &intr_unit[0]; - /* DON'T BOTHER FOR NOW! */ - /* for some reason, we get bursts of intr #7, even if not enabled! */ - /* - * Well the reason you got bursts of intr #7 is because someone - * raised an interrupt line and dropped it before the 8259 could - * prioritize it. This is documented in the intel data book. This - * means you have BAD hardware! I have changed this so that only - * the first 5 get logged, then it quits logging them, and puts - * out a special message. rgrimes 3/25/1993 - */ /* * XXX TODO print a different message for #7 if it is for a * glitch. Glitches can be distinguished from real #7's by @@ -405,36 +377,10 @@ isa_irq_pending() } #endif -int -update_intr_masks(void) -{ - int intr, n=0; - u_int mask,*maskptr; - - for (intr=0; intr < ICU_LEN; intr ++) { -#if defined(APIC_IO) - /* no 8259 SLAVE to ignore */ -#else - if (intr==ICU_SLAVEID) continue; /* ignore 8259 SLAVE output */ -#endif /* APIC_IO */ - maskptr = intr_mptr[intr]; - if (!maskptr) - continue; - *maskptr |= SWI_LOW_MASK | (1 << intr); - mask = *maskptr; - if (mask != intr_mask[intr]) { -#if 0 - printf ("intr_mask[%2d] old=%08x new=%08x ptr=%p.\n", - intr, intr_mask[intr], mask, maskptr); -#endif - intr_mask[intr]=mask; - n++; - } - - } - return (n); -} - +/* + * Update intrnames array with the specified name. This is used by + * vmstat(8) and the like. + */ static void update_intrname(int intr, char *name) { @@ -485,7 +431,7 @@ update_intrname(int intr, char *name) } int -icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) +icu_setup(int intr, inthand2_t *handler, void *arg, int flags) { #ifdef FAST_HI int select; /* the select register is 8 bits */ @@ -493,7 +439,6 @@ icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) u_int32_t value; /* the window register is 32 bits */ #endif /* FAST_HI */ u_long ef; - u_int mask = (maskptr ? *maskptr : 0); #if defined(APIC_IO) if ((u_int)intr >= ICU_LEN) /* no 8259 SLAVE to ignore */ @@ -506,8 +451,6 @@ icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) ef = read_eflags(); disable_intr(); intr_handler[intr] = handler; - intr_mptr[intr] = maskptr; - intr_mask[intr] = mask | SWI_LOW_MASK | (1 << intr); intr_unit[intr] = arg; #ifdef FAST_HI if (flags & INTR_FAST) { @@ -547,11 +490,15 @@ icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /* FAST_HI */ INTREN(1 << intr); - MPINTR_UNLOCK(); write_eflags(ef); return (0); } +/* + * Dissociate an interrupt handler from an IRQ and set the handler to + * the stray interrupt handler. The 'handler' parameter is used only + * for consistency checking. + */ int icu_unset(intr, handler) int intr; @@ -567,8 +514,6 @@ icu_unset(intr, handler) disable_intr(); intr_countp[intr] = &intrcnt[1 + intr]; intr_handler[intr] = isa_strayintr; - intr_mptr[intr] = NULL; - intr_mask[intr] = HWI_MASK | SWI_MASK; intr_unit[intr] = &intr_unit[intr]; #ifdef FAST_HI_XXX /* XXX how do I re-create dvp here? */ @@ -581,353 +526,172 @@ icu_unset(intr, handler) setidt(ICU_OFFSET + intr, slowintr[intr], SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /* FAST_HI */ - MPINTR_UNLOCK(); write_eflags(ef); return (0); } -/* The following notice applies beyond this point in the file */ - -/* - * Copyright (c) 1997, Stefan Esser - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice unmodified, this list of conditions, and the following - * disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - * - */ - -typedef struct intrec { - intrmask_t mask; - inthand2_t *handler; - void *argument; - struct intrec *next; - char *name; - int intr; - intrmask_t *maskptr; - int flags; -} intrec; - -static intrec *intreclist_head[ICU_LEN]; - -/* - * The interrupt multiplexer calls each of the handlers in turn. The - * ipl is initially quite low. It is raised as necessary for each call - * and lowered after the call. Thus out of order handling is possible - * even for interrupts of the same type. This is probably no more - * harmful than out of order handling in general (not harmful except - * for real time response which we don't support anyway). - */ -static void -intr_mux(void *arg) -{ - intrec *p; - intrmask_t oldspl; - - for (p = arg; p != NULL; p = p->next) { - oldspl = splq(p->mask); - p->handler(p->argument); - splx(oldspl); - } -} - -static intrec* -find_idesc(unsigned *maskptr, int irq) -{ - intrec *p = intreclist_head[irq]; - - while (p && p->maskptr != maskptr) - p = p->next; - - return (p); -} - -static intrec** -find_pred(intrec *idesc, int irq) -{ - intrec **pp = &intreclist_head[irq]; - intrec *p = *pp; - - while (p != idesc) { - if (p == NULL) - return (NULL); - pp = &p->next; - p = *pp; - } - return (pp); -} - -/* - * Both the low level handler and the shared interrupt multiplexer - * block out further interrupts as set in the handlers "mask", while - * the handler is running. In fact *maskptr should be used for this - * purpose, but since this requires one more pointer dereference on - * each interrupt, we rather bother update "mask" whenever *maskptr - * changes. The function "update_masks" should be called **after** - * all manipulation of the linked list of interrupt handlers hung - * off of intrdec_head[irq] is complete, since the chain of handlers - * will both determine the *maskptr values and the instances of mask - * that are fixed. This function should be called with the irq for - * which a new handler has been add blocked, since the masks may not - * yet know about the use of this irq for a device of a certain class. - */ - -static void -update_mux_masks(void) -{ - int irq; - for (irq = 0; irq < ICU_LEN; irq++) { - intrec *idesc = intreclist_head[irq]; - while (idesc != NULL) { - if (idesc->maskptr != NULL) { - /* our copy of *maskptr may be stale, refresh */ - idesc->mask = *idesc->maskptr; - } - idesc = idesc->next; - } - } -} - -static void -update_masks(intrmask_t *maskptr, int irq) -{ - intrmask_t mask = 1 << irq; - - if (maskptr == NULL) - return; - - if (find_idesc(maskptr, irq) == NULL) { - /* no reference to this maskptr was found in this irq's chain */ - if ((*maskptr & mask) == 0) - return; - /* the irq was included in the classes mask, remove it */ - *maskptr &= ~mask; - } else { - /* a reference to this maskptr was found in this irq's chain */ - if ((*maskptr & mask) != 0) - return; - /* put the irq into the classes mask */ - *maskptr |= mask; - } - /* we need to update all values in the intr_mask[irq] array */ - update_intr_masks(); - /* update mask in chains of the interrupt multiplex handler as well */ - update_mux_masks(); -} - -/* - * Add interrupt handler to linked list hung off of intreclist_head[irq] - * and install shared interrupt multiplex handler, if necessary - */ - -static int -add_intrdesc(intrec *idesc) -{ - int irq = idesc->intr; - - intrec *head = intreclist_head[irq]; - - if (head == NULL) { - /* first handler for this irq, just install it */ - if (icu_setup(irq, idesc->handler, idesc->argument, - idesc->maskptr, idesc->flags) != 0) - return (-1); - - update_intrname(irq, idesc->name); - /* keep reference */ - intreclist_head[irq] = idesc; - } else { - if ((idesc->flags & INTR_EXCL) != 0 - || (head->flags & INTR_EXCL) != 0) { - /* - * can't append new handler, if either list head or - * new handler do not allow interrupts to be shared - */ - if (bootverbose) - printf("\tdevice combination doesn't support " - "shared irq%d\n", irq); - return (-1); - } - if (head->next == NULL) { - /* - * second handler for this irq, replace device driver's - * handler by shared interrupt multiplexer function - */ - icu_unset(irq, head->handler); - if (icu_setup(irq, intr_mux, head, 0, 0) != 0) - return (-1); - if (bootverbose) - printf("\tusing shared irq%d.\n", irq); - update_intrname(irq, "mux"); - } - /* just append to the end of the chain */ - while (head->next != NULL) - head = head->next; - head->next = idesc; - } - update_masks(idesc->maskptr, irq); - return (0); -} - -/* - * Create and activate an interrupt handler descriptor data structure. - * - * The dev_instance pointer is required for resource management, and will - * only be passed through to resource_claim(). - * - * There will be functions that derive a driver and unit name from a - * dev_instance variable, and those functions will be used to maintain the - * interrupt counter label array referenced by systat and vmstat to report - * device interrupt rates (->update_intrlabels). - * - * Add the interrupt handler descriptor data structure created by an - * earlier call of create_intr() to the linked list for its irq and - * adjust the interrupt masks if necessary. - * - * WARNING: This is an internal function and not to be used by device - * drivers. It is subject to change without notice. - */ - intrec * inthand_add(const char *name, int irq, inthand2_t handler, void *arg, - intrmask_t *maskptr, int flags) + int pri, int flags) { - intrec *idesc; - int errcode = -1; - intrmask_t oldspl; + ithd *ithd = ithds[irq]; /* descriptor for the IRQ */ + intrec *head; /* chain of handlers for IRQ */ + intrec *idesc; /* descriptor for this handler */ + struct proc *p; /* interrupt thread */ + int errcode = 0; - if (ICU_LEN > 8 * sizeof *maskptr) { - printf("create_intr: ICU_LEN of %d too high for %d bit intrmask\n", - ICU_LEN, 8 * sizeof *maskptr); - return (NULL); - } - if ((unsigned)irq >= ICU_LEN) { - printf("create_intr: requested irq%d too high, limit is %d\n", - irq, ICU_LEN -1); - return (NULL); - } + if (name == NULL) /* no name? */ + panic ("anonymous interrupt"); + if (ithd == NULL || ithd->it_ih == NULL) { + /* first handler for this irq. */ + if (ithd == NULL) { + ithd = malloc(sizeof (struct ithd), M_DEVBUF, M_WAITOK); + if (ithd == NULL) + return (NULL); + bzero(ithd, sizeof(struct ithd)); + ithd->irq = irq; + ithds[irq] = ithd; + } + /* + * If we have a fast interrupt, we need to set the + * handler address directly. Do that below. For a + * slow interrupt, we don't need to know more details, + * so do it here because it's tidier. + */ + if ((flags & INTR_FAST) == 0) { + /* + * Only create a kernel thread if we don't already + * have one. + */ + if (ithd->it_proc == NULL) { + errcode = kthread_create(ithd_loop, NULL, &p, + RFSTOPPED | RFHIGHPID, "irq%d: %s", irq, + name); + if (errcode) + panic("inthand_add: Can't create " + "interrupt thread"); + p->p_rtprio.type = RTP_PRIO_ITHREAD; + p->p_stat = SWAIT; /* we're idle */ - idesc = malloc(sizeof *idesc, M_DEVBUF, M_WAITOK); + /* Put in linkages. */ + ithd->it_proc = p; + p->p_ithd = ithd; + } else + snprintf(ithd->it_proc->p_comm, MAXCOMLEN, + "irq%d: %s", irq, name); + p->p_rtprio.prio = pri; + + /* + * The interrupt process must be in place, but + * not necessarily schedulable, before we + * initialize the ICU, since it may cause an + * immediate interrupt. + */ + if (icu_setup(irq, &sched_ithd, arg, flags) != 0) + panic("inthand_add: Can't initialize ICU"); + } + } else if ((flags & INTR_EXCL) != 0 + || (ithd->it_ih->flags & INTR_EXCL) != 0) { + /* + * We can't append the new handler if either + * list ithd or new handler do not allow + * interrupts to be shared. + */ + if (bootverbose) + printf("\tdevice combination %s and %s " + "doesn't support shared irq%d\n", + ithd->it_ih->name, name, irq); + return(NULL); + } else if (flags & INTR_FAST) { + /* We can only have one fast interrupt by itself. */ + if (bootverbose) + printf("\tCan't add fast interrupt %s" + " to normal interrupt %s on irq%d", + name, ithd->it_ih->name, irq); + return (NULL); + } else { /* update p_comm */ + p = ithd->it_proc; + if (strlen(p->p_comm) + strlen(name) < MAXCOMLEN) { + strcat(p->p_comm, " "); + strcat(p->p_comm, name); + } else if (strlen(p->p_comm) == MAXCOMLEN) + p->p_comm[MAXCOMLEN - 1] = '+'; + else + strcat(p->p_comm, "+"); + } + idesc = malloc(sizeof (struct intrec), M_DEVBUF, M_WAITOK); if (idesc == NULL) - return NULL; - bzero(idesc, sizeof *idesc); + return (NULL); + bzero(idesc, sizeof (struct intrec)); + + idesc->handler = handler; + idesc->argument = arg; + idesc->flags = flags; + idesc->ithd = ithd; - if (name == NULL) - name = "???"; idesc->name = malloc(strlen(name) + 1, M_DEVBUF, M_WAITOK); if (idesc->name == NULL) { free(idesc, M_DEVBUF); - return NULL; + return (NULL); } strcpy(idesc->name, name); - idesc->handler = handler; - idesc->argument = arg; - idesc->maskptr = maskptr; - idesc->intr = irq; - idesc->flags = flags; - - /* block this irq */ - oldspl = splq(1 << irq); - - /* add irq to class selected by maskptr */ - errcode = add_intrdesc(idesc); - splx(oldspl); - - if (errcode != 0) { + /* Slow interrupts got set up above. */ + if ((flags & INTR_FAST) + && (icu_setup(irq, idesc->handler, idesc->argument, + idesc->flags) != 0) ) { if (bootverbose) - printf("\tintr_connect(irq%d) failed, result=%d\n", + printf("\tinthand_add(irq%d) failed, result=%d\n", irq, errcode); free(idesc->name, M_DEVBUF); free(idesc, M_DEVBUF); - idesc = NULL; + return NULL; } - + head = ithd->it_ih; /* look at chain of handlers */ + if (head) { + while (head->next != NULL) + head = head->next; /* find the end */ + head->next = idesc; /* hook it in there */ + } else + ithd->it_ih = idesc; /* put it up front */ + update_intrname(irq, idesc->name); return (idesc); } /* - * Deactivate and remove the interrupt handler descriptor data connected - * created by an earlier call of intr_connect() from the linked list and - * adjust theinterrupt masks if necessary. + * Deactivate and remove linked list the interrupt handler descriptor + * data connected created by an earlier call of inthand_add(), then + * adjust the interrupt masks if necessary. * - * Return the memory held by the interrupt handler descriptor data structure - * to the system. Make sure, the handler is not actively used anymore, before. + * Return the memory held by the interrupt handler descriptor data + * structure to the system. First ensure the handler is not actively + * in use. */ int inthand_remove(intrec *idesc) { - intrec **hook, *head; - int irq; - int errcode = 0; - intrmask_t oldspl; + ithd *ithd; /* descriptor for the IRQ */ + intrec *ih; /* chain of handlers */ if (idesc == NULL) return (-1); + ithd = idesc->ithd; + ih = ithd->it_ih; - irq = idesc->intr; - - /* find pointer that keeps the reference to this interrupt descriptor */ - hook = find_pred(idesc, irq); - if (hook == NULL) + if (ih == idesc) /* first in the chain */ + ithd->it_ih = idesc->next; /* unhook it */ + else { + while ((ih != NULL) + && (ih->next != idesc) ) + ih = ih->next; + if (ih->next != idesc) return (-1); - - /* make copy of original list head, the line after may overwrite it */ - head = intreclist_head[irq]; - - /* unlink: make predecessor point to idesc->next instead of to idesc */ - *hook = idesc->next; - - /* now check whether the element we removed was the list head */ - if (idesc == head) { - - oldspl = splq(1 << irq); - - /* check whether the new list head is the only element on list */ - head = intreclist_head[irq]; - if (head != NULL) { - icu_unset(irq, intr_mux); - if (head->next != NULL) { - /* install the multiplex handler with new list head as argument */ - errcode = icu_setup(irq, intr_mux, head, 0, 0); - if (errcode == 0) - update_intrname(irq, NULL); - } else { - /* install the one remaining handler for this irq */ - errcode = icu_setup(irq, head->handler, - head->argument, - head->maskptr, head->flags); - if (errcode == 0) - update_intrname(irq, head->name); + ih->next = ih->next->next; } - } else { - /* revert to old handler, eg: strayintr */ - icu_unset(irq, idesc->handler); - } - splx(oldspl); - } - update_masks(idesc->maskptr, irq); + + if (ithd->it_ih == NULL) /* no handlers left, */ + icu_unset(ithd->irq, idesc->handler); free(idesc, M_DEVBUF); return (0); } diff --git a/sys/i386/isa/intr_machdep.h b/sys/i386/isa/intr_machdep.h index 5982295b1ab4..87c97a35f5ef 100644 --- a/sys/i386/isa/intr_machdep.h +++ b/sys/i386/isa/intr_machdep.h @@ -98,7 +98,6 @@ #define TPR_BLOCK_XCPUSTOP 0xaf /* */ #define TPR_BLOCK_ALL 0xff /* all INTs */ - #ifdef TEST_TEST1 /* put a 'fake' HWI in top of APIC prio 0x3x, 32 + 31 = 63 = 0x3f */ #define XTEST1_OFFSET (ICU_OFFSET + 31) @@ -145,8 +144,9 @@ extern u_long intrcnt[]; /* counts for for each device and stray */ extern char intrnames[]; /* string table containing device names */ extern u_long *intr_countp[]; /* pointers into intrcnt[] */ extern inthand2_t *intr_handler[]; /* C entry points of intr handlers */ -extern u_int intr_mask[]; /* sets of intrs masked during handling of 1 */ +extern ithd *ithds[]; extern void *intr_unit[]; /* cookies to pass to intr handlers */ +extern ithd softinterrupt; /* soft interrupt thread */ inthand_t IDTVEC(fastintr0), IDTVEC(fastintr1), @@ -190,26 +190,60 @@ inthand_t #endif /** TEST_TEST1 */ #endif /* SMP || APIC_IO */ +#ifdef PC98 +#define ICU_IMR_OFFSET 2 /* IO_ICU{1,2} + 2 */ +#define ICU_SLAVEID 7 +#else +#define ICU_IMR_OFFSET 1 /* IO_ICU{1,2} + 1 */ +#define ICU_SLAVEID 2 +#endif + +#ifdef APIC_IO +/* + * This is to accommodate "mixed-mode" programming for + * motherboards that don't connect the 8254 to the IO APIC. + */ +#define AUTO_EOI_1 1 +#endif + +#define NR_INTRNAMES (1 + ICU_LEN + 2 * ICU_LEN) + void isa_defaultirq __P((void)); int isa_nmi __P((int cd)); int icu_setup __P((int intr, inthand2_t *func, void *arg, - u_int *maskptr, int flags)); + int flags)); int icu_unset __P((int intr, inthand2_t *handler)); -int update_intr_masks __P((void)); intrmask_t splq __P((intrmask_t mask)); -#define INTR_FAST 0x00000001 /* fast interrupt handler */ -#define INTR_EXCL 0x00010000 /* excl. intr, default is shared */ +/* + * Describe a hardware interrupt handler. These structures are + * accessed via the array intreclist, which contains one pointer per + * hardware interrupt. + * + * Multiple interrupt handlers for a specific IRQ can be chained + * together via the 'next' pointer. + */ +typedef struct intrec { + inthand2_t *handler; /* code address of handler */ + void *argument; /* argument to pass to handler */ + enum intr_type flags; /* flag bits (sys/bus.h) */ + char *name; /* name of handler */ + ithd *ithd; /* handler we're connected to */ + struct intrec *next; /* next handler for this irq */ +} intrec; /* * WARNING: These are internal functions and not to be used by device drivers! * They are subject to change without notice. */ struct intrec *inthand_add(const char *name, int irq, inthand2_t handler, - void *arg, intrmask_t *maskptr, int flags); - + void *arg, int pri, int flags); int inthand_remove(struct intrec *idesc); +void sched_ithd(void *); +void ithd_loop(void *); +void start_softintr(void *); +void intr_soft(void *); #endif /* LOCORE */ diff --git a/sys/i386/isa/ipl.s b/sys/i386/isa/ipl.s index 93612301fa85..1ee9ace4559e 100644 --- a/sys/i386/isa/ipl.s +++ b/sys/i386/isa/ipl.s @@ -44,7 +44,6 @@ * AT/386 * Vector interrupt control section * - * cpl - Current interrupt disable mask * *_imask - Interrupt masks for various spl*() functions * ipending - Pending interrupts (set when a masked interrupt occurs) */ @@ -53,8 +52,6 @@ ALIGN_DATA /* current priority (all off) */ - .globl _cpl -_cpl: .long HWI_MASK | SWI_MASK .globl _tty_imask _tty_imask: .long SWI_TTY_MASK @@ -71,9 +68,9 @@ _softnet_imask: .long SWI_NET_MASK .globl _softtty_imask _softtty_imask: .long SWI_TTY_MASK -/* pending interrupts blocked by splxxx() */ - .globl _ipending -_ipending: .long 0 +/* pending software interrupts */ + .globl _spending +_spending: .long 0 /* set with bits for which queue to service */ .globl _netisr @@ -100,59 +97,30 @@ _netisrs: _doreti: FAKE_MCOUNT(_bintr) /* init "from" _bintr -> _doreti */ addl $4,%esp /* discard unit number */ - popl %eax /* cpl or cml to restore */ doreti_next: - /* - * Check for pending HWIs and SWIs atomically with restoring cpl - * and exiting. The check has to be atomic with exiting to stop - * (ipending & ~cpl) changing from zero to nonzero while we're - * looking at it (this wouldn't be fatal but it would increase - * interrupt latency). Restoring cpl has to be atomic with exiting - * so that the stack cannot pile up (the nesting level of interrupt - * handlers is limited by the number of bits in cpl). - */ -#ifdef SMP - cli /* early to prevent INT deadlock */ -doreti_next2: -#endif - movl %eax,%ecx - notl %ecx /* set bit = unmasked level */ -#ifndef SMP - cli -#endif - andl _ipending,%ecx /* set bit = unmasked pending INT */ - jne doreti_unpend - movl %eax,_cpl decb _intr_nesting_level /* Check for ASTs that can be handled now. */ testl $AST_PENDING,_astpending - je doreti_exit - testb $SEL_RPL_MASK,TF_CS(%esp) - jne doreti_ast - testl $PSL_VM,TF_EFLAGS(%esp) - je doreti_exit - cmpl $1,_in_vm86call - jne doreti_ast + je doreti_exit /* no AST, exit */ + testb $SEL_RPL_MASK,TF_CS(%esp) /* are we in user mode? */ + jne doreti_ast /* yes, do it now. */ + testl $PSL_VM,TF_EFLAGS(%esp) /* kernel mode */ + je doreti_exit /* and not VM86 mode, defer */ + cmpl $1,_in_vm86call /* are we in a VM86 call? */ + jne doreti_ast /* yes, we can do it */ /* - * doreti_exit - release MP lock, pop registers, iret. + * doreti_exit: release MP lock, pop registers, iret. * - * Note that the syscall trap shotcuts to doreti_syscall_ret. + * Note that the syscall trap shortcuts to doreti_syscall_ret. * The segment register pop is a special case, since it may * fault if (for example) a sigreturn specifies bad segment - * registers. The fault is handled in trap.c + * registers. The fault is handled in trap.c. */ - doreti_exit: MEXITCOUNT -#ifdef SMP - /* release the kernel lock */ - movl $_mp_lock, %edx /* GIANT_LOCK */ - call _MPrellock_edx -#endif /* SMP */ - .globl doreti_popl_fs .globl doreti_syscall_ret doreti_syscall_ret: @@ -170,6 +138,13 @@ doreti_popl_ds: doreti_iret: iret + /* + * doreti_iret_fault and friends. Alternative return code for + * the case where we get a fault in the doreti_exit code + * above. trap() (i386/i386/trap.c) catches this specific + * case, sends the process a signal and continues in the + * corresponding place in the code below. + */ ALIGN_TEXT .globl doreti_iret_fault doreti_iret_fault: @@ -188,94 +163,12 @@ doreti_popl_fs_fault: movl $T_PROTFLT,TF_TRAPNO(%esp) jmp alltraps_with_regs_pushed - ALIGN_TEXT -doreti_unpend: - /* - * Enabling interrupts is safe because we haven't restored cpl yet. - * %ecx contains the next probable ready interrupt (~cpl & ipending) - */ -#ifdef SMP - bsfl %ecx, %ecx /* locate the next dispatchable int */ - lock - btrl %ecx, _ipending /* is it really still pending? */ - jnc doreti_next2 /* some intr cleared memory copy */ - sti /* late to prevent INT deadlock */ -#else - sti - bsfl %ecx,%ecx /* slow, but not worth optimizing */ - btrl %ecx,_ipending - jnc doreti_next /* some intr cleared memory copy */ -#endif /* SMP */ - /* - * Execute handleable interrupt - * - * Set up JUMP to _ihandlers[%ecx] for HWIs. - * Set up CALL of _ihandlers[%ecx] for SWIs. - * This is a bit early for the SMP case - we have to push %ecx and - * %edx, but could push only %ecx and load %edx later. - */ - movl _ihandlers(,%ecx,4),%edx - cmpl $NHWI,%ecx - jae doreti_swi /* software interrupt handling */ - cli /* else hardware int handling */ -#ifdef SMP - movl %eax,_cpl /* same as non-smp case right now */ -#else - movl %eax,_cpl -#endif - MEXITCOUNT -#ifdef APIC_INTR_DIAGNOSTIC - lock - incl CNAME(apic_itrace_doreti)(,%ecx,4) -#ifdef APIC_INTR_DIAGNOSTIC_IRQ - cmpl $APIC_INTR_DIAGNOSTIC_IRQ,%ecx - jne 9f - pushl %eax - pushl %ecx - pushl %edx - pushl $APIC_ITRACE_DORETI - call log_intr_event - addl $4,%esp - popl %edx - popl %ecx - popl %eax -9: -#endif -#endif - jmp *%edx - - ALIGN_TEXT -doreti_swi: - pushl %eax - /* - * At least the SWI_CLOCK handler has to run at a possibly strictly - * lower cpl, so we have to restore - * all the h/w bits in cpl now and have to worry about stack growth. - * The worst case is currently (30 Jan 1994) 2 SWI handlers nested - * in dying interrupt frames and about 12 HWIs nested in active - * interrupt frames. There are only 4 different SWIs and the HWI - * and SWI masks limit the nesting further. - * - * The SMP case is currently the same as the non-SMP case. - */ -#ifdef SMP - orl imasks(,%ecx,4), %eax /* or in imasks */ - movl %eax,_cpl /* set cpl for call */ -#else - orl imasks(,%ecx,4),%eax - movl %eax,_cpl -#endif - call *%edx - popl %eax /* cpl to restore */ - jmp doreti_next - ALIGN_TEXT doreti_ast: andl $~AST_PENDING,_astpending sti movl $T_ASTFLT,TF_TRAPNO(%esp) - call _trap - subl %eax,%eax /* recover cpl|cml */ + call _ast movb $1,_intr_nesting_level /* for doreti_next to decrement */ jmp doreti_next diff --git a/sys/i386/isa/ipl_funcs.c b/sys/i386/isa/ipl_funcs.c index d27d97fa9b1f..14eb2402eb0e 100644 --- a/sys/i386/isa/ipl_funcs.c +++ b/sys/i386/isa/ipl_funcs.c @@ -27,11 +27,13 @@ */ #include +#include #include #include #include #include -#include +#include +#include #include /* @@ -45,236 +47,55 @@ void name(void) \ { \ atomic_set_int(var, bits); \ + sched_ithd((void *) SOFTINTR); \ } -DO_SETBITS(setdelayed, &ipending, loadandclear(&idelayed)) +DO_SETBITS(setdelayed, &spending, loadandclear(&idelayed)) +DO_SETBITS(setsoftcamnet,&spending, SWI_CAMNET_PENDING) +DO_SETBITS(setsoftcambio,&spending, SWI_CAMBIO_PENDING) +DO_SETBITS(setsoftclock, &spending, SWI_CLOCK_PENDING) +DO_SETBITS(setsoftnet, &spending, SWI_NET_PENDING) +DO_SETBITS(setsofttty, &spending, SWI_TTY_PENDING) +DO_SETBITS(setsoftvm, &spending, SWI_VM_PENDING) +DO_SETBITS(setsofttq, &spending, SWI_TQ_PENDING) -DO_SETBITS(setsoftcamnet,&ipending, SWI_CAMNET_PENDING) -DO_SETBITS(setsoftcambio,&ipending, SWI_CAMBIO_PENDING) -DO_SETBITS(setsoftclock, &ipending, SWI_CLOCK_PENDING) -DO_SETBITS(setsoftnet, &ipending, SWI_NET_PENDING) -DO_SETBITS(setsofttty, &ipending, SWI_TTY_PENDING) -DO_SETBITS(setsoftvm, &ipending, SWI_VM_PENDING) -DO_SETBITS(setsofttq, &ipending, SWI_TQ_PENDING) - -DO_SETBITS(schedsoftcamnet, &idelayed, SWI_CAMNET_PENDING) -DO_SETBITS(schedsoftcambio, &idelayed, SWI_CAMBIO_PENDING) -DO_SETBITS(schedsoftnet, &idelayed, SWI_NET_PENDING) -DO_SETBITS(schedsofttty, &idelayed, SWI_TTY_PENDING) -DO_SETBITS(schedsoftvm, &idelayed, SWI_VM_PENDING) -DO_SETBITS(schedsofttq, &idelayed, SWI_TQ_PENDING) +/* + * We don't need to schedule soft interrupts any more, it happens + * automatically. + */ +#define schedsoftcamnet +#define schedsoftcambio +#define schedsoftnet +#define schedsofttty +#define schedsoftvm +#define schedsofttq unsigned softclockpending(void) { - return (ipending & SWI_CLOCK_PENDING); + return (spending & SWI_CLOCK_PENDING); } /* - * Support for SPL assertions. + * Dummy spl calls. The only reason for these is to not break + * all the code which expects to call them. */ - -#ifdef INVARIANT_SUPPORT - -#define SPLASSERT_IGNORE 0 -#define SPLASSERT_LOG 1 -#define SPLASSERT_PANIC 2 - -static int splassertmode = SPLASSERT_LOG; -SYSCTL_INT(_kern, OID_AUTO, splassertmode, CTLFLAG_RW, - &splassertmode, 0, "Set the mode of SPLASSERT"); - -static void -init_splassertmode(void *ignored) -{ - TUNABLE_INT_FETCH("kern.splassertmode", 0, splassertmode); -} -SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_splassertmode, NULL); - -static void -splassertfail(char *str, const char *msg, char *name, int level) -{ - switch (splassertmode) { - case SPLASSERT_IGNORE: - break; - case SPLASSERT_LOG: - printf(str, msg, name, level); - printf("\n"); - break; - case SPLASSERT_PANIC: - panic(str, msg, name, level); - break; - } -} - -#define GENSPLASSERT(NAME, MODIFIER) \ -void \ -NAME##assert(const char *msg) \ -{ \ - if ((cpl & (MODIFIER)) != (MODIFIER)) \ - splassertfail("%s: not %s, cpl == %#x", \ - msg, __XSTRING(NAME) + 3, cpl); \ -} -#else -#define GENSPLASSERT(NAME, MODIFIER) -#endif - -/************************************************************************ - * GENERAL SPL CODE * - ************************************************************************ - * - * Implement splXXX(), spl0(), splx(), and splq(). splXXX() disables a - * set of interrupts (e.g. splbio() disables interrupts relating to - * device I/O) and returns the previous interrupt mask. splx() restores - * the previous interrupt mask, spl0() is a special case which enables - * all interrupts and is typically used inside i386/i386 swtch.s and - * fork_trampoline. splq() is a generic version of splXXX(). - * - * The SPL routines mess around with the 'cpl' global, which masks - * interrupts. Interrupts are not *actually* masked. What happens is - * that if an interrupt masked by the cpl occurs, the appropriate bit - * in 'ipending' is set and the interrupt is defered. When we clear - * bits in the cpl we must check to see if any ipending interrupts have - * been unmasked and issue the synchronously, which is what the splz() - * call does. - * - * Because the cpl is often saved and restored in a nested fashion, cpl - * modifications are only allowed in the SMP case when the MP lock is held - * to prevent multiple processes from tripping over each other's masks. - * The cpl is saved when you do a context switch (mi_switch()) and restored - * when your process gets cpu again. - * - * An interrupt routine is allowed to modify the cpl as long as it restores - * it prior to returning (thus the interrupted mainline code doesn't notice - * anything amiss). For the SMP case, the interrupt routine must hold - * the MP lock for any cpl manipulation. - * - * Likewise, due to the deterministic nature of cpl modifications, we do - * NOT need to use locked instructions to modify it. - */ - -#ifndef SMP - -#define GENSPL(NAME, OP, MODIFIER, PC) \ -GENSPLASSERT(NAME, MODIFIER) \ -unsigned NAME(void) \ -{ \ - unsigned x; \ - \ - x = cpl; \ - cpl OP MODIFIER; \ - return (x); \ -} - -void -spl0(void) -{ - cpl = 0; - if (ipending) - splz(); -} - -void -splx(unsigned ipl) -{ - cpl = ipl; - if (ipending & ~ipl) - splz(); -} - -intrmask_t -splq(intrmask_t mask) -{ - intrmask_t tmp = cpl; - cpl |= mask; - return (tmp); -} - -#else /* !SMP */ - -#include -#include - -/* - * SMP CASE - * - * Mostly the same as the non-SMP case now, but it didn't used to be - * this clean. - */ - -#define GENSPL(NAME, OP, MODIFIER, PC) \ -GENSPLASSERT(NAME, MODIFIER) \ -unsigned NAME(void) \ -{ \ - unsigned x; \ - \ - x = cpl; \ - cpl OP MODIFIER; \ - \ - return (x); \ -} - -/* - * spl0() - unmask all interrupts - * - * The MP lock must be held on entry - * This routine may only be called from mainline code. - */ -void -spl0(void) -{ - KASSERT(inside_intr == 0, ("spl0: called from interrupt")); - cpl = 0; - if (ipending) - splz(); -} - -/* - * splx() - restore previous interrupt mask - * - * The MP lock must be held on entry - */ - -void -splx(unsigned ipl) -{ - cpl = ipl; - if (inside_intr == 0 && (ipending & ~cpl) != 0) - splz(); -} - - -/* - * splq() - blocks specified interrupts - * - * The MP lock must be held on entry - */ -intrmask_t -splq(intrmask_t mask) -{ - intrmask_t tmp = cpl; - cpl |= mask; - return (tmp); -} - -#endif /* !SMP */ - -/* Finally, generate the actual spl*() functions */ - -/* NAME: OP: MODIFIER: PC: */ -GENSPL(splbio, |=, bio_imask, 2) -GENSPL(splcam, |=, cam_imask, 7) -GENSPL(splclock, =, HWI_MASK | SWI_MASK, 3) -GENSPL(splhigh, =, HWI_MASK | SWI_MASK, 4) -GENSPL(splimp, |=, net_imask, 5) -GENSPL(splnet, |=, SWI_NET_MASK, 6) -GENSPL(splsoftcam, |=, SWI_CAMBIO_MASK | SWI_CAMNET_MASK, 8) -GENSPL(splsoftcambio, |=, SWI_CAMBIO_MASK, 9) -GENSPL(splsoftcamnet, |=, SWI_CAMNET_MASK, 10) -GENSPL(splsoftclock, =, SWI_CLOCK_MASK, 11) -GENSPL(splsofttty, |=, SWI_TTY_MASK, 12) -GENSPL(splsoftvm, |=, SWI_VM_MASK, 16) -GENSPL(splsofttq, |=, SWI_TQ_MASK, 17) -GENSPL(splstatclock, |=, stat_imask, 13) -GENSPL(spltty, |=, tty_imask, 14) -GENSPL(splvm, |=, net_imask | bio_imask | cam_imask, 15) +void spl0 (void) {} +void splx (intrmask_t x) {} +intrmask_t splq(intrmask_t mask) {return 0; } +intrmask_t splbio(void) {return 0; } +intrmask_t splcam(void) {return 0; } +intrmask_t splclock(void) {return 0; } +intrmask_t splhigh(void) {return 0; } +intrmask_t splimp(void) {return 0; } +intrmask_t splnet(void) {return 0; } +intrmask_t splsoftcam(void) {return 0; } +intrmask_t splsoftcambio(void) {return 0; } +intrmask_t splsoftcamnet(void) {return 0; } +intrmask_t splsoftclock(void) {return 0; } +intrmask_t splsofttty(void) {return 0; } +intrmask_t splsoftvm(void) {return 0; } +intrmask_t splsofttq(void) {return 0; } +intrmask_t splstatclock(void) {return 0; } +intrmask_t spltty(void) {return 0; } +intrmask_t splvm(void) {return 0; } diff --git a/sys/i386/isa/ithread.c b/sys/i386/isa/ithread.c new file mode 100644 index 000000000000..4ceac4229d1c --- /dev/null +++ b/sys/i386/isa/ithread.c @@ -0,0 +1,353 @@ +/*- + * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From BSDI: intr.c,v 1.6.2.5 1999/07/06 19:16:52 cp Exp + * $FreeBSD$ + */ + +/* Interrupt thread code. */ + +#include "opt_auto_eoi.h" + +#include "isa.h" + +#include +#include /* change this name XXX */ +#ifndef SMP +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(APIC_IO) +#include +#include /** FAST_HI */ +#include +#endif /* APIC_IO */ +#ifdef PC98 +#include +#include +#include +#else +#include +#endif +#include + +#if NISA > 0 +#include +#endif +#include +#include +#ifdef APIC_IO +#include +#endif + +#include "mca.h" +#if NMCA > 0 +#include +#endif + +#include +#include +#include +#include +#if 0 +#include +#endif + +u_long softintrcnt [NSWI]; + +SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr, NULL) + +/* + * Schedule a heavyweight interrupt process. This function is called + * from the interrupt handlers Xintr. + */ +void +sched_ithd(void *cookie) +{ + int irq = (int) cookie; /* IRQ we're handling */ + ithd *ir = ithds[irq]; /* and the process that does it */ + + /* This used to be in icu_vector.s */ + /* + * We count software interrupts when we process them. The + * code here follows previous practice, but there's an + * argument for counting hardware interrupts when they're + * processed too. + */ + if (irq < NHWI) /* real interrupt, */ + atomic_add_long(intr_countp[irq], 1); /* one more for this IRQ */ + atomic_add_int(&cnt.v_intr, 1); /* one more global interrupt */ + + CTR3(KTR_INTR, "sched_ithd pid %d(%s) need=%d", + ir->it_proc->p_pid, ir->it_proc->p_comm, ir->it_need); + +#if 0 + /* + * If we are in the debugger, we can't use interrupt threads to + * process interrupts since the threads are scheduled. Instead, + * call the interrupt handlers directly. This should be able to + * go away once we have light-weight interrupt handlers. + */ + if (db_active) { + intrec *ih; /* and our interrupt handler chain */ +#if 0 + membar_unlock(); /* push out "it_need=0" */ +#endif + for (ih = ir->it_ih; ih != NULL; ih = ih->next) { + if ((ih->flags & INTR_MPSAFE) == 0) + mtx_enter(&Giant, MTX_DEF); + ih->handler(ih->argument); + if ((ih->flags & INTR_MPSAFE) == 0) + mtx_exit(&Giant, MTX_DEF); + } + + INTREN (1 << ir->irq); /* reset the mask bit */ + return; + } +#endif + + /* + * Set it_need so that if the thread is already running but close + * to done, it will do another go-round. Then get the sched lock + * and see if the thread is on whichkqs yet. If not, put it on + * there. In any case, kick everyone so that if the new thread + * is higher priority than their current thread, it gets run now. + */ + ir->it_need = 1; + mtx_enter(&sched_lock, MTX_SPIN); + if (ir->it_proc->p_stat == SWAIT) { /* not on run queue */ + CTR1(KTR_INTR, "sched_ithd: setrunqueue %d", + ir->it_proc->p_pid); +/* membar_lock(); */ + ir->it_proc->p_stat = SRUN; + setrunqueue(ir->it_proc); + aston(); + } + else { +if (irq < NHWI && (irq & 7) != 0) + CTR3(KTR_INTR, "sched_ithd %d: it_need %d, state %d", + ir->it_proc->p_pid, + ir->it_need, + ir->it_proc->p_stat ); + } + mtx_exit(&sched_lock, MTX_SPIN); +#if 0 + aston(); /* ??? check priorities first? */ +#else + need_resched(); +#endif +} + +/* + * This is the main code for all interrupt threads. It gets put on + * whichkqs by setrunqueue above. + */ +void +ithd_loop(void *dummy) +{ + ithd *me; /* our thread context */ + intrec *ih; /* and our interrupt handler chain */ + + me = curproc->p_ithd; /* point to myself */ + + /* + * As long as we have interrupts outstanding, go through the + * list of handlers, giving each one a go at it. + */ + for (;;) { + CTR3(KTR_INTR, "ithd_loop pid %d(%s) need=%d", + me->it_proc->p_pid, me->it_proc->p_comm, me->it_need); + while (me->it_need) { + /* + * Service interrupts. If another interrupt + * arrives while we are running, they will set + * it_need to denote that we should make + * another pass. + */ + me->it_need = 0; +#if 0 + membar_unlock(); /* push out "it_need=0" */ +#endif + for (ih = me->it_ih; ih != NULL; ih = ih->next) { + CTR5(KTR_INTR, + "ithd_loop pid %d ih=%p: %p(%p) flg=%x", + me->it_proc->p_pid, (void *)ih, + (void *)ih->handler, ih->argument, + ih->flags); + + if ((ih->flags & INTR_MPSAFE) == 0) + mtx_enter(&Giant, MTX_DEF); + ih->handler(ih->argument); + if ((ih->flags & INTR_MPSAFE) == 0) + mtx_exit(&Giant, MTX_DEF); + } + } + + /* + * Processed all our interrupts. Now get the sched + * lock. This may take a while and it_need may get + * set again, so we have to check it again. + */ + mtx_enter(&sched_lock, MTX_SPIN); + if (!me->it_need) { + + INTREN (1 << me->irq); /* reset the mask bit */ + me->it_proc->p_stat = SWAIT; /* we're idle */ +#ifdef APIC_IO + CTR1(KTR_INTR, "ithd_loop pid %d: done", + me->it_proc->p_pid); +#else + CTR2(KTR_INTR, "ithd_loop pid %d: done, imen=%x", + me->it_proc->p_pid, imen); +#endif + mi_switch(); + CTR1(KTR_INTR, "ithd_loop pid %d: resumed", + me->it_proc->p_pid); + } + mtx_exit(&sched_lock, MTX_SPIN); + } +} + +/* + * Start soft interrupt thread. + */ +void +start_softintr(void *dummy) +{ + int error; + struct proc *p; + ithd *softintr; /* descriptor for the "IRQ" */ + intrec *idesc; /* descriptor for this handler */ + char *name = "sintr"; /* name for idesc */ + int i; + + if (ithds[SOFTINTR]) { /* we already have a thread */ + printf("start_softintr: already running"); + return; + } + /* first handler for this irq. */ + softintr = malloc(sizeof (struct ithd), M_DEVBUF, M_WAITOK); + if (softintr == NULL) + panic ("Can't create soft interrupt thread"); + bzero(softintr, sizeof(struct ithd)); + softintr->irq = SOFTINTR; + ithds[SOFTINTR] = softintr; + error = kthread_create(intr_soft, NULL, &p, + RFSTOPPED | RFHIGHPID, "softinterrupt"); + if (error) + panic("start_softintr: kthread_create error %d\n", error); + + p->p_rtprio.type = RTP_PRIO_ITHREAD; + p->p_rtprio.prio = PI_SOFT; /* soft interrupt */ + p->p_stat = SWAIT; /* we're idle */ + + /* Put in linkages. */ + softintr->it_proc = p; + p->p_ithd = softintr; /* reverse link */ + + idesc = malloc(sizeof (struct intrec), M_DEVBUF, M_WAITOK); + if (idesc == NULL) + panic ("Can't create soft interrupt thread"); + bzero(idesc, sizeof (struct intrec)); + + idesc->ithd = softintr; + idesc->name = malloc(strlen(name) + 1, M_DEVBUF, M_WAITOK); + if (idesc->name == NULL) + panic ("Can't create soft interrupt thread"); + strcpy(idesc->name, name); + for (i = NHWI; i < NHWI + NSWI; i++) + intr_countp[i] = &softintrcnt [i - NHWI]; +} + +/* + * Software interrupt process code. + */ +void +intr_soft(void *dummy) +{ + int i; + ithd *me; /* our thread context */ + + me = curproc->p_ithd; /* point to myself */ + + /* Main loop */ + for (;;) { +#if 0 + CTR3(KTR_INTR, "intr_soft pid %d(%s) need=%d", + me->it_proc->p_pid, me->it_proc->p_comm, + me->it_need); +#endif + + /* + * Service interrupts. If another interrupt arrives + * while we are running, they will set it_need to + * denote that we should make another pass. + */ + me->it_need = 0; + while ((i = ffs(spending))) { + i--; + atomic_add_long(intr_countp[i], 1); + spending &= ~ (1 << i); + mtx_enter(&Giant, MTX_DEF); + (ihandlers[i])(); + mtx_exit(&Giant, MTX_DEF); + } + /* + * Processed all our interrupts. Now get the sched + * lock. This may take a while and it_need may get + * set again, so we have to check it again. + */ + mtx_enter(&sched_lock, MTX_SPIN); + if (!me->it_need) { +#if 0 + CTR1(KTR_INTR, "intr_soft pid %d: done", + me->it_proc->p_pid); +#endif + me->it_proc->p_stat = SWAIT; /* we're idle */ + mi_switch(); +#if 0 + CTR1(KTR_INTR, "intr_soft pid %d: resumed", + me->it_proc->p_pid); +#endif + } + mtx_exit(&sched_lock, MTX_SPIN); + } +} diff --git a/sys/i386/isa/loran.c b/sys/i386/isa/loran.c index 577a608f7113..c43bf8524c24 100644 --- a/sys/i386/isa/loran.c +++ b/sys/i386/isa/loran.c @@ -620,7 +620,7 @@ SYSCTL_OPAQUE(_debug, OID_AUTO, loran_timecounter, CTLFLAG_RD, /**********************************************************************/ struct isa_driver lorandriver = { - INTR_TYPE_TTY | INTR_TYPE_FAST, + INTR_TYPE_TTY | INTR_FAST, loranprobe, loranattach, "loran" diff --git a/sys/i386/isa/nmi.c b/sys/i386/isa/nmi.c index 34a8c229bd6b..870760e1ce01 100644 --- a/sys/i386/isa/nmi.c +++ b/sys/i386/isa/nmi.c @@ -36,12 +36,6 @@ * from: @(#)isa.c 7.2 (Berkeley) 5/13/91 * $FreeBSD$ */ -/* - * This file contains an aggregated module marked: - * Copyright (c) 1997, Stefan Esser - * All rights reserved. - * See the notice for details. - */ #include "opt_auto_eoi.h" @@ -51,11 +45,14 @@ #ifndef SMP #include #endif +#include #include #include #include +#include #include #include +#include #include #include #include @@ -91,30 +88,14 @@ #include #endif -/* XXX should be in suitable include files */ -#ifdef PC98 -#define ICU_IMR_OFFSET 2 /* IO_ICU{1,2} + 2 */ -#define ICU_SLAVEID 7 -#else -#define ICU_IMR_OFFSET 1 /* IO_ICU{1,2} + 1 */ -#define ICU_SLAVEID 2 -#endif - -#ifdef APIC_IO /* - * This is to accommodate "mixed-mode" programming for - * motherboards that don't connect the 8254 to the IO APIC. + * Per-interrupt data. We consider the soft interrupt to be a special + * case, so these arrays have NHWI + NSWI entries, not ICU_LEN. */ -#define AUTO_EOI_1 1 -#endif - -#define NR_INTRNAMES (1 + ICU_LEN + 2 * ICU_LEN) - -u_long *intr_countp[ICU_LEN]; -inthand2_t *intr_handler[ICU_LEN]; -u_int intr_mask[ICU_LEN]; -static u_int* intr_mptr[ICU_LEN]; -void *intr_unit[ICU_LEN]; +u_long *intr_countp[NHWI + NSWI]; /* pointers to interrupt counters */ +inthand2_t *intr_handler[NHWI + NSWI]; /* first level interrupt handler */ +ithd *ithds[NHWI + NSWI]; /* real interrupt handler */ +void *intr_unit[NHWI + NSWI]; static inthand_t *fastintr[ICU_LEN] = { &IDTVEC(fastintr0), &IDTVEC(fastintr1), @@ -292,8 +273,9 @@ isa_nmi(cd) } /* - * Fill in default interrupt table (in case of spuruious interrupt - * during configuration of kernel, setup interrupt control unit + * Create a default interrupt table to avoid problems caused by + * spurious interrupts during configuration of kernel, then setup + * interrupt control unit. */ void isa_defaultirq() @@ -364,16 +346,6 @@ isa_strayintr(vcookiep) { int intr = (void **)vcookiep - &intr_unit[0]; - /* DON'T BOTHER FOR NOW! */ - /* for some reason, we get bursts of intr #7, even if not enabled! */ - /* - * Well the reason you got bursts of intr #7 is because someone - * raised an interrupt line and dropped it before the 8259 could - * prioritize it. This is documented in the intel data book. This - * means you have BAD hardware! I have changed this so that only - * the first 5 get logged, then it quits logging them, and puts - * out a special message. rgrimes 3/25/1993 - */ /* * XXX TODO print a different message for #7 if it is for a * glitch. Glitches can be distinguished from real #7's by @@ -405,36 +377,10 @@ isa_irq_pending() } #endif -int -update_intr_masks(void) -{ - int intr, n=0; - u_int mask,*maskptr; - - for (intr=0; intr < ICU_LEN; intr ++) { -#if defined(APIC_IO) - /* no 8259 SLAVE to ignore */ -#else - if (intr==ICU_SLAVEID) continue; /* ignore 8259 SLAVE output */ -#endif /* APIC_IO */ - maskptr = intr_mptr[intr]; - if (!maskptr) - continue; - *maskptr |= SWI_LOW_MASK | (1 << intr); - mask = *maskptr; - if (mask != intr_mask[intr]) { -#if 0 - printf ("intr_mask[%2d] old=%08x new=%08x ptr=%p.\n", - intr, intr_mask[intr], mask, maskptr); -#endif - intr_mask[intr]=mask; - n++; - } - - } - return (n); -} - +/* + * Update intrnames array with the specified name. This is used by + * vmstat(8) and the like. + */ static void update_intrname(int intr, char *name) { @@ -485,7 +431,7 @@ update_intrname(int intr, char *name) } int -icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) +icu_setup(int intr, inthand2_t *handler, void *arg, int flags) { #ifdef FAST_HI int select; /* the select register is 8 bits */ @@ -493,7 +439,6 @@ icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) u_int32_t value; /* the window register is 32 bits */ #endif /* FAST_HI */ u_long ef; - u_int mask = (maskptr ? *maskptr : 0); #if defined(APIC_IO) if ((u_int)intr >= ICU_LEN) /* no 8259 SLAVE to ignore */ @@ -506,8 +451,6 @@ icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) ef = read_eflags(); disable_intr(); intr_handler[intr] = handler; - intr_mptr[intr] = maskptr; - intr_mask[intr] = mask | SWI_LOW_MASK | (1 << intr); intr_unit[intr] = arg; #ifdef FAST_HI if (flags & INTR_FAST) { @@ -547,11 +490,15 @@ icu_setup(int intr, inthand2_t *handler, void *arg, u_int *maskptr, int flags) SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /* FAST_HI */ INTREN(1 << intr); - MPINTR_UNLOCK(); write_eflags(ef); return (0); } +/* + * Dissociate an interrupt handler from an IRQ and set the handler to + * the stray interrupt handler. The 'handler' parameter is used only + * for consistency checking. + */ int icu_unset(intr, handler) int intr; @@ -567,8 +514,6 @@ icu_unset(intr, handler) disable_intr(); intr_countp[intr] = &intrcnt[1 + intr]; intr_handler[intr] = isa_strayintr; - intr_mptr[intr] = NULL; - intr_mask[intr] = HWI_MASK | SWI_MASK; intr_unit[intr] = &intr_unit[intr]; #ifdef FAST_HI_XXX /* XXX how do I re-create dvp here? */ @@ -581,353 +526,172 @@ icu_unset(intr, handler) setidt(ICU_OFFSET + intr, slowintr[intr], SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif /* FAST_HI */ - MPINTR_UNLOCK(); write_eflags(ef); return (0); } -/* The following notice applies beyond this point in the file */ - -/* - * Copyright (c) 1997, Stefan Esser - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice unmodified, this list of conditions, and the following - * disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ - * - */ - -typedef struct intrec { - intrmask_t mask; - inthand2_t *handler; - void *argument; - struct intrec *next; - char *name; - int intr; - intrmask_t *maskptr; - int flags; -} intrec; - -static intrec *intreclist_head[ICU_LEN]; - -/* - * The interrupt multiplexer calls each of the handlers in turn. The - * ipl is initially quite low. It is raised as necessary for each call - * and lowered after the call. Thus out of order handling is possible - * even for interrupts of the same type. This is probably no more - * harmful than out of order handling in general (not harmful except - * for real time response which we don't support anyway). - */ -static void -intr_mux(void *arg) -{ - intrec *p; - intrmask_t oldspl; - - for (p = arg; p != NULL; p = p->next) { - oldspl = splq(p->mask); - p->handler(p->argument); - splx(oldspl); - } -} - -static intrec* -find_idesc(unsigned *maskptr, int irq) -{ - intrec *p = intreclist_head[irq]; - - while (p && p->maskptr != maskptr) - p = p->next; - - return (p); -} - -static intrec** -find_pred(intrec *idesc, int irq) -{ - intrec **pp = &intreclist_head[irq]; - intrec *p = *pp; - - while (p != idesc) { - if (p == NULL) - return (NULL); - pp = &p->next; - p = *pp; - } - return (pp); -} - -/* - * Both the low level handler and the shared interrupt multiplexer - * block out further interrupts as set in the handlers "mask", while - * the handler is running. In fact *maskptr should be used for this - * purpose, but since this requires one more pointer dereference on - * each interrupt, we rather bother update "mask" whenever *maskptr - * changes. The function "update_masks" should be called **after** - * all manipulation of the linked list of interrupt handlers hung - * off of intrdec_head[irq] is complete, since the chain of handlers - * will both determine the *maskptr values and the instances of mask - * that are fixed. This function should be called with the irq for - * which a new handler has been add blocked, since the masks may not - * yet know about the use of this irq for a device of a certain class. - */ - -static void -update_mux_masks(void) -{ - int irq; - for (irq = 0; irq < ICU_LEN; irq++) { - intrec *idesc = intreclist_head[irq]; - while (idesc != NULL) { - if (idesc->maskptr != NULL) { - /* our copy of *maskptr may be stale, refresh */ - idesc->mask = *idesc->maskptr; - } - idesc = idesc->next; - } - } -} - -static void -update_masks(intrmask_t *maskptr, int irq) -{ - intrmask_t mask = 1 << irq; - - if (maskptr == NULL) - return; - - if (find_idesc(maskptr, irq) == NULL) { - /* no reference to this maskptr was found in this irq's chain */ - if ((*maskptr & mask) == 0) - return; - /* the irq was included in the classes mask, remove it */ - *maskptr &= ~mask; - } else { - /* a reference to this maskptr was found in this irq's chain */ - if ((*maskptr & mask) != 0) - return; - /* put the irq into the classes mask */ - *maskptr |= mask; - } - /* we need to update all values in the intr_mask[irq] array */ - update_intr_masks(); - /* update mask in chains of the interrupt multiplex handler as well */ - update_mux_masks(); -} - -/* - * Add interrupt handler to linked list hung off of intreclist_head[irq] - * and install shared interrupt multiplex handler, if necessary - */ - -static int -add_intrdesc(intrec *idesc) -{ - int irq = idesc->intr; - - intrec *head = intreclist_head[irq]; - - if (head == NULL) { - /* first handler for this irq, just install it */ - if (icu_setup(irq, idesc->handler, idesc->argument, - idesc->maskptr, idesc->flags) != 0) - return (-1); - - update_intrname(irq, idesc->name); - /* keep reference */ - intreclist_head[irq] = idesc; - } else { - if ((idesc->flags & INTR_EXCL) != 0 - || (head->flags & INTR_EXCL) != 0) { - /* - * can't append new handler, if either list head or - * new handler do not allow interrupts to be shared - */ - if (bootverbose) - printf("\tdevice combination doesn't support " - "shared irq%d\n", irq); - return (-1); - } - if (head->next == NULL) { - /* - * second handler for this irq, replace device driver's - * handler by shared interrupt multiplexer function - */ - icu_unset(irq, head->handler); - if (icu_setup(irq, intr_mux, head, 0, 0) != 0) - return (-1); - if (bootverbose) - printf("\tusing shared irq%d.\n", irq); - update_intrname(irq, "mux"); - } - /* just append to the end of the chain */ - while (head->next != NULL) - head = head->next; - head->next = idesc; - } - update_masks(idesc->maskptr, irq); - return (0); -} - -/* - * Create and activate an interrupt handler descriptor data structure. - * - * The dev_instance pointer is required for resource management, and will - * only be passed through to resource_claim(). - * - * There will be functions that derive a driver and unit name from a - * dev_instance variable, and those functions will be used to maintain the - * interrupt counter label array referenced by systat and vmstat to report - * device interrupt rates (->update_intrlabels). - * - * Add the interrupt handler descriptor data structure created by an - * earlier call of create_intr() to the linked list for its irq and - * adjust the interrupt masks if necessary. - * - * WARNING: This is an internal function and not to be used by device - * drivers. It is subject to change without notice. - */ - intrec * inthand_add(const char *name, int irq, inthand2_t handler, void *arg, - intrmask_t *maskptr, int flags) + int pri, int flags) { - intrec *idesc; - int errcode = -1; - intrmask_t oldspl; + ithd *ithd = ithds[irq]; /* descriptor for the IRQ */ + intrec *head; /* chain of handlers for IRQ */ + intrec *idesc; /* descriptor for this handler */ + struct proc *p; /* interrupt thread */ + int errcode = 0; - if (ICU_LEN > 8 * sizeof *maskptr) { - printf("create_intr: ICU_LEN of %d too high for %d bit intrmask\n", - ICU_LEN, 8 * sizeof *maskptr); - return (NULL); - } - if ((unsigned)irq >= ICU_LEN) { - printf("create_intr: requested irq%d too high, limit is %d\n", - irq, ICU_LEN -1); - return (NULL); - } + if (name == NULL) /* no name? */ + panic ("anonymous interrupt"); + if (ithd == NULL || ithd->it_ih == NULL) { + /* first handler for this irq. */ + if (ithd == NULL) { + ithd = malloc(sizeof (struct ithd), M_DEVBUF, M_WAITOK); + if (ithd == NULL) + return (NULL); + bzero(ithd, sizeof(struct ithd)); + ithd->irq = irq; + ithds[irq] = ithd; + } + /* + * If we have a fast interrupt, we need to set the + * handler address directly. Do that below. For a + * slow interrupt, we don't need to know more details, + * so do it here because it's tidier. + */ + if ((flags & INTR_FAST) == 0) { + /* + * Only create a kernel thread if we don't already + * have one. + */ + if (ithd->it_proc == NULL) { + errcode = kthread_create(ithd_loop, NULL, &p, + RFSTOPPED | RFHIGHPID, "irq%d: %s", irq, + name); + if (errcode) + panic("inthand_add: Can't create " + "interrupt thread"); + p->p_rtprio.type = RTP_PRIO_ITHREAD; + p->p_stat = SWAIT; /* we're idle */ - idesc = malloc(sizeof *idesc, M_DEVBUF, M_WAITOK); + /* Put in linkages. */ + ithd->it_proc = p; + p->p_ithd = ithd; + } else + snprintf(ithd->it_proc->p_comm, MAXCOMLEN, + "irq%d: %s", irq, name); + p->p_rtprio.prio = pri; + + /* + * The interrupt process must be in place, but + * not necessarily schedulable, before we + * initialize the ICU, since it may cause an + * immediate interrupt. + */ + if (icu_setup(irq, &sched_ithd, arg, flags) != 0) + panic("inthand_add: Can't initialize ICU"); + } + } else if ((flags & INTR_EXCL) != 0 + || (ithd->it_ih->flags & INTR_EXCL) != 0) { + /* + * We can't append the new handler if either + * list ithd or new handler do not allow + * interrupts to be shared. + */ + if (bootverbose) + printf("\tdevice combination %s and %s " + "doesn't support shared irq%d\n", + ithd->it_ih->name, name, irq); + return(NULL); + } else if (flags & INTR_FAST) { + /* We can only have one fast interrupt by itself. */ + if (bootverbose) + printf("\tCan't add fast interrupt %s" + " to normal interrupt %s on irq%d", + name, ithd->it_ih->name, irq); + return (NULL); + } else { /* update p_comm */ + p = ithd->it_proc; + if (strlen(p->p_comm) + strlen(name) < MAXCOMLEN) { + strcat(p->p_comm, " "); + strcat(p->p_comm, name); + } else if (strlen(p->p_comm) == MAXCOMLEN) + p->p_comm[MAXCOMLEN - 1] = '+'; + else + strcat(p->p_comm, "+"); + } + idesc = malloc(sizeof (struct intrec), M_DEVBUF, M_WAITOK); if (idesc == NULL) - return NULL; - bzero(idesc, sizeof *idesc); + return (NULL); + bzero(idesc, sizeof (struct intrec)); + + idesc->handler = handler; + idesc->argument = arg; + idesc->flags = flags; + idesc->ithd = ithd; - if (name == NULL) - name = "???"; idesc->name = malloc(strlen(name) + 1, M_DEVBUF, M_WAITOK); if (idesc->name == NULL) { free(idesc, M_DEVBUF); - return NULL; + return (NULL); } strcpy(idesc->name, name); - idesc->handler = handler; - idesc->argument = arg; - idesc->maskptr = maskptr; - idesc->intr = irq; - idesc->flags = flags; - - /* block this irq */ - oldspl = splq(1 << irq); - - /* add irq to class selected by maskptr */ - errcode = add_intrdesc(idesc); - splx(oldspl); - - if (errcode != 0) { + /* Slow interrupts got set up above. */ + if ((flags & INTR_FAST) + && (icu_setup(irq, idesc->handler, idesc->argument, + idesc->flags) != 0) ) { if (bootverbose) - printf("\tintr_connect(irq%d) failed, result=%d\n", + printf("\tinthand_add(irq%d) failed, result=%d\n", irq, errcode); free(idesc->name, M_DEVBUF); free(idesc, M_DEVBUF); - idesc = NULL; + return NULL; } - + head = ithd->it_ih; /* look at chain of handlers */ + if (head) { + while (head->next != NULL) + head = head->next; /* find the end */ + head->next = idesc; /* hook it in there */ + } else + ithd->it_ih = idesc; /* put it up front */ + update_intrname(irq, idesc->name); return (idesc); } /* - * Deactivate and remove the interrupt handler descriptor data connected - * created by an earlier call of intr_connect() from the linked list and - * adjust theinterrupt masks if necessary. + * Deactivate and remove linked list the interrupt handler descriptor + * data connected created by an earlier call of inthand_add(), then + * adjust the interrupt masks if necessary. * - * Return the memory held by the interrupt handler descriptor data structure - * to the system. Make sure, the handler is not actively used anymore, before. + * Return the memory held by the interrupt handler descriptor data + * structure to the system. First ensure the handler is not actively + * in use. */ int inthand_remove(intrec *idesc) { - intrec **hook, *head; - int irq; - int errcode = 0; - intrmask_t oldspl; + ithd *ithd; /* descriptor for the IRQ */ + intrec *ih; /* chain of handlers */ if (idesc == NULL) return (-1); + ithd = idesc->ithd; + ih = ithd->it_ih; - irq = idesc->intr; - - /* find pointer that keeps the reference to this interrupt descriptor */ - hook = find_pred(idesc, irq); - if (hook == NULL) + if (ih == idesc) /* first in the chain */ + ithd->it_ih = idesc->next; /* unhook it */ + else { + while ((ih != NULL) + && (ih->next != idesc) ) + ih = ih->next; + if (ih->next != idesc) return (-1); - - /* make copy of original list head, the line after may overwrite it */ - head = intreclist_head[irq]; - - /* unlink: make predecessor point to idesc->next instead of to idesc */ - *hook = idesc->next; - - /* now check whether the element we removed was the list head */ - if (idesc == head) { - - oldspl = splq(1 << irq); - - /* check whether the new list head is the only element on list */ - head = intreclist_head[irq]; - if (head != NULL) { - icu_unset(irq, intr_mux); - if (head->next != NULL) { - /* install the multiplex handler with new list head as argument */ - errcode = icu_setup(irq, intr_mux, head, 0, 0); - if (errcode == 0) - update_intrname(irq, NULL); - } else { - /* install the one remaining handler for this irq */ - errcode = icu_setup(irq, head->handler, - head->argument, - head->maskptr, head->flags); - if (errcode == 0) - update_intrname(irq, head->name); + ih->next = ih->next->next; } - } else { - /* revert to old handler, eg: strayintr */ - icu_unset(irq, idesc->handler); - } - splx(oldspl); - } - update_masks(idesc->maskptr, irq); + + if (ithd->it_ih == NULL) /* no handlers left, */ + icu_unset(ithd->irq, idesc->handler); free(idesc, M_DEVBUF); return (0); } diff --git a/sys/i386/isa/npx.c b/sys/i386/isa/npx.c index 637853e25264..8610e35f1f11 100644 --- a/sys/i386/isa/npx.c +++ b/sys/i386/isa/npx.c @@ -245,6 +245,12 @@ npx_probe(dev) setidt(16, probetrap, SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(npx_intrno, probeintr, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); npx_idt_probeintr = idt[npx_intrno]; + + /* + * XXX This looks highly bogus, but it appears that npc_probe1 + * needs interrupts enabled. Does this make any difference + * here? + */ enable_intr(); result = npx_probe1(dev); disable_intr(); @@ -797,7 +803,7 @@ npxdna() /* * Record new context early in case frstor causes an IRQ13. */ - npxproc = curproc; + PCPU_SET(npxproc, CURPROC); curpcb->pcb_savefpu.sv_ex_sw = 0; /* * The following frstor may cause an IRQ13 when the state being @@ -834,16 +840,18 @@ npxsave(addr) fnsave(addr); /* fnop(); */ start_emulating(); - npxproc = NULL; + PCPU_SET(npxproc, NULL); #else /* SMP */ + int intrstate; u_char icu1_mask; u_char icu2_mask; u_char old_icu1_mask; u_char old_icu2_mask; struct gate_descriptor save_idt_npxintr; + intrstate = save_intr(); disable_intr(); old_icu1_mask = inb(IO_ICU1 + 1); old_icu2_mask = inb(IO_ICU2 + 1); @@ -851,12 +859,12 @@ npxsave(addr) outb(IO_ICU1 + 1, old_icu1_mask & ~(IRQ_SLAVE | npx0_imask)); outb(IO_ICU2 + 1, old_icu2_mask & ~(npx0_imask >> 8)); idt[npx_intrno] = npx_idt_probeintr; - enable_intr(); + write_eflags(intrstate); stop_emulating(); fnsave(addr); fnop(); start_emulating(); - npxproc = NULL; + PCPU_SET(npxproc, NULL); disable_intr(); icu1_mask = inb(IO_ICU1 + 1); /* masks may have changed */ icu2_mask = inb(IO_ICU2 + 1); @@ -866,7 +874,7 @@ npxsave(addr) (icu2_mask & ~(npx0_imask >> 8)) | (old_icu2_mask & (npx0_imask >> 8))); idt[npx_intrno] = save_idt_npxintr; - enable_intr(); /* back to usual state */ + restore_intr(intrstate); /* back to previous state */ #endif /* SMP */ } diff --git a/sys/i386/isa/vector.s b/sys/i386/isa/vector.s index 5447a90126a0..79f2320e6b8e 100644 --- a/sys/i386/isa/vector.s +++ b/sys/i386/isa/vector.s @@ -16,9 +16,10 @@ #include #endif +#define FAST_INTR_HANDLER_USES_ES 1 #ifdef FAST_INTR_HANDLER_USES_ES #define ACTUALLY_PUSHED 1 -#define MAYBE_MOVW_AX_ES movl %ax,%es +#define MAYBE_MOVW_AX_ES movw %ax,%es #define MAYBE_POPL_ES popl %es #define MAYBE_PUSHL_ES pushl %es #else @@ -36,11 +37,6 @@ .data ALIGN_DATA - .globl _intr_nesting_level -_intr_nesting_level: - .byte 0 - .space 3 - /* * Interrupt counters and names for export to vmstat(8) and friends. * @@ -58,7 +54,6 @@ _eintrcnt: _intrnames: .space NR_INTRNAMES * 16 _eintrnames: - .text /* diff --git a/sys/isa/atrtc.c b/sys/isa/atrtc.c index 15044abbaa3b..724f3c2817ba 100644 --- a/sys/isa/atrtc.c +++ b/sys/isa/atrtc.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -93,10 +94,6 @@ #include #endif -#ifdef SMP -#define disable_intr() CLOCK_DISABLE_INTR() -#define enable_intr() CLOCK_ENABLE_INTR() - #ifdef APIC_IO #include /* The interrupt triggered by the 8254 (timer) chip */ @@ -104,7 +101,6 @@ int apic_8254_intr; static u_long read_intr_count __P((int vec)); static void setup_8254_mixed_mode __P((void)); #endif -#endif /* SMP */ /* * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we @@ -147,7 +143,9 @@ int tsc_is_broken; int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */ static int beeping = 0; +#if 0 static u_int clk_imask = HWI_MASK | SWI_MASK; +#endif static const u_char daysinmonth[] = {31,28,31,30,31,30,31,31,30,31,30,31}; static u_int hardclock_max_count; static u_int32_t i8254_lastcount; @@ -205,8 +203,12 @@ SYSCTL_OPAQUE(_debug, OID_AUTO, i8254_timecounter, CTLFLAG_RD, static void clkintr(struct clockframe frame) { + int intrsave; + if (timecounter->tc_get_timecount == i8254_get_timecount) { + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); if (i8254_ticked) i8254_ticked = 0; else { @@ -214,7 +216,8 @@ clkintr(struct clockframe frame) i8254_lastcount = 0; } clkintr_pending = 0; - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); } timer_func(&frame); switch (timer0_state) { @@ -233,14 +236,17 @@ clkintr(struct clockframe frame) break; case ACQUIRE_PENDING: + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); i8254_offset = i8254_get_timecount(NULL); i8254_lastcount = 0; timer0_max_count = TIMER_DIV(new_rate); outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); timer_func = new_function; timer0_state = ACQUIRED; setdelayed(); @@ -249,7 +255,9 @@ clkintr(struct clockframe frame) case RELEASE_PENDING: if ((timer0_prescaler_count += timer0_max_count) >= hardclock_max_count) { + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); i8254_offset = i8254_get_timecount(NULL); i8254_lastcount = 0; timer0_max_count = hardclock_max_count; @@ -257,7 +265,8 @@ clkintr(struct clockframe frame) TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); timer0_prescaler_count = 0; timer_func = hardclock; timer0_state = RELEASED; @@ -404,11 +413,11 @@ DB_SHOW_COMMAND(rtc, rtc) static int getit(void) { - u_long ef; - int high, low; + int high, low, intrsave; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); /* Select timer0 and latch counter value. */ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); @@ -417,7 +426,7 @@ getit(void) high = inb(TIMER_CNTR0); CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); return ((high << 8) | low); } @@ -523,6 +532,7 @@ sysbeepstop(void *chan) int sysbeep(int pitch, int period) { + int intrsave; int x = splclock(); if (acquire_timer2(TIMER_SQWAVE|TIMER_16BIT)) @@ -531,10 +541,13 @@ sysbeep(int pitch, int period) splx(x); return (-1); /* XXX Should be EBUSY, but nobody cares anyway. */ } + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); outb(TIMER_CNTR2, pitch); outb(TIMER_CNTR2, (pitch>>8)); - enable_intr(); + CLOCK_UNLOCK(); + restore_intr(intrsave); if (!beeping) { /* enable counter2 output to speaker */ outb(IO_PPI, inb(IO_PPI) | 3); @@ -683,11 +696,12 @@ calibrate_clocks(void) static void set_timer_freq(u_int freq, int intr_freq) { - u_long ef; + int intrsave; int new_timer0_max_count; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); timer_freq = freq; new_timer0_max_count = hardclock_max_count = TIMER_DIV(intr_freq); if (new_timer0_max_count != timer0_max_count) { @@ -697,7 +711,7 @@ set_timer_freq(u_int freq, int intr_freq) outb(TIMER_CNTR0, timer0_max_count >> 8); } CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); } /* @@ -711,15 +725,16 @@ set_timer_freq(u_int freq, int intr_freq) void i8254_restore(void) { - u_long ef; + int intrsave; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); outb(TIMER_MODE, TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT); outb(TIMER_CNTR0, timer0_max_count & 0xff); outb(TIMER_CNTR0, timer0_max_count >> 8); CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); } /* @@ -979,8 +994,8 @@ cpu_initclocks() { int diag; #ifdef APIC_IO - int apic_8254_trial; - struct intrec *clkdesc; + int apic_8254_trial, num_8254_ticks; + struct intrec *clkdesc, *rtcdesc; #endif /* APIC_IO */ if (statclock_disable) { @@ -1014,14 +1029,15 @@ cpu_initclocks() } else panic("APIC_IO: Cannot route 8254 interrupt to CPU"); } - - clkdesc = inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, - NULL, &clk_imask, INTR_EXCL); - INTREN(1 << apic_8254_intr); - #else /* APIC_IO */ - inthand_add("clk", 0, (inthand2_t *)clkintr, NULL, &clk_imask, + /* + * XXX Check the priority of this interrupt handler. I + * couldn't find anything suitable in the BSD/OS code (grog, + * 19 July 2000). + */ + /* Setup the PIC clk handler. The APIC handler is setup later */ + inthand_add("clk", 0, (inthand2_t *)clkintr, NULL, PI_REALTIME, INTR_EXCL); INTREN(IRQ0); @@ -1032,8 +1048,18 @@ cpu_initclocks() writertc(RTC_STATUSB, RTCSB_24HR); /* Don't bother enabling the statistics clock. */ - if (statclock_disable) + if (statclock_disable) { +#ifdef APIC_IO + /* + * XXX - if statclock is disabled, don't attempt the APIC + * trial. Not sure this is sane for APIC_IO. + */ + inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, NULL, + PI_REALTIME, INTR_EXCL); + INTREN(1 << apic_8254_intr); +#endif /* APIC_IO */ return; + } diag = rtcin(RTC_DIAG); if (diag != 0) printf("RTC BIOS diagnostic error %b\n", diag, RTCDG_BITS); @@ -1041,34 +1067,44 @@ cpu_initclocks() #ifdef APIC_IO if (isa_apic_irq(8) != 8) panic("APIC RTC != 8"); -#endif /* APIC_IO */ - inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, &stat_imask, - INTR_EXCL); - -#ifdef APIC_IO - INTREN(APIC_IRQ8); -#else - INTREN(IRQ8); -#endif /* APIC_IO */ - - writertc(RTC_STATUSB, rtc_statusb); - -#ifdef APIC_IO if (apic_8254_trial) { - + /* + * XXX - We use fast interrupts for clk and rtc long enough to + * perform the APIC probe and then revert to exclusive + * interrupts. + */ + clkdesc = inthand_add("clk", apic_8254_intr, + (inthand2_t *)clkintr, NULL, PI_REALTIME, INTR_FAST); + INTREN(1 << apic_8254_intr); + + rtcdesc = inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, + PI_REALTIME, INTR_FAST); /* XXX */ + INTREN(APIC_IRQ8); + writertc(RTC_STATUSB, rtc_statusb); + printf("APIC_IO: Testing 8254 interrupt delivery\n"); while (read_intr_count(8) < 6) ; /* nothing */ - if (read_intr_count(apic_8254_intr) < 3) { + num_8254_ticks = read_intr_count(apic_8254_intr); + + /* disable and remove our fake handlers */ + INTRDIS(1 << apic_8254_intr); + inthand_remove(clkdesc); + + writertc(RTC_STATUSA, rtc_statusa); + writertc(RTC_STATUSB, RTCSB_24HR); + + INTRDIS(APIC_IRQ8); + inthand_remove(rtcdesc); + + if (num_8254_ticks < 3) { /* * The MP table is broken. * The 8254 was not connected to the specified pin * on the IO APIC. * Workaround: Limited variant of mixed mode. */ - INTRDIS(1 << apic_8254_intr); - inthand_remove(clkdesc); printf("APIC_IO: Broken MP table detected: " "8254 is not connected to " "IOAPIC #%d intpin %d\n", @@ -1087,13 +1123,27 @@ cpu_initclocks() } apic_8254_intr = apic_irq(0, 0); setup_8254_mixed_mode(); - inthand_add("clk", apic_8254_intr, - (inthand2_t *)clkintr, - NULL, &clk_imask, INTR_EXCL); - INTREN(1 << apic_8254_intr); } } + + /* Finally, setup the real clock handlers */ + inthand_add("clk", apic_8254_intr, (inthand2_t *)clkintr, NULL, + PI_REALTIME, INTR_EXCL); + INTREN(1 << apic_8254_intr); +#endif + + inthand_add("rtc", 8, (inthand2_t *)rtcintr, NULL, PI_REALTIME, + INTR_EXCL); +#ifdef APIC_IO + INTREN(APIC_IRQ8); +#else + INTREN(IRQ8); +#endif + + writertc(RTC_STATUSB, rtc_statusb); + +#ifdef APIC_IO if (apic_int_type(0, 0) != 3 || int_to_apicintpin[apic_8254_intr].ioapic != 0 || int_to_apicintpin[apic_8254_intr].int_pin != 0) @@ -1198,11 +1248,12 @@ static unsigned i8254_get_timecount(struct timecounter *tc) { u_int count; - u_long ef; + int intrsave; u_int high, low; - ef = read_eflags(); + intrsave = save_intr(); disable_intr(); + CLOCK_LOCK(); /* Select timer0 and latch counter value. */ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); @@ -1212,7 +1263,7 @@ i8254_get_timecount(struct timecounter *tc) count = timer0_max_count - ((high << 8) | low); if (count < i8254_lastcount || (!i8254_ticked && (clkintr_pending || - ((count < 20 || (!(ef & PSL_I) && count < timer0_max_count / 2u)) && + ((count < 20 || (!(intrsave & PSL_I) && count < timer0_max_count / 2u)) && #ifdef APIC_IO #define lapic_irr1 ((volatile u_int *)&lapic)[0x210 / 4] /* XXX XXX */ /* XXX this assumes that apic_8254_intr is < 24. */ @@ -1227,7 +1278,7 @@ i8254_get_timecount(struct timecounter *tc) i8254_lastcount = count; count += i8254_offset; CLOCK_UNLOCK(); - write_eflags(ef); + restore_intr(intrsave); return (count); } diff --git a/sys/isa/sio.c b/sys/isa/sio.c index 2725a201076b..a6f05e762ce8 100644 --- a/sys/isa/sio.c +++ b/sys/isa/sio.c @@ -95,16 +95,12 @@ #endif #include +/* XXX - this is ok because we only do sio fast interrupts on i386 */ #ifndef __i386__ #define disable_intr() #define enable_intr() #endif -#ifdef SMP -#define disable_intr() COM_DISABLE_INTR() -#define enable_intr() COM_ENABLE_INTR() -#endif /* SMP */ - #define LOTS_OF_EVENTS 64 /* helps separate urgent events from input */ #define CALLOUT_MASK 0x80 @@ -760,6 +756,7 @@ sioprobe(dev, xrid) u_int flags = device_get_flags(dev); int rid; struct resource *port; + int intrsave; rid = xrid; port = bus_alloc_resource(dev, SYS_RES_IOPORT, &rid, @@ -856,7 +853,9 @@ sioprobe(dev, xrid) * but mask them in the processor as well in case there are some * (misconfigured) shared interrupts. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); /* EXTRA DELAY? */ /* @@ -953,7 +952,8 @@ sioprobe(dev, xrid) CLR_FLAG(dev, COM_C_IIR_TXRDYBUG); } sio_setreg(com, com_cfcr, CFCR_8BITS); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); bus_release_resource(dev, SYS_RES_IOPORT, rid, port); return (iobase == siocniobase ? 0 : result); } @@ -993,7 +993,8 @@ sioprobe(dev, xrid) irqmap[3] = isa_irq_pending(); failures[9] = (sio_getreg(com, com_iir) & IIR_IMASK) - IIR_NOPEND; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); irqs = irqmap[1] & ~irqmap[0]; if (bus_get_resource(idev, SYS_RES_IRQ, 0, &xirq, NULL) == 0 && @@ -1181,7 +1182,6 @@ sioattach(dev, xrid) } else com->it_in.c_ispeed = com->it_in.c_ospeed = TTYDEF_SPEED; if (siosetwater(com, com->it_in.c_ispeed) != 0) { - enable_intr(); /* * Leave i/o resources allocated if this is a `cn'-level * console, so that other devices can't snarf them. @@ -1190,7 +1190,6 @@ sioattach(dev, xrid) bus_release_resource(dev, SYS_RES_IOPORT, rid, port); return (ENOMEM); } - enable_intr(); termioschars(&com->it_in); com->it_out = com->it_in; @@ -1340,7 +1339,7 @@ determined_type: ; RF_ACTIVE); if (com->irqres) { ret = BUS_SETUP_INTR(device_get_parent(dev), dev, com->irqres, - INTR_TYPE_TTY | INTR_TYPE_FAST, + INTR_TYPE_TTY | INTR_FAST, siointr, com, &com->cookie); if (ret) { ret = BUS_SETUP_INTR(device_get_parent(dev), dev, @@ -1424,6 +1423,8 @@ sioopen(dev, flag, mode, p) goto out; } } else { + int intrsave; + /* * The device isn't open, so there are no conflicts. * Initialize it. Initialization is done twice in many @@ -1483,7 +1484,9 @@ sioopen(dev, flag, mode, p) } } + intrsave = save_intr(); disable_intr(); + COM_LOCK(); (void) inb(com->line_status_port); (void) inb(com->data_port); com->prev_modem_status = com->last_modem_status @@ -1495,7 +1498,8 @@ sioopen(dev, flag, mode, p) outb(com->intr_ctl_port, IER_ERXRDY | IER_ETXRDY | IER_ERLS | IER_EMSC); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); /* * Handle initial DCD. Callout devices get a fake initial * DCD (trapdoor DCD). If we are callout, then any sleeping @@ -1716,6 +1720,9 @@ siodtrwakeup(chan) wakeup(&com->dtr_wait); } +/* + * Call this function with COM_LOCK. It will return with the lock still held. + */ static void sioinput(com) struct com_s *com; @@ -1725,6 +1732,7 @@ sioinput(com) u_char line_status; int recv_data; struct tty *tp; + int intrsave; buf = com->ibuf; tp = com->tp; @@ -1742,6 +1750,13 @@ sioinput(com) * call overhead). */ do { + /* + * This may look odd, but it is using save-and-enable + * semantics instead of the save-and-disable semantics + * that are used everywhere else. + */ + intrsave = save_intr(); + COM_UNLOCK(); enable_intr(); incc = com->iptr - buf; if (tp->t_rawq.c_cc + incc > tp->t_ihiwat @@ -1763,10 +1778,18 @@ sioinput(com) tp->t_lflag &= ~FLUSHO; comstart(tp); } - disable_intr(); + restore_intr(intrsave); + COM_LOCK(); } while (buf < com->iptr); } else { do { + /* + * This may look odd, but it is using save-and-enable + * semantics instead of the save-and-disable semantics + * that are used everywhere else. + */ + intrsave = save_intr(); + COM_UNLOCK(); enable_intr(); line_status = buf[com->ierroff]; recv_data = *buf++; @@ -1782,7 +1805,8 @@ sioinput(com) recv_data |= TTY_PE; } (*linesw[tp->t_line].l_rint)(recv_data, tp); - disable_intr(); + restore_intr(intrsave); + COM_LOCK(); } while (buf < com->iptr); } com_events -= (com->iptr - com->ibuf); @@ -1893,12 +1917,16 @@ siointr1(com) if (recv_data == KEY_CR) { brk_state1 = recv_data; brk_state2 = 0; - } else if (brk_state1 == KEY_CR && (recv_data == KEY_TILDE || recv_data == KEY_CRTLB)) { + } else if (brk_state1 == KEY_CR + && (recv_data == KEY_TILDE + || recv_data == KEY_CRTLB)) { if (recv_data == KEY_TILDE) brk_state2 = recv_data; - else if (brk_state2 == KEY_TILDE && recv_data == KEY_CRTLB) { + else if (brk_state2 == KEY_TILDE + && recv_data == KEY_CRTLB) { breakpoint(); - brk_state1 = brk_state2 = 0; + brk_state1 = 0; + brk_state2 = 0; goto cont; } else brk_state2 = 0; @@ -1949,7 +1977,10 @@ siointr1(com) if (com->do_timestamp) microtime(&com->timestamp); ++com_events; +/* XXX - needs to go away when alpha gets ithreads */ +#ifdef __alpha__ schedsofttty(); +#endif #if 0 /* for testing input latency vs efficiency */ if (com->iptr - com->ibuf == 8) setsofttty(); @@ -2217,10 +2248,12 @@ sioioctl(dev, cmd, data, flag, p) return (0); } +/* software interrupt handler for SWI_TTY */ static void siopoll() { int unit; + int intrsave; if (com_events == 0) return; @@ -2239,7 +2272,9 @@ siopoll() * Discard any events related to never-opened or * going-away devices. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); incc = com->iptr - com->ibuf; com->iptr = com->ibuf; if (com->state & CS_CHECKMSR) { @@ -2247,33 +2282,43 @@ siopoll() com->state &= ~CS_CHECKMSR; } com_events -= incc; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); continue; } if (com->iptr != com->ibuf) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); sioinput(com); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } if (com->state & CS_CHECKMSR) { u_char delta_modem_status; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); delta_modem_status = com->last_modem_status ^ com->prev_modem_status; com->prev_modem_status = com->last_modem_status; com_events -= LOTS_OF_EVENTS; com->state &= ~CS_CHECKMSR; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (delta_modem_status & MSR_DCD) (*linesw[tp->t_line].l_modem) (tp, com->prev_modem_status & MSR_DCD); } if (com->state & CS_ODONE) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); com_events -= LOTS_OF_EVENTS; com->state &= ~CS_ODONE; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (!(com->state & CS_BUSY) && !(com->extra_state & CSE_BUSYCHECK)) { timeout(siobusycheck, com, hz / 100); @@ -2301,6 +2346,7 @@ comparam(tp, t) u_char dlbl; int s; int unit; + int intrsave; /* do historical conversions */ if (t->c_ispeed == 0) @@ -2367,11 +2413,10 @@ comparam(tp, t) sio_setreg(com, com_fifo, com->fifo_image); } - /* - * This returns with interrupts disabled so that we can complete - * the speed change atomically. Keeping interrupts disabled is - * especially important while com_data is hidden. - */ + intrsave = save_intr(); + disable_intr(); + COM_LOCK(); + (void) siosetwater(com, t->c_ispeed); if (divisor != 0) { @@ -2459,7 +2504,8 @@ comparam(tp, t) if (com->state >= (CS_BUSY | CS_TTGO)) siointr1(com); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); splx(s); comstart(tp); if (com->ibufold != NULL) { @@ -2478,6 +2524,7 @@ siosetwater(com, speed) u_char *ibuf; int ibufsize; struct tty *tp; + int intrsave; /* * Make the buffer size large enough to handle a softtty interrupt @@ -2488,20 +2535,16 @@ siosetwater(com, speed) cp4ticks = speed / 10 / hz * 4; for (ibufsize = 128; ibufsize < cp4ticks;) ibufsize <<= 1; - if (ibufsize == com->ibufsize) { - disable_intr(); + if (ibufsize == com->ibufsize) return (0); - } /* * Allocate input buffer. The extra factor of 2 in the size is * to allow for an error byte for each input byte. */ ibuf = malloc(2 * ibufsize, M_DEVBUF, M_NOWAIT); - if (ibuf == NULL) { - disable_intr(); + if (ibuf == NULL) return (ENOMEM); - } /* Initialize non-critical variables. */ com->ibufold = com->ibuf; @@ -2517,7 +2560,9 @@ siosetwater(com, speed) * Read current input buffer, if any. Continue with interrupts * disabled. */ + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->iptr != com->ibuf) sioinput(com); @@ -2536,6 +2581,8 @@ siosetwater(com, speed) com->ibufend = ibuf + ibufsize; com->ierroff = ibufsize; com->ihighwater = ibuf + 3 * ibufsize / 4; + COM_UNLOCK(); + restore_intr(intrsave); return (0); } @@ -2546,13 +2593,16 @@ comstart(tp) struct com_s *com; int s; int unit; + int intrsave; unit = DEV_TO_UNIT(tp->t_dev); com = com_addr(unit); if (com == NULL) return; s = spltty(); + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (tp->t_state & TS_TTSTOP) com->state &= ~CS_TTGO; else @@ -2565,7 +2615,8 @@ comstart(tp) && com->state & CS_RTS_IFLOW) outb(com->modem_ctl_port, com->mcr_image |= MCR_RTS); } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (tp->t_state & (TS_TIMEOUT | TS_TTSTOP)) { ttwwakeup(tp); splx(s); @@ -2581,7 +2632,9 @@ comstart(tp) sizeof com->obuf1); com->obufs[0].l_next = NULL; com->obufs[0].l_queued = TRUE; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state & CS_BUSY) { qp = com->obufq.l_next; while ((next = qp->l_next) != NULL) @@ -2593,7 +2646,8 @@ comstart(tp) com->obufq.l_next = &com->obufs[0]; com->state |= CS_BUSY; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } if (tp->t_outq.c_cc != 0 && !com->obufs[1].l_queued) { com->obufs[1].l_tail @@ -2601,7 +2655,9 @@ comstart(tp) sizeof com->obuf2); com->obufs[1].l_next = NULL; com->obufs[1].l_queued = TRUE; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state & CS_BUSY) { qp = com->obufq.l_next; while ((next = qp->l_next) != NULL) @@ -2613,14 +2669,18 @@ comstart(tp) com->obufq.l_next = &com->obufs[1]; com->state |= CS_BUSY; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } tp->t_state |= TS_BUSY; } + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (com->state >= (CS_BUSY | CS_TTGO)) siointr1(com); /* fake interrupt to start output */ - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); ttwwakeup(tp); splx(s); } @@ -2631,11 +2691,14 @@ comstop(tp, rw) int rw; { struct com_s *com; + int intrsave; com = com_addr(DEV_TO_UNIT(tp->t_dev)); if (com == NULL || com->gone) return; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); if (rw & FWRITE) { if (com->hasfifo) #ifdef COM_ESP @@ -2662,7 +2725,8 @@ comstop(tp, rw) com_events -= (com->iptr - com->ibuf); com->iptr = com->ibuf; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); comstart(tp); } @@ -2674,6 +2738,7 @@ commctl(com, bits, how) { int mcr; int msr; + int intrsave; if (how == DMGET) { bits = TIOCM_LE; /* XXX - always enabled while open */ @@ -2705,7 +2770,9 @@ commctl(com, bits, how) mcr |= MCR_RTS; if (com->gone) return(0); + intrsave = save_intr(); disable_intr(); + COM_LOCK(); switch (how) { case DMSET: outb(com->modem_ctl_port, @@ -2718,7 +2785,8 @@ commctl(com, bits, how) outb(com->modem_ctl_port, com->mcr_image &= ~mcr); break; } - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); return (0); } @@ -2766,6 +2834,7 @@ comwakeup(chan) { struct com_s *com; int unit; + int intrsave; sio_timeout_handle = timeout(comwakeup, (void *)NULL, sio_timeout); @@ -2777,9 +2846,12 @@ comwakeup(chan) com = com_addr(unit); if (com != NULL && !com->gone && (com->state >= (CS_BUSY | CS_TTGO) || com->poll)) { + intrsave = save_intr(); disable_intr(); + COM_LOCK(); siointr1(com); - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); } } @@ -2801,10 +2873,13 @@ comwakeup(chan) u_int delta; u_long total; + intrsave = save_intr(); disable_intr(); + COM_LOCK(); delta = com->delta_error_counts[errnum]; com->delta_error_counts[errnum] = 0; - enable_intr(); + COM_UNLOCK(); + restore_intr(intrsave); if (delta == 0) continue; total = com->error_counts[errnum] += delta; diff --git a/sys/isofs/cd9660/cd9660_util.c b/sys/isofs/cd9660/cd9660_util.c index 2a11dc2f6361..d0f2e1c45c5b 100644 --- a/sys/isofs/cd9660/cd9660_util.c +++ b/sys/isofs/cd9660/cd9660_util.c @@ -41,6 +41,7 @@ */ #include +#include #include #include diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 6d0d915ae54e..f5ae66cc6138 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,8 @@ #include #include +#include +#include #include #include @@ -259,6 +262,11 @@ proc0_init(void *dummy __unused) p = &proc0; + /* + * Initialize magic number. + */ + p->p_magic = P_MAGIC; + /* * Initialize process and pgrp structures. */ @@ -364,11 +372,20 @@ proc0_init(void *dummy __unused) */ (void)chgproccnt(cred0.p_uidinfo, 1, 0); + LIST_INIT(&p->p_heldmtx); + LIST_INIT(&p->p_contested); + /* * Initialize the current process pointer (curproc) before * any possible traps/probes to simplify trap processing. */ - SET_CURPROC(p); + PCPU_SET(curproc, p); + + /* + * Enter the Giant mutex. + * XXX This should be done BEFORE cpu_startup(). + */ + mtx_enter(&Giant, MTX_DEF); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL) @@ -389,7 +406,7 @@ proc0_post(void *dummy __unused) p->p_runtime = 0; } microuptime(&switchtime); - switchticks = ticks; + PCPU_SET(switchticks, ticks); /* * Give the ``random'' number generator a thump. @@ -418,7 +435,6 @@ SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL) *************************************************************************** */ - /* * List of paths to try when searching for "init". */ @@ -444,6 +460,8 @@ start_init(void *dummy) char *ucp, **uap, *arg0, *arg1; struct proc *p; + mtx_enter(&Giant, MTX_DEF); + p = curproc; /* Get the vnode for '/'. Set p->p_fd->fd_cdir to reference it. */ @@ -562,16 +580,12 @@ static void create_init(const void *udata __unused) { int error; - int s; - s = splhigh(); - error = fork1(&proc0, RFFDG | RFPROC, &initproc); + error = fork1(&proc0, RFFDG | RFPROC | RFSTOPPED, &initproc); if (error) panic("cannot fork init: %d\n", error); initproc->p_flag |= P_INMEM | P_SYSTEM; cpu_set_fork_handler(initproc, start_init, NULL); - remrunqueue(initproc); - splx(s); } SYSINIT(init,SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL) @@ -581,6 +595,9 @@ SYSINIT(init,SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL) static void kick_init(const void *udata __unused) { + mtx_enter(&sched_lock, MTX_SPIN); + initproc->p_stat = SRUN; setrunqueue(initproc); + mtx_exit(&sched_lock, MTX_SPIN); } SYSINIT(kickinit,SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL) diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 11e63a7d0592..33eef3ca3046 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -70,11 +70,7 @@ static void initclocks __P((void *dummy)); SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) /* Some of these don't belong here, but it's easiest to concentrate them. */ -#if defined(SMP) && defined(BETTER_CLOCK) long cp_time[CPUSTATES]; -#else -static long cp_time[CPUSTATES]; -#endif long tk_cancc; long tk_nin; @@ -156,7 +152,7 @@ hardclock(frame) register struct proc *p; p = curproc; - if (p) { + if (p != idleproc) { register struct pstats *pstats; /* @@ -325,12 +321,12 @@ statclock(frame) struct rusage *ru; struct vmspace *vm; - if (curproc != NULL && CLKF_USERMODE(frame)) { + if (CLKF_USERMODE(frame)) { /* * Came from user mode; CPU was in user state. * If this process is being profiled, record the tick. */ - p = curproc; + p = prevproc; if (p->p_flag & P_PROFIL) addupc_intr(p, CLKF_PC(frame), 1); #if defined(SMP) && defined(BETTER_CLOCK) @@ -379,20 +375,21 @@ statclock(frame) * so that we know how much of its real time was spent * in ``non-process'' (i.e., interrupt) work. */ - p = curproc; - if (CLKF_INTR(frame)) { - if (p != NULL) - p->p_iticks++; + p = prevproc; + if (p->p_ithd) { + p->p_iticks++; cp_time[CP_INTR]++; - } else if (p != NULL) { + } else { p->p_sticks++; - cp_time[CP_SYS]++; - } else - cp_time[CP_IDLE]++; + if (p != idleproc) + cp_time[CP_SYS]++; + else + cp_time[CP_IDLE]++; + } } pscnt = psdiv; - if (p != NULL) { + if (p != idleproc) { schedclock(p); /* Update resource usage integrals and maximums. */ diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index df71fe07bf45..7fccc1689409 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -349,7 +349,6 @@ exit1(p, rv) * * Other substructures are freed from wait(). */ - SET_CURPROC(NULL); if (--p->p_limit->p_refcnt == 0) { FREE(p->p_limit, M_SUBPROC); p->p_limit = NULL; diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index f24c97e9c4f4..0aa31ab857bc 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -65,6 +66,8 @@ #include +#include + static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback"); static int fast_vfork = 1; @@ -131,7 +134,8 @@ rfork(p, uap) int error; struct proc *p2; - error = fork1(p, uap->flags, &p2); + /* mask kernel only flags out of the user flags */ + error = fork1(p, uap->flags & ~RFKERNELONLY, &p2); if (error == 0) { p->p_retval[0] = p2 ? p2->p_pid : 0; p->p_retval[1] = 0; @@ -177,17 +181,19 @@ SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, int fork1(p1, flags, procp) - struct proc *p1; + struct proc *p1; /* parent proc */ int flags; - struct proc **procp; + struct proc **procp; /* child proc */ { struct proc *p2, *pptr; uid_t uid; struct proc *newproc; + int trypid; int ok; static int pidchecked = 0; struct forklist *ep; + /* Can't copy and clear */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); @@ -278,47 +284,56 @@ fork1(p1, flags, procp) /* * Find an unused process ID. We remember a range of unused IDs * ready to use (from nextpid+1 through pidchecked-1). + * + * If RFHIGHPID is set (used during system boot), do not allocate + * low-numbered pids. */ - nextpid++; + trypid = nextpid + 1; + if (flags & RFHIGHPID) { + if (trypid < 10) { + trypid = 10; + } + } else { if (randompid) - nextpid += arc4random() % randompid; + trypid += arc4random() % randompid; + } retry: /* * If the process ID prototype has wrapped around, * restart somewhat above 0, as the low-numbered procs * tend to include daemons that don't exit. */ - if (nextpid >= PID_MAX) { - nextpid = nextpid % PID_MAX; - if (nextpid < 100) - nextpid += 100; + if (trypid >= PID_MAX) { + trypid = trypid % PID_MAX; + if (trypid < 100) + trypid += 100; pidchecked = 0; } - if (nextpid >= pidchecked) { + if (trypid >= pidchecked) { int doingzomb = 0; pidchecked = PID_MAX; /* * Scan the active and zombie procs to check whether this pid * is in use. Remember the lowest pid that's greater - * than nextpid, so we can avoid checking for a while. + * than trypid, so we can avoid checking for a while. */ p2 = LIST_FIRST(&allproc); again: for (; p2 != 0; p2 = LIST_NEXT(p2, p_list)) { - while (p2->p_pid == nextpid || - p2->p_pgrp->pg_id == nextpid || - p2->p_session->s_sid == nextpid) { - nextpid++; - if (nextpid >= pidchecked) + while (p2->p_pid == trypid || + p2->p_pgrp->pg_id == trypid || + p2->p_session->s_sid == trypid) { + trypid++; + if (trypid >= pidchecked) goto retry; } - if (p2->p_pid > nextpid && pidchecked > p2->p_pid) + if (p2->p_pid > trypid && pidchecked > p2->p_pid) pidchecked = p2->p_pid; - if (p2->p_pgrp->pg_id > nextpid && + if (p2->p_pgrp->pg_id > trypid && pidchecked > p2->p_pgrp->pg_id) pidchecked = p2->p_pgrp->pg_id; - if (p2->p_session->s_sid > nextpid && + if (p2->p_session->s_sid > trypid && pidchecked > p2->p_session->s_sid) pidchecked = p2->p_session->s_sid; } @@ -331,10 +346,18 @@ fork1(p1, flags, procp) p2 = newproc; p2->p_stat = SIDL; /* protect against others */ - p2->p_pid = nextpid; + p2->p_pid = trypid; LIST_INSERT_HEAD(&allproc, p2, p_list); LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); + /* + * RFHIGHPID does not mess with the nextpid counter during boot. + */ + if (flags & RFHIGHPID) + pidchecked = 0; + else + nextpid = trypid; + /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, @@ -456,6 +479,8 @@ fork1(p1, flags, procp) p2->p_pptr = pptr; LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); LIST_INIT(&p2->p_children); + LIST_INIT(&p2->p_heldmtx); + LIST_INIT(&p2->p_contested); #ifdef KTRACE /* @@ -496,14 +521,19 @@ fork1(p1, flags, procp) } /* - * Make child runnable and add to run queue. + * If RFSTOPPED not requested, make child runnable and add to + * run queue. */ microtime(&(p2->p_stats->p_start)); p2->p_acflag = AFORK; - (void) splhigh(); - p2->p_stat = SRUN; - setrunqueue(p2); - (void) spl0(); + if ((flags & RFSTOPPED) == 0) { + splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); + p2->p_stat = SRUN; + setrunqueue(p2); + mtx_exit(&sched_lock, MTX_SPIN); + spl0(); + } /* * Now can be swapped. diff --git a/sys/kern/kern_idle.c b/sys/kern/kern_idle.c new file mode 100644 index 000000000000..840c0f98fd56 --- /dev/null +++ b/sys/kern/kern_idle.c @@ -0,0 +1,108 @@ +/*- + * Copyright (c) 2000, All rights reserved. See /usr/src/COPYRIGHT + * + * $FreeBSD$ + */ + +#include "opt_ktrace.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef KTRACE +#include +#include +#endif + +#include +#include +#include +#include + +#include +#include + +#ifdef SMP_DEBUG +#include +#include +#include +#endif + +static void idle_setup(void *dummy); +SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL) + +static void idle_proc(void *dummy); + +/* + * setup per-cpu idle process contexts + */ +static void +idle_setup(void *dummy) +{ + struct globaldata *gd; + int error; + + SLIST_FOREACH(gd, &cpuhead, gd_allcpu) { +#ifdef SMP + error = kthread_create(idle_proc, NULL, &gd->gd_idleproc, + RFSTOPPED|RFHIGHPID, "idle: cpu%d", + gd->gd_cpuid); +#else + error = kthread_create(idle_proc, NULL, &gd->gd_idleproc, + RFSTOPPED|RFHIGHPID, "idle"); +#endif + if (error) + panic("idle_setup: kthread_create error %d\n", error); + + gd->gd_idleproc->p_stat = SWAIT; + } +} + +/* + * idle process context + */ +static void +idle_proc(void *dummy) +{ + int count; + + for (;;) { + /* + * Clear switchtime, which prevents the idle process's time + * from being counted. + switchtime.tv_usec = 0; + switchtime.tv_sec = 0; + */ + + mtx_assert(&Giant, MA_NOTOWNED); + + count = 0; + + while (count >= 0 && procrunnable() == 0) { + /* + * This is a good place to put things to be done in + * the background, including sanity checks. + */ + if (count++ < 0) + CTR0(KTR_PROC, "idle_proc: timed out waiting" + " for a process"); + } + + mtx_enter(&sched_lock, MTX_SPIN); + idleproc->p_stat = SWAIT; + mi_switch(); + mtx_exit(&sched_lock, MTX_SPIN); + spl0(); + } +} diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c index 6373750e06cd..e684b78c032a 100644 --- a/sys/kern/kern_kthread.c +++ b/sys/kern/kern_kthread.c @@ -52,24 +52,33 @@ kproc_start(udata) int error; error = kthread_create((void (*)(void *))kp->func, NULL, - kp->global_procpp, kp->arg0); + kp->global_procpp, 0, kp->arg0); if (error) panic("kproc_start: %s: error %d", kp->arg0, error); } /* - * Create a kernel process/thread/whatever. It shares it's address space + * Create a kernel process/thread/whatever. It shares its address space * with proc0 - ie: kernel only. + * + * func is the function to start. + * arg is the parameter to pass to function on first startup. + * newpp is the return value pointing to the thread's struct proc. + * flags are flags to fork1 (in unistd.h) + * fmt and following will be *printf'd into (*newpp)->p_comm (for ps, etc.). */ int kthread_create(void (*func)(void *), void *arg, - struct proc **newpp, const char *fmt, ...) + struct proc **newpp, int flags, const char *fmt, ...) { int error; va_list ap; struct proc *p2; - error = fork1(&proc0, RFMEM | RFFDG | RFPROC, &p2); + if (!proc0.p_stats /* || proc0.p_stats->p_start.tv_sec == 0 */) + panic("kthread_create called too soon"); + + error = fork1(&proc0, RFMEM | RFFDG | RFPROC | flags, &p2); if (error) return error; diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c new file mode 100644 index 000000000000..1ac3f584d9ef --- /dev/null +++ b/sys/kern/kern_mutex.c @@ -0,0 +1,799 @@ +/*- + * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ + * $FreeBSD$ + */ + +/* + * Main Entry: witness + * Pronunciation: 'wit-n&s + * Function: noun + * Etymology: Middle English witnesse, from Old English witnes knowledge, + * testimony, witness, from 2wit + * Date: before 12th century + * 1 : attestation of a fact or event : TESTIMONY + * 2 : one that gives evidence; specifically : one who testifies in + * a cause or before a judicial tribunal + * 3 : one asked to be present at a transaction so as to be able to + * testify to its having taken place + * 4 : one who has personal knowledge of something + * 5 a : something serving as evidence or proof : SIGN + * b : public affirmation by word or example of usually + * religious faith or conviction + * 6 capitalized : a member of the Jehovah's Witnesses + */ + +#include +#include +#include +#include + +#include +#define _KERN_MUTEX_C_ /* Cause non-inlined mtx_*() to be compiled. */ +#include + +/* + * The non-inlined versions of the mtx_*() functions are always built (above), + * but the witness code depends on the SMP_DEBUG and WITNESS kernel options + * being specified. + */ +#if (defined(SMP_DEBUG) && defined(WITNESS)) + +#define WITNESS_COUNT 200 +#define WITNESS_NCHILDREN 2 + +#ifndef WITNESS +#define WITNESS 0 /* default off */ +#endif + +#ifndef SMP +extern int witness_spin_check; +#endif + +int witness_watch; + +typedef struct witness { + struct witness *w_next; + char *w_description; + char *w_file; + int w_line; + struct witness *w_morechildren; + u_char w_childcnt; + u_char w_Giant_squawked:1; + u_char w_other_squawked:1; + u_char w_same_squawked:1; + u_char w_sleep:1; + u_char w_spin:1; /* this is a spin mutex */ + u_int w_level; + struct witness *w_children[WITNESS_NCHILDREN]; +} witness_t; + +typedef struct witness_blessed { + char *b_lock1; + char *b_lock2; +} witness_blessed_t; + +#ifdef KDEBUG +/* + * When WITNESS_KDEBUG is set to 1, it will cause the system to + * drop into kdebug() when: + * - a lock heirarchy violation occurs + * - locks are held when going to sleep. + */ +#ifndef WITNESS_KDEBUG +#define WITNESS_KDEBUG 0 +#endif +int witness_kdebug = WITNESS_KDEBUG; +#endif /* KDEBUG */ + +#ifndef WITNESS_SKIPSPIN +#define WITNESS_SKIPSPIN 0 +#endif +int witness_skipspin = WITNESS_SKIPSPIN; + + +static mtx_t w_mtx; +static witness_t *w_free; +static witness_t *w_all; +static int w_inited; +static int witness_dead; /* fatal error, probably no memory */ + +static witness_t w_data[WITNESS_COUNT]; + +static witness_t *enroll __P((char *description, int flag)); +static int itismychild __P((witness_t *parent, witness_t *child)); +static void removechild __P((witness_t *parent, witness_t *child)); +static int isitmychild __P((witness_t *parent, witness_t *child)); +static int isitmydescendant __P((witness_t *parent, witness_t *child)); +static int dup_ok __P((witness_t *)); +static int blessed __P((witness_t *, witness_t *)); +static void witness_displaydescendants + __P((void(*)(const char *fmt, ...), witness_t *)); +static void witness_leveldescendents __P((witness_t *parent, int level)); +static void witness_levelall __P((void)); +static witness_t * witness_get __P((void)); +static void witness_free __P((witness_t *m)); + + +static char *ignore_list[] = { + "witness lock", + "Kdebug", /* breaks rules and may or may not work */ + "Page Alias", /* sparc only, witness lock won't block intr */ + NULL +}; + +static char *spin_order_list[] = { + "sched lock", + "log mtx", + "zslock", /* sparc only above log, this one is a real hack */ + "time lock", /* above callout */ + "callout mtx", /* above wayout */ + /* + * leaf locks + */ + "wayout mtx", + "kernel_pmap", /* sparc only, logically equal "pmap" below */ + "pmap", /* sparc only */ + NULL +}; + +static char *order_list[] = { + "tcb", "inp", "so_snd", "so_rcv", "Giant lock", NULL, + "udb", "inp", NULL, + "unp head", "unp", "so_snd", NULL, + "de0", "Giant lock", NULL, + "ifnet", "Giant lock", NULL, + "fifo", "so_snd", NULL, + "hme0", "Giant lock", NULL, + "esp0", "Giant lock", NULL, + "hfa0", "Giant lock", NULL, + "so_rcv", "atm_global", NULL, + "so_snd", "atm_global", NULL, + "NFS", "Giant lock", NULL, + NULL +}; + +static char *dup_list[] = { + "inp", + "process group", + "session", + "unp", + "rtentry", + "rawcb", + NULL +}; + +static char *sleep_list[] = { + "Giant lock", + NULL +}; + +/* + * Pairs of locks which have been blessed + * Don't complain about order problems with blessed locks + */ +static witness_blessed_t blessed_list[] = { +}; +static int blessed_count = sizeof (blessed_list) / sizeof (witness_blessed_t); + +void +witness_init(mtx_t *m, int flag) +{ + m->mtx_witness = enroll(m->mtx_description, flag); +} + +void +witness_destroy(mtx_t *m) +{ + mtx_t *m1; + struct proc *p; + p = CURPROC; + for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL; + m1 = LIST_NEXT(m1, mtx_held)) { + if (m1 == m) { + LIST_REMOVE(m, mtx_held); + break; + } + } + return; + +} + +void +witness_enter(mtx_t *m, int flags, char *file, int line) +{ + witness_t *w, *w1; + mtx_t *m1; + struct proc *p; + int i; +#ifdef KDEBUG + int go_into_kdebug = 0; +#endif /* KDEBUG */ + + w = m->mtx_witness; + p = CURPROC; + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + i = witness_spin_check; + if (i != 0 && w->w_level < i) { + mtx_exit(&w_mtx, MTX_SPIN); + panic("mutex_enter(%s:%x, MTX_SPIN) out of order @ %s:%d" + " already holding %s:%x", + m->mtx_description, w->w_level, file, line, + spin_order_list[ffs(i)-1], i); + } + PCPU_SET(witness_spin_check, i | w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + if (w->w_spin) + panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + if (witness_dead) + goto out; + if (cold) + goto out; + + if (!mtx_legal2block()) + panic("blockable mtx_enter() of %s when not legal @ %s:%d", + m->mtx_description, file, line); + /* + * Is this the first mutex acquired + */ + if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL) + goto out; + + + if ((w1 = m1->mtx_witness) == w) { + if (w->w_same_squawked || dup_ok(w)) + goto out; + w->w_same_squawked = 1; + printf("acquring duplicate lock of same type: \"%s\"\n", + m->mtx_description); + printf(" 1st @ %s:%d\n", w->w_file, w->w_line); + printf(" 2nd @ %s:%d\n", file, line); +#ifdef KDEBUG + go_into_kdebug = 1; +#endif /* KDEBUG */ + goto out; + } + MPASS(!mtx_owned(&w_mtx)); + mtx_enter(&w_mtx, MTX_SPIN); + /* + * If we have a known higher number just say ok + */ + if (witness_watch > 1 && w->w_level > w1->w_level) { + mtx_exit(&w_mtx, MTX_SPIN); + goto out; + } + if (isitmydescendant(m1->mtx_witness, w)) { + mtx_exit(&w_mtx, MTX_SPIN); + goto out; + } + for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) { + + ASS(i < 200); + w1 = m1->mtx_witness; + if (isitmydescendant(w, w1)) { + mtx_exit(&w_mtx, MTX_SPIN); + if (blessed(w, w1)) + goto out; + if (m1 == &Giant) { + if (w1->w_Giant_squawked) + goto out; + else + w1->w_Giant_squawked = 1; + } else { + if (w1->w_other_squawked) + goto out; + else + w1->w_other_squawked = 1; + } + printf("lock order reversal\n"); + printf(" 1st %s last acquired @ %s:%d\n", + w->w_description, w->w_file, w->w_line); + printf(" 2nd %p %s @ %s:%d\n", + m1, w1->w_description, w1->w_file, w1->w_line); + printf(" 3rd %p %s @ %s:%d\n", + m, w->w_description, file, line); +#ifdef KDEBUG + go_into_kdebug = 1; +#endif /* KDEBUG */ + goto out; + } + } + m1 = LIST_FIRST(&p->p_heldmtx); + if (!itismychild(m1->mtx_witness, w)) + mtx_exit(&w_mtx, MTX_SPIN); + +out: +#ifdef KDEBUG + if (witness_kdebug && go_into_kdebug) + kdebug(); +#endif /* KDEBUG */ + w->w_file = file; + w->w_line = line; + m->mtx_line = line; + m->mtx_file = file; + + /* + * If this pays off it likely means that a mutex being witnessed + * is acquired in hardclock. Put it in the ignore list. It is + * likely not the mutex this assert fails on. + */ + ASS(m->mtx_held.le_prev == NULL); + LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); +} + +void +witness_exit(mtx_t *m, int flags, char *file, int line) +{ + witness_t *w; + + w = m->mtx_witness; + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + PCPU_SET(witness_spin_check, witness_spin_check & ~w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + if (w->w_spin) + panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + + if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold) + panic("switchable mtx_exit() of %s when not legal @ %s:%d", + m->mtx_description, file, line); + LIST_REMOVE(m, mtx_held); + m->mtx_held.le_prev = NULL; +} + +void +witness_try_enter(mtx_t *m, int flags, char *file, int line) +{ + struct proc *p; + witness_t *w = m->mtx_witness; + + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_try_enter: " + "MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + PCPU_SET(witness_spin_check, witness_spin_check | w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + + if (w->w_spin) + panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + + w->w_file = file; + w->w_line = line; + m->mtx_line = line; + m->mtx_file = file; + p = CURPROC; + ASS(m->mtx_held.le_prev == NULL); + LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); +} + +void +witness_display(void(*prnt)(const char *fmt, ...)) +{ + witness_t *w, *w1; + + witness_levelall(); + + for (w = w_all; w; w = w->w_next) { + if (w->w_file == NULL) + continue; + for (w1 = w_all; w1; w1 = w1->w_next) { + if (isitmychild(w1, w)) + break; + } + if (w1 != NULL) + continue; + /* + * This lock has no anscestors, display its descendants. + */ + witness_displaydescendants(prnt, w); + } + prnt("\nMutex which were never acquired\n"); + for (w = w_all; w; w = w->w_next) { + if (w->w_file != NULL) + continue; + prnt("%s\n", w->w_description); + } +} + +int +witness_sleep(int check_only, mtx_t *mtx, char *file, int line) +{ + mtx_t *m; + struct proc *p; + char **sleep; + int n = 0; + + p = CURPROC; + for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL; + m = LIST_NEXT(m, mtx_held)) { + if (m == mtx) + continue; + for (sleep = sleep_list; *sleep!= NULL; sleep++) + if (strcmp(m->mtx_description, *sleep) == 0) + goto next; + printf("%s:%d: %s with \"%s\" locked from %s:%d\n", + file, line, check_only ? "could sleep" : "sleeping", + m->mtx_description, + m->mtx_witness->w_file, m->mtx_witness->w_line); + n++; + next: + } +#ifdef KDEBUG + if (witness_kdebug && n) + kdebug(); +#endif /* KDEBUG */ + return (n); +} + +static witness_t * +enroll(char *description, int flag) +{ + int i; + witness_t *w, *w1; + char **ignore; + char **order; + + if (!witness_watch) + return (NULL); + for (ignore = ignore_list; *ignore != NULL; ignore++) + if (strcmp(description, *ignore) == 0) + return (NULL); + + if (w_inited == 0) { + mtx_init(&w_mtx, "witness lock", MTX_DEF); + for (i = 0; i < WITNESS_COUNT; i++) { + w = &w_data[i]; + witness_free(w); + } + w_inited = 1; + for (order = order_list; *order != NULL; order++) { + w = enroll(*order, MTX_DEF); + w->w_file = "order list"; + for (order++; *order != NULL; order++) { + w1 = enroll(*order, MTX_DEF); + w1->w_file = "order list"; + itismychild(w, w1); + w = w1; + } + } + } + if ((flag & MTX_SPIN) && witness_skipspin) + return (NULL); + mtx_enter(&w_mtx, MTX_SPIN); + for (w = w_all; w; w = w->w_next) { + if (strcmp(description, w->w_description) == 0) { + mtx_exit(&w_mtx, MTX_SPIN); + return (w); + } + } + if ((w = witness_get()) == NULL) + return (NULL); + w->w_next = w_all; + w_all = w; + w->w_description = description; + mtx_exit(&w_mtx, MTX_SPIN); + if (flag & MTX_SPIN) { + w->w_spin = 1; + + i = 1; + for (order = spin_order_list; *order != NULL; order++) { + if (strcmp(description, *order) == 0) + break; + i <<= 1; + } + if (*order == NULL) + panic("spin lock %s not in order list", description); + w->w_level = i; + } + return (w); +} + +static int +itismychild(witness_t *parent, witness_t *child) +{ + static int recursed; + + /* + * Insert "child" after "parent" + */ + while (parent->w_morechildren) + parent = parent->w_morechildren; + + if (parent->w_childcnt == WITNESS_NCHILDREN) { + if ((parent->w_morechildren = witness_get()) == NULL) + return (1); + parent = parent->w_morechildren; + } + ASS(child != NULL); + parent->w_children[parent->w_childcnt++] = child; + /* + * now prune whole tree + */ + if (recursed) + return (0); + recursed = 1; + for (child = w_all; child != NULL; child = child->w_next) { + for (parent = w_all; parent != NULL; + parent = parent->w_next) { + if (!isitmychild(parent, child)) + continue; + removechild(parent, child); + if (isitmydescendant(parent, child)) + continue; + itismychild(parent, child); + } + } + recursed = 0; + witness_levelall(); + return (0); +} + +static void +removechild(witness_t *parent, witness_t *child) +{ + witness_t *w, *w1; + int i; + + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + if (w->w_children[i] == child) + goto found; + return; +found: + for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren) + continue; + w->w_children[i] = w1->w_children[--w1->w_childcnt]; + ASS(w->w_children[i] != NULL); + + if (w1->w_childcnt != 0) + return; + + if (w1 == parent) + return; + for (w = parent; w->w_morechildren != w1; w = w->w_morechildren) + continue; + w->w_morechildren = 0; + witness_free(w1); +} + +static int +isitmychild(witness_t *parent, witness_t *child) +{ + witness_t *w; + int i; + + for (w = parent; w != NULL; w = w->w_morechildren) { + for (i = 0; i < w->w_childcnt; i++) { + if (w->w_children[i] == child) + return (1); + } + } + return (0); +} + +static int +isitmydescendant(witness_t *parent, witness_t *child) +{ + witness_t *w; + int i; + int j; + + for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) { + ASS(j < 1000); + for (i = 0; i < w->w_childcnt; i++) { + if (w->w_children[i] == child) + return (1); + } + for (i = 0; i < w->w_childcnt; i++) { + if (isitmydescendant(w->w_children[i], child)) + return (1); + } + } + return (0); +} + +void +witness_levelall (void) +{ + witness_t *w, *w1; + + for (w = w_all; w; w = w->w_next) + if (!w->w_spin) + w->w_level = 0; + for (w = w_all; w; w = w->w_next) { + if (w->w_spin) + continue; + for (w1 = w_all; w1; w1 = w1->w_next) { + if (isitmychild(w1, w)) + break; + } + if (w1 != NULL) + continue; + witness_leveldescendents(w, 0); + } +} + +static void +witness_leveldescendents(witness_t *parent, int level) +{ + int i; + witness_t *w; + + if (parent->w_level < level) + parent->w_level = level; + level++; + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + witness_leveldescendents(w->w_children[i], level); +} + +static void +witness_displaydescendants(void(*prnt)(const char *fmt, ...), witness_t *parent) +{ + witness_t *w; + int i; + int level = parent->w_level; + + prnt("%d", level); + if (level < 10) + prnt(" "); + for (i = 0; i < level; i++) + prnt(" "); + prnt("%s", parent->w_description); + if (parent->w_file != NULL) { + prnt(" -- last acquired @ %s", parent->w_file); +#ifndef W_USE_WHERE + prnt(":%d", parent->w_line); +#endif + prnt("\n"); + } + + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + witness_displaydescendants(prnt, w->w_children[i]); + } + +static int +dup_ok(witness_t *w) +{ + char **dup; + + for (dup = dup_list; *dup!= NULL; dup++) + if (strcmp(w->w_description, *dup) == 0) + return (1); + return (0); +} + +static int +blessed(witness_t *w1, witness_t *w2) +{ + int i; + witness_blessed_t *b; + + for (i = 0; i < blessed_count; i++) { + b = &blessed_list[i]; + if (strcmp(w1->w_description, b->b_lock1) == 0) { + if (strcmp(w2->w_description, b->b_lock2) == 0) + return (1); + continue; + } + if (strcmp(w1->w_description, b->b_lock2) == 0) + if (strcmp(w2->w_description, b->b_lock1) == 0) + return (1); + } + return (0); +} + +static witness_t * +witness_get() +{ + witness_t *w; + + if ((w = w_free) == NULL) { + witness_dead = 1; + mtx_exit(&w_mtx, MTX_SPIN); + printf("witness exhausted\n"); + return (NULL); + } + w_free = w->w_next; + bzero(w, sizeof (*w)); + return (w); +} + +static void +witness_free(witness_t *w) +{ + w->w_next = w_free; + w_free = w; +} + +void +witness_list(struct proc *p) +{ + mtx_t *m; + + for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL; + m = LIST_NEXT(m, mtx_held)) { + printf("\t\"%s\" (%p) locked at %s:%d\n", + m->mtx_description, m, + m->mtx_witness->w_file, m->mtx_witness->w_line); + } +} + +void +witness_save(mtx_t *m, char **filep, int *linep) +{ + *filep = m->mtx_witness->w_file; + *linep = m->mtx_witness->w_line; +} + +void +witness_restore(mtx_t *m, char *file, int line) +{ + m->mtx_witness->w_file = file; + m->mtx_witness->w_line = line; +} + +#endif /* (defined(SMP_DEBUG) && defined(WITNESS)) */ diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c index 7ec2628a187f..4800747cc861 100644 --- a/sys/kern/kern_proc.c +++ b/sys/kern/kern_proc.c @@ -73,6 +73,7 @@ u_long pgrphash; struct proclist allproc; struct proclist zombproc; vm_zone_t proc_zone; +vm_zone_t ithread_zone; /* * Initialize global process hashing structures. diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c index f2a8fa6b7e8b..3344f7eda554 100644 --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -530,7 +530,7 @@ calcru(p, up, sp, ip) microuptime(&tv); if (timevalcmp(&tv, &switchtime, <)) printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n", - switchtime.tv_sec, switchtime.tv_usec, + switchtime.tv_sec, switchtime.tv_usec, tv.tv_sec, tv.tv_usec); else tu += (tv.tv_usec - switchtime.tv_usec) + diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c index 9c744c778e3f..8a6ccd8f8e40 100644 --- a/sys/kern/kern_shutdown.c +++ b/sys/kern/kern_shutdown.c @@ -63,6 +63,7 @@ #include #include +#include #include #include /* smp_active, cpuid */ @@ -524,6 +525,11 @@ panic(const char *fmt, ...) va_list ap; static char buf[256]; +#ifdef SMP + /* Only 1 CPU can panic at a time */ + s_lock(&panic_lock); +#endif + bootopt = RB_AUTOBOOT | RB_DUMP; if (panicstr) bootopt |= RB_NOSYNC; @@ -537,8 +543,7 @@ panic(const char *fmt, ...) va_end(ap); printf("panic: %s\n", buf); #ifdef SMP - /* three seperate prints in case of an unmapped page and trap */ - printf("mp_lock = %08x; ", mp_lock); + /* two seperate prints in case of an unmapped page and trap */ printf("cpuid = %d; ", cpuid); printf("lapic.id = %08x\n", lapic.id); #endif diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index a2ff2ef8913b..a39a4c805b19 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -1465,6 +1466,8 @@ killproc(p, why) struct proc *p; char *why; { + CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", + p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, why); psignal(p, SIGKILL); diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c index c0f7f6479e48..d9a599afbb89 100644 --- a/sys/kern/kern_subr.c +++ b/sys/kern/kern_subr.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,8 @@ #include #include +#include + static void uio_yield __P((void)); int @@ -421,10 +424,12 @@ uio_yield() int s; p = curproc; - p->p_priority = p->p_usrpri; s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); + p->p_priority = p->p_usrpri; setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); + mtx_exit(&sched_lock, MTX_SPIN); splx(s); } diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c index 3146f9e856ef..8f47dba110f9 100644 --- a/sys/kern/kern_switch.c +++ b/sys/kern/kern_switch.c @@ -29,27 +29,39 @@ #include #include #include +#include #include #include #include +#include + /* * We have NQS (32) run queues per scheduling class. For the normal * class, there are 128 priorities scaled onto these 32 queues. New * processes are added to the last entry in each queue, and processes * are selected for running by taking them from the head and maintaining - * a simple FIFO arrangement. Realtime and Idle priority processes have - * and explicit 0-31 priority which maps directly onto their class queue - * index. When a queue has something in it, the corresponding bit is - * set in the queuebits variable, allowing a single read to determine - * the state of all 32 queues and then a ffs() to find the first busy + * a simple FIFO arrangement. + * + * Interrupt, real time and idle priority processes have and explicit + * 0-31 priority which maps directly onto their class queue index. + * When a queue has something in it, the corresponding bit is set in + * the queuebits variable, allowing a single read to determine the + * state of all 32 queues and then a ffs() to find the first busy * queue. + * + * XXX This needs fixing. First, we only have one idle process, so we + * hardly need 32 queues for it. Secondly, the number of classes + * makes things unwieldy. We should be able to merge them into a + * single 96 or 128 entry queue. */ -struct rq queues[NQS]; -struct rq rtqueues[NQS]; -struct rq idqueues[NQS]; -u_int32_t queuebits; +struct rq itqueues[NQS]; /* interrupt threads */ +struct rq rtqueues[NQS]; /* real time processes */ +struct rq queues[NQS]; /* time sharing processes */ +struct rq idqueues[NQS]; /* idle process */ +u_int32_t itqueuebits; u_int32_t rtqueuebits; +u_int32_t queuebits; u_int32_t idqueuebits; /* @@ -61,8 +73,9 @@ rqinit(void *dummy) int i; for (i = 0; i < NQS; i++) { - TAILQ_INIT(&queues[i]); + TAILQ_INIT(&itqueues[i]); TAILQ_INIT(&rtqueues[i]); + TAILQ_INIT(&queues[i]); TAILQ_INIT(&idqueues[i]); } } @@ -81,22 +94,37 @@ setrunqueue(struct proc *p) struct rq *q; u_int8_t pri; - KASSERT(p->p_stat == SRUN, ("setrunqueue: proc not SRUN")); - if (p->p_rtprio.type == RTP_PRIO_NORMAL) { - pri = p->p_priority >> 2; - q = &queues[pri]; - queuebits |= 1 << pri; - } else if (p->p_rtprio.type == RTP_PRIO_REALTIME || + mtx_assert(&sched_lock, MA_OWNED); + KASSERT(p->p_stat == SRUN, ("setrunqueue: proc %p (%s) not SRUN", p, \ + p->p_comm)); + + /* + * Decide which class we want to run. We now have four + * queues, and this is becoming ugly. We should be able to + * collapse the first three classes into a single contiguous + * queue. XXX FIXME. + */ + CTR4(KTR_PROC, "setrunqueue: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); + if (p->p_rtprio.type == RTP_PRIO_ITHREAD) { /* interrupt thread */ + pri = p->p_rtprio.prio; + q = &itqueues[pri]; + itqueuebits |= 1 << pri; + } else if (p->p_rtprio.type == RTP_PRIO_REALTIME || /* real time */ p->p_rtprio.type == RTP_PRIO_FIFO) { pri = p->p_rtprio.prio; q = &rtqueues[pri]; rtqueuebits |= 1 << pri; - } else if (p->p_rtprio.type == RTP_PRIO_IDLE) { + } else if (p->p_rtprio.type == RTP_PRIO_NORMAL) { /* time sharing */ + pri = p->p_priority >> 2; + q = &queues[pri]; + queuebits |= 1 << pri; + } else if (p->p_rtprio.type == RTP_PRIO_IDLE) { /* idle proc */ pri = p->p_rtprio.prio; q = &idqueues[pri]; idqueuebits |= 1 << pri; } else { - panic("setrunqueue: invalid rtprio type"); + panic("setrunqueue: invalid rtprio type %d", p->p_rtprio.type); } p->p_rqindex = pri; /* remember the queue index */ TAILQ_INSERT_TAIL(q, p, p_procq); @@ -114,14 +142,20 @@ remrunqueue(struct proc *p) u_int32_t *which; u_int8_t pri; + CTR4(KTR_PROC, "remrunqueue: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); + mtx_assert(&sched_lock, MA_OWNED); pri = p->p_rqindex; - if (p->p_rtprio.type == RTP_PRIO_NORMAL) { - q = &queues[pri]; - which = &queuebits; + if (p->p_rtprio.type == RTP_PRIO_ITHREAD) { + q = &itqueues[pri]; + which = &itqueuebits; } else if (p->p_rtprio.type == RTP_PRIO_REALTIME || p->p_rtprio.type == RTP_PRIO_FIFO) { q = &rtqueues[pri]; which = &rtqueuebits; + } else if (p->p_rtprio.type == RTP_PRIO_NORMAL) { + q = &queues[pri]; + which = &queuebits; } else if (p->p_rtprio.type == RTP_PRIO_IDLE) { q = &idqueues[pri]; which = &idqueuebits; @@ -142,11 +176,17 @@ remrunqueue(struct proc *p) * loop to avoid the more expensive (and destructive) chooseproc(). * * MP SAFE. CALLED WITHOUT THE MP LOCK + * + * XXX I doubt this. It's possibly fail-safe, but there's obviously + * the case here where one of the bits words gets loaded, the + * processor gets preempted, and by the time it returns from this + * function, some other processor has picked the runnable process. + * What am I missing? (grog, 23 July 2000). */ u_int32_t procrunnable(void) { - return (rtqueuebits || queuebits || idqueuebits); + return (itqueuebits || rtqueuebits || queuebits || idqueuebits); } /* @@ -173,7 +213,12 @@ chooseproc(void) u_char id; #endif - if (rtqueuebits) { + mtx_assert(&sched_lock, MA_OWNED); + if (itqueuebits) { + pri = ffs(itqueuebits) - 1; + q = &itqueues[pri]; + which = &itqueuebits; + } else if (rtqueuebits) { pri = ffs(rtqueuebits) - 1; q = &rtqueues[pri]; which = &rtqueuebits; @@ -186,10 +231,12 @@ chooseproc(void) q = &idqueues[pri]; which = &idqueuebits; } else { - return NULL; + CTR1(KTR_PROC, "chooseproc: idleproc, schedlock %x", + sched_lock.mtx_lock); + idleproc->p_stat = SRUN; + return idleproc; } p = TAILQ_FIRST(q); - KASSERT(p, ("chooseproc: no proc on busy queue")); #ifdef SMP /* wander down the current run queue for this pri level for a match */ id = cpuid; @@ -201,6 +248,9 @@ chooseproc(void) } } #endif + CTR4(KTR_PROC, "chooseproc: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); + KASSERT(p, ("chooseproc: no proc on busy queue")); TAILQ_REMOVE(q, p, p_procq); if (TAILQ_EMPTY(q)) *which &= ~(1 << pri); diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index f747759b0007..f397f4095551 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +60,7 @@ #include #include #include +#include static void sched_setup __P((void *dummy)); SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) @@ -135,7 +137,7 @@ maybe_resched(chk) * standard process becomes runaway cpu-bound, the system can lockup * due to idle-scheduler processes in wakeup never getting any cpu. */ - if (p == NULL) { + if (p == idleproc) { #if 0 need_resched(); #endif @@ -169,7 +171,7 @@ roundrobin(arg) need_resched(); forward_roundrobin(); #else - if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type)) + if (p == idleproc || RTP_PRIO_NEED_RR(p->p_rtprio.type)) need_resched(); #endif @@ -284,6 +286,8 @@ schedcpu(arg) * Increment time in/out of memory and sleep time * (if sleeping). We ignore overflow; with 16-bit int's * (remember them?) overflow takes 45 days. + if (p->p_stat == SWAIT) + continue; */ p->p_swtime++; if (p->p_stat == SSLEEP || p->p_stat == SSTOP) @@ -295,7 +299,12 @@ schedcpu(arg) */ if (p->p_slptime > 1) continue; - s = splhigh(); /* prevent state changes and protect run queue */ + /* + * prevent state changes and protect run queue + */ + s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); + /* * p_pctcpu is only for ps. */ @@ -325,6 +334,7 @@ schedcpu(arg) } else p->p_priority = p->p_usrpri; } + mtx_exit(&sched_lock, MTX_SPIN); splx(s); } vmmeter(); @@ -364,6 +374,7 @@ updatepri(p) static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE]; #define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1)) +#if 0 /* * During autoconfiguration or after a panic, a sleep will simply * lower the priority briefly to allow interrupts, then return. @@ -374,6 +385,7 @@ static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE]; * higher to block network software interrupts after panics. */ int safepri; +#endif void sleepinit(void) @@ -406,11 +418,15 @@ tsleep(ident, priority, wmesg, timo) struct proc *p = curproc; int s, sig, catch = priority & PCATCH; struct callout_handle thandle; + int rval = 0; #ifdef KTRACE if (p && KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 1, 0); #endif + mtx_assert(&Giant, MA_OWNED); + mtx_enter(&sched_lock, MTX_SPIN); + s = splhigh(); if (cold || panicstr) { /* @@ -419,10 +435,14 @@ tsleep(ident, priority, wmesg, timo) * don't run any other procs or panic below, * in case this is the idle process and already asleep. */ + mtx_exit(&sched_lock, MTX_SPIN); +#if 0 splx(safepri); +#endif splx(s); return (0); } + KASSERT(p != NULL, ("tsleep1")); KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep")); /* @@ -436,6 +456,9 @@ tsleep(ident, priority, wmesg, timo) p->p_wmesg = wmesg; p->p_slptime = 0; p->p_priority = priority & PRIMASK; + p->p_nativepri = p->p_priority; + CTR4(KTR_PROC, "tsleep: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq); if (timo) thandle = timeout(endtsleep, (void *)p, timo); @@ -449,6 +472,9 @@ tsleep(ident, priority, wmesg, timo) * stopped, p->p_wchan will be 0 upon return from CURSIG. */ if (catch) { + CTR4(KTR_PROC, + "tsleep caught: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); p->p_flag |= P_SINTR; if ((sig = CURSIG(p))) { if (p->p_wchan) @@ -465,6 +491,9 @@ tsleep(ident, priority, wmesg, timo) p->p_stat = SSLEEP; p->p_stats->p_ru.ru_nvcsw++; mi_switch(); + CTR4(KTR_PROC, + "tsleep resume: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); resume: curpriority = p->p_usrpri; splx(s); @@ -476,7 +505,8 @@ tsleep(ident, priority, wmesg, timo) if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif - return (EWOULDBLOCK); + rval = EWOULDBLOCK; + goto out; } } else if (timo) untimeout(endtsleep, (void *)p, thandle); @@ -486,14 +516,19 @@ tsleep(ident, priority, wmesg, timo) ktrcsw(p->p_tracep, 0, 0); #endif if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) - return (EINTR); - return (ERESTART); + rval = EINTR; + else + rval = ERESTART; + goto out; } +out: + mtx_exit(&sched_lock, MTX_SPIN); #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif - return (0); + + return (rval); } /* @@ -519,13 +554,14 @@ asleep(void *ident, int priority, const char *wmesg, int timo) int s; /* - * splhigh() while manipulating sleep structures and slpque. + * obtain sched_lock while manipulating sleep structures and slpque. * * Remove preexisting wait condition (if any) and place process * on appropriate slpque, but do not put process to sleep. */ s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); if (p->p_wchan != NULL) unsleep(p); @@ -539,6 +575,7 @@ asleep(void *ident, int priority, const char *wmesg, int timo) TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq); } + mtx_exit(&sched_lock, MTX_SPIN); splx(s); return(0); @@ -560,8 +597,12 @@ int await(int priority, int timo) { struct proc *p = curproc; + int rval = 0; int s; + mtx_assert(&Giant, MA_OWNED); + mtx_enter(&sched_lock, MTX_SPIN); + s = splhigh(); if (p->p_wchan != NULL) { @@ -616,7 +657,8 @@ await(int priority, int timo) if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif - return (EWOULDBLOCK); + rval = EWOULDBLOCK; + goto out; } } else if (timo) untimeout(endtsleep, (void *)p, thandle); @@ -626,8 +668,10 @@ await(int priority, int timo) ktrcsw(p->p_tracep, 0, 0); #endif if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) - return (EINTR); - return (ERESTART); + rval = EINTR; + else + rval = ERESTART; + goto out; } #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) @@ -655,7 +699,10 @@ await(int priority, int timo) */ p->p_asleep.as_priority = 0; - return (0); +out: + mtx_exit(&sched_lock, MTX_SPIN); + + return (rval); } /* @@ -673,7 +720,11 @@ endtsleep(arg) int s; p = (struct proc *)arg; + CTR4(KTR_PROC, + "endtsleep: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); if (p->p_wchan) { if (p->p_stat == SSLEEP) setrunnable(p); @@ -681,6 +732,7 @@ endtsleep(arg) unsleep(p); p->p_flag |= P_TIMEOUT; } + mtx_exit(&sched_lock, MTX_SPIN); splx(s); } @@ -694,10 +746,12 @@ unsleep(p) int s; s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); if (p->p_wchan) { TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq); p->p_wchan = 0; } + mtx_exit(&sched_lock, MTX_SPIN); splx(s); } @@ -713,6 +767,7 @@ wakeup(ident) int s; s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); qp = &slpque[LOOKUP(ident)]; restart: TAILQ_FOREACH(p, qp, p_procq) { @@ -721,6 +776,9 @@ wakeup(ident) p->p_wchan = 0; if (p->p_stat == SSLEEP) { /* OPTIMIZED EXPANSION OF setrunnable(p); */ + CTR4(KTR_PROC, + "wakeup: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); if (p->p_slptime > 1) updatepri(p); p->p_slptime = 0; @@ -737,6 +795,7 @@ wakeup(ident) } } } + mtx_exit(&sched_lock, MTX_SPIN); splx(s); } @@ -754,6 +813,7 @@ wakeup_one(ident) int s; s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); qp = &slpque[LOOKUP(ident)]; TAILQ_FOREACH(p, qp, p_procq) { @@ -762,6 +822,9 @@ wakeup_one(ident) p->p_wchan = 0; if (p->p_stat == SSLEEP) { /* OPTIMIZED EXPANSION OF setrunnable(p); */ + CTR4(KTR_PROC, + "wakeup1: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); if (p->p_slptime > 1) updatepri(p); p->p_slptime = 0; @@ -778,6 +841,7 @@ wakeup_one(ident) } } } + mtx_exit(&sched_lock, MTX_SPIN); splx(s); } @@ -791,7 +855,9 @@ mi_switch() struct timeval new_switchtime; register struct proc *p = curproc; /* XXX */ register struct rlimit *rlim; + int giantreleased; int x; + WITNESS_SAVE_DECL(Giant); /* * XXX this spl is almost unnecessary. It is partly to allow for @@ -812,6 +878,14 @@ mi_switch() */ x = splstatclock(); + CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); + mtx_enter(&sched_lock, MTX_SPIN | MTX_RLIKELY); + + WITNESS_SAVE(&Giant, Giant); + for (giantreleased = 0; mtx_owned(&Giant); giantreleased++) + mtx_exit(&Giant, MTX_DEF | MTX_NOSWITCH); + #ifdef SIMPLELOCK_DEBUG if (p->p_simple_locks) printf("sleep: holding simple lock\n"); @@ -823,7 +897,7 @@ mi_switch() microuptime(&new_switchtime); if (timevalcmp(&new_switchtime, &switchtime, <)) { printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n", - switchtime.tv_sec, switchtime.tv_usec, + switchtime.tv_sec, switchtime.tv_usec, new_switchtime.tv_sec, new_switchtime.tv_usec); new_switchtime = switchtime; } else { @@ -834,6 +908,8 @@ mi_switch() /* * Check if the process exceeds its cpu resource allocation. * If over max, kill it. + * + * XXX drop sched_lock, pickup Giant */ if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY && p->p_runtime > p->p_limit->p_cpulimit) { @@ -854,10 +930,18 @@ mi_switch() */ cnt.v_swtch++; switchtime = new_switchtime; - cpu_switch(p); + CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); + cpu_switch(); + CTR4(KTR_PROC, "mi_switch: new proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); if (switchtime.tv_sec == 0) microuptime(&switchtime); switchticks = ticks; + mtx_exit(&sched_lock, MTX_SPIN); + while (giantreleased--) + mtx_enter(&Giant, MTX_DEF); + WITNESS_RESTORE(&Giant, Giant); splx(x); } @@ -874,10 +958,12 @@ setrunnable(p) register int s; s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); switch (p->p_stat) { case 0: case SRUN: case SZOMB: + case SWAIT: default: panic("setrunnable"); case SSTOP: @@ -891,6 +977,7 @@ setrunnable(p) p->p_stat = SRUN; if (p->p_flag & P_INMEM) setrunqueue(p); + mtx_exit(&sched_lock, MTX_SPIN); splx(s); if (p->p_slptime > 1) updatepri(p); diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c index b8d58334118a..1128c2ed5c6c 100644 --- a/sys/kern/kern_tc.c +++ b/sys/kern/kern_tc.c @@ -24,7 +24,7 @@ * Number of timecounters used to implement stable storage */ #ifndef NTIMECOUNTER -#define NTIMECOUNTER 5 +#define NTIMECOUNTER 45 #endif static MALLOC_DEFINE(M_TIMECOUNTER, "timecounter", @@ -148,6 +148,13 @@ nanotime(struct timespec *ts) nnanotime++; tc = timecounter; +#ifdef KTR + if (tc == NULL) { /* called before initialization */ + ts->tv_sec = 0; + ts->tv_nsec = 0; + return; + } +#endif ts->tv_sec = tc->tc_offset_sec; count = tco_delta(tc); delta = tc->tc_offset_nano; diff --git a/sys/kern/kern_threads.c b/sys/kern/kern_threads.c index 3531e2c6d361..ba2b4bf634ec 100644 --- a/sys/kern/kern_threads.c +++ b/sys/kern/kern_threads.c @@ -52,10 +52,13 @@ #include #include #include +#include #include #include #include +#include + /* * Low level support for sleep/wakeup paradigm * If a timeout is specified: @@ -145,10 +148,12 @@ yield(struct proc *p, struct yield_args *uap) { p->p_retval[0] = 0; s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); p->p_priority = MAXPRI; setrunqueue(p); p->p_stats->p_ru.ru_nvcsw++; mi_switch(); + mtx_exit(&sched_lock, MTX_SPIN); splx(s); return(0); diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c index 3794ccf5d930..a98915286604 100644 --- a/sys/kern/subr_prf.c +++ b/sys/kern/subr_prf.c @@ -110,7 +110,8 @@ uprintf(const char *fmt, ...) struct putchar_arg pca; int retval = 0; - if (p && p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) { + if (p && p != idleproc && p->p_flag & P_CONTROLT && + p->p_session->s_ttyvp) { va_start(ap, fmt); pca.tty = p->p_session->s_ttyp; pca.flags = TOTTY; diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c index 4fa5223acaf4..294c649b733b 100644 --- a/sys/kern/subr_prof.c +++ b/sys/kern/subr_prof.c @@ -93,6 +93,7 @@ kmstartup(dummy) int nullfunc_loop_profiled_time; uintfptr_t tmp_addr; #endif + int intrstate; /* * Round lowpc and highpc to multiples of the density we're using @@ -135,6 +136,7 @@ kmstartup(dummy) * Disable interrupts to avoid interference while we calibrate * things. */ + intrstate = save_intr(); disable_intr(); /* @@ -189,7 +191,7 @@ kmstartup(dummy) p->state = GMON_PROF_OFF; stopguprof(p); - enable_intr(); + restore_intr(intrstate); nullfunc_loop_profiled_time = 0; for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled; diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c index 61c5ecf73205..95b5759f9e66 100644 --- a/sys/kern/subr_smp.c +++ b/sys/kern/subr_smp.c @@ -36,6 +36,7 @@ #endif #include +#include #include #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -236,6 +238,8 @@ typedef struct BASETABLE_ENTRY { #define MP_ANNOUNCE_POST 0x19 +/* used to hold the AP's until we are ready to release them */ +struct simplelock ap_boot_lock; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; @@ -336,6 +340,7 @@ static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); static int apic_int_is_bus_type(int intr, int bus_type); +static void release_aps(void *dummy); /* * Calculate usable address in base memory for AP trampoline code. @@ -403,7 +408,7 @@ mp_probe(void) /* - * Startup the SMP processors. + * Initialize the SMP hardware and the APIC and start up the AP's. */ void mp_start(void) @@ -619,6 +624,9 @@ mp_enable(u_int boot_addr) /* initialize all SMP locks */ init_locks(); + /* obtain the ap_boot_lock */ + s_lock(&ap_boot_lock); + /* start each Application Processor */ start_all_aps(boot_addr); } @@ -1866,9 +1874,6 @@ struct simplelock fast_intr_lock; /* critical region around INTR() routines */ struct simplelock intr_lock; -/* lock regions protected in UP kernel via cli/sti */ -struct simplelock mpintr_lock; - /* lock region used by kernel profiling */ struct simplelock mcount_lock; @@ -1885,26 +1890,16 @@ struct simplelock clock_lock; /* lock around the MP rendezvous */ static struct simplelock smp_rv_lock; +/* only 1 CPU can panic at a time :) */ +struct simplelock panic_lock; + static void init_locks(void) { - /* - * Get the initial mp_lock with a count of 1 for the BSP. - * This uses a LOGICAL cpu ID, ie BSP == 0. - */ - mp_lock = 0x00000001; - -#if 0 - /* ISR uses its own "giant lock" */ - isr_lock = FREE_LOCK; -#endif - #if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ) s_lock_init((struct simplelock*)&apic_itrace_debuglock); #endif - s_lock_init((struct simplelock*)&mpintr_lock); - s_lock_init((struct simplelock*)&mcount_lock); s_lock_init((struct simplelock*)&fast_intr_lock); @@ -1912,6 +1907,7 @@ init_locks(void) s_lock_init((struct simplelock*)&imen_lock); s_lock_init((struct simplelock*)&cpl_lock); s_lock_init(&smp_rv_lock); + s_lock_init(&panic_lock); #ifdef USE_COMLOCK s_lock_init((struct simplelock*)&com_lock); @@ -1919,12 +1915,10 @@ init_locks(void) #ifdef USE_CLOCKLOCK s_lock_init((struct simplelock*)&clock_lock); #endif /* USE_CLOCKLOCK */ + + s_lock_init(&ap_boot_lock); } - -/* Wait for all APs to be fully initialized */ -extern int wait_ap(unsigned int); - /* * start each AP in our list */ @@ -1987,6 +1981,7 @@ start_all_aps(u_int boot_addr) SMPpt[pg + 4] = 0; /* *prv_PMAP1 */ /* prime data page for it to use */ + SLIST_INSERT_HEAD(&cpuhead, gd, gd_allcpu); gd->gd_cpuid = x; gd->gd_cpu_lockid = x << 24; gd->gd_prv_CMAP1 = &SMPpt[pg + 1]; @@ -2211,7 +2206,6 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } - /* * Flush the TLB on all other CPU's * @@ -2348,10 +2342,13 @@ SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW, void ap_init(void); void -ap_init() +ap_init(void) { u_int apic_id; + /* lock against other AP's that are waking up */ + s_lock(&ap_boot_lock); + /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); @@ -2397,6 +2394,30 @@ ap_init() smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } + + /* let other AP's wake up now */ + s_unlock(&ap_boot_lock); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ; /* nothing */ + + /* + * Set curproc to our per-cpu idleproc so that mutexes have + * something unique to lock with. + */ + PCPU_SET(curproc,idleproc); + PCPU_SET(prevproc,idleproc); + + microuptime(&switchtime); + switchticks = ticks; + + /* ok, now grab sched_lock and enter the scheduler */ + enable_intr(); + mtx_enter(&sched_lock, MTX_SPIN); + cpu_throw(); /* doesn't return */ + + panic("scheduler returned us to ap_init"); } #ifdef BETTER_CLOCK @@ -2453,6 +2474,12 @@ forwarded_statclock(int id, int pscnt, int *astmap) p = checkstate_curproc[id]; cpustate = checkstate_cpustate[id]; + /* XXX */ + if (p->p_ithd) + cpustate = CHECKSTATE_INTR; + else if (p == idleproc) + cpustate = CHECKSTATE_SYS; + switch (cpustate) { case CHECKSTATE_USER: if (p->p_flag & P_PROFIL) @@ -2482,9 +2509,10 @@ forwarded_statclock(int id, int pscnt, int *astmap) if (pscnt > 1) return; - if (!p) + if (p == idleproc) { + p->p_sticks++; cp_time[CP_IDLE]++; - else { + } else { p->p_sticks++; cp_time[CP_SYS]++; } @@ -2510,7 +2538,7 @@ forwarded_statclock(int id, int pscnt, int *astmap) p->p_iticks++; cp_time[CP_INTR]++; } - if (p != NULL) { + if (p != idleproc) { schedclock(p); /* Update resource usage integrals and maximums. */ @@ -2863,3 +2891,11 @@ smp_rendezvous(void (* setup_func)(void *), /* release lock */ s_unlock(&smp_rv_lock); } + +void +release_aps(void *dummy __unused) +{ + s_unlock(&ap_boot_lock); +} + +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c index 51de1ac9e650..f32dfaeeddc0 100644 --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -49,10 +49,12 @@ #include "opt_trap.h" #include +#include #include #include #include #include +#include #include #include #include @@ -76,12 +78,14 @@ #include #include #include +#include #include #ifdef SMP #include #endif #include +#include #include #ifdef POWERFAIL_NMI @@ -96,11 +100,14 @@ #include "isa.h" #include "npx.h" +#include + int (*pmath_emulate) __P((struct trapframe *)); extern void trap __P((struct trapframe frame)); extern int trapwrite __P((unsigned addr)); extern void syscall2 __P((struct trapframe frame)); +extern void ast __P((struct trapframe frame)); static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); static void trap_fatal __P((struct trapframe *, vm_offset_t)); @@ -142,7 +149,7 @@ static char *trap_msg[] = { }; static __inline int userret __P((struct proc *p, struct trapframe *frame, - u_quad_t oticks, int have_mplock)); + u_quad_t oticks, int have_giant)); #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; @@ -158,18 +165,18 @@ SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); static __inline int -userret(p, frame, oticks, have_mplock) +userret(p, frame, oticks, have_giant) struct proc *p; struct trapframe *frame; u_quad_t oticks; - int have_mplock; + int have_giant; { int sig, s; while ((sig = CURSIG(p)) != 0) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } postsig(sig); } @@ -184,31 +191,34 @@ userret(p, frame, oticks, have_mplock) * mi_switch()'ed, we might not be on the queue indicated by * our priority. */ - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; - } s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); + mtx_exit(&sched_lock, MTX_SPIN); splx(s); - while ((sig = CURSIG(p)) != 0) + while ((sig = CURSIG(p)) != 0) { + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; + } postsig(sig); + } } /* * Charge system time if profiling. */ if (p->p_flag & P_PROFIL) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } addupc_task(p, frame->tf_eip, (u_int)(p->p_sticks - oticks) * psratio); } curpriority = p->p_priority; - return(have_mplock); + return(have_giant); } /* @@ -226,13 +236,20 @@ trap(frame) u_quad_t sticks = 0; int i = 0, ucode = 0, type, code; vm_offset_t eva; +#ifdef POWERFAIL_NMI + static int lastalert = 0; +#endif - if (!(frame.tf_eflags & PSL_I)) { + atomic_add_int(&cnt.v_trap, 1); + + if ((frame.tf_eflags & PSL_I) == 0) { /* - * Buggy application or kernel code has disabled interrupts - * and then trapped. Enabling interrupts now is wrong, but - * it is better than running with interrupts disabled until - * they are accidentally enabled later. + * Buggy application or kernel code has disabled + * interrupts and then trapped. Enabling interrupts + * now is wrong, but it is better than running with + * interrupts disabled until they are accidentally + * enabled later. XXX Consider whether is this still + * correct. */ type = frame.tf_trapno; if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) @@ -252,54 +269,27 @@ trap(frame) eva = 0; if (frame.tf_trapno == T_PAGEFLT) { /* - * For some Cyrix CPUs, %cr2 is clobbered by interrupts. - * This problem is worked around by using an interrupt - * gate for the pagefault handler. We are finally ready - * to read %cr2 and then must reenable interrupts. - * - * XXX this should be in the switch statement, but the - * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the - * flow of control too much for this to be obviously - * correct. + * For some Cyrix CPUs, %cr2 is clobbered by + * interrupts. This problem is worked around by using + * an interrupt gate for the pagefault handler. We + * are finally ready to read %cr2 and then must + * reenable interrupts. */ eva = rcr2(); enable_intr(); - } + } + + mtx_enter(&Giant, MTX_DEF); #if defined(I586_CPU) && !defined(NO_F00F_HACK) restart: #endif + type = frame.tf_trapno; code = frame.tf_err; - if (in_vm86call) { - if (frame.tf_eflags & PSL_VM && - (type == T_PROTFLT || type == T_STKFLT)) { - i = vm86_emulate((struct vm86frame *)&frame); - if (i != 0) - /* - * returns to original process - */ - vm86_trap((struct vm86frame *)&frame); - return; - } - switch (type) { - /* - * these traps want either a process context, or - * assume a normal userspace trap. - */ - case T_PROTFLT: - case T_SEGNPFLT: - trap_fatal(&frame, eva); - return; - case T_TRCTRAP: - type = T_BPTFLT; /* kernel breakpoint */ - /* FALL THROUGH */ - } - goto kernel_trap; /* normal kernel trap handling */ - } - - if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) { + if ((ISPL(frame.tf_cs) == SEL_UPL) || + ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { /* user trap */ sticks = p->p_sticks; @@ -322,16 +312,6 @@ trap(frame) i = SIGFPE; break; - case T_ASTFLT: /* Allow process switch */ - astoff(); - cnt.v_soft++; - if (p->p_flag & P_OWEUPC) { - p->p_flag &= ~P_OWEUPC; - addupc_task(p, p->p_stats->p_prof.pr_addr, - p->p_stats->p_prof.pr_ticks); - } - goto out; - /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle @@ -342,7 +322,7 @@ trap(frame) if (frame.tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)&frame); if (i == 0) - goto out; + goto user; break; } /* FALL THROUGH */ @@ -357,14 +337,20 @@ trap(frame) case T_PAGEFLT: /* page fault */ i = trap_pfault(&frame, TRUE, eva); - if (i == -1) - return; #if defined(I586_CPU) && !defined(NO_F00F_HACK) - if (i == -2) + if (i == -2) { + /* + * f00f hack workaround has triggered, treat + * as illegal instruction not page fault. + */ + frame.tf_trapno = T_PRIVINFLT; goto restart; + } #endif - if (i == 0) + if (i == -1) goto out; + if (i == 0) + goto user; ucode = T_PAGEFLT; break; @@ -377,7 +363,15 @@ trap(frame) #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI - goto handle_powerfail; +#ifndef TIMER_FREQ +# define TIMER_FREQ 1193182 +#endif + if (time_second - lastalert > 10) { + log(LOG_WARNING, "NMI: power fail\n"); + sysbeep(TIMER_FREQ/880, hz); + lastalert = time_second; + } + goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { @@ -391,7 +385,7 @@ trap(frame) kdb_trap (type, 0, &frame); } #endif /* DDB */ - return; + goto out; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; @@ -410,9 +404,9 @@ trap(frame) case T_DNA: #if NNPX > 0 - /* if a transparent fault (due to context switch "late") */ + /* transparent fault (due to context switch "late") */ if (npxdna()) - return; + goto out; #endif if (!pmath_emulate) { i = SIGFPE; @@ -422,7 +416,7 @@ trap(frame) i = (*pmath_emulate)(&frame); if (i == 0) { if (!(frame.tf_eflags & PSL_T)) - return; + goto out; frame.tf_eflags &= ~PSL_T; i = SIGTRAP; } @@ -435,13 +429,12 @@ trap(frame) break; } } else { -kernel_trap: /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(&frame, FALSE, eva); - return; + goto out; case T_DNA: #if NNPX > 0 @@ -451,31 +444,35 @@ trap(frame) * registered such use. */ if (npxdna()) - return; + goto out; #endif break; - case T_PROTFLT: /* general protection fault */ - case T_SEGNPFLT: /* segment not present fault */ /* - * Invalid segment selectors and out of bounds - * %eip's and %esp's can be set up in user mode. - * This causes a fault in kernel mode when the - * kernel tries to return to user mode. We want - * to get this fault so that we can fix the - * problem here and not have to check all the - * selectors and pointers when the user changes - * them. + * The following two traps can happen in + * vm86 mode, and, if so, we want to handle + * them specially. */ -#define MAYBE_DORETI_FAULT(where, whereto) \ - do { \ - if (frame.tf_eip == (int)where) { \ - frame.tf_eip = (int)whereto; \ - return; \ - } \ - } while (0) + case T_PROTFLT: /* general protection fault */ + case T_STKFLT: /* stack fault */ + if (frame.tf_eflags & PSL_VM) { + i = vm86_emulate((struct vm86frame *)&frame); + if (i != 0) + /* + * returns to original process + */ + vm86_trap((struct vm86frame *)&frame); + goto out; + } + /* FALL THROUGH */ + + case T_SEGNPFLT: /* segment not present fault */ + if (in_vm86call) + break; + + if (intr_nesting_level != 0) + break; - if (intr_nesting_level == 0) { /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the @@ -488,20 +485,38 @@ trap(frame) if (frame.tf_eip == (int)cpu_switch_load_gs) { curpcb->pcb_gs = 0; psignal(p, SIGBUS); - return; + goto out; + } + + /* + * Invalid segment selectors and out of bounds + * %eip's and %esp's can be set up in user mode. + * This causes a fault in kernel mode when the + * kernel tries to return to user mode. We want + * to get this fault so that we can fix the + * problem here and not have to check all the + * selectors and pointers when the user changes + * them. + */ + if (frame.tf_eip == (int)doreti_iret) { + frame.tf_eip = (int)doreti_iret_fault; + goto out; + } + if (frame.tf_eip == (int)doreti_popl_ds) { + frame.tf_eip = (int)doreti_popl_ds_fault; + goto out; + } + if (frame.tf_eip == (int)doreti_popl_es) { + frame.tf_eip = (int)doreti_popl_es_fault; + goto out; } - MAYBE_DORETI_FAULT(doreti_iret, - doreti_iret_fault); - MAYBE_DORETI_FAULT(doreti_popl_ds, - doreti_popl_ds_fault); - MAYBE_DORETI_FAULT(doreti_popl_es, - doreti_popl_es_fault); - MAYBE_DORETI_FAULT(doreti_popl_fs, - doreti_popl_fs_fault); + if (frame.tf_eip == (int)doreti_popl_fs) { + frame.tf_eip = (int)doreti_popl_fs_fault; + goto out; + } if (curpcb && curpcb->pcb_onfault) { frame.tf_eip = (int)curpcb->pcb_onfault; - return; - } + goto out; } break; @@ -517,7 +532,7 @@ trap(frame) */ if (frame.tf_eflags & PSL_NT) { frame.tf_eflags &= ~PSL_NT; - return; + goto out; } break; @@ -529,7 +544,7 @@ trap(frame) * silently until the syscall handler has * saved the flags. */ - return; + goto out; } if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { /* @@ -537,7 +552,7 @@ trap(frame) * flags. Stop single stepping it. */ frame.tf_eflags &= ~PSL_T; - return; + goto out; } /* * Ignore debug register trace traps due to @@ -549,13 +564,13 @@ trap(frame) * in kernel space because that is useful when * debugging the kernel. */ - if (user_dbreg_trap()) { + if (user_dbreg_trap() && !in_vm86call) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); - return; + goto out; } /* * Fall through (TRCTRAP kernel mode, kernel address) @@ -567,28 +582,19 @@ trap(frame) */ #ifdef DDB if (kdb_trap (type, 0, &frame)) - return; + goto out; #endif break; #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI -#ifndef TIMER_FREQ -# define TIMER_FREQ 1193182 -#endif - handle_powerfail: - { - static unsigned lastalert = 0; - - if(time_second - lastalert > 10) - { + if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; - } - return; } + goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { @@ -602,16 +608,16 @@ trap(frame) kdb_trap (type, 0, &frame); } #endif /* DDB */ - return; + goto out; } else if (panic_on_nmi == 0) - return; + goto out; /* FALL THROUGH */ #endif /* POWERFAIL_NMI */ #endif /* NISA > 0 */ } trap_fatal(&frame, eva); - return; + goto out; } /* Translate fault for emulators (e.g. Linux) */ @@ -630,8 +636,10 @@ trap(frame) } #endif -out: +user: userret(p, &frame, sticks, 1); +out: + mtx_exit(&Giant, MTX_DEF); } #ifdef notyet @@ -769,10 +777,8 @@ trap_pfault(frame, usermode, eva) * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) - if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) { - frame->tf_trapno = T_PRIVINFLT; + if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return -2; - } #endif if (usermode) goto nogo; @@ -869,8 +875,7 @@ trap_fatal(frame, eva) frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP - /* three seperate prints in case of a trap on an unmapped page */ - printf("mp_lock = %08x; ", mp_lock); + /* two seperate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", cpuid); printf("lapic.id = %08x\n", lapic.id); #endif @@ -917,26 +922,6 @@ trap_fatal(frame, eva) } else { printf("Idle\n"); } - printf("interrupt mask = "); - if ((cpl & net_imask) == net_imask) - printf("net "); - if ((cpl & tty_imask) == tty_imask) - printf("tty "); - if ((cpl & bio_imask) == bio_imask) - printf("bio "); - if ((cpl & cam_imask) == cam_imask) - printf("cam "); - if (cpl == 0) - printf("none"); -#ifdef SMP -/** - * XXX FIXME: - * we probably SHOULD have stopped the other CPUs before now! - * another CPU COULD have been touching cpl at this moment... - */ - printf(" <- SMP: XXX"); -#endif - printf("\n"); #ifdef KDB if (kdb_trap(&psl)) @@ -973,8 +958,7 @@ dblfault_handler() printf("esp = 0x%x\n", common_tss.tss_esp); printf("ebp = 0x%x\n", common_tss.tss_ebp); #ifdef SMP - /* three seperate prints in case of a trap on an unmapped page */ - printf("mp_lock = %08x; ", mp_lock); + /* two seperate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", cpuid); printf("lapic.id = %08x\n", lapic.id); #endif @@ -1048,12 +1032,14 @@ syscall2(frame) int error; int narg; int args[8]; - int have_mplock = 0; + int have_giant = 0; u_int code; + atomic_add_int(&cnt.v_syscall, 1); + #ifdef DIAGNOSTIC if (ISPL(frame.tf_cs) != SEL_UPL) { - get_mplock(); + mtx_enter(&Giant, MTX_DEF); panic("syscall"); /* NOT REACHED */ } @@ -1075,9 +1061,9 @@ syscall2(frame) /* * The prep code is not MP aware. */ - get_mplock(); + mtx_enter(&Giant, MTX_DEF); (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); - rel_mplock(); + mtx_exit(&Giant, MTX_DEF); } else { /* * Need to check if this is a 32 bit or 64 bit syscall. @@ -1114,8 +1100,8 @@ syscall2(frame) */ if (params && (i = narg * sizeof(int)) && (error = copyin(params, (caddr_t)args, (u_int)i))) { - get_mplock(); - have_mplock = 1; + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, narg, args); @@ -1129,15 +1115,15 @@ syscall2(frame) * we are ktracing */ if ((callp->sy_narg & SYF_MPSAFE) == 0) { - get_mplock(); - have_mplock = 1; + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } ktrsyscall(p->p_tracep, code, narg, args); } @@ -1192,9 +1178,9 @@ syscall2(frame) * Traced syscall. trapsignal() is not MP aware. */ if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); @@ -1203,13 +1189,13 @@ syscall2(frame) /* * Handle reschedule and other end-of-syscall issues */ - have_mplock = userret(p, &frame, sticks, have_mplock); + have_giant = userret(p, &frame, sticks, have_giant); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } ktrsysret(p->p_tracep, code, error, p->p_retval[0]); } @@ -1225,27 +1211,66 @@ syscall2(frame) /* * Release the MP lock if we had to get it */ - if (have_mplock) - rel_mplock(); + if (have_giant) + mtx_exit(&Giant, MTX_DEF); + + mtx_assert(&sched_lock, MA_NOTOWNED); + mtx_assert(&Giant, MA_NOTOWNED); +} + +void +ast(frame) + struct trapframe frame; +{ + struct proc *p = CURPROC; + u_quad_t sticks; + + /* + * handle atomicy by looping since interrupts are enabled and the + * MP lock is not held. + */ + sticks = ((volatile struct proc *)p)->p_sticks; + while (sticks != ((volatile struct proc *)p)->p_sticks) + sticks = ((volatile struct proc *)p)->p_sticks; + + astoff(); + atomic_add_int(&cnt.v_soft, 1); + if (p->p_flag & P_OWEUPC) { + mtx_enter(&Giant, MTX_DEF); + p->p_flag &= ~P_OWEUPC; + addupc_task(p, p->p_stats->p_prof.pr_addr, + p->p_stats->p_prof.pr_ticks); +} + if (userret(p, &frame, sticks, mtx_owned(&Giant)) != 0) + mtx_exit(&Giant, MTX_DEF); } /* * Simplified back end of syscall(), used when returning from fork() - * directly into user mode. MP lock is held on entry and should be - * held on return. + * directly into user mode. Giant is not held on entry, and must not + * be held on return. */ void fork_return(p, frame) struct proc *p; struct trapframe frame; { + int have_giant; + frame.tf_eax = 0; /* Child returns zero */ frame.tf_eflags &= ~PSL_C; /* success */ frame.tf_edx = 1; - userret(p, &frame, 0, 1); + have_giant = userret(p, &frame, 0, mtx_owned(&Giant)); #ifdef KTRACE - if (KTRPOINT(p, KTR_SYSRET)) + if (KTRPOINT(p, KTR_SYSRET)) { + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; + } ktrsysret(p->p_tracep, SYS_fork, 0, 0); + } #endif + if (have_giant) + mtx_exit(&Giant, MTX_DEF); } diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c new file mode 100644 index 000000000000..1ac3f584d9ef --- /dev/null +++ b/sys/kern/subr_turnstile.c @@ -0,0 +1,799 @@ +/*- + * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ + * $FreeBSD$ + */ + +/* + * Main Entry: witness + * Pronunciation: 'wit-n&s + * Function: noun + * Etymology: Middle English witnesse, from Old English witnes knowledge, + * testimony, witness, from 2wit + * Date: before 12th century + * 1 : attestation of a fact or event : TESTIMONY + * 2 : one that gives evidence; specifically : one who testifies in + * a cause or before a judicial tribunal + * 3 : one asked to be present at a transaction so as to be able to + * testify to its having taken place + * 4 : one who has personal knowledge of something + * 5 a : something serving as evidence or proof : SIGN + * b : public affirmation by word or example of usually + * religious faith or conviction + * 6 capitalized : a member of the Jehovah's Witnesses + */ + +#include +#include +#include +#include + +#include +#define _KERN_MUTEX_C_ /* Cause non-inlined mtx_*() to be compiled. */ +#include + +/* + * The non-inlined versions of the mtx_*() functions are always built (above), + * but the witness code depends on the SMP_DEBUG and WITNESS kernel options + * being specified. + */ +#if (defined(SMP_DEBUG) && defined(WITNESS)) + +#define WITNESS_COUNT 200 +#define WITNESS_NCHILDREN 2 + +#ifndef WITNESS +#define WITNESS 0 /* default off */ +#endif + +#ifndef SMP +extern int witness_spin_check; +#endif + +int witness_watch; + +typedef struct witness { + struct witness *w_next; + char *w_description; + char *w_file; + int w_line; + struct witness *w_morechildren; + u_char w_childcnt; + u_char w_Giant_squawked:1; + u_char w_other_squawked:1; + u_char w_same_squawked:1; + u_char w_sleep:1; + u_char w_spin:1; /* this is a spin mutex */ + u_int w_level; + struct witness *w_children[WITNESS_NCHILDREN]; +} witness_t; + +typedef struct witness_blessed { + char *b_lock1; + char *b_lock2; +} witness_blessed_t; + +#ifdef KDEBUG +/* + * When WITNESS_KDEBUG is set to 1, it will cause the system to + * drop into kdebug() when: + * - a lock heirarchy violation occurs + * - locks are held when going to sleep. + */ +#ifndef WITNESS_KDEBUG +#define WITNESS_KDEBUG 0 +#endif +int witness_kdebug = WITNESS_KDEBUG; +#endif /* KDEBUG */ + +#ifndef WITNESS_SKIPSPIN +#define WITNESS_SKIPSPIN 0 +#endif +int witness_skipspin = WITNESS_SKIPSPIN; + + +static mtx_t w_mtx; +static witness_t *w_free; +static witness_t *w_all; +static int w_inited; +static int witness_dead; /* fatal error, probably no memory */ + +static witness_t w_data[WITNESS_COUNT]; + +static witness_t *enroll __P((char *description, int flag)); +static int itismychild __P((witness_t *parent, witness_t *child)); +static void removechild __P((witness_t *parent, witness_t *child)); +static int isitmychild __P((witness_t *parent, witness_t *child)); +static int isitmydescendant __P((witness_t *parent, witness_t *child)); +static int dup_ok __P((witness_t *)); +static int blessed __P((witness_t *, witness_t *)); +static void witness_displaydescendants + __P((void(*)(const char *fmt, ...), witness_t *)); +static void witness_leveldescendents __P((witness_t *parent, int level)); +static void witness_levelall __P((void)); +static witness_t * witness_get __P((void)); +static void witness_free __P((witness_t *m)); + + +static char *ignore_list[] = { + "witness lock", + "Kdebug", /* breaks rules and may or may not work */ + "Page Alias", /* sparc only, witness lock won't block intr */ + NULL +}; + +static char *spin_order_list[] = { + "sched lock", + "log mtx", + "zslock", /* sparc only above log, this one is a real hack */ + "time lock", /* above callout */ + "callout mtx", /* above wayout */ + /* + * leaf locks + */ + "wayout mtx", + "kernel_pmap", /* sparc only, logically equal "pmap" below */ + "pmap", /* sparc only */ + NULL +}; + +static char *order_list[] = { + "tcb", "inp", "so_snd", "so_rcv", "Giant lock", NULL, + "udb", "inp", NULL, + "unp head", "unp", "so_snd", NULL, + "de0", "Giant lock", NULL, + "ifnet", "Giant lock", NULL, + "fifo", "so_snd", NULL, + "hme0", "Giant lock", NULL, + "esp0", "Giant lock", NULL, + "hfa0", "Giant lock", NULL, + "so_rcv", "atm_global", NULL, + "so_snd", "atm_global", NULL, + "NFS", "Giant lock", NULL, + NULL +}; + +static char *dup_list[] = { + "inp", + "process group", + "session", + "unp", + "rtentry", + "rawcb", + NULL +}; + +static char *sleep_list[] = { + "Giant lock", + NULL +}; + +/* + * Pairs of locks which have been blessed + * Don't complain about order problems with blessed locks + */ +static witness_blessed_t blessed_list[] = { +}; +static int blessed_count = sizeof (blessed_list) / sizeof (witness_blessed_t); + +void +witness_init(mtx_t *m, int flag) +{ + m->mtx_witness = enroll(m->mtx_description, flag); +} + +void +witness_destroy(mtx_t *m) +{ + mtx_t *m1; + struct proc *p; + p = CURPROC; + for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL; + m1 = LIST_NEXT(m1, mtx_held)) { + if (m1 == m) { + LIST_REMOVE(m, mtx_held); + break; + } + } + return; + +} + +void +witness_enter(mtx_t *m, int flags, char *file, int line) +{ + witness_t *w, *w1; + mtx_t *m1; + struct proc *p; + int i; +#ifdef KDEBUG + int go_into_kdebug = 0; +#endif /* KDEBUG */ + + w = m->mtx_witness; + p = CURPROC; + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + i = witness_spin_check; + if (i != 0 && w->w_level < i) { + mtx_exit(&w_mtx, MTX_SPIN); + panic("mutex_enter(%s:%x, MTX_SPIN) out of order @ %s:%d" + " already holding %s:%x", + m->mtx_description, w->w_level, file, line, + spin_order_list[ffs(i)-1], i); + } + PCPU_SET(witness_spin_check, i | w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + if (w->w_spin) + panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + if (witness_dead) + goto out; + if (cold) + goto out; + + if (!mtx_legal2block()) + panic("blockable mtx_enter() of %s when not legal @ %s:%d", + m->mtx_description, file, line); + /* + * Is this the first mutex acquired + */ + if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL) + goto out; + + + if ((w1 = m1->mtx_witness) == w) { + if (w->w_same_squawked || dup_ok(w)) + goto out; + w->w_same_squawked = 1; + printf("acquring duplicate lock of same type: \"%s\"\n", + m->mtx_description); + printf(" 1st @ %s:%d\n", w->w_file, w->w_line); + printf(" 2nd @ %s:%d\n", file, line); +#ifdef KDEBUG + go_into_kdebug = 1; +#endif /* KDEBUG */ + goto out; + } + MPASS(!mtx_owned(&w_mtx)); + mtx_enter(&w_mtx, MTX_SPIN); + /* + * If we have a known higher number just say ok + */ + if (witness_watch > 1 && w->w_level > w1->w_level) { + mtx_exit(&w_mtx, MTX_SPIN); + goto out; + } + if (isitmydescendant(m1->mtx_witness, w)) { + mtx_exit(&w_mtx, MTX_SPIN); + goto out; + } + for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) { + + ASS(i < 200); + w1 = m1->mtx_witness; + if (isitmydescendant(w, w1)) { + mtx_exit(&w_mtx, MTX_SPIN); + if (blessed(w, w1)) + goto out; + if (m1 == &Giant) { + if (w1->w_Giant_squawked) + goto out; + else + w1->w_Giant_squawked = 1; + } else { + if (w1->w_other_squawked) + goto out; + else + w1->w_other_squawked = 1; + } + printf("lock order reversal\n"); + printf(" 1st %s last acquired @ %s:%d\n", + w->w_description, w->w_file, w->w_line); + printf(" 2nd %p %s @ %s:%d\n", + m1, w1->w_description, w1->w_file, w1->w_line); + printf(" 3rd %p %s @ %s:%d\n", + m, w->w_description, file, line); +#ifdef KDEBUG + go_into_kdebug = 1; +#endif /* KDEBUG */ + goto out; + } + } + m1 = LIST_FIRST(&p->p_heldmtx); + if (!itismychild(m1->mtx_witness, w)) + mtx_exit(&w_mtx, MTX_SPIN); + +out: +#ifdef KDEBUG + if (witness_kdebug && go_into_kdebug) + kdebug(); +#endif /* KDEBUG */ + w->w_file = file; + w->w_line = line; + m->mtx_line = line; + m->mtx_file = file; + + /* + * If this pays off it likely means that a mutex being witnessed + * is acquired in hardclock. Put it in the ignore list. It is + * likely not the mutex this assert fails on. + */ + ASS(m->mtx_held.le_prev == NULL); + LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); +} + +void +witness_exit(mtx_t *m, int flags, char *file, int line) +{ + witness_t *w; + + w = m->mtx_witness; + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + PCPU_SET(witness_spin_check, witness_spin_check & ~w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + if (w->w_spin) + panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + + if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold) + panic("switchable mtx_exit() of %s when not legal @ %s:%d", + m->mtx_description, file, line); + LIST_REMOVE(m, mtx_held); + m->mtx_held.le_prev = NULL; +} + +void +witness_try_enter(mtx_t *m, int flags, char *file, int line) +{ + struct proc *p; + witness_t *w = m->mtx_witness; + + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_try_enter: " + "MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + PCPU_SET(witness_spin_check, witness_spin_check | w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + + if (w->w_spin) + panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + + w->w_file = file; + w->w_line = line; + m->mtx_line = line; + m->mtx_file = file; + p = CURPROC; + ASS(m->mtx_held.le_prev == NULL); + LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); +} + +void +witness_display(void(*prnt)(const char *fmt, ...)) +{ + witness_t *w, *w1; + + witness_levelall(); + + for (w = w_all; w; w = w->w_next) { + if (w->w_file == NULL) + continue; + for (w1 = w_all; w1; w1 = w1->w_next) { + if (isitmychild(w1, w)) + break; + } + if (w1 != NULL) + continue; + /* + * This lock has no anscestors, display its descendants. + */ + witness_displaydescendants(prnt, w); + } + prnt("\nMutex which were never acquired\n"); + for (w = w_all; w; w = w->w_next) { + if (w->w_file != NULL) + continue; + prnt("%s\n", w->w_description); + } +} + +int +witness_sleep(int check_only, mtx_t *mtx, char *file, int line) +{ + mtx_t *m; + struct proc *p; + char **sleep; + int n = 0; + + p = CURPROC; + for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL; + m = LIST_NEXT(m, mtx_held)) { + if (m == mtx) + continue; + for (sleep = sleep_list; *sleep!= NULL; sleep++) + if (strcmp(m->mtx_description, *sleep) == 0) + goto next; + printf("%s:%d: %s with \"%s\" locked from %s:%d\n", + file, line, check_only ? "could sleep" : "sleeping", + m->mtx_description, + m->mtx_witness->w_file, m->mtx_witness->w_line); + n++; + next: + } +#ifdef KDEBUG + if (witness_kdebug && n) + kdebug(); +#endif /* KDEBUG */ + return (n); +} + +static witness_t * +enroll(char *description, int flag) +{ + int i; + witness_t *w, *w1; + char **ignore; + char **order; + + if (!witness_watch) + return (NULL); + for (ignore = ignore_list; *ignore != NULL; ignore++) + if (strcmp(description, *ignore) == 0) + return (NULL); + + if (w_inited == 0) { + mtx_init(&w_mtx, "witness lock", MTX_DEF); + for (i = 0; i < WITNESS_COUNT; i++) { + w = &w_data[i]; + witness_free(w); + } + w_inited = 1; + for (order = order_list; *order != NULL; order++) { + w = enroll(*order, MTX_DEF); + w->w_file = "order list"; + for (order++; *order != NULL; order++) { + w1 = enroll(*order, MTX_DEF); + w1->w_file = "order list"; + itismychild(w, w1); + w = w1; + } + } + } + if ((flag & MTX_SPIN) && witness_skipspin) + return (NULL); + mtx_enter(&w_mtx, MTX_SPIN); + for (w = w_all; w; w = w->w_next) { + if (strcmp(description, w->w_description) == 0) { + mtx_exit(&w_mtx, MTX_SPIN); + return (w); + } + } + if ((w = witness_get()) == NULL) + return (NULL); + w->w_next = w_all; + w_all = w; + w->w_description = description; + mtx_exit(&w_mtx, MTX_SPIN); + if (flag & MTX_SPIN) { + w->w_spin = 1; + + i = 1; + for (order = spin_order_list; *order != NULL; order++) { + if (strcmp(description, *order) == 0) + break; + i <<= 1; + } + if (*order == NULL) + panic("spin lock %s not in order list", description); + w->w_level = i; + } + return (w); +} + +static int +itismychild(witness_t *parent, witness_t *child) +{ + static int recursed; + + /* + * Insert "child" after "parent" + */ + while (parent->w_morechildren) + parent = parent->w_morechildren; + + if (parent->w_childcnt == WITNESS_NCHILDREN) { + if ((parent->w_morechildren = witness_get()) == NULL) + return (1); + parent = parent->w_morechildren; + } + ASS(child != NULL); + parent->w_children[parent->w_childcnt++] = child; + /* + * now prune whole tree + */ + if (recursed) + return (0); + recursed = 1; + for (child = w_all; child != NULL; child = child->w_next) { + for (parent = w_all; parent != NULL; + parent = parent->w_next) { + if (!isitmychild(parent, child)) + continue; + removechild(parent, child); + if (isitmydescendant(parent, child)) + continue; + itismychild(parent, child); + } + } + recursed = 0; + witness_levelall(); + return (0); +} + +static void +removechild(witness_t *parent, witness_t *child) +{ + witness_t *w, *w1; + int i; + + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + if (w->w_children[i] == child) + goto found; + return; +found: + for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren) + continue; + w->w_children[i] = w1->w_children[--w1->w_childcnt]; + ASS(w->w_children[i] != NULL); + + if (w1->w_childcnt != 0) + return; + + if (w1 == parent) + return; + for (w = parent; w->w_morechildren != w1; w = w->w_morechildren) + continue; + w->w_morechildren = 0; + witness_free(w1); +} + +static int +isitmychild(witness_t *parent, witness_t *child) +{ + witness_t *w; + int i; + + for (w = parent; w != NULL; w = w->w_morechildren) { + for (i = 0; i < w->w_childcnt; i++) { + if (w->w_children[i] == child) + return (1); + } + } + return (0); +} + +static int +isitmydescendant(witness_t *parent, witness_t *child) +{ + witness_t *w; + int i; + int j; + + for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) { + ASS(j < 1000); + for (i = 0; i < w->w_childcnt; i++) { + if (w->w_children[i] == child) + return (1); + } + for (i = 0; i < w->w_childcnt; i++) { + if (isitmydescendant(w->w_children[i], child)) + return (1); + } + } + return (0); +} + +void +witness_levelall (void) +{ + witness_t *w, *w1; + + for (w = w_all; w; w = w->w_next) + if (!w->w_spin) + w->w_level = 0; + for (w = w_all; w; w = w->w_next) { + if (w->w_spin) + continue; + for (w1 = w_all; w1; w1 = w1->w_next) { + if (isitmychild(w1, w)) + break; + } + if (w1 != NULL) + continue; + witness_leveldescendents(w, 0); + } +} + +static void +witness_leveldescendents(witness_t *parent, int level) +{ + int i; + witness_t *w; + + if (parent->w_level < level) + parent->w_level = level; + level++; + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + witness_leveldescendents(w->w_children[i], level); +} + +static void +witness_displaydescendants(void(*prnt)(const char *fmt, ...), witness_t *parent) +{ + witness_t *w; + int i; + int level = parent->w_level; + + prnt("%d", level); + if (level < 10) + prnt(" "); + for (i = 0; i < level; i++) + prnt(" "); + prnt("%s", parent->w_description); + if (parent->w_file != NULL) { + prnt(" -- last acquired @ %s", parent->w_file); +#ifndef W_USE_WHERE + prnt(":%d", parent->w_line); +#endif + prnt("\n"); + } + + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + witness_displaydescendants(prnt, w->w_children[i]); + } + +static int +dup_ok(witness_t *w) +{ + char **dup; + + for (dup = dup_list; *dup!= NULL; dup++) + if (strcmp(w->w_description, *dup) == 0) + return (1); + return (0); +} + +static int +blessed(witness_t *w1, witness_t *w2) +{ + int i; + witness_blessed_t *b; + + for (i = 0; i < blessed_count; i++) { + b = &blessed_list[i]; + if (strcmp(w1->w_description, b->b_lock1) == 0) { + if (strcmp(w2->w_description, b->b_lock2) == 0) + return (1); + continue; + } + if (strcmp(w1->w_description, b->b_lock2) == 0) + if (strcmp(w2->w_description, b->b_lock1) == 0) + return (1); + } + return (0); +} + +static witness_t * +witness_get() +{ + witness_t *w; + + if ((w = w_free) == NULL) { + witness_dead = 1; + mtx_exit(&w_mtx, MTX_SPIN); + printf("witness exhausted\n"); + return (NULL); + } + w_free = w->w_next; + bzero(w, sizeof (*w)); + return (w); +} + +static void +witness_free(witness_t *w) +{ + w->w_next = w_free; + w_free = w; +} + +void +witness_list(struct proc *p) +{ + mtx_t *m; + + for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL; + m = LIST_NEXT(m, mtx_held)) { + printf("\t\"%s\" (%p) locked at %s:%d\n", + m->mtx_description, m, + m->mtx_witness->w_file, m->mtx_witness->w_line); + } +} + +void +witness_save(mtx_t *m, char **filep, int *linep) +{ + *filep = m->mtx_witness->w_file; + *linep = m->mtx_witness->w_line; +} + +void +witness_restore(mtx_t *m, char *file, int line) +{ + m->mtx_witness->w_file = file; + m->mtx_witness->w_line = line; +} + +#endif /* (defined(SMP_DEBUG) && defined(WITNESS)) */ diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c new file mode 100644 index 000000000000..1ac3f584d9ef --- /dev/null +++ b/sys/kern/subr_witness.c @@ -0,0 +1,799 @@ +/*- + * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ + * $FreeBSD$ + */ + +/* + * Main Entry: witness + * Pronunciation: 'wit-n&s + * Function: noun + * Etymology: Middle English witnesse, from Old English witnes knowledge, + * testimony, witness, from 2wit + * Date: before 12th century + * 1 : attestation of a fact or event : TESTIMONY + * 2 : one that gives evidence; specifically : one who testifies in + * a cause or before a judicial tribunal + * 3 : one asked to be present at a transaction so as to be able to + * testify to its having taken place + * 4 : one who has personal knowledge of something + * 5 a : something serving as evidence or proof : SIGN + * b : public affirmation by word or example of usually + * religious faith or conviction + * 6 capitalized : a member of the Jehovah's Witnesses + */ + +#include +#include +#include +#include + +#include +#define _KERN_MUTEX_C_ /* Cause non-inlined mtx_*() to be compiled. */ +#include + +/* + * The non-inlined versions of the mtx_*() functions are always built (above), + * but the witness code depends on the SMP_DEBUG and WITNESS kernel options + * being specified. + */ +#if (defined(SMP_DEBUG) && defined(WITNESS)) + +#define WITNESS_COUNT 200 +#define WITNESS_NCHILDREN 2 + +#ifndef WITNESS +#define WITNESS 0 /* default off */ +#endif + +#ifndef SMP +extern int witness_spin_check; +#endif + +int witness_watch; + +typedef struct witness { + struct witness *w_next; + char *w_description; + char *w_file; + int w_line; + struct witness *w_morechildren; + u_char w_childcnt; + u_char w_Giant_squawked:1; + u_char w_other_squawked:1; + u_char w_same_squawked:1; + u_char w_sleep:1; + u_char w_spin:1; /* this is a spin mutex */ + u_int w_level; + struct witness *w_children[WITNESS_NCHILDREN]; +} witness_t; + +typedef struct witness_blessed { + char *b_lock1; + char *b_lock2; +} witness_blessed_t; + +#ifdef KDEBUG +/* + * When WITNESS_KDEBUG is set to 1, it will cause the system to + * drop into kdebug() when: + * - a lock heirarchy violation occurs + * - locks are held when going to sleep. + */ +#ifndef WITNESS_KDEBUG +#define WITNESS_KDEBUG 0 +#endif +int witness_kdebug = WITNESS_KDEBUG; +#endif /* KDEBUG */ + +#ifndef WITNESS_SKIPSPIN +#define WITNESS_SKIPSPIN 0 +#endif +int witness_skipspin = WITNESS_SKIPSPIN; + + +static mtx_t w_mtx; +static witness_t *w_free; +static witness_t *w_all; +static int w_inited; +static int witness_dead; /* fatal error, probably no memory */ + +static witness_t w_data[WITNESS_COUNT]; + +static witness_t *enroll __P((char *description, int flag)); +static int itismychild __P((witness_t *parent, witness_t *child)); +static void removechild __P((witness_t *parent, witness_t *child)); +static int isitmychild __P((witness_t *parent, witness_t *child)); +static int isitmydescendant __P((witness_t *parent, witness_t *child)); +static int dup_ok __P((witness_t *)); +static int blessed __P((witness_t *, witness_t *)); +static void witness_displaydescendants + __P((void(*)(const char *fmt, ...), witness_t *)); +static void witness_leveldescendents __P((witness_t *parent, int level)); +static void witness_levelall __P((void)); +static witness_t * witness_get __P((void)); +static void witness_free __P((witness_t *m)); + + +static char *ignore_list[] = { + "witness lock", + "Kdebug", /* breaks rules and may or may not work */ + "Page Alias", /* sparc only, witness lock won't block intr */ + NULL +}; + +static char *spin_order_list[] = { + "sched lock", + "log mtx", + "zslock", /* sparc only above log, this one is a real hack */ + "time lock", /* above callout */ + "callout mtx", /* above wayout */ + /* + * leaf locks + */ + "wayout mtx", + "kernel_pmap", /* sparc only, logically equal "pmap" below */ + "pmap", /* sparc only */ + NULL +}; + +static char *order_list[] = { + "tcb", "inp", "so_snd", "so_rcv", "Giant lock", NULL, + "udb", "inp", NULL, + "unp head", "unp", "so_snd", NULL, + "de0", "Giant lock", NULL, + "ifnet", "Giant lock", NULL, + "fifo", "so_snd", NULL, + "hme0", "Giant lock", NULL, + "esp0", "Giant lock", NULL, + "hfa0", "Giant lock", NULL, + "so_rcv", "atm_global", NULL, + "so_snd", "atm_global", NULL, + "NFS", "Giant lock", NULL, + NULL +}; + +static char *dup_list[] = { + "inp", + "process group", + "session", + "unp", + "rtentry", + "rawcb", + NULL +}; + +static char *sleep_list[] = { + "Giant lock", + NULL +}; + +/* + * Pairs of locks which have been blessed + * Don't complain about order problems with blessed locks + */ +static witness_blessed_t blessed_list[] = { +}; +static int blessed_count = sizeof (blessed_list) / sizeof (witness_blessed_t); + +void +witness_init(mtx_t *m, int flag) +{ + m->mtx_witness = enroll(m->mtx_description, flag); +} + +void +witness_destroy(mtx_t *m) +{ + mtx_t *m1; + struct proc *p; + p = CURPROC; + for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL; + m1 = LIST_NEXT(m1, mtx_held)) { + if (m1 == m) { + LIST_REMOVE(m, mtx_held); + break; + } + } + return; + +} + +void +witness_enter(mtx_t *m, int flags, char *file, int line) +{ + witness_t *w, *w1; + mtx_t *m1; + struct proc *p; + int i; +#ifdef KDEBUG + int go_into_kdebug = 0; +#endif /* KDEBUG */ + + w = m->mtx_witness; + p = CURPROC; + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + i = witness_spin_check; + if (i != 0 && w->w_level < i) { + mtx_exit(&w_mtx, MTX_SPIN); + panic("mutex_enter(%s:%x, MTX_SPIN) out of order @ %s:%d" + " already holding %s:%x", + m->mtx_description, w->w_level, file, line, + spin_order_list[ffs(i)-1], i); + } + PCPU_SET(witness_spin_check, i | w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + if (w->w_spin) + panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + if (witness_dead) + goto out; + if (cold) + goto out; + + if (!mtx_legal2block()) + panic("blockable mtx_enter() of %s when not legal @ %s:%d", + m->mtx_description, file, line); + /* + * Is this the first mutex acquired + */ + if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL) + goto out; + + + if ((w1 = m1->mtx_witness) == w) { + if (w->w_same_squawked || dup_ok(w)) + goto out; + w->w_same_squawked = 1; + printf("acquring duplicate lock of same type: \"%s\"\n", + m->mtx_description); + printf(" 1st @ %s:%d\n", w->w_file, w->w_line); + printf(" 2nd @ %s:%d\n", file, line); +#ifdef KDEBUG + go_into_kdebug = 1; +#endif /* KDEBUG */ + goto out; + } + MPASS(!mtx_owned(&w_mtx)); + mtx_enter(&w_mtx, MTX_SPIN); + /* + * If we have a known higher number just say ok + */ + if (witness_watch > 1 && w->w_level > w1->w_level) { + mtx_exit(&w_mtx, MTX_SPIN); + goto out; + } + if (isitmydescendant(m1->mtx_witness, w)) { + mtx_exit(&w_mtx, MTX_SPIN); + goto out; + } + for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) { + + ASS(i < 200); + w1 = m1->mtx_witness; + if (isitmydescendant(w, w1)) { + mtx_exit(&w_mtx, MTX_SPIN); + if (blessed(w, w1)) + goto out; + if (m1 == &Giant) { + if (w1->w_Giant_squawked) + goto out; + else + w1->w_Giant_squawked = 1; + } else { + if (w1->w_other_squawked) + goto out; + else + w1->w_other_squawked = 1; + } + printf("lock order reversal\n"); + printf(" 1st %s last acquired @ %s:%d\n", + w->w_description, w->w_file, w->w_line); + printf(" 2nd %p %s @ %s:%d\n", + m1, w1->w_description, w1->w_file, w1->w_line); + printf(" 3rd %p %s @ %s:%d\n", + m, w->w_description, file, line); +#ifdef KDEBUG + go_into_kdebug = 1; +#endif /* KDEBUG */ + goto out; + } + } + m1 = LIST_FIRST(&p->p_heldmtx); + if (!itismychild(m1->mtx_witness, w)) + mtx_exit(&w_mtx, MTX_SPIN); + +out: +#ifdef KDEBUG + if (witness_kdebug && go_into_kdebug) + kdebug(); +#endif /* KDEBUG */ + w->w_file = file; + w->w_line = line; + m->mtx_line = line; + m->mtx_file = file; + + /* + * If this pays off it likely means that a mutex being witnessed + * is acquired in hardclock. Put it in the ignore list. It is + * likely not the mutex this assert fails on. + */ + ASS(m->mtx_held.le_prev == NULL); + LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); +} + +void +witness_exit(mtx_t *m, int flags, char *file, int line) +{ + witness_t *w; + + w = m->mtx_witness; + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + PCPU_SET(witness_spin_check, witness_spin_check & ~w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + if (w->w_spin) + panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + + if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold) + panic("switchable mtx_exit() of %s when not legal @ %s:%d", + m->mtx_description, file, line); + LIST_REMOVE(m, mtx_held); + m->mtx_held.le_prev = NULL; +} + +void +witness_try_enter(mtx_t *m, int flags, char *file, int line) +{ + struct proc *p; + witness_t *w = m->mtx_witness; + + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_try_enter: " + "MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + PCPU_SET(witness_spin_check, witness_spin_check | w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + + if (w->w_spin) + panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + + w->w_file = file; + w->w_line = line; + m->mtx_line = line; + m->mtx_file = file; + p = CURPROC; + ASS(m->mtx_held.le_prev == NULL); + LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); +} + +void +witness_display(void(*prnt)(const char *fmt, ...)) +{ + witness_t *w, *w1; + + witness_levelall(); + + for (w = w_all; w; w = w->w_next) { + if (w->w_file == NULL) + continue; + for (w1 = w_all; w1; w1 = w1->w_next) { + if (isitmychild(w1, w)) + break; + } + if (w1 != NULL) + continue; + /* + * This lock has no anscestors, display its descendants. + */ + witness_displaydescendants(prnt, w); + } + prnt("\nMutex which were never acquired\n"); + for (w = w_all; w; w = w->w_next) { + if (w->w_file != NULL) + continue; + prnt("%s\n", w->w_description); + } +} + +int +witness_sleep(int check_only, mtx_t *mtx, char *file, int line) +{ + mtx_t *m; + struct proc *p; + char **sleep; + int n = 0; + + p = CURPROC; + for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL; + m = LIST_NEXT(m, mtx_held)) { + if (m == mtx) + continue; + for (sleep = sleep_list; *sleep!= NULL; sleep++) + if (strcmp(m->mtx_description, *sleep) == 0) + goto next; + printf("%s:%d: %s with \"%s\" locked from %s:%d\n", + file, line, check_only ? "could sleep" : "sleeping", + m->mtx_description, + m->mtx_witness->w_file, m->mtx_witness->w_line); + n++; + next: + } +#ifdef KDEBUG + if (witness_kdebug && n) + kdebug(); +#endif /* KDEBUG */ + return (n); +} + +static witness_t * +enroll(char *description, int flag) +{ + int i; + witness_t *w, *w1; + char **ignore; + char **order; + + if (!witness_watch) + return (NULL); + for (ignore = ignore_list; *ignore != NULL; ignore++) + if (strcmp(description, *ignore) == 0) + return (NULL); + + if (w_inited == 0) { + mtx_init(&w_mtx, "witness lock", MTX_DEF); + for (i = 0; i < WITNESS_COUNT; i++) { + w = &w_data[i]; + witness_free(w); + } + w_inited = 1; + for (order = order_list; *order != NULL; order++) { + w = enroll(*order, MTX_DEF); + w->w_file = "order list"; + for (order++; *order != NULL; order++) { + w1 = enroll(*order, MTX_DEF); + w1->w_file = "order list"; + itismychild(w, w1); + w = w1; + } + } + } + if ((flag & MTX_SPIN) && witness_skipspin) + return (NULL); + mtx_enter(&w_mtx, MTX_SPIN); + for (w = w_all; w; w = w->w_next) { + if (strcmp(description, w->w_description) == 0) { + mtx_exit(&w_mtx, MTX_SPIN); + return (w); + } + } + if ((w = witness_get()) == NULL) + return (NULL); + w->w_next = w_all; + w_all = w; + w->w_description = description; + mtx_exit(&w_mtx, MTX_SPIN); + if (flag & MTX_SPIN) { + w->w_spin = 1; + + i = 1; + for (order = spin_order_list; *order != NULL; order++) { + if (strcmp(description, *order) == 0) + break; + i <<= 1; + } + if (*order == NULL) + panic("spin lock %s not in order list", description); + w->w_level = i; + } + return (w); +} + +static int +itismychild(witness_t *parent, witness_t *child) +{ + static int recursed; + + /* + * Insert "child" after "parent" + */ + while (parent->w_morechildren) + parent = parent->w_morechildren; + + if (parent->w_childcnt == WITNESS_NCHILDREN) { + if ((parent->w_morechildren = witness_get()) == NULL) + return (1); + parent = parent->w_morechildren; + } + ASS(child != NULL); + parent->w_children[parent->w_childcnt++] = child; + /* + * now prune whole tree + */ + if (recursed) + return (0); + recursed = 1; + for (child = w_all; child != NULL; child = child->w_next) { + for (parent = w_all; parent != NULL; + parent = parent->w_next) { + if (!isitmychild(parent, child)) + continue; + removechild(parent, child); + if (isitmydescendant(parent, child)) + continue; + itismychild(parent, child); + } + } + recursed = 0; + witness_levelall(); + return (0); +} + +static void +removechild(witness_t *parent, witness_t *child) +{ + witness_t *w, *w1; + int i; + + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + if (w->w_children[i] == child) + goto found; + return; +found: + for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren) + continue; + w->w_children[i] = w1->w_children[--w1->w_childcnt]; + ASS(w->w_children[i] != NULL); + + if (w1->w_childcnt != 0) + return; + + if (w1 == parent) + return; + for (w = parent; w->w_morechildren != w1; w = w->w_morechildren) + continue; + w->w_morechildren = 0; + witness_free(w1); +} + +static int +isitmychild(witness_t *parent, witness_t *child) +{ + witness_t *w; + int i; + + for (w = parent; w != NULL; w = w->w_morechildren) { + for (i = 0; i < w->w_childcnt; i++) { + if (w->w_children[i] == child) + return (1); + } + } + return (0); +} + +static int +isitmydescendant(witness_t *parent, witness_t *child) +{ + witness_t *w; + int i; + int j; + + for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) { + ASS(j < 1000); + for (i = 0; i < w->w_childcnt; i++) { + if (w->w_children[i] == child) + return (1); + } + for (i = 0; i < w->w_childcnt; i++) { + if (isitmydescendant(w->w_children[i], child)) + return (1); + } + } + return (0); +} + +void +witness_levelall (void) +{ + witness_t *w, *w1; + + for (w = w_all; w; w = w->w_next) + if (!w->w_spin) + w->w_level = 0; + for (w = w_all; w; w = w->w_next) { + if (w->w_spin) + continue; + for (w1 = w_all; w1; w1 = w1->w_next) { + if (isitmychild(w1, w)) + break; + } + if (w1 != NULL) + continue; + witness_leveldescendents(w, 0); + } +} + +static void +witness_leveldescendents(witness_t *parent, int level) +{ + int i; + witness_t *w; + + if (parent->w_level < level) + parent->w_level = level; + level++; + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + witness_leveldescendents(w->w_children[i], level); +} + +static void +witness_displaydescendants(void(*prnt)(const char *fmt, ...), witness_t *parent) +{ + witness_t *w; + int i; + int level = parent->w_level; + + prnt("%d", level); + if (level < 10) + prnt(" "); + for (i = 0; i < level; i++) + prnt(" "); + prnt("%s", parent->w_description); + if (parent->w_file != NULL) { + prnt(" -- last acquired @ %s", parent->w_file); +#ifndef W_USE_WHERE + prnt(":%d", parent->w_line); +#endif + prnt("\n"); + } + + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + witness_displaydescendants(prnt, w->w_children[i]); + } + +static int +dup_ok(witness_t *w) +{ + char **dup; + + for (dup = dup_list; *dup!= NULL; dup++) + if (strcmp(w->w_description, *dup) == 0) + return (1); + return (0); +} + +static int +blessed(witness_t *w1, witness_t *w2) +{ + int i; + witness_blessed_t *b; + + for (i = 0; i < blessed_count; i++) { + b = &blessed_list[i]; + if (strcmp(w1->w_description, b->b_lock1) == 0) { + if (strcmp(w2->w_description, b->b_lock2) == 0) + return (1); + continue; + } + if (strcmp(w1->w_description, b->b_lock2) == 0) + if (strcmp(w2->w_description, b->b_lock1) == 0) + return (1); + } + return (0); +} + +static witness_t * +witness_get() +{ + witness_t *w; + + if ((w = w_free) == NULL) { + witness_dead = 1; + mtx_exit(&w_mtx, MTX_SPIN); + printf("witness exhausted\n"); + return (NULL); + } + w_free = w->w_next; + bzero(w, sizeof (*w)); + return (w); +} + +static void +witness_free(witness_t *w) +{ + w->w_next = w_free; + w_free = w; +} + +void +witness_list(struct proc *p) +{ + mtx_t *m; + + for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL; + m = LIST_NEXT(m, mtx_held)) { + printf("\t\"%s\" (%p) locked at %s:%d\n", + m->mtx_description, m, + m->mtx_witness->w_file, m->mtx_witness->w_line); + } +} + +void +witness_save(mtx_t *m, char **filep, int *linep) +{ + *filep = m->mtx_witness->w_file; + *linep = m->mtx_witness->w_line; +} + +void +witness_restore(mtx_t *m, char *file, int line) +{ + m->mtx_witness->w_file = file; + m->mtx_witness->w_line = line; +} + +#endif /* (defined(SMP_DEBUG) && defined(WITNESS)) */ diff --git a/sys/kern/tty.c b/sys/kern/tty.c index 29b62881145b..87fb9806c81c 100644 --- a/sys/kern/tty.c +++ b/sys/kern/tty.c @@ -2266,7 +2266,8 @@ ttyinfo(tp) tmp = (pick->p_pctcpu * 10000 + FSCALE / 2) >> FSHIFT; ttyprintf(tp, "%d%% %ldk\n", tmp / 100, - pick->p_stat == SIDL || pick->p_stat == SZOMB ? 0 : + pick->p_stat == SIDL || pick->p_stat == SWAIT || + pick->p_stat == SZOMB ? 0 : (long)pgtok(vmspace_resident_count(pick->p_vmspace))); } tp->t_rocount = 0; /* so pending input will be retyped if BS */ diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 99c0754ea8e4..34cff175b983 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,8 @@ #include #include +#include + static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ @@ -461,7 +464,7 @@ bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { - if (curproc != NULL) + if (curproc != idleproc) curproc->p_stats->p_ru.ru_inblock++; KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp)); bp->b_iocmd = BIO_READ; @@ -498,7 +501,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size, /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { - if (curproc != NULL) + if (curproc != idleproc) curproc->p_stats->p_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; @@ -519,7 +522,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size, rabp = getblk(vp, *rablkno, *rabsize, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { - if (curproc != NULL) + if (curproc != idleproc) curproc->p_stats->p_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; @@ -640,7 +643,7 @@ bwrite(struct buf * bp) bp->b_vp->v_numoutput++; vfs_busy_pages(bp, 1); - if (curproc != NULL) + if (curproc != idleproc) curproc->p_stats->p_ru.ru_oublock++; splx(s); if (oldflags & B_ASYNC) @@ -1420,7 +1423,8 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize) int isspecial; static int flushingbufs; - if (curproc && (curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0) + if (curproc != idleproc && + (curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0) isspecial = 0; else isspecial = 1; @@ -1745,6 +1749,8 @@ buf_daemon() { int s; + mtx_enter(&Giant, MTX_DEF); + /* * This process needs to be suspended prior to shutdown sync. */ @@ -2070,9 +2076,9 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) * move it into the else, when gbincore() fails. At the moment * it isn't a problem. */ - if (!curproc || (curproc->p_flag & P_BUFEXHAUST)) { + if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) { if (numfreebuffers == 0) { - if (!curproc) + if (curproc == idleproc) return NULL; needsbuffer |= VFS_BIO_NEED_ANY; tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c index 3e4b17f2373c..52ad0ef2434f 100644 --- a/sys/kern/vfs_export.c +++ b/sys/kern/vfs_export.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,7 @@ #include #include +#include #include #include @@ -960,6 +962,8 @@ sched_sync(void) int s; struct proc *p = updateproc; + mtx_enter(&Giant, MTX_DEF); + EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p, SHUTDOWN_PRI_LAST); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 3e4b17f2373c..52ad0ef2434f 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,7 @@ #include #include +#include #include #include @@ -960,6 +962,8 @@ sched_sync(void) int s; struct proc *p = updateproc; + mtx_enter(&Giant, MTX_DEF); + EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p, SHUTDOWN_PRI_LAST); diff --git a/sys/modules/if_ppp/Makefile b/sys/modules/if_ppp/Makefile index 3e6c9a54e5da..c165a4955218 100644 --- a/sys/modules/if_ppp/Makefile +++ b/sys/modules/if_ppp/Makefile @@ -2,8 +2,8 @@ .PATH: ${.CURDIR}/../../net KMOD= if_ppp -SRCS= if_ppp.c ppp_tty.c slcompress.c \ - ppp.h opt_inet.h opt_ipx.h opt_ppp.h vnode_if.h +SRCS= bus_if.h device_if.h if_ppp.c ppp_tty.c slcompress.c \ + ppp.h opt_bus.h opt_inet.h opt_ipx.h opt_ppp.h vnode_if.h NOMAN= NPPP?= 2 diff --git a/sys/modules/netgraph/tty/Makefile b/sys/modules/netgraph/tty/Makefile index 824b08285bf1..3ee6198eff83 100644 --- a/sys/modules/netgraph/tty/Makefile +++ b/sys/modules/netgraph/tty/Makefile @@ -2,7 +2,8 @@ # $Whistle: Makefile,v 1.2 1999/01/19 19:39:22 archie Exp $ KMOD= ng_tty -SRCS= ng_tty.c +SRCS= ng_tty.c device_if.h bus_if.h pci_if.h +MFILES= kern/device_if.m kern/bus_if.m pci/agp_if.m pci/pci_if.m NOMAN= .include diff --git a/sys/net/ppp_tty.c b/sys/net/ppp_tty.c index 2c4a1cfa9628..906de00194a8 100644 --- a/sys/net/ppp_tty.c +++ b/sys/net/ppp_tty.c @@ -89,6 +89,8 @@ #include #ifdef __i386__ +#include +#include #include #endif @@ -160,27 +162,6 @@ void pppasyncattach(dummy) void *dummy; { -#ifdef __i386__ - int s; - - s = splhigh(); - - /* - * Make sure that the soft net "engine" cannot run while spltty code is - * active. The if_ppp.c code can walk down into b_to_q etc, and it is - * bad if the tty system was in the middle of another b_to_q... - */ - tty_imask |= softnet_imask; /* spltty() block spl[soft]net() */ - net_imask |= softtty_imask; /* splimp() block splsofttty() */ - net_imask |= tty_imask; /* splimp() block spltty() */ - update_intr_masks(); - - splx(s); - if ( bootverbose ) - printf("new masks: bio %x, tty %x, net %x\n", - bio_imask, tty_imask, net_imask); -#endif - /* register line discipline */ linesw[PPPDISC] = pppdisc; } diff --git a/sys/netgraph/ng_tty.c b/sys/netgraph/ng_tty.c index ef2cc5d6ff96..70f9fb33a9b2 100644 --- a/sys/netgraph/ng_tty.c +++ b/sys/netgraph/ng_tty.c @@ -77,6 +77,7 @@ #include #ifdef __i386__ /* fiddle with the spl locking */ +#include #include #include #endif @@ -660,19 +661,6 @@ ngt_mod_event(module_t mod, int event, void *data) switch (event) { case MOD_LOAD: -#ifdef __i386__ - /* Insure the soft net "engine" can't run during spltty code */ - s = splhigh(); - tty_imask |= softnet_imask; /* spltty() block spl[soft]net() */ - net_imask |= softtty_imask; /* splimp() block splsofttty() */ - net_imask |= tty_imask; /* splimp() block spltty() */ - update_intr_masks(); - splx(s); - - if (bootverbose) - log(LOG_DEBUG, "new masks: bio %x, tty %x, net %x\n", - bio_imask, tty_imask, net_imask); -#endif /* Register line discipline */ s = spltty(); diff --git a/sys/nfs/nfs_srvcache.c b/sys/nfs/nfs_srvcache.c index 9eb168f6086e..6c4af8eeab36 100644 --- a/sys/nfs/nfs_srvcache.c +++ b/sys/nfs/nfs_srvcache.c @@ -44,6 +44,7 @@ */ #include #include +#include #include #include #include diff --git a/sys/nfsserver/nfs_srvcache.c b/sys/nfsserver/nfs_srvcache.c index 9eb168f6086e..6c4af8eeab36 100644 --- a/sys/nfsserver/nfs_srvcache.c +++ b/sys/nfsserver/nfs_srvcache.c @@ -44,6 +44,7 @@ */ #include #include +#include #include #include #include diff --git a/sys/pci/pci_compat.c b/sys/pci/pci_compat.c index bf833b16abb6..2e7eba56df54 100644 --- a/sys/pci/pci_compat.c +++ b/sys/pci/pci_compat.c @@ -54,6 +54,8 @@ #endif #ifdef __i386__ +#include +#include #include #endif @@ -141,7 +143,7 @@ pci_map_int_right(pcici_t cfg, pci_inthand_t *handler, void *arg, #ifdef INTR_FAST if (intflags & INTR_FAST) - flags |= INTR_TYPE_FAST; + flags |= INTR_FAST; if (intflags & INTR_EXCL) resflags &= ~RF_SHAREABLE; #endif diff --git a/sys/powerpc/aim/vm_machdep.c b/sys/powerpc/aim/vm_machdep.c index 8baea02b8494..3831d67658c6 100644 --- a/sys/powerpc/aim/vm_machdep.c +++ b/sys/powerpc/aim/vm_machdep.c @@ -84,6 +84,7 @@ #include #include #include +#include #include #include @@ -246,8 +247,10 @@ cpu_exit(p) alpha_fpstate_drop(p); (void) splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); + mtx_exit(&Giant, MTX_DEF); cnt.v_swtch++; - cpu_switch(p); + cpu_switch(); panic("cpu_exit"); } @@ -358,7 +361,7 @@ vunmapbuf(bp) } /* - * Force reset the processor by invalidating the entire address space! + * Reset back to firmware. */ void cpu_reset() @@ -416,7 +419,7 @@ vm_page_zero_idle() return(0); #ifdef SMP - if (try_mplock()) { + if (KLOCK_ENTER(M_TRY)) { #endif s = splvm(); m = vm_page_list_find(PQ_FREE, free_rover, FALSE); @@ -447,7 +450,7 @@ vm_page_zero_idle() free_rover = (free_rover + PQ_PRIME2) & PQ_L2_MASK; splx(s); #ifdef SMP - rel_mplock(); + KLOCK_EXIT; #endif return (1); #ifdef SMP diff --git a/sys/powerpc/include/globaldata.h b/sys/powerpc/include/globaldata.h new file mode 100644 index 000000000000..b246bb1fb707 --- /dev/null +++ b/sys/powerpc/include/globaldata.h @@ -0,0 +1,79 @@ +/*- + * Copyright (c) 1999 Luoqi Chen + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MACHINE_GLOBALDATA_H_ +#define _MACHINE_GLOBALDATA_H_ + +#ifdef _KERNEL + +#include + +/* + * This structure maps out the global data that needs to be kept on a + * per-cpu basis. genassym uses this to generate offsets for the assembler + * code, which also provides external symbols so that C can get at them as + * though they were really globals. This structure is pointed to by + * the per-cpu system value (see alpha_pal_rdval() and alpha_pal_wrval()). + * Inside the kernel, the globally reserved register t7 is used to + * point at the globaldata structure. + */ +struct globaldata { + struct alpha_pcb gd_idlepcb; /* pcb for idling */ + struct proc *gd_curproc; /* current process */ + struct proc *gd_idleproc; /* idle process */ + struct proc *gd_fpcurproc; /* fp state owner */ + struct pcb *gd_curpcb; /* current pcb */ + struct timeval gd_switchtime; + int gd_switchticks; + u_int gd_cpuno; /* this cpu number */ + u_int gd_other_cpus; /* all other cpus */ + int gd_inside_intr; + u_int64_t gd_idlepcbphys; /* pa of gd_idlepcb */ + u_int64_t gd_pending_ipis; /* pending IPI events */ + u_int32_t gd_next_asn; /* next ASN to allocate */ + u_int32_t gd_current_asngen; /* ASN rollover check */ + u_int32_t gd_intr_nesting_level; /* interrupt recursion */ + + u_int gd_astpending; + SLIST_ENTRY(globaldata) gd_allcpu; +#ifdef KTR_PERCPU + volatile int gd_ktr_idx; /* Index into trace table */ + char *gd_ktr_buf; + char gd_ktr_buf_data[0]; +#endif +}; + +SLIST_HEAD(cpuhead, globaldata); +extern struct cpuhead cpuhead; + +void globaldata_init(struct globaldata *pcpu, int cpuno, size_t sz); +struct globaldata *globaldata_find(int cpuno); + +#endif /* _KERNEL */ + +#endif /* !_MACHINE_GLOBALDATA_H_ */ diff --git a/sys/powerpc/include/globals.h b/sys/powerpc/include/globals.h new file mode 100644 index 000000000000..303efdfe9f6a --- /dev/null +++ b/sys/powerpc/include/globals.h @@ -0,0 +1,63 @@ +/*- + * Copyright (c) 1999 Luoqi Chen + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MACHINE_GLOBALS_H_ +#define _MACHINE_GLOBALS_H_ + +#ifdef _KERNEL + +register struct globaldata *globalp __asm__("$8"); + +#if 1 +#define GLOBALP globalp +#else +#define GLOBALP ((struct globaldata *) alpha_pal_rdval()) +#endif + +#define PCPU_GET(name) (GLOBALP->gd_##name) +#define PCPU_SET(name,value) (GLOBALP->gd_##name = (value)) + +/* + * The following set of macros works for UP kernel as well, but for maximum + * performance we allow the global variables to be accessed directly. On the + * other hand, kernel modules should always use these macros to maintain + * portability between UP and SMP kernels. + */ +#define CURPROC PCPU_GET(curproc) +#define curproc PCPU_GET(curproc) +#define idleproc PCPU_GET(idleproc) +#define curpcb PCPU_GET(curpcb) +#define fpcurproc PCPU_GET(fpcurproc) +#define switchtime PCPU_GET(switchtime) +#define switchticks PCPU_GET(switchticks) +#define cpuid PCPU_GET(cpuno) +#define prevproc PCPU_GET(curproc) /* XXX - until ithreads */ + +#endif /* _KERNEL */ + +#endif /* !_MACHINE_GLOBALS_H_ */ diff --git a/sys/powerpc/include/mutex.h b/sys/powerpc/include/mutex.h new file mode 100644 index 000000000000..ac13b8cbde0e --- /dev/null +++ b/sys/powerpc/include/mutex.h @@ -0,0 +1,563 @@ +/*- + * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: mutex.h,v 2.7.2.35 2000/04/27 03:10:26 cp Exp $ + * $FreeBSD$ + */ + + +#ifndef _MACHINE_MUTEX_H_ +#define _MACHINE_MUTEX_H_ + +#ifndef LOCORE + +#include +#include +#include +#include + +/* + * Mutex flags + * + * Types + */ +#define MTX_DEF 0x1 /* Default (spin/sleep) */ +#define MTX_SPIN 0x2 /* Spin only lock */ + +/* Options */ +#define MTX_RLIKELY 0x4 /* (opt) Recursion likely */ +#define MTX_NORECURSE 0x8 /* No recursion possible */ +#define MTX_NOSPIN 0x10 /* Don't spin before sleeping */ +#define MTX_NOSWITCH 0x20 /* Do not switch on release */ +#define MTX_FIRST 0x40 /* First spin lock holder */ +#define MTX_TOPHALF 0x80 /* Interrupts not disabled on spin */ + +/* options that should be passed on to mtx_enter_hard, mtx_exit_hard */ +#define MTX_HARDOPTS (MTX_DEF | MTX_SPIN | MTX_FIRST | MTX_TOPHALF | MTX_NOSWITCH) + +/* Flags/value used in mtx_lock */ +#define MTX_RECURSE 0x01 /* (non-spin) lock held recursively */ +#define MTX_CONTESTED 0x02 /* (non-spin) lock contested */ +#define MTX_FLAGMASK ~(MTX_RECURSE | MTX_CONTESTED) +#define MTX_UNOWNED 0x8 /* Cookie for free mutex */ + +struct proc; /* XXX */ + +/* + * Sleep/spin mutex + */ +struct mtx { + volatile u_int64_t mtx_lock; /* lock owner/gate/flags */ + volatile u_int32_t mtx_recurse; /* number of recursive holds */ + u_int32_t mtx_saveipl; /* saved ipl (for spin locks) */ + char *mtx_description; + TAILQ_HEAD(, proc) mtx_blocked; + LIST_ENTRY(mtx) mtx_contested; + struct mtx *mtx_next; /* all locks in system */ + struct mtx *mtx_prev; +#ifdef SMP_DEBUG + /* If you add anything here, adjust the mtxf_t definition below */ + struct witness *mtx_witness; + LIST_ENTRY(mtx) mtx_held; + char *mtx_file; + int mtx_line; +#endif /* SMP_DEBUG */ +}; + +typedef struct mtx mtx_t; + +/* + * Filler for structs which need to remain the same size + * whether or not SMP_DEBUG is turned on. + */ +typedef struct mtxf { +#ifdef SMP_DEBUG + char mtxf_data[0]; +#else + char mtxf_data[4*sizeof(void *) + sizeof(int)]; +#endif +} mtxf_t; + +#define mp_fixme(string) + +#ifdef _KERNEL +/* Misc */ +#define CURTHD ((u_int64_t)CURPROC) /* Current thread ID */ + +/* Prototypes */ +void mtx_init(mtx_t *m, char *description, int flag); +void mtx_enter_hard(mtx_t *, int type, int ipl); +void mtx_exit_hard(mtx_t *, int type); +void mtx_destroy(mtx_t *m); + +/* Global locks */ +extern mtx_t sched_lock; +extern mtx_t Giant; + +/* + * Used to replace return with an exit Giant and return. + */ + +#define EGAR(a) \ +do { \ + mtx_exit(&Giant, MTX_DEF); \ + return (a); \ +} while (0) + +#define VEGAR \ +do { \ + mtx_exit(&Giant, MTX_DEF); \ + return; \ +} while (0) + +#define DROP_GIANT() \ +do { \ + int _giantcnt; \ + WITNESS_SAVE_DECL(Giant); \ + \ + WITNESS_SAVE(&Giant, Giant); \ + for (_giantcnt = 0; mtx_owned(&Giant); _giantcnt++) \ + mtx_exit(&Giant, MTX_DEF) + +#define PICKUP_GIANT() \ + mtx_assert(&Giant, MA_NOTOWNED); \ + while (_giantcnt--) \ + mtx_enter(&Giant, MTX_DEF); \ + WITNESS_RESTORE(&Giant, Giant); \ +} while (0) + +#define PARTIAL_PICKUP_GIANT() \ + mtx_assert(&Giant, MA_NOTOWNED); \ + while (_giantcnt--) \ + mtx_enter(&Giant, MTX_DEF); \ + WITNESS_RESTORE(&Giant, Giant) + + +/* + * Debugging + */ +#ifndef SMP_DEBUG +#define mtx_assert(m, what) +#else /* SMP_DEBUG */ + +#define MA_OWNED 1 +#define MA_NOTOWNED 2 +#define mtx_assert(m, what) { \ + switch ((what)) { \ + case MA_OWNED: \ + ASS(mtx_owned((m))); \ + break; \ + case MA_NOTOWNED: \ + ASS(!mtx_owned((m))); \ + break; \ + default: \ + panic("unknown mtx_assert at %s:%d", __FILE__, __LINE__); \ + } \ +} + +#ifdef INVARIANTS +#define ASS(ex) MPASS(ex) +#define MPASS(ex) if (!(ex)) panic("Assertion %s failed at %s:%d", \ + #ex, __FILE__, __LINE__) +#define MPASS2(ex, what) if (!(ex)) panic("Assertion %s failed at %s:%d", \ + what, __FILE__, __LINE__) + +#ifdef MTX_STRS +char STR_IEN[] = "fl & 0x200"; +char STR_IDIS[] = "!(fl & 0x200)"; +#else /* MTX_STRS */ +extern char STR_IEN[]; +extern char STR_IDIS[]; +#endif /* MTX_STRS */ +#define ASS_IEN MPASS2((alpha_pal_rdps & ALPHA_PSL_IPL_MASK) + == ALPHA_PSL_IPL_HIGH, STR_IEN) +#define ASS_IDIS MPASS2((alpha_pal_rdps & ALPHA_PSL_IPL_MASK) + != ALPHA_PSL_IPL_HIGH, STR_IDIS) +#endif /* INVARIANTS */ + +#endif /* SMP_DEBUG */ + +#if !defined(SMP_DEBUG) || !defined(INVARIANTS) +#define ASS(ex) +#define MPASS(ex) +#define MPASS2(ex, where) +#define ASS_IEN +#define ASS_IDIS +#endif /* !defined(SMP_DEBUG) || !defined(INVARIANTS) */ + +#ifdef WITNESS +#ifndef SMP_DEBUG +#error WITNESS requires SMP_DEBUG +#endif /* SMP_DEBUG */ +#define WITNESS_ENTER(m, f) \ + if ((m)->mtx_witness != NULL) \ + witness_enter((m), (f), __FILE__, __LINE__) +#define WITNESS_EXIT(m, f) \ + if ((m)->mtx_witness != NULL) \ + witness_exit((m), (f), __FILE__, __LINE__) + +#define WITNESS_SLEEP(check, m) witness_sleep(check, (m), __FILE__, __LINE__) +#define WITNESS_SAVE_DECL(n) \ + char * __CONCAT(n, __wf); \ + int __CONCAT(n, __wl) + +#define WITNESS_SAVE(m, n) \ +do { \ + if ((m)->mtx_witness != NULL) \ + witness_save(m, &__CONCAT(n, __wf), &__CONCAT(n, __wl)); \ +} while (0) + +#define WITNESS_RESTORE(m, n) \ +do { \ + if ((m)->mtx_witness != NULL) \ + witness_restore(m, __CONCAT(n, __wf), __CONCAT(n, __wl)); \ +} while (0) + +void witness_init(mtx_t *, int flag); +void witness_destroy(mtx_t *); +void witness_enter(mtx_t *, int, char *, int); +void witness_try_enter(mtx_t *, int, char *, int); +void witness_exit(mtx_t *, int, char *, int); +void witness_display(void(*)(const char *fmt, ...)); +void witness_list(struct proc *); +int witness_sleep(int, mtx_t *, char *, int); +void witness_save(mtx_t *, char **, int *); +void witness_restore(mtx_t *, char *, int); +#else /* WITNESS */ +#define WITNESS_ENTER(m, flag) +#define WITNESS_EXIT(m, flag) +#define WITNESS_SLEEP(check, m) +#define WITNESS_SAVE_DECL(n) +#define WITNESS_SAVE(m, n) +#define WITNESS_RESTORE(m, n) + +/* + * flag++ is slezoid way of shutting up unused parameter warning + * in mtx_init() + */ +#define witness_init(m, flag) flag++ +#define witness_destroy(m) +#define witness_enter(m, flag, f, l) +#define witness_try_enter(m, flag, f, l ) +#define witness_exit(m, flag, f, l) +#endif /* WITNESS */ + +/* + * Assembly macros (for internal use only) + *-------------------------------------------------------------------------- + */ + +/* + * Get a sleep lock, deal with recursion inline + */ + +#define _V(x) __STRING(x) + +#define _getlock_sleep(mp, tid, type) do { \ + if (atomic_cmpset_64(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) == 0) { \ + if (((mp)->mtx_lock & MTX_FLAGMASK) != (tid)) \ + mtx_enter_hard(mp, (type) & MTX_HARDOPTS, 0); \ + else { \ + if (((mp)->mtx_lock & MTX_RECURSE) == 0) \ + atomic_set_64(&(mp)->mtx_lock, MTX_RECURSE); \ + (mp)->mtx_recurse++; \ + } \ + } else { \ + alpha_mb(); \ + } \ +} while (0) + +/* + * Get a spin lock, handle recusion inline (as the less common case) + */ + +#define _getlock_spin_block(mp, tid, type) do { \ + u_int _ipl = alpha_pal_rdps() & ALPHA_PSL_IPL_MASK; \ + if (atomic_cmpset_64(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) == 0) \ + mtx_enter_hard(mp, (type) & MTX_HARDOPTS, _ipl); \ + else { \ + alpha_mb(); \ + (mp)->mtx_saveipl = _ipl; \ + } \ +} while (0) + +/* + * Get a lock without any recursion handling. Calls the hard enter + * function if we can't get it inline. + */ + +#define _getlock_norecurse(mp, tid, type) do { \ + if (atomic_cmpset_64(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) == 0) \ + mtx_enter_hard((mp), (type) & MTX_HARDOPTS, 0); \ + else \ + alpha_mb(); \ +} while (0) + +/* + * Release a sleep lock assuming we haven't recursed on it, recursion is + * handled in the hard function. + */ + +#define _exitlock_norecurse(mp, tid, type) do { \ + alpha_mb(); \ + if (atomic_cmpset_64(&(mp)->mtx_lock, (tid), MTX_UNOWNED) == 0) \ + mtx_exit_hard((mp), (type) & MTX_HARDOPTS); \ +} while (0) + +/* + * Release a sleep lock when its likely we recursed (the code to + * deal with simple recursion is inline). + */ + +#define _exitlock(mp, tid, type) do { \ + alpha_mb(); \ + if (atomic_cmpset_64(&(mp)->mtx_lock, (tid), MTX_UNOWNED) == 0) {\ + if (((mp)->mtx_lock & MTX_RECURSE) && \ + (--(mp)->mtx_recurse == 0)) \ + atomic_clear_64(&(mp)->mtx_lock, MTX_RECURSE); \ + else \ + mtx_exit_hard((mp), (type) & MTX_HARDOPTS); \ + } \ +} while (0) + +/* + * Release a spin lock (with possible recursion) + */ + +#define _exitlock_spin(mp) do { \ + int _ipl = (mp)->mtx_saveipl; \ + alpha_mb(); \ + if ((mp)->mtx_recurse == 0 || (--(mp)->mtx_recurse) == 0) \ + atomic_cmpset_64(&(mp)->mtx_lock, (mp)->mtx_lock, \ + MTX_UNOWNED); \ + alpha_pal_swpipl(_ipl); \ +} while (0) + +/* + * Externally visible mutex functions + *------------------------------------------------------------------------ + */ + +/* + * Return non-zero if a mutex is already owned by the current thread + */ +#define mtx_owned(m) (((m)->mtx_lock & MTX_FLAGMASK) == CURTHD) + +/* Common strings */ +#ifdef MTX_STRS +char STR_mtx_enter_fmt[] = "GOT %s [%p] at %s:%d r=%d"; +char STR_mtx_bad_type[] = "((type) & (MTX_NORECURSE | MTX_NOSWITCH)) == 0"; +char STR_mtx_exit_fmt[] = "REL %s [%p] at %s:%d r=%d"; +char STR_mtx_owned[] = "mtx_owned(_mpp)"; +char STR_mtx_recurse[] = "_mpp->mtx_recurse == 0"; +char STR_mtx_try_enter_fmt[] = "TRY_ENTER %s [%p] at %s:%d result=%d"; +#else /* MTX_STRS */ +extern char STR_mtx_enter_fmt[]; +extern char STR_mtx_bad_type[]; +extern char STR_mtx_exit_fmt[]; +extern char STR_mtx_owned[]; +extern char STR_mtx_recurse[]; +extern char STR_mtx_try_enter_fmt[]; +#endif /* MTX_STRS */ + +/* + * Get lock 'm', the macro handles the easy (and most common cases) and + * leaves the slow stuff to the mtx_enter_hard() function. + * + * Note: since type is usually a constant much of this code is optimized out + */ +#define mtx_enter(mtxp, type) do { \ + mtx_t * _mpp = mtxp; \ + \ + /* bits only valid on mtx_exit() */ \ + MPASS2(((type) & (MTX_NORECURSE | MTX_NOSWITCH)) == 0, STR_mtx_bad_type); \ + \ + do { \ + if ((type) & MTX_SPIN) { \ + /* \ + * Easy cases of spin locks: \ + * \ + * 1) We already own the lock and will simply \ + * recurse on it (if RLIKELY) \ + * \ + * 2) The lock is free, we just get it \ + */ \ + if ((type) & MTX_RLIKELY) { \ + /* \ + * Check for recursion, if we already \ + * have this lock we just bump the \ + * recursion count. \ + */ \ + if (_mpp->mtx_lock == CURTHD) { \ + _mpp->mtx_recurse++; \ + break; /* Done */ \ + } \ + } \ + \ + if (((type) & MTX_TOPHALF) == 0) \ + /* \ + * If an interrupt thread uses this \ + * we must block interrupts here. \ + */ \ + _getlock_spin_block(_mpp, CURTHD, \ + (type) & MTX_HARDOPTS); \ + else \ + _getlock_norecurse(_mpp, CURTHD, \ + (type) & MTX_HARDOPTS); \ + } else { \ + /* Sleep locks */ \ + if ((type) & MTX_RLIKELY) \ + _getlock_sleep(_mpp, CURTHD, \ + (type) & MTX_HARDOPTS); \ + else \ + _getlock_norecurse(_mpp, CURTHD, \ + (type) & MTX_HARDOPTS); \ + } \ + } while (0); \ + WITNESS_ENTER(_mpp, type); \ + CTR5(KTR_LOCK, STR_mtx_enter_fmt, \ + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, \ + (_mpp)->mtx_recurse); \ +} while (0) + +/* + * Attempt to get MTX_DEF lock, return non-zero if lock acquired + * + * XXX DOES NOT HANDLE RECURSION + */ +#ifdef SMP_DEBUG +#define mtx_try_enter(mtxp, type) ({ \ + mtx_t *const _mpp = mtxp; \ + int _rval; \ + \ + _rval = atomic_cmpset_int(&_mpp->mtx_lock, MTX_UNOWNED, CURTHD);\ + if (_rval && (_mpp)->mtx_witness != NULL) { \ + ASS((_mpp)->mtx_recurse == 0); \ + witness_try_enter(_mpp, type, __FILE__, __LINE__); \ + } \ + CTR5(KTR_LOCK, STR_mtx_try_enter_fmt, \ + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, \ + _rval); \ + _rval; \ +}) + +#else /* SMP_DEBUG */ + +#define mtx_try_enter(mtxp, type) ({ \ + mtx_t *const _mpp = mtxp; \ + int _rval; \ + \ + _rval = atomic_cmpset_int(&_mpp->mtx_lock, MTX_UNOWNED, CURTHD);\ + CTR5(KTR_LOCK, STR_mtx_try_enter_fmt, \ + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, \ + _rval); \ + _rval; \ +}) + +#endif /* SMP_DEBUG */ + +#if 0 +#define mtx_legal2block() ({ \ + register int _l2b; \ + __asm __volatile ( \ +" pushfl;" \ +" popl %%eax;" \ +" andl $0x200, %%eax;" \ + : "=a" (_l2b) \ + : \ + : "cc"); \ + _l2b; \ +}) +#endif + +#define mtx_legal2block() (read_eflags() & 0x200) + +/* + * Release lock m + */ +#define mtx_exit(mtxp, type) do { \ + mtx_t *const _mpp = mtxp; \ + \ + MPASS2(mtx_owned(_mpp), STR_mtx_owned); \ + WITNESS_EXIT(_mpp, type); \ + CTR5(KTR_LOCK, STR_mtx_exit_fmt, \ + (_mpp)->mtx_description, (_mpp), __FILE__, __LINE__, \ + (_mpp)->mtx_recurse); \ + if ((type) & MTX_SPIN) { \ + if ((type) & MTX_NORECURSE) { \ + MPASS2(_mpp->mtx_recurse == 0, STR_mtx_recurse); \ + atomic_cmpset_64(&_mpp->mtx_lock, _mpp->mtx_lock, \ + MTX_UNOWNED); \ + if (((type) & MTX_TOPHALF) == 0) { \ + splx(_mpp->mtx_saveipl); \ + } \ + } else \ + if ((type) & MTX_TOPHALF) \ + _exitlock_norecurse(_mpp, CURTHD, \ + (type) & MTX_HARDOPTS); \ + else \ + _exitlock_spin(_mpp); \ + } else { \ + /* Handle sleep locks */ \ + if ((type) & MTX_RLIKELY) \ + _exitlock(_mpp, CURTHD, (type) & MTX_HARDOPTS); \ + else \ + _exitlock_norecurse(_mpp, CURTHD, \ + (type) & MTX_HARDOPTS); \ + } \ +} while (0) +#endif /* _KERNEL */ + +#else /* !LOCORE */ + +/* + * Simple assembly macros to get and release non-recursive spin locks + */ +#define MTX_ENTER(lck) \ + call_pal PAL_OSF1_rdps; \ + and v0, ALPHA_PSL_IPL_MASK, v0; \ +1: ldq_l a0, lck+MTX_LOCK; \ + cmpeq a0, MTX_UNOWNED, a1; \ + beq a1, 1b; \ + ldq a0, PC_CURPROC(globalp); \ + stq_c a0, lck+MTX_LOCK; \ + beq a0, 1b; \ + mb; \ + stl v0, lck+MTX_SAVEIPL; \ + ldq a0, ALPHA_PSL_IPL_HIGH; \ + call_pal PSL_OSF1_swpipl + +#define MTX_EXIT(lck) \ + mb; \ + ldiq a0, MTX_UNOWNED; \ + stq a0, lck+MTX_LOCK; \ + ldl a0, lck+MTX_SAVEIPL; \ + call_pal PAL_OSF1_swpipl + +#endif /* !LOCORE */ + +#endif /* __MACHINE_MUTEX_H */ diff --git a/sys/powerpc/include/pcpu.h b/sys/powerpc/include/pcpu.h new file mode 100644 index 000000000000..b246bb1fb707 --- /dev/null +++ b/sys/powerpc/include/pcpu.h @@ -0,0 +1,79 @@ +/*- + * Copyright (c) 1999 Luoqi Chen + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MACHINE_GLOBALDATA_H_ +#define _MACHINE_GLOBALDATA_H_ + +#ifdef _KERNEL + +#include + +/* + * This structure maps out the global data that needs to be kept on a + * per-cpu basis. genassym uses this to generate offsets for the assembler + * code, which also provides external symbols so that C can get at them as + * though they were really globals. This structure is pointed to by + * the per-cpu system value (see alpha_pal_rdval() and alpha_pal_wrval()). + * Inside the kernel, the globally reserved register t7 is used to + * point at the globaldata structure. + */ +struct globaldata { + struct alpha_pcb gd_idlepcb; /* pcb for idling */ + struct proc *gd_curproc; /* current process */ + struct proc *gd_idleproc; /* idle process */ + struct proc *gd_fpcurproc; /* fp state owner */ + struct pcb *gd_curpcb; /* current pcb */ + struct timeval gd_switchtime; + int gd_switchticks; + u_int gd_cpuno; /* this cpu number */ + u_int gd_other_cpus; /* all other cpus */ + int gd_inside_intr; + u_int64_t gd_idlepcbphys; /* pa of gd_idlepcb */ + u_int64_t gd_pending_ipis; /* pending IPI events */ + u_int32_t gd_next_asn; /* next ASN to allocate */ + u_int32_t gd_current_asngen; /* ASN rollover check */ + u_int32_t gd_intr_nesting_level; /* interrupt recursion */ + + u_int gd_astpending; + SLIST_ENTRY(globaldata) gd_allcpu; +#ifdef KTR_PERCPU + volatile int gd_ktr_idx; /* Index into trace table */ + char *gd_ktr_buf; + char gd_ktr_buf_data[0]; +#endif +}; + +SLIST_HEAD(cpuhead, globaldata); +extern struct cpuhead cpuhead; + +void globaldata_init(struct globaldata *pcpu, int cpuno, size_t sz); +struct globaldata *globaldata_find(int cpuno); + +#endif /* _KERNEL */ + +#endif /* !_MACHINE_GLOBALDATA_H_ */ diff --git a/sys/powerpc/powerpc/genassym.c b/sys/powerpc/powerpc/genassym.c index a67f2d11275c..066d87b4fee3 100644 --- a/sys/powerpc/powerpc/genassym.c +++ b/sys/powerpc/powerpc/genassym.c @@ -51,8 +51,11 @@ #include #include #include +#include #include #include +#include +#include #include #include #include @@ -66,6 +69,21 @@ #include #include +#include "opt_smp.h" + +ASSYM(GD_CURPROC, offsetof(struct globaldata, gd_curproc)); +ASSYM(GD_FPCURPROC, offsetof(struct globaldata, gd_fpcurproc)); +ASSYM(GD_CURPCB, offsetof(struct globaldata, gd_curpcb)); +ASSYM(GD_SWITCHTIME, offsetof(struct globaldata, gd_switchtime)); +ASSYM(GD_CPUNO, offsetof(struct globaldata, gd_cpuno)); +ASSYM(GD_IDLEPCBPHYS, offsetof(struct globaldata, gd_idlepcbphys)); +ASSYM(GD_ASTPENDING, offsetof(struct globaldata, gd_astpending)); + +ASSYM(MTX_LOCK, offsetof(struct mtx, mtx_lock)); +ASSYM(MTX_RECURSE, offsetof(struct mtx, mtx_recurse)); +ASSYM(MTX_SAVEIPL, offsetof(struct mtx, mtx_saveipl)); +ASSYM(MTX_UNOWNED, MTX_UNOWNED); + ASSYM(P_ADDR, offsetof(struct proc, p_addr)); ASSYM(P_MD_FLAGS, offsetof(struct proc, p_md.md_flags)); ASSYM(P_MD_PCBPADDR, offsetof(struct proc, p_md.md_pcbpaddr)); @@ -81,6 +99,7 @@ ASSYM(PTESIZE, PTESIZE); ASSYM(U_PCB_ONFAULT, offsetof(struct user, u_pcb.pcb_onfault)); ASSYM(U_PCB_HWPCB_KSP, offsetof(struct user, u_pcb.pcb_hw.apcb_ksp)); ASSYM(U_PCB_CONTEXT, offsetof(struct user, u_pcb.pcb_context)); +ASSYM(U_PCB_SCHEDNEST, offsetof(struct user, u_pcb.pcb_schednest)); ASSYM(PCB_HW, offsetof(struct pcb, pcb_hw)); diff --git a/sys/powerpc/powerpc/vm_machdep.c b/sys/powerpc/powerpc/vm_machdep.c index 8baea02b8494..3831d67658c6 100644 --- a/sys/powerpc/powerpc/vm_machdep.c +++ b/sys/powerpc/powerpc/vm_machdep.c @@ -84,6 +84,7 @@ #include #include #include +#include #include #include @@ -246,8 +247,10 @@ cpu_exit(p) alpha_fpstate_drop(p); (void) splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); + mtx_exit(&Giant, MTX_DEF); cnt.v_swtch++; - cpu_switch(p); + cpu_switch(); panic("cpu_exit"); } @@ -358,7 +361,7 @@ vunmapbuf(bp) } /* - * Force reset the processor by invalidating the entire address space! + * Reset back to firmware. */ void cpu_reset() @@ -416,7 +419,7 @@ vm_page_zero_idle() return(0); #ifdef SMP - if (try_mplock()) { + if (KLOCK_ENTER(M_TRY)) { #endif s = splvm(); m = vm_page_list_find(PQ_FREE, free_rover, FALSE); @@ -447,7 +450,7 @@ vm_page_zero_idle() free_rover = (free_rover + PQ_PRIME2) & PQ_L2_MASK; splx(s); #ifdef SMP - rel_mplock(); + KLOCK_EXIT; #endif return (1); #ifdef SMP diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 42424d645228..d469a0493cbc 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -310,7 +310,7 @@ BUF_KERNPROC(struct buf *bp) { struct proc *p = curproc; - if (p != NULL && bp->b_lock.lk_lockholder == p->p_pid) + if (p != idleproc && bp->b_lock.lk_lockholder == p->p_pid) p->p_locks--; bp->b_lock.lk_lockholder = LK_KERNPROC; } diff --git a/sys/sys/bus.h b/sys/sys/bus.h index 2eff10cdfd4b..171728deca75 100644 --- a/sys/sys/bus.h +++ b/sys/sys/bus.h @@ -45,6 +45,14 @@ typedef struct devclass *devclass_t; typedef void driver_intr_t(void*); /* + * Interrupt type bits. These flags are used both by newbus interrupt + * registration (nexus.c) and also in struct intrec, which defines + * interrupt properties. + * + * XXX We should probably revisit this and remove the vestiges of the + * spls implicit in names like INTR_TYPE_TTY. In the meantime, don't + * confuse things by renaming them (Grog, 18 July 2000). + * * We define this in terms of bits because some devices may belong * to multiple classes (and therefore need to be included in * multiple interrupt masks, which is what this really serves to @@ -57,7 +65,12 @@ enum intr_type { INTR_TYPE_NET = 4, INTR_TYPE_CAM = 8, INTR_TYPE_MISC = 16, - INTR_TYPE_FAST = 128 + INTR_HEAVY = 32, /* heavyweight interrupt process */ + INTR_LIGHT = 64, /* light weight interrupt thread */ + INTR_THREADED = INTR_LIGHT | INTR_HEAVY, /* any kind of interrupt thread */ + INTR_FAST = 128, + INTR_EXCL = 256, /* exclusive interrupt */ + INTR_MPSAFE = 512 /* this interrupt is SMP safe */ }; typedef int (*devop_t)(void); diff --git a/sys/sys/kernel.h b/sys/sys/kernel.h index f87630d87691..2f54efebf591 100644 --- a/sys/sys/kernel.h +++ b/sys/sys/kernel.h @@ -119,6 +119,8 @@ enum sysinit_sub_id { SI_SUB_VM_CONF = 0x2300000, /* config VM, set limits*/ SI_SUB_RUN_QUEUE = 0x2400000, /* set up run queue*/ SI_SUB_CREATE_INIT = 0x2500000, /* create init process*/ + SI_SUB_SCHED_IDLE = 0x2600000, /* required idle procs */ + SI_SUB_SOFTINTR = 0x2700000, /* start soft interrupt thread */ SI_SUB_DRIVERS = 0x3100000, /* Let Drivers initialize */ SI_SUB_CONFIGURE = 0x3800000, /* Configure devices */ SI_SUB_VFS = 0x4000000, /* virtual file system*/ @@ -150,7 +152,7 @@ enum sysinit_sub_id { SI_SUB_KTHREAD_BUF = 0xea00000, /* buffer daemon*/ SI_SUB_KTHREAD_UPDATE = 0xec00000, /* update daemon*/ SI_SUB_KTHREAD_IDLE = 0xee00000, /* idle procs*/ - SI_SUB_SMP = 0xf000000, /* idle procs*/ + SI_SUB_SMP = 0xf000000, /* start the APs*/ SI_SUB_RUN_SCHEDULER = 0xfffffff /* scheduler*/ }; diff --git a/sys/sys/kthread.h b/sys/sys/kthread.h index fb0d3f995cbe..5ca3736c64f7 100644 --- a/sys/sys/kthread.h +++ b/sys/sys/kthread.h @@ -44,7 +44,7 @@ struct kproc_desc { void kproc_start __P((const void *)); int kthread_create __P((void (*)(void *), void *, struct proc **, - const char *, ...)) __printflike(4, 5); + int flags, const char *, ...)) __printflike(5, 6); void kthread_exit __P((int)) __dead2; int suspend_kproc __P((struct proc *, int)); diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 4173fea63a58..900ac5d45dd0 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -122,6 +122,10 @@ struct pargs { struct jail; +struct mtx; + +struct ithd; + struct proc { TAILQ_ENTRY(proc) p_procq; /* run/sleep queue. */ LIST_ENTRY(proc) p_list; /* List of all processes. */ @@ -207,6 +211,9 @@ struct proc { int p_sig; /* for core dump/debugger XXX */ u_long p_code; /* for core dump/debugger XXX */ struct klist p_klist; /* knotes attached to this process */ + LIST_HEAD(, mtx) p_heldmtx; /* for debugging code */ + struct mtx *p_blocked; /* Mutex process is blocked on */ + LIST_HEAD(, mtx) p_contested; /* contested locks */ /* End area that is zeroed on creation. */ #define p_endzero p_startcopy @@ -216,8 +223,11 @@ struct proc { sigset_t p_sigmask; /* Current signal mask. */ stack_t p_sigstk; /* sp & on stack state variable */ + + int p_magic; /* Magic number. */ u_char p_priority; /* Process priority. */ u_char p_usrpri; /* User-priority based on p_cpu and p_nice. */ + u_char p_nativepri; /* Priority before propogation. */ char p_nice; /* Process "nice" value. */ char p_comm[MAXCOMLEN+1]; @@ -244,17 +254,20 @@ struct proc { struct proc *p_leader; struct pasleep p_asleep; /* Used by asleep()/await(). */ void *p_emuldata; /* process-specific emulator state data */ + struct ithd *p_ithd; /* for interrupt threads only */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id -/* Status values. */ +/* Status values (p_stat) */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ +#define SWAIT 6 /* Waiting for interrupt or CPU. */ +#define SMTX 7 /* Blocked on a mutex. */ /* These flags are kept in p_flags. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ @@ -293,6 +306,8 @@ struct proc { #define P_OLDMASK 0x2000000 /* need to restore mask before pause */ #define P_ALTSTACK 0x4000000 /* have alternate signal stack */ +#define P_MAGIC 0xbeefface + #define P_CAN_SEE 1 #define P_CAN_KILL 2 #define P_CAN_SCHED 3 @@ -315,6 +330,56 @@ struct pcred { struct uidinfo *p_uidinfo; /* Per uid resource consumption */ }; +/* + * Describe an interrupt thread. There is one of these per irq. BSD/OS makes + * this a superset of struct proc, i.e. it_proc is the struct itself and not a + * pointer. We point in both directions, because it feels good that way. + */ +typedef struct ithd { + struct proc *it_proc; /* interrupt process */ + + LIST_HEAD(ihhead, intrhand) it_ihhead; + LIST_HEAD(srchead, isrc) it_isrchead; + + /* Fields used by all interrupt threads */ + LIST_ENTRY(ithd) it_list; /* All interrupt threads */ + int it_need; /* Needs service */ + int irq; /* irq */ + struct intrec *it_ih; /* head of handler queue */ + struct ithd *it_interrupted; /* Who we interrupted */ + + /* Fields used only for hard interrupt threads */ + int it_stray; /* Stray interrupts */ + +#ifdef APIC_IO + /* Used by APIC interrupt sources */ + int it_needeoi; /* An EOI is needed */ + int it_blocked; /* at least 1 blocked apic src */ +#endif + + /* stats */ +#ifdef SMP_DEBUG + int it_busy; /* failed attempts on runlock */ + int it_lostneeded; /* Number of it_need races lost */ + int it_invprio; /* Startup priority inversions */ +#endif +#ifdef NEEDED + /* + * These are in the BSD/OS i386 sources only, not in SPARC. + * I'm not yet sure we need them. + */ + LIST_HEAD(ihhead, intrhand) it_ihhead; + LIST_HEAD(srchead, isrc) it_isrchead; + + /* Fields used by all interrupt threads */ + LIST_ENTRY(ithd) it_list; /* All interrupt threads */ + + /* Fields user only for soft interrupt threads */ + sifunc_t it_service; /* service routine */ + int it_cnt; /* number of schedule events */ + +#endif +} ithd; #ifdef _KERNEL @@ -351,13 +416,13 @@ MALLOC_DECLARE(M_PARGS); * STOPEVENT is MP SAFE. */ extern void stopevent(struct proc*, unsigned int, unsigned int); -#define STOPEVENT(p,e,v) \ - do { \ - if ((p)->p_stops & (e)) { \ - get_mplock(); \ - stopevent(p,e,v); \ - rel_mplock(); \ - } \ +#define STOPEVENT(p,e,v) \ + do { \ + if ((p)->p_stops & (e)) { \ + mtx_enter(&Giant, MTX_DEF); \ + stopevent(p,e,v); \ + mtx_exit(&Giant, MTX_DEF); \ + } \ } while (0) /* hold process U-area in memory, normally for ptrace/procfs work */ @@ -381,6 +446,8 @@ extern u_long pgrphash; #ifndef curproc extern struct proc *curproc; /* Current running proc. */ +extern struct proc *prevproc; /* Previously running proc. */ +extern struct proc *idleproc; /* Current idle proc. */ extern u_int astpending; /* software interrupt pending */ extern int switchticks; /* `ticks' at last context switch. */ extern struct timeval switchtime; /* Uptime at last context switch */ @@ -398,12 +465,10 @@ extern struct proc *initproc, *pageproc, *updateproc; /* Process slots for init, #define NQS 32 /* 32 run queues. */ TAILQ_HEAD(rq, proc); -extern struct rq queues[]; +extern struct rq itqueues[]; extern struct rq rtqueues[]; +extern struct rq queues[]; extern struct rq idqueues[]; -extern int whichqs; /* Bit mask summary of non-empty Q's. */ -extern int whichrtqs; /* Bit mask summary of non-empty Q's. */ -extern int whichidqs; /* Bit mask summary of non-empty Q's. */ /* * XXX macros for scheduler. Shouldn't be here, but currently needed for @@ -447,7 +512,8 @@ int suser __P((const struct proc *)); int suser_xxx __P((const struct ucred *cred, const struct proc *proc, int flag)); void remrunqueue __P((struct proc *)); -void cpu_switch __P((struct proc *)); +void cpu_switch __P((void)); +void cpu_throw __P((void)) __dead2; void unsleep __P((struct proc *)); void cpu_exit __P((struct proc *)) __dead2; diff --git a/sys/sys/rtprio.h b/sys/sys/rtprio.h index 5178b0e4ea69..578afc515f63 100644 --- a/sys/sys/rtprio.h +++ b/sys/sys/rtprio.h @@ -38,11 +38,12 @@ * Process realtime-priority specifications to rtprio. */ -/* priority types */ +/* priority types. Start at 1 to catch uninitialized fields. */ -#define RTP_PRIO_REALTIME 0 -#define RTP_PRIO_NORMAL 1 -#define RTP_PRIO_IDLE 2 +#define RTP_PRIO_ITHREAD 1 /* interrupt thread */ +#define RTP_PRIO_REALTIME 2 /* real time process */ +#define RTP_PRIO_NORMAL 3 /* time sharing process */ +#define RTP_PRIO_IDLE 4 /* idle process */ /* RTP_PRIO_FIFO is POSIX.1B SCHED_FIFO. */ @@ -64,12 +65,34 @@ #define RTP_SET 1 #ifndef LOCORE +/* + * Scheduling class information. This is strictly speaking not only + * for real-time processes. We should replace it with two variables: + * class and priority. At the moment we use prio here for real-time + * and interrupt processes, and for others we use proc.p_pri. FIXME. + */ struct rtprio { - u_short type; + u_short type; /* scheduling class */ u_short prio; }; #endif +/* + * Interrupt thread priorities, after BSD/OS. + */ +#define PI_REALTIME 1 /* very high priority (clock) */ +#define PI_AV 2 /* Audio/video devices */ +#define PI_TTYHIGH 3 /* High priority tty's (small FIFOs) */ +#define PI_TAPE 4 /* Tape devices (high for streaming) */ +#define PI_NET 5 /* Network interfaces */ +#define PI_DISK 6 /* Disks and SCSI */ +#define PI_TTYLOW 7 /* Ttys with big buffers */ +#define PI_DISKLOW 8 /* Disks that do programmed I/O */ +#define PI_DULL 9 /* We don't know or care */ + +/* Soft interrupt threads */ +#define PI_SOFT 15 /* All soft interrupts */ + #ifndef _KERNEL #include diff --git a/sys/sys/signalvar.h b/sys/sys/signalvar.h index 85ad07b209e2..0d1757fa4279 100644 --- a/sys/sys/signalvar.h +++ b/sys/sys/signalvar.h @@ -189,6 +189,10 @@ __sigseteq(sigset_t *set1, sigset_t *set2) #ifdef _KERNEL +#include +#include +#include + struct pgrp; struct proc; struct sigio; @@ -240,9 +244,9 @@ static __inline int __cursig(struct proc *p) (!(p->p_flag & P_TRACED) && SIGISEMPTY(tmpset))) { return(0); } - get_mplock(); + mtx_enter(&Giant, MTX_DEF); r = issignal(p); - rel_mplock(); + mtx_exit(&Giant, MTX_DEF); return(r); } diff --git a/sys/sys/smp.h b/sys/sys/smp.h index 69b716ba8579..20d4fa3a8873 100644 --- a/sys/sys/smp.h +++ b/sys/sys/smp.h @@ -15,6 +15,9 @@ #ifdef _KERNEL +#ifdef I386_CPU +#error SMP not supported with I386_CPU +#endif #if defined(SMP) && !defined(APIC_IO) # error APIC_IO required for SMP, add "options APIC_IO" to your config file. #endif /* SMP && !APIC_IO */ @@ -57,23 +60,6 @@ extern int bootMP_size; /* functions in mpboot.s */ void bootMP __P((void)); -/* global data in mplock.s */ -extern u_int mp_lock; -extern u_int isr_lock; -#ifdef RECURSIVE_MPINTRLOCK -extern u_int mpintr_lock; -#endif /* RECURSIVE_MPINTRLOCK */ - -/* functions in mplock.s */ -void get_mplock __P((void)); -void rel_mplock __P((void)); -int try_mplock __P((void)); -#ifdef RECURSIVE_MPINTRLOCK -void get_mpintrlock __P((void)); -void rel_mpintrlock __P((void)); -int try_mpintrlock __P((void)); -#endif /* RECURSIVE_MPINTRLOCK */ - /* global data in apic_vector.s */ extern volatile u_int stopped_cpus; extern volatile u_int started_cpus; @@ -185,23 +171,7 @@ extern int smp_started; extern volatile int smp_idle_loops; #endif /* !LOCORE */ -#else /* !SMP && !APIC_IO */ - -/* - * Create dummy MP lock empties - */ - -static __inline void -get_mplock(void) -{ -} - -static __inline void -rel_mplock(void) -{ -} - -#endif +#endif /* SMP && !APIC_IO */ #endif /* _KERNEL */ #endif /* _MACHINE_SMP_H_ */ diff --git a/sys/sys/unistd.h b/sys/sys/unistd.h index 515c69f14090..83b60d61b9b9 100644 --- a/sys/sys/unistd.h +++ b/sys/sys/unistd.h @@ -202,7 +202,7 @@ /* * rfork() options. * - * XXX currently, operations without RFPROC set are not supported. + * XXX currently, some operations without RFPROC set are not supported. */ #define RFNAMEG (1<<0) /* UNIMPL new plan9 `name space' */ #define RFENVG (1<<1) /* UNIMPL copy plan9 `env space' */ @@ -210,14 +210,17 @@ #define RFNOTEG (1<<3) /* UNIMPL create new plan9 `note group' */ #define RFPROC (1<<4) /* change child (else changes curproc) */ #define RFMEM (1<<5) /* share `address space' */ -#define RFNOWAIT (1<<6) /* parent need not wait() on child */ +#define RFNOWAIT (1<<6) /* give child to init */ #define RFCNAMEG (1<<10) /* UNIMPL zero plan9 `name space' */ #define RFCENVG (1<<11) /* UNIMPL zero plan9 `env space' */ -#define RFCFDG (1<<12) /* zero fd table */ +#define RFCFDG (1<<12) /* close all fds, zero fd table */ #define RFTHREAD (1<<13) /* enable kernel thread support */ #define RFSIGSHARE (1<<14) /* share signal handlers */ #define RFLINUXTHPN (1<<16) /* do linux clone exit parent notification */ +#define RFSTOPPED (1<<17) /* leave child in a stopped state */ +#define RFHIGHPID (1<<18) /* use a pid higher then 10 (idleproc) */ #define RFPPWAIT (1<<31) /* parent sleeps until child exits (vfork) */ +#define RFKERNELONLY RFSTOPPED #endif /* !_POSIX_SOURCE */ diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c index ba74b31de114..bbe5f2f3e1e7 100644 --- a/sys/ufs/ffs/ffs_snapshot.c +++ b/sys/ufs/ffs/ffs_snapshot.c @@ -57,7 +57,6 @@ #include #define KERNCRED proc0.p_ucred -#define CURPROC curproc #define DEBUG static int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t, diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index cbc37ad4052c..ad30011a8554 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -133,7 +133,6 @@ static struct malloc_type *memtype[] = { */ #define TYPENAME(type) \ ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") -#define CURPROC curproc /* * End system adaptaion definitions. */ diff --git a/sys/ufs/ufs/ufs_vfsops.c b/sys/ufs/ufs/ufs_vfsops.c index 906a4021a388..30ce635e5332 100644 --- a/sys/ufs/ufs/ufs_vfsops.c +++ b/sys/ufs/ufs/ufs_vfsops.c @@ -43,8 +43,8 @@ #include #include -#include #include +#include #include #include diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index 849a30affd60..ea39d7f1305c 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -74,9 +74,11 @@ #include #include +#include #include #include +#include #include #include @@ -316,8 +318,11 @@ faultin(p) s = splhigh(); - if (p->p_stat == SRUN) + if (p->p_stat == SRUN) { + mtx_enter(&sched_lock, MTX_SPIN); setrunqueue(p); + mtx_exit(&sched_lock, MTX_SPIN); + } p->p_flag |= P_INMEM; @@ -332,6 +337,8 @@ faultin(p) * This swapin algorithm attempts to swap-in processes only if there * is enough space for them. Of course, if a process waits for a long * time, it will be swapped in anyway. + * + * Giant is still held at this point, to be released in tsleep. */ /* ARGSUSED*/ static void @@ -343,6 +350,8 @@ scheduler(dummy) struct proc *pp; int ppri; + mtx_assert(&Giant, MA_OWNED); + loop: if (vm_page_count_min()) { VM_WAIT; diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index 66829bb54a64..0f584c86ae29 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -153,6 +153,7 @@ vmtotal(SYSCTL_HANDLER_ARGS) case 0: continue; + case SMTX: case SSLEEP: case SSTOP: if (p->p_flag & P_INMEM) { @@ -166,6 +167,10 @@ vmtotal(SYSCTL_HANDLER_ARGS) continue; break; + case SWAIT: + totalp->t_sl++; + continue; + case SRUN: case SIDL: if (p->p_flag & P_INMEM) diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index 97b221e7b9f0..d12ecacc4203 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include #include @@ -95,6 +96,8 @@ #include #include +#include + /* * System initialization */ @@ -1280,6 +1283,9 @@ vm_size_t count; static void vm_pageout() { + + mtx_enter(&Giant, MTX_DEF); + /* * Initialize some paging parameters. */ @@ -1399,6 +1405,8 @@ vm_daemon() { struct proc *p; + mtx_enter(&Giant, MTX_DEF); + while (TRUE) { tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0); if (vm_pageout_req_swapout) { diff --git a/usr.bin/top/machine.c b/usr.bin/top/machine.c index 0b52a70229d9..06b316850b92 100644 --- a/usr.bin/top/machine.c +++ b/usr.bin/top/machine.c @@ -130,7 +130,7 @@ static char up_header[] = char *state_abbrev[] = { - "", "START", "RUN\0\0\0", "SLEEP", "STOP", "ZOMB", + "", "START", "RUN\0\0\0", "SLEEP", "STOP", "ZOMB", "WAIT", "MUTEX" }; @@ -162,10 +162,10 @@ static long cp_diff[CPUSTATES]; /* these are for detailing the process states */ -int process_states[6]; +int process_states[8]; char *procstatenames[] = { "", " starting, ", " running, ", " sleeping, ", " stopped, ", - " zombie, ", + " zombie, ", " waiting, ", " mutex, ", NULL };