From 46969da5f8b7d09f4fb1368d6be669af63110127 Mon Sep 17 00:00:00 2001 From: peter Date: Wed, 2 Apr 2003 23:53:30 +0000 Subject: [PATCH] Commit a partial lazy thread switch mechanism for i386. it isn't as lazy as it could be and can do with some more cleanup. Currently its under options LAZY_SWITCH. What this does is avoid %cr3 reloads for short context switches that do not involve another user process. ie: we can take an interrupt, switch to a kthread and return to the user without explicitly flushing the tlb. However, this isn't as exciting as it could be, the interrupt overhead is still high and too much blocks on Giant still. There are some debug sysctls, for stats and for an on/off switch. The main problem with doing this has been "what if the process that you're running on exits while we're borrowing its address space?" - in this case we use an IPI to give it a kick when we're about to reclaim the pmap. Its not compiled in unless you add the LAZY_SWITCH option. I want to fix a few more things and get some more feedback before turning it on by default. This is NOT a replacement for Bosko's lazy interrupt stuff. This was more meant for the kthread case, while his was for interrupts. Mine helps a little for interrupts, but his helps a lot more. The stats are enabled with options SWTCH_OPTIM_STATS - this has been a pseudo-option for years, I just added a bunch of stuff to it. One non-trivial change was to select a new thread before calling cpu_switch() in the first place. This allows us to catch the silly case of doing a cpu_switch() to the current process. This happens uncomfortably often. This simplifies a bit of the asm code in cpu_switch (no longer have to call choosethread() in the middle). This has been implemented on i386 and (thanks to jake) sparc64. The others will come soon. This is actually seperate to the lazy switch stuff. Glanced at by: jake, jhb --- sys/amd64/amd64/apic_vector.S | 22 ++++ sys/amd64/amd64/cpu_switch.S | 198 ++++++++++++++++++------------- sys/amd64/amd64/machdep.c | 36 +++++- sys/amd64/amd64/mp_machdep.c | 9 +- sys/amd64/amd64/mptable.c | 9 +- sys/amd64/amd64/pmap.c | 129 +++++++++++++++++++- sys/amd64/amd64/swtch.s | 198 ++++++++++++++++++------------- sys/amd64/include/md_var.h | 16 +++ sys/amd64/include/mptable.h | 9 +- sys/amd64/include/pmap.h | 2 +- sys/amd64/include/smp.h | 1 + sys/amd64/isa/intr_machdep.h | 6 +- sys/conf/options.i386 | 2 + sys/i386/i386/apic_vector.s | 22 ++++ sys/i386/i386/machdep.c | 36 +++++- sys/i386/i386/mp_machdep.c | 9 +- sys/i386/i386/mptable.c | 9 +- sys/i386/i386/pmap.c | 129 +++++++++++++++++++- sys/i386/i386/swtch.s | 198 ++++++++++++++++++------------- sys/i386/include/md_var.h | 16 +++ sys/i386/include/mptable.h | 9 +- sys/i386/include/pmap.h | 2 +- sys/i386/include/smp.h | 1 + sys/i386/isa/apic_vector.s | 22 ++++ sys/i386/isa/intr_machdep.h | 6 +- sys/kern/kern_kse.c | 6 + sys/kern/kern_switch.c | 18 ++- sys/kern/kern_synch.c | 26 +++- sys/kern/kern_thr.c | 4 + sys/kern/kern_thread.c | 6 + sys/kern/subr_witness.c | 6 + sys/sparc64/sparc64/mp_machdep.c | 2 +- sys/sparc64/sparc64/swtch.S | 26 ++-- sys/sys/proc.h | 5 + 34 files changed, 912 insertions(+), 283 deletions(-) diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index e4b6ea5fd65f..d91ff1ce4cee 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -3,6 +3,7 @@ * $FreeBSD$ */ +#include "opt_swtch.h" #include #include @@ -648,7 +649,28 @@ Xrendezvous: POP_FRAME iret +#ifdef LAZY_SWITCH +/* + * Clean up when we lose out on the lazy context switch optimization. + * ie: when we are about to release a PTD but a cpu is still borrowing it. + */ + SUPERALIGN_TEXT + .globl Xlazypmap +Xlazypmap: + PUSH_FRAME + movl $KDSEL, %eax + mov %ax, %ds /* use KERNEL data segment */ + mov %ax, %es + movl $KPSEL, %eax + mov %ax, %fs + + call pmap_lazyfix_action + movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + POP_FRAME + iret +#endif + .data .globl apic_pin_trigger diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S index aaf0ec2c2073..793e63431f4a 100644 --- a/sys/amd64/amd64/cpu_switch.S +++ b/sys/amd64/amd64/cpu_switch.S @@ -37,30 +37,16 @@ */ #include "opt_npx.h" +#include "opt_swtch.h" #include -#ifdef SMP -#include -#include /* CHEAP_TPR, GRAB_LOPRIO */ -#endif - #include "assym.s" /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ - .data - - .globl panic - -#ifdef SWTCH_OPTIM_STATS - .globl swtch_optim_stats, tlb_flush_count -swtch_optim_stats: .long 0 /* number of _swtch_optims */ -tlb_flush_count: .long 0 -#endif - .text /* @@ -68,30 +54,60 @@ tlb_flush_count: .long 0 * * This is the second half of cpu_swtch(). It is used when the current * thread is either a dummy or slated to die, and we no longer care - * about its state. + * about its state. This is only a slight optimization and is probably + * not worth it anymore. Note that we need to clear the pm_active bits so + * we do need the old proc if it still exists. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd */ ENTRY(cpu_throw) + movl PCPU(CPUID), %esi + movl 4(%esp),%ecx /* Old thread */ + testl %ecx,%ecx /* no thread? */ + jz 1f + /* release bit from old pm_active */ + movl TD_PROC(%ecx), %eax /* thread->td_proc */ + movl P_VMSPACE(%eax), %ebx /* proc->p_vmspace */ +#ifdef SMP + lock +#endif + btrl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* clear old */ +1: + movl 8(%esp),%ecx /* New thread */ + movl TD_PCB(%ecx),%edx +#ifdef SWTCH_OPTIM_STATS + incl tlb_flush_count +#endif + movl PCB_CR3(%edx),%eax + movl %eax,%cr3 /* new address space */ + /* set bit in new pm_active */ + movl TD_PROC(%ecx),%eax + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btsl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* set new */ jmp sw1 /* - * cpu_switch() + * cpu_switch(old, new) * * Save the current thread state, then select the next thread to run * and load its state. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd */ ENTRY(cpu_switch) - /* Switch to new thread. First, save context as needed. */ - movl PCPU(CURTHREAD),%ecx + /* Switch to new thread. First, save context. */ + movl 4(%esp),%ecx - /* If no thread to save, don't save it (XXX shouldn't happen). */ - testl %ecx,%ecx - jz sw1 - - movl TD_PROC(%ecx), %eax - movl P_VMSPACE(%eax), %edx - movl PCPU(CPUID), %eax - btrl %eax, VM_PMAP+PM_ACTIVE(%edx) +#ifdef INVARIANTS + testl %ecx,%ecx /* no thread? */ + jz badsw2 /* no, panic */ +#endif movl TD_PCB(%ecx),%edx @@ -125,10 +141,6 @@ ENTRY(cpu_switch) movl %eax,PCB_DR0(%edx) 1: -#ifdef SMP - /* XXX FIXME: we should be saving the local APIC TPR */ -#endif - #ifdef DEV_NPX /* have we used fp, and need a save? */ cmpl %ecx,PCPU(FPCURTHREAD) @@ -140,56 +152,76 @@ ENTRY(cpu_switch) 1: #endif - /* Save is done. Now choose a new thread. */ - /* XXX still trashing space above the old "Top Of Stack". */ -sw1: - -#ifdef SMP - /* - * Stop scheduling if smp_active has become zero (for rebooting) and - * we are not the BSP. - */ - cmpl $0,smp_active - jne 1f - cmpl $0,PCPU(CPUID) - je 1f - movl PCPU(IDLETHREAD), %eax - jmp sw1b -1: -#endif - - /* - * Choose a new thread to schedule. choosethread() returns idlethread - * if it cannot find another thread to run. - */ - call choosethread /* Trash ecx, edx; ret eax. */ - + /* Save is done. Now fire up new thread. Leave old vmspace. */ + movl %ecx,%edi + movl 8(%esp),%ecx /* New thread */ #ifdef INVARIANTS - testl %eax,%eax /* no thread? */ + testl %ecx,%ecx /* no thread? */ jz badsw3 /* no, panic */ #endif - -sw1b: - movl %eax,%ecx movl TD_PCB(%ecx),%edx - -#ifdef SWTCH_OPTIM_STATS - incl swtch_optim_stats -#endif + movl PCPU(CPUID), %esi /* switch address space */ - movl %cr3,%ebx /* The same address space? */ - cmpl PCB_CR3(%edx),%ebx - je 4f /* Yes, skip all that cruft */ + movl PCB_CR3(%edx),%eax +#ifdef LAZY_SWITCH + cmpl $0,lazy_flush_enable + je 1f + cmpl %eax,IdlePTD /* Kernel address space? */ +#ifdef SWTCH_OPTIM_STATS + je 3f +#else + je sw1 +#endif +1: + movl %cr3,%ebx /* The same address space? */ + cmpl %ebx,%eax +#ifdef SWTCH_OPTIM_STATS + je 2f /* Yes, skip all that cruft */ +#else + je sw1 +#endif +#endif + #ifdef SWTCH_OPTIM_STATS - decl swtch_optim_stats incl tlb_flush_count #endif - movl PCB_CR3(%edx),%ebx /* Tell the CPU about the */ - movl %ebx,%cr3 /* new address space */ -4: + movl %eax,%cr3 /* new address space */ - movl PCPU(CPUID), %esi + /* Release bit from old pmap->pm_active */ + movl TD_PROC(%edi), %eax /* oldproc */ + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btrl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* clear old */ + + /* Set bit in new pmap->pm_active */ + movl TD_PROC(%ecx),%eax /* newproc */ + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btsl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* set new */ + +#ifdef LAZY_SWITCH +#ifdef SWTCH_OPTIM_STATS + jmp sw1 + +2: /* same address space */ + incl swtch_optim_stats + jmp sw1 + +3: /* kernel address space */ + incl lazy_flush_count +#endif +#endif + +sw1: + /* + * At this point, we've switched address spaces and are ready + * to load up the rest of the next context. + */ cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f /* If not, use the default */ btsl %esi, private_tss /* mark use of private tss */ @@ -221,11 +253,6 @@ sw1b: movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 3: - /* Note in vmspace that this cpu is using it. */ - movl TD_PROC(%ecx),%eax - movl P_VMSPACE(%eax), %ebx - movl PCPU(CPUID), %eax - btsl %eax, VM_PMAP+PM_ACTIVE(%ebx) /* Restore context. */ movl PCB_EBX(%edx),%ebx @@ -241,10 +268,6 @@ sw1b: movl %edx, PCPU(CURPCB) movl %ecx, PCPU(CURTHREAD) /* into next thread */ -#ifdef SMP - /* XXX FIXME: we should be restoring the local APIC TPR */ -#endif - /* * Determine the LDT to use and load it if is the default one and * that is not the current one. @@ -301,12 +324,23 @@ cpu_switch_load_gs: ret #ifdef INVARIANTS +badsw1: + pushal + pushl $sw0_1 + call panic +sw0_1: .asciz "cpu_throw: no newthread supplied" + +badsw2: + pushal + pushl $sw0_2 + call panic +sw0_2: .asciz "cpu_switch: no curthread supplied" + badsw3: pushal pushl $sw0_3 call panic - -sw0_3: .asciz "cpu_switch: choosethread returned NULL" +sw0_3: .asciz "cpu_switch: no newthread supplied" #endif /* diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index b389828b596f..baab5cb6dccb 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -49,6 +49,7 @@ #include "opt_msgbuf.h" #include "opt_npx.h" #include "opt_perfmon.h" +#include "opt_swtch.h" #include "opt_kstack_pages.h" #include @@ -151,11 +152,40 @@ int _udatasel, _ucodesel; u_int atdevbase; #if defined(SWTCH_OPTIM_STATS) -extern int swtch_optim_stats; +int stupid_switch; +SYSCTL_INT(_debug, OID_AUTO, stupid_switch, + CTLFLAG_RW, &stupid_switch, 0, ""); +int swtch_optim_stats; SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, - CTLFLAG_RD, &swtch_optim_stats, 0, ""); + CTLFLAG_RW, &swtch_optim_stats, 0, ""); +int tlb_flush_count; SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, - CTLFLAG_RD, &tlb_flush_count, 0, ""); + CTLFLAG_RW, &tlb_flush_count, 0, ""); +int lazy_flush_count; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_count, + CTLFLAG_RW, &lazy_flush_count, 0, ""); +int lazy_flush_fixup; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_fixup, + CTLFLAG_RW, &lazy_flush_fixup, 0, ""); +#ifdef SMP +int lazy_flush_smpfixup; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpfixup, + CTLFLAG_RW, &lazy_flush_smpfixup, 0, ""); +int lazy_flush_smpipi; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpipi, + CTLFLAG_RW, &lazy_flush_smpipi, 0, ""); +int lazy_flush_smpbadcr3; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpbadcr3, + CTLFLAG_RW, &lazy_flush_smpbadcr3, 0, ""); +int lazy_flush_smpmiss; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpmiss, + CTLFLAG_RW, &lazy_flush_smpmiss, 0, ""); +#endif +#endif +#ifdef LAZY_SWITCH +int lazy_flush_enable = 1; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_enable, + CTLFLAG_RW, &lazy_flush_enable, 0, ""); #endif int cold = 1; diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 954a7fdfdb8f..4cc8bf545791 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -27,6 +27,7 @@ #include "opt_cpu.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #ifdef SMP #include @@ -634,6 +635,12 @@ mp_enable(u_int boot_addr) setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#ifdef LAZY_SWITCH + /* install an inter-CPU IPI for lazy pmap release */ + setidt(XLAZYPMAP_OFFSET, Xlazypmap, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2598,7 +2605,7 @@ ap_init(void) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ panic("scheduler returned us to %s", __func__); } diff --git a/sys/amd64/amd64/mptable.c b/sys/amd64/amd64/mptable.c index 954a7fdfdb8f..4cc8bf545791 100644 --- a/sys/amd64/amd64/mptable.c +++ b/sys/amd64/amd64/mptable.c @@ -27,6 +27,7 @@ #include "opt_cpu.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #ifdef SMP #include @@ -634,6 +635,12 @@ mp_enable(u_int boot_addr) setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#ifdef LAZY_SWITCH + /* install an inter-CPU IPI for lazy pmap release */ + setidt(XLAZYPMAP_OFFSET, Xlazypmap, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2598,7 +2605,7 @@ ap_init(void) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ panic("scheduler returned us to %s", __func__); } diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 76b45b4219f6..77c53e0904ec 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -102,6 +102,7 @@ #include "opt_pmap.h" #include "opt_msgbuf.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #include #include @@ -184,6 +185,9 @@ struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; +#if defined(SMP) && defined(LAZY_SWITCH) +static struct mtx lazypmap_lock; +#endif vm_paddr_t avail_start; /* PA of first available physical page */ vm_paddr_t avail_end; /* PA of last available physical page */ @@ -336,6 +340,9 @@ pmap_bootstrap(firstaddr, loadaddr) kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvlist); LIST_INIT(&allpmaps); +#if defined(SMP) && defined(LAZY_SWITCH) + mtx_init(&lazypmap_lock, "lazypmap", NULL, MTX_SPIN); +#endif mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); @@ -1486,6 +1493,121 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va) * Pmap allocation/deallocation routines. ***************************************************/ +#ifdef LAZY_SWITCH +#ifdef SMP +/* + * Deal with a SMP shootdown of other users of the pmap that we are + * trying to dispose of. This can be a bit hairy. + */ +static u_int *lazymask; +static u_int lazyptd; +static volatile u_int lazywait; + +void pmap_lazyfix_action(void); + +void +pmap_lazyfix_action(void) +{ + u_int mymask = PCPU_GET(cpumask); + + if (rcr3() == lazyptd) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); +#ifdef SWTCH_OPTIM_STATS + atomic_add_int(&lazy_flush_smpfixup, 1); + } else { + if (*lazymask & mymask) + lazy_flush_smpbadcr3++; + else + lazy_flush_smpmiss++; +#endif + } + atomic_clear_int(lazymask, mymask); + atomic_store_rel_int(&lazywait, 1); +} + +static void +pmap_lazyfix_self(u_int mymask) +{ + + if (rcr3() == lazyptd) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); +#ifdef SWTCH_OPTIM_STATS + lazy_flush_fixup++; + } else { + if (*lazymask & mymask) + lazy_flush_smpbadcr3++; + else + lazy_flush_smpmiss++; +#endif + } + atomic_clear_int(lazymask, mymask); +} + + +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int mymask = PCPU_GET(cpumask); + u_int mask; + register u_int spins; + + while ((mask = pmap->pm_active) != 0) { + spins = 50000000; + mask = mask & -mask; /* Find least significant set bit */ + mtx_lock_spin(&lazypmap_lock); +#ifdef PAE + lazyptd = vtophys(pmap->pm_pdpt); +#else + lazyptd = vtophys(pmap->pm_pdir); +#endif + if (mask == mymask) { + lazymask = &pmap->pm_active; + pmap_lazyfix_self(mymask); + } else { + atomic_store_rel_int((u_int *)&lazymask, + (u_int)&pmap->pm_active); + atomic_store_rel_int(&lazywait, 0); + ipi_selected(mask, IPI_LAZYPMAP); + while (lazywait == 0) { + ia32_pause(); + if (--spins == 0) + break; + } +#ifdef SWTCH_OPTIM_STATS + lazy_flush_smpipi++; +#endif + } + mtx_unlock_spin(&lazypmap_lock); + if (spins == 0) + printf("pmap_lazyfix: spun for 50000000\n"); + } +} + +#else /* SMP */ + +/* + * Cleaning up on uniprocessor is easy. For various reasons, we're + * unlikely to have to even execute this code, including the fact + * that the cleanup is deferred until the parent does a wait(2), which + * means that another userland process has run. + */ +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int cr3; + + cr3 = vtophys(pmap->pm_pdir); + if (cr3 == rcr3()) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + pmap->pm_active &= ~(PCPU_GET(cpumask)); +#ifdef SWTCH_OPTIM_STATS + lazy_flush_fixup++; +#endif + } +} +#endif /* SMP */ +#endif /* LAZY_SWITCH */ + /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. @@ -1507,6 +1629,9 @@ pmap_release(pmap_t pmap) ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); +#ifdef LAZY_SWITCH + pmap_lazyfix(pmap); +#endif mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); @@ -3321,9 +3446,10 @@ pmap_activate(struct thread *td) pmap_t pmap; u_int32_t cr3; + critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); #if defined(SMP) - pmap->pm_active |= PCPU_GET(cpumask); + atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); #else pmap->pm_active |= 1; #endif @@ -3348,6 +3474,7 @@ pmap_activate(struct thread *td) #ifdef SWTCH_OPTIM_STATS tlb_flush_count++; #endif + critical_exit(); } vm_offset_t diff --git a/sys/amd64/amd64/swtch.s b/sys/amd64/amd64/swtch.s index aaf0ec2c2073..793e63431f4a 100644 --- a/sys/amd64/amd64/swtch.s +++ b/sys/amd64/amd64/swtch.s @@ -37,30 +37,16 @@ */ #include "opt_npx.h" +#include "opt_swtch.h" #include -#ifdef SMP -#include -#include /* CHEAP_TPR, GRAB_LOPRIO */ -#endif - #include "assym.s" /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ - .data - - .globl panic - -#ifdef SWTCH_OPTIM_STATS - .globl swtch_optim_stats, tlb_flush_count -swtch_optim_stats: .long 0 /* number of _swtch_optims */ -tlb_flush_count: .long 0 -#endif - .text /* @@ -68,30 +54,60 @@ tlb_flush_count: .long 0 * * This is the second half of cpu_swtch(). It is used when the current * thread is either a dummy or slated to die, and we no longer care - * about its state. + * about its state. This is only a slight optimization and is probably + * not worth it anymore. Note that we need to clear the pm_active bits so + * we do need the old proc if it still exists. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd */ ENTRY(cpu_throw) + movl PCPU(CPUID), %esi + movl 4(%esp),%ecx /* Old thread */ + testl %ecx,%ecx /* no thread? */ + jz 1f + /* release bit from old pm_active */ + movl TD_PROC(%ecx), %eax /* thread->td_proc */ + movl P_VMSPACE(%eax), %ebx /* proc->p_vmspace */ +#ifdef SMP + lock +#endif + btrl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* clear old */ +1: + movl 8(%esp),%ecx /* New thread */ + movl TD_PCB(%ecx),%edx +#ifdef SWTCH_OPTIM_STATS + incl tlb_flush_count +#endif + movl PCB_CR3(%edx),%eax + movl %eax,%cr3 /* new address space */ + /* set bit in new pm_active */ + movl TD_PROC(%ecx),%eax + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btsl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* set new */ jmp sw1 /* - * cpu_switch() + * cpu_switch(old, new) * * Save the current thread state, then select the next thread to run * and load its state. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd */ ENTRY(cpu_switch) - /* Switch to new thread. First, save context as needed. */ - movl PCPU(CURTHREAD),%ecx + /* Switch to new thread. First, save context. */ + movl 4(%esp),%ecx - /* If no thread to save, don't save it (XXX shouldn't happen). */ - testl %ecx,%ecx - jz sw1 - - movl TD_PROC(%ecx), %eax - movl P_VMSPACE(%eax), %edx - movl PCPU(CPUID), %eax - btrl %eax, VM_PMAP+PM_ACTIVE(%edx) +#ifdef INVARIANTS + testl %ecx,%ecx /* no thread? */ + jz badsw2 /* no, panic */ +#endif movl TD_PCB(%ecx),%edx @@ -125,10 +141,6 @@ ENTRY(cpu_switch) movl %eax,PCB_DR0(%edx) 1: -#ifdef SMP - /* XXX FIXME: we should be saving the local APIC TPR */ -#endif - #ifdef DEV_NPX /* have we used fp, and need a save? */ cmpl %ecx,PCPU(FPCURTHREAD) @@ -140,56 +152,76 @@ ENTRY(cpu_switch) 1: #endif - /* Save is done. Now choose a new thread. */ - /* XXX still trashing space above the old "Top Of Stack". */ -sw1: - -#ifdef SMP - /* - * Stop scheduling if smp_active has become zero (for rebooting) and - * we are not the BSP. - */ - cmpl $0,smp_active - jne 1f - cmpl $0,PCPU(CPUID) - je 1f - movl PCPU(IDLETHREAD), %eax - jmp sw1b -1: -#endif - - /* - * Choose a new thread to schedule. choosethread() returns idlethread - * if it cannot find another thread to run. - */ - call choosethread /* Trash ecx, edx; ret eax. */ - + /* Save is done. Now fire up new thread. Leave old vmspace. */ + movl %ecx,%edi + movl 8(%esp),%ecx /* New thread */ #ifdef INVARIANTS - testl %eax,%eax /* no thread? */ + testl %ecx,%ecx /* no thread? */ jz badsw3 /* no, panic */ #endif - -sw1b: - movl %eax,%ecx movl TD_PCB(%ecx),%edx - -#ifdef SWTCH_OPTIM_STATS - incl swtch_optim_stats -#endif + movl PCPU(CPUID), %esi /* switch address space */ - movl %cr3,%ebx /* The same address space? */ - cmpl PCB_CR3(%edx),%ebx - je 4f /* Yes, skip all that cruft */ + movl PCB_CR3(%edx),%eax +#ifdef LAZY_SWITCH + cmpl $0,lazy_flush_enable + je 1f + cmpl %eax,IdlePTD /* Kernel address space? */ +#ifdef SWTCH_OPTIM_STATS + je 3f +#else + je sw1 +#endif +1: + movl %cr3,%ebx /* The same address space? */ + cmpl %ebx,%eax +#ifdef SWTCH_OPTIM_STATS + je 2f /* Yes, skip all that cruft */ +#else + je sw1 +#endif +#endif + #ifdef SWTCH_OPTIM_STATS - decl swtch_optim_stats incl tlb_flush_count #endif - movl PCB_CR3(%edx),%ebx /* Tell the CPU about the */ - movl %ebx,%cr3 /* new address space */ -4: + movl %eax,%cr3 /* new address space */ - movl PCPU(CPUID), %esi + /* Release bit from old pmap->pm_active */ + movl TD_PROC(%edi), %eax /* oldproc */ + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btrl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* clear old */ + + /* Set bit in new pmap->pm_active */ + movl TD_PROC(%ecx),%eax /* newproc */ + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btsl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* set new */ + +#ifdef LAZY_SWITCH +#ifdef SWTCH_OPTIM_STATS + jmp sw1 + +2: /* same address space */ + incl swtch_optim_stats + jmp sw1 + +3: /* kernel address space */ + incl lazy_flush_count +#endif +#endif + +sw1: + /* + * At this point, we've switched address spaces and are ready + * to load up the rest of the next context. + */ cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f /* If not, use the default */ btsl %esi, private_tss /* mark use of private tss */ @@ -221,11 +253,6 @@ sw1b: movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 3: - /* Note in vmspace that this cpu is using it. */ - movl TD_PROC(%ecx),%eax - movl P_VMSPACE(%eax), %ebx - movl PCPU(CPUID), %eax - btsl %eax, VM_PMAP+PM_ACTIVE(%ebx) /* Restore context. */ movl PCB_EBX(%edx),%ebx @@ -241,10 +268,6 @@ sw1b: movl %edx, PCPU(CURPCB) movl %ecx, PCPU(CURTHREAD) /* into next thread */ -#ifdef SMP - /* XXX FIXME: we should be restoring the local APIC TPR */ -#endif - /* * Determine the LDT to use and load it if is the default one and * that is not the current one. @@ -301,12 +324,23 @@ cpu_switch_load_gs: ret #ifdef INVARIANTS +badsw1: + pushal + pushl $sw0_1 + call panic +sw0_1: .asciz "cpu_throw: no newthread supplied" + +badsw2: + pushal + pushl $sw0_2 + call panic +sw0_2: .asciz "cpu_switch: no curthread supplied" + badsw3: pushal pushl $sw0_3 call panic - -sw0_3: .asciz "cpu_switch: choosethread returned NULL" +sw0_3: .asciz "cpu_switch: no newthread supplied" #endif /* diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h index 9143d19cbbc8..6bcecff4001d 100644 --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -65,6 +65,22 @@ extern int szfreebsd4_sigcode; #ifdef COMPAT_43 extern int szosigcode; #endif +#ifdef SWTCH_OPTIM_STATS +extern int stupid_switch; +extern int swtch_optim_stats; +extern int tlb_flush_count; +extern int lazy_flush_count; +extern int lazy_flush_fixup; +#ifdef SMP +extern int lazy_flush_smpfixup; +extern int lazy_flush_smpipi; +extern int lazy_flush_smpbadcr3; +extern int lazy_flush_smpmiss; +#endif +#endif +#ifdef LAZY_SWITCH +extern int lazy_flush_enable; +#endif typedef void alias_for_inthand_t(u_int cs, u_int ef, u_int esp, u_int ss); struct thread; diff --git a/sys/amd64/include/mptable.h b/sys/amd64/include/mptable.h index 954a7fdfdb8f..4cc8bf545791 100644 --- a/sys/amd64/include/mptable.h +++ b/sys/amd64/include/mptable.h @@ -27,6 +27,7 @@ #include "opt_cpu.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #ifdef SMP #include @@ -634,6 +635,12 @@ mp_enable(u_int boot_addr) setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#ifdef LAZY_SWITCH + /* install an inter-CPU IPI for lazy pmap release */ + setidt(XLAZYPMAP_OFFSET, Xlazypmap, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2598,7 +2605,7 @@ ap_init(void) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ panic("scheduler returned us to %s", __func__); } diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index 8fcf2cbb3c97..f044d453abd9 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -247,7 +247,7 @@ struct pmap { pd_entry_t *pm_pdir; /* KVA of page directory */ vm_object_t pm_pteobj; /* Container for pte's */ TAILQ_HEAD(,pv_entry) pm_pvlist; /* list of mappings in pmap */ - int pm_active; /* active on cpus */ + u_int pm_active; /* active on cpus */ struct pmap_statistics pm_stats; /* pmap statistics */ LIST_ENTRY(pmap) pm_list; /* List of all pmaps */ #ifdef PAE diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index b503c2ca37a2..6467365cc8e0 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -56,6 +56,7 @@ extern int current_postcode; /** XXX currently in mp_machdep.c */ #define IPI_INVLTLB XINVLTLB_OFFSET #define IPI_INVLPG XINVLPG_OFFSET #define IPI_INVLRNG XINVLRNG_OFFSET +#define IPI_LAZYPMAP XLAZYPMAP_OFFSET #define IPI_RENDEZVOUS XRENDEZVOUS_OFFSET #define IPI_AST XCPUAST_OFFSET #define IPI_STOP XCPUSTOP_OFFSET diff --git a/sys/amd64/isa/intr_machdep.h b/sys/amd64/isa/intr_machdep.h index 7179268ba6a4..4cfecb5d24d6 100644 --- a/sys/amd64/isa/intr_machdep.h +++ b/sys/amd64/isa/intr_machdep.h @@ -116,6 +116,9 @@ /* inter-CPU rendezvous */ #define XRENDEZVOUS_OFFSET (ICU_OFFSET + 122) /* 0x9A */ +/* lazy pmap release */ +#define XLAZYPMAP_OFFSET (ICU_OFFSET + 123) /* 0x9B */ + /* IPI to generate an additional software trap at the target CPU */ /* XXX in the middle of the interrupt range, overlapping IRQ48 */ #define XCPUAST_OFFSET (ICU_OFFSET + 48) /* 0x50 */ @@ -206,7 +209,8 @@ inthand_t Xcpuast, /* Additional software trap on other cpu */ Xcpustop, /* CPU stops & waits for another CPU to restart it */ Xspuriousint, /* handle APIC "spurious INTs" */ - Xrendezvous; /* handle CPU rendezvous */ + Xrendezvous, /* handle CPU rendezvous */ + Xlazypmap; /* handle lazy pmap release */ #ifdef TEST_TEST1 inthand_t diff --git a/sys/conf/options.i386 b/sys/conf/options.i386 index 868cfbdb6546..de784be34666 100644 --- a/sys/conf/options.i386 +++ b/sys/conf/options.i386 @@ -6,6 +6,8 @@ GPL_MATH_EMULATE opt_math_emulate.h DISABLE_PSE opt_pmap.h PMAP_SHPGPERPROC opt_pmap.h DISABLE_PG_G opt_pmap.h +LAZY_SWITCH opt_swtch.h +SWTCH_OPTIM_STATS opt_swtch.h PPC_PROBE_CHIPSET opt_ppc.h PPC_DEBUG opt_ppc.h MAXMEM diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s index e4b6ea5fd65f..d91ff1ce4cee 100644 --- a/sys/i386/i386/apic_vector.s +++ b/sys/i386/i386/apic_vector.s @@ -3,6 +3,7 @@ * $FreeBSD$ */ +#include "opt_swtch.h" #include #include @@ -648,7 +649,28 @@ Xrendezvous: POP_FRAME iret +#ifdef LAZY_SWITCH +/* + * Clean up when we lose out on the lazy context switch optimization. + * ie: when we are about to release a PTD but a cpu is still borrowing it. + */ + SUPERALIGN_TEXT + .globl Xlazypmap +Xlazypmap: + PUSH_FRAME + movl $KDSEL, %eax + mov %ax, %ds /* use KERNEL data segment */ + mov %ax, %es + movl $KPSEL, %eax + mov %ax, %fs + + call pmap_lazyfix_action + movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + POP_FRAME + iret +#endif + .data .globl apic_pin_trigger diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index b389828b596f..baab5cb6dccb 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -49,6 +49,7 @@ #include "opt_msgbuf.h" #include "opt_npx.h" #include "opt_perfmon.h" +#include "opt_swtch.h" #include "opt_kstack_pages.h" #include @@ -151,11 +152,40 @@ int _udatasel, _ucodesel; u_int atdevbase; #if defined(SWTCH_OPTIM_STATS) -extern int swtch_optim_stats; +int stupid_switch; +SYSCTL_INT(_debug, OID_AUTO, stupid_switch, + CTLFLAG_RW, &stupid_switch, 0, ""); +int swtch_optim_stats; SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, - CTLFLAG_RD, &swtch_optim_stats, 0, ""); + CTLFLAG_RW, &swtch_optim_stats, 0, ""); +int tlb_flush_count; SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, - CTLFLAG_RD, &tlb_flush_count, 0, ""); + CTLFLAG_RW, &tlb_flush_count, 0, ""); +int lazy_flush_count; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_count, + CTLFLAG_RW, &lazy_flush_count, 0, ""); +int lazy_flush_fixup; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_fixup, + CTLFLAG_RW, &lazy_flush_fixup, 0, ""); +#ifdef SMP +int lazy_flush_smpfixup; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpfixup, + CTLFLAG_RW, &lazy_flush_smpfixup, 0, ""); +int lazy_flush_smpipi; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpipi, + CTLFLAG_RW, &lazy_flush_smpipi, 0, ""); +int lazy_flush_smpbadcr3; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpbadcr3, + CTLFLAG_RW, &lazy_flush_smpbadcr3, 0, ""); +int lazy_flush_smpmiss; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpmiss, + CTLFLAG_RW, &lazy_flush_smpmiss, 0, ""); +#endif +#endif +#ifdef LAZY_SWITCH +int lazy_flush_enable = 1; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_enable, + CTLFLAG_RW, &lazy_flush_enable, 0, ""); #endif int cold = 1; diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c index 954a7fdfdb8f..4cc8bf545791 100644 --- a/sys/i386/i386/mp_machdep.c +++ b/sys/i386/i386/mp_machdep.c @@ -27,6 +27,7 @@ #include "opt_cpu.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #ifdef SMP #include @@ -634,6 +635,12 @@ mp_enable(u_int boot_addr) setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#ifdef LAZY_SWITCH + /* install an inter-CPU IPI for lazy pmap release */ + setidt(XLAZYPMAP_OFFSET, Xlazypmap, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2598,7 +2605,7 @@ ap_init(void) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ panic("scheduler returned us to %s", __func__); } diff --git a/sys/i386/i386/mptable.c b/sys/i386/i386/mptable.c index 954a7fdfdb8f..4cc8bf545791 100644 --- a/sys/i386/i386/mptable.c +++ b/sys/i386/i386/mptable.c @@ -27,6 +27,7 @@ #include "opt_cpu.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #ifdef SMP #include @@ -634,6 +635,12 @@ mp_enable(u_int boot_addr) setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#ifdef LAZY_SWITCH + /* install an inter-CPU IPI for lazy pmap release */ + setidt(XLAZYPMAP_OFFSET, Xlazypmap, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2598,7 +2605,7 @@ ap_init(void) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ panic("scheduler returned us to %s", __func__); } diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 76b45b4219f6..77c53e0904ec 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -102,6 +102,7 @@ #include "opt_pmap.h" #include "opt_msgbuf.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #include #include @@ -184,6 +185,9 @@ struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; +#if defined(SMP) && defined(LAZY_SWITCH) +static struct mtx lazypmap_lock; +#endif vm_paddr_t avail_start; /* PA of first available physical page */ vm_paddr_t avail_end; /* PA of last available physical page */ @@ -336,6 +340,9 @@ pmap_bootstrap(firstaddr, loadaddr) kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvlist); LIST_INIT(&allpmaps); +#if defined(SMP) && defined(LAZY_SWITCH) + mtx_init(&lazypmap_lock, "lazypmap", NULL, MTX_SPIN); +#endif mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); @@ -1486,6 +1493,121 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va) * Pmap allocation/deallocation routines. ***************************************************/ +#ifdef LAZY_SWITCH +#ifdef SMP +/* + * Deal with a SMP shootdown of other users of the pmap that we are + * trying to dispose of. This can be a bit hairy. + */ +static u_int *lazymask; +static u_int lazyptd; +static volatile u_int lazywait; + +void pmap_lazyfix_action(void); + +void +pmap_lazyfix_action(void) +{ + u_int mymask = PCPU_GET(cpumask); + + if (rcr3() == lazyptd) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); +#ifdef SWTCH_OPTIM_STATS + atomic_add_int(&lazy_flush_smpfixup, 1); + } else { + if (*lazymask & mymask) + lazy_flush_smpbadcr3++; + else + lazy_flush_smpmiss++; +#endif + } + atomic_clear_int(lazymask, mymask); + atomic_store_rel_int(&lazywait, 1); +} + +static void +pmap_lazyfix_self(u_int mymask) +{ + + if (rcr3() == lazyptd) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); +#ifdef SWTCH_OPTIM_STATS + lazy_flush_fixup++; + } else { + if (*lazymask & mymask) + lazy_flush_smpbadcr3++; + else + lazy_flush_smpmiss++; +#endif + } + atomic_clear_int(lazymask, mymask); +} + + +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int mymask = PCPU_GET(cpumask); + u_int mask; + register u_int spins; + + while ((mask = pmap->pm_active) != 0) { + spins = 50000000; + mask = mask & -mask; /* Find least significant set bit */ + mtx_lock_spin(&lazypmap_lock); +#ifdef PAE + lazyptd = vtophys(pmap->pm_pdpt); +#else + lazyptd = vtophys(pmap->pm_pdir); +#endif + if (mask == mymask) { + lazymask = &pmap->pm_active; + pmap_lazyfix_self(mymask); + } else { + atomic_store_rel_int((u_int *)&lazymask, + (u_int)&pmap->pm_active); + atomic_store_rel_int(&lazywait, 0); + ipi_selected(mask, IPI_LAZYPMAP); + while (lazywait == 0) { + ia32_pause(); + if (--spins == 0) + break; + } +#ifdef SWTCH_OPTIM_STATS + lazy_flush_smpipi++; +#endif + } + mtx_unlock_spin(&lazypmap_lock); + if (spins == 0) + printf("pmap_lazyfix: spun for 50000000\n"); + } +} + +#else /* SMP */ + +/* + * Cleaning up on uniprocessor is easy. For various reasons, we're + * unlikely to have to even execute this code, including the fact + * that the cleanup is deferred until the parent does a wait(2), which + * means that another userland process has run. + */ +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int cr3; + + cr3 = vtophys(pmap->pm_pdir); + if (cr3 == rcr3()) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + pmap->pm_active &= ~(PCPU_GET(cpumask)); +#ifdef SWTCH_OPTIM_STATS + lazy_flush_fixup++; +#endif + } +} +#endif /* SMP */ +#endif /* LAZY_SWITCH */ + /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. @@ -1507,6 +1629,9 @@ pmap_release(pmap_t pmap) ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); +#ifdef LAZY_SWITCH + pmap_lazyfix(pmap); +#endif mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); @@ -3321,9 +3446,10 @@ pmap_activate(struct thread *td) pmap_t pmap; u_int32_t cr3; + critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); #if defined(SMP) - pmap->pm_active |= PCPU_GET(cpumask); + atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); #else pmap->pm_active |= 1; #endif @@ -3348,6 +3474,7 @@ pmap_activate(struct thread *td) #ifdef SWTCH_OPTIM_STATS tlb_flush_count++; #endif + critical_exit(); } vm_offset_t diff --git a/sys/i386/i386/swtch.s b/sys/i386/i386/swtch.s index aaf0ec2c2073..793e63431f4a 100644 --- a/sys/i386/i386/swtch.s +++ b/sys/i386/i386/swtch.s @@ -37,30 +37,16 @@ */ #include "opt_npx.h" +#include "opt_swtch.h" #include -#ifdef SMP -#include -#include /* CHEAP_TPR, GRAB_LOPRIO */ -#endif - #include "assym.s" /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ - .data - - .globl panic - -#ifdef SWTCH_OPTIM_STATS - .globl swtch_optim_stats, tlb_flush_count -swtch_optim_stats: .long 0 /* number of _swtch_optims */ -tlb_flush_count: .long 0 -#endif - .text /* @@ -68,30 +54,60 @@ tlb_flush_count: .long 0 * * This is the second half of cpu_swtch(). It is used when the current * thread is either a dummy or slated to die, and we no longer care - * about its state. + * about its state. This is only a slight optimization and is probably + * not worth it anymore. Note that we need to clear the pm_active bits so + * we do need the old proc if it still exists. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd */ ENTRY(cpu_throw) + movl PCPU(CPUID), %esi + movl 4(%esp),%ecx /* Old thread */ + testl %ecx,%ecx /* no thread? */ + jz 1f + /* release bit from old pm_active */ + movl TD_PROC(%ecx), %eax /* thread->td_proc */ + movl P_VMSPACE(%eax), %ebx /* proc->p_vmspace */ +#ifdef SMP + lock +#endif + btrl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* clear old */ +1: + movl 8(%esp),%ecx /* New thread */ + movl TD_PCB(%ecx),%edx +#ifdef SWTCH_OPTIM_STATS + incl tlb_flush_count +#endif + movl PCB_CR3(%edx),%eax + movl %eax,%cr3 /* new address space */ + /* set bit in new pm_active */ + movl TD_PROC(%ecx),%eax + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btsl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* set new */ jmp sw1 /* - * cpu_switch() + * cpu_switch(old, new) * * Save the current thread state, then select the next thread to run * and load its state. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd */ ENTRY(cpu_switch) - /* Switch to new thread. First, save context as needed. */ - movl PCPU(CURTHREAD),%ecx + /* Switch to new thread. First, save context. */ + movl 4(%esp),%ecx - /* If no thread to save, don't save it (XXX shouldn't happen). */ - testl %ecx,%ecx - jz sw1 - - movl TD_PROC(%ecx), %eax - movl P_VMSPACE(%eax), %edx - movl PCPU(CPUID), %eax - btrl %eax, VM_PMAP+PM_ACTIVE(%edx) +#ifdef INVARIANTS + testl %ecx,%ecx /* no thread? */ + jz badsw2 /* no, panic */ +#endif movl TD_PCB(%ecx),%edx @@ -125,10 +141,6 @@ ENTRY(cpu_switch) movl %eax,PCB_DR0(%edx) 1: -#ifdef SMP - /* XXX FIXME: we should be saving the local APIC TPR */ -#endif - #ifdef DEV_NPX /* have we used fp, and need a save? */ cmpl %ecx,PCPU(FPCURTHREAD) @@ -140,56 +152,76 @@ ENTRY(cpu_switch) 1: #endif - /* Save is done. Now choose a new thread. */ - /* XXX still trashing space above the old "Top Of Stack". */ -sw1: - -#ifdef SMP - /* - * Stop scheduling if smp_active has become zero (for rebooting) and - * we are not the BSP. - */ - cmpl $0,smp_active - jne 1f - cmpl $0,PCPU(CPUID) - je 1f - movl PCPU(IDLETHREAD), %eax - jmp sw1b -1: -#endif - - /* - * Choose a new thread to schedule. choosethread() returns idlethread - * if it cannot find another thread to run. - */ - call choosethread /* Trash ecx, edx; ret eax. */ - + /* Save is done. Now fire up new thread. Leave old vmspace. */ + movl %ecx,%edi + movl 8(%esp),%ecx /* New thread */ #ifdef INVARIANTS - testl %eax,%eax /* no thread? */ + testl %ecx,%ecx /* no thread? */ jz badsw3 /* no, panic */ #endif - -sw1b: - movl %eax,%ecx movl TD_PCB(%ecx),%edx - -#ifdef SWTCH_OPTIM_STATS - incl swtch_optim_stats -#endif + movl PCPU(CPUID), %esi /* switch address space */ - movl %cr3,%ebx /* The same address space? */ - cmpl PCB_CR3(%edx),%ebx - je 4f /* Yes, skip all that cruft */ + movl PCB_CR3(%edx),%eax +#ifdef LAZY_SWITCH + cmpl $0,lazy_flush_enable + je 1f + cmpl %eax,IdlePTD /* Kernel address space? */ +#ifdef SWTCH_OPTIM_STATS + je 3f +#else + je sw1 +#endif +1: + movl %cr3,%ebx /* The same address space? */ + cmpl %ebx,%eax +#ifdef SWTCH_OPTIM_STATS + je 2f /* Yes, skip all that cruft */ +#else + je sw1 +#endif +#endif + #ifdef SWTCH_OPTIM_STATS - decl swtch_optim_stats incl tlb_flush_count #endif - movl PCB_CR3(%edx),%ebx /* Tell the CPU about the */ - movl %ebx,%cr3 /* new address space */ -4: + movl %eax,%cr3 /* new address space */ - movl PCPU(CPUID), %esi + /* Release bit from old pmap->pm_active */ + movl TD_PROC(%edi), %eax /* oldproc */ + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btrl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* clear old */ + + /* Set bit in new pmap->pm_active */ + movl TD_PROC(%ecx),%eax /* newproc */ + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btsl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* set new */ + +#ifdef LAZY_SWITCH +#ifdef SWTCH_OPTIM_STATS + jmp sw1 + +2: /* same address space */ + incl swtch_optim_stats + jmp sw1 + +3: /* kernel address space */ + incl lazy_flush_count +#endif +#endif + +sw1: + /* + * At this point, we've switched address spaces and are ready + * to load up the rest of the next context. + */ cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f /* If not, use the default */ btsl %esi, private_tss /* mark use of private tss */ @@ -221,11 +253,6 @@ sw1b: movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 3: - /* Note in vmspace that this cpu is using it. */ - movl TD_PROC(%ecx),%eax - movl P_VMSPACE(%eax), %ebx - movl PCPU(CPUID), %eax - btsl %eax, VM_PMAP+PM_ACTIVE(%ebx) /* Restore context. */ movl PCB_EBX(%edx),%ebx @@ -241,10 +268,6 @@ sw1b: movl %edx, PCPU(CURPCB) movl %ecx, PCPU(CURTHREAD) /* into next thread */ -#ifdef SMP - /* XXX FIXME: we should be restoring the local APIC TPR */ -#endif - /* * Determine the LDT to use and load it if is the default one and * that is not the current one. @@ -301,12 +324,23 @@ cpu_switch_load_gs: ret #ifdef INVARIANTS +badsw1: + pushal + pushl $sw0_1 + call panic +sw0_1: .asciz "cpu_throw: no newthread supplied" + +badsw2: + pushal + pushl $sw0_2 + call panic +sw0_2: .asciz "cpu_switch: no curthread supplied" + badsw3: pushal pushl $sw0_3 call panic - -sw0_3: .asciz "cpu_switch: choosethread returned NULL" +sw0_3: .asciz "cpu_switch: no newthread supplied" #endif /* diff --git a/sys/i386/include/md_var.h b/sys/i386/include/md_var.h index 9143d19cbbc8..6bcecff4001d 100644 --- a/sys/i386/include/md_var.h +++ b/sys/i386/include/md_var.h @@ -65,6 +65,22 @@ extern int szfreebsd4_sigcode; #ifdef COMPAT_43 extern int szosigcode; #endif +#ifdef SWTCH_OPTIM_STATS +extern int stupid_switch; +extern int swtch_optim_stats; +extern int tlb_flush_count; +extern int lazy_flush_count; +extern int lazy_flush_fixup; +#ifdef SMP +extern int lazy_flush_smpfixup; +extern int lazy_flush_smpipi; +extern int lazy_flush_smpbadcr3; +extern int lazy_flush_smpmiss; +#endif +#endif +#ifdef LAZY_SWITCH +extern int lazy_flush_enable; +#endif typedef void alias_for_inthand_t(u_int cs, u_int ef, u_int esp, u_int ss); struct thread; diff --git a/sys/i386/include/mptable.h b/sys/i386/include/mptable.h index 954a7fdfdb8f..4cc8bf545791 100644 --- a/sys/i386/include/mptable.h +++ b/sys/i386/include/mptable.h @@ -27,6 +27,7 @@ #include "opt_cpu.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #ifdef SMP #include @@ -634,6 +635,12 @@ mp_enable(u_int boot_addr) setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#ifdef LAZY_SWITCH + /* install an inter-CPU IPI for lazy pmap release */ + setidt(XLAZYPMAP_OFFSET, Xlazypmap, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2598,7 +2605,7 @@ ap_init(void) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ panic("scheduler returned us to %s", __func__); } diff --git a/sys/i386/include/pmap.h b/sys/i386/include/pmap.h index 8fcf2cbb3c97..f044d453abd9 100644 --- a/sys/i386/include/pmap.h +++ b/sys/i386/include/pmap.h @@ -247,7 +247,7 @@ struct pmap { pd_entry_t *pm_pdir; /* KVA of page directory */ vm_object_t pm_pteobj; /* Container for pte's */ TAILQ_HEAD(,pv_entry) pm_pvlist; /* list of mappings in pmap */ - int pm_active; /* active on cpus */ + u_int pm_active; /* active on cpus */ struct pmap_statistics pm_stats; /* pmap statistics */ LIST_ENTRY(pmap) pm_list; /* List of all pmaps */ #ifdef PAE diff --git a/sys/i386/include/smp.h b/sys/i386/include/smp.h index b503c2ca37a2..6467365cc8e0 100644 --- a/sys/i386/include/smp.h +++ b/sys/i386/include/smp.h @@ -56,6 +56,7 @@ extern int current_postcode; /** XXX currently in mp_machdep.c */ #define IPI_INVLTLB XINVLTLB_OFFSET #define IPI_INVLPG XINVLPG_OFFSET #define IPI_INVLRNG XINVLRNG_OFFSET +#define IPI_LAZYPMAP XLAZYPMAP_OFFSET #define IPI_RENDEZVOUS XRENDEZVOUS_OFFSET #define IPI_AST XCPUAST_OFFSET #define IPI_STOP XCPUSTOP_OFFSET diff --git a/sys/i386/isa/apic_vector.s b/sys/i386/isa/apic_vector.s index e4b6ea5fd65f..d91ff1ce4cee 100644 --- a/sys/i386/isa/apic_vector.s +++ b/sys/i386/isa/apic_vector.s @@ -3,6 +3,7 @@ * $FreeBSD$ */ +#include "opt_swtch.h" #include #include @@ -648,7 +649,28 @@ Xrendezvous: POP_FRAME iret +#ifdef LAZY_SWITCH +/* + * Clean up when we lose out on the lazy context switch optimization. + * ie: when we are about to release a PTD but a cpu is still borrowing it. + */ + SUPERALIGN_TEXT + .globl Xlazypmap +Xlazypmap: + PUSH_FRAME + movl $KDSEL, %eax + mov %ax, %ds /* use KERNEL data segment */ + mov %ax, %es + movl $KPSEL, %eax + mov %ax, %fs + + call pmap_lazyfix_action + movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + POP_FRAME + iret +#endif + .data .globl apic_pin_trigger diff --git a/sys/i386/isa/intr_machdep.h b/sys/i386/isa/intr_machdep.h index 7179268ba6a4..4cfecb5d24d6 100644 --- a/sys/i386/isa/intr_machdep.h +++ b/sys/i386/isa/intr_machdep.h @@ -116,6 +116,9 @@ /* inter-CPU rendezvous */ #define XRENDEZVOUS_OFFSET (ICU_OFFSET + 122) /* 0x9A */ +/* lazy pmap release */ +#define XLAZYPMAP_OFFSET (ICU_OFFSET + 123) /* 0x9B */ + /* IPI to generate an additional software trap at the target CPU */ /* XXX in the middle of the interrupt range, overlapping IRQ48 */ #define XCPUAST_OFFSET (ICU_OFFSET + 48) /* 0x50 */ @@ -206,7 +209,8 @@ inthand_t Xcpuast, /* Additional software trap on other cpu */ Xcpustop, /* CPU stops & waits for another CPU to restart it */ Xspuriousint, /* handle APIC "spurious INTs" */ - Xrendezvous; /* handle CPU rendezvous */ + Xrendezvous, /* handle CPU rendezvous */ + Xlazypmap; /* handle lazy pmap release */ #ifdef TEST_TEST1 inthand_t diff --git a/sys/kern/kern_kse.c b/sys/kern/kern_kse.c index 36994610d513..d3ceb096cefc 100644 --- a/sys/kern/kern_kse.c +++ b/sys/kern/kern_kse.c @@ -1250,7 +1250,13 @@ thread_exit(void) PROC_UNLOCK(p); } /* XXX Shouldn't cpu_throw() here. */ + mtx_assert(&sched_lock, MA_OWNED); +#if defined(__i386__) || defined(__sparc64__) + cpu_throw(td, choosethread()); +#else cpu_throw(); +#endif + panic("I'm a teapot!"); /* NOTREACHED */ } diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c index 8c3924344df3..377ad429ff45 100644 --- a/sys/kern/kern_switch.c +++ b/sys/kern/kern_switch.c @@ -98,6 +98,9 @@ reassigned to keep this true. #include #include #include +#if defined(SMP) && defined(__i386__) +#include +#endif #include CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS); @@ -122,8 +125,21 @@ choosethread(void) struct thread *td; struct ksegrp *kg; +#if defined(SMP) && defined(__i386__) + if (smp_active == 0 && PCPU_GET(cpuid) != 0) { + /* Shutting down, run idlethread on AP's */ + td = PCPU_GET(idlethread); + ke = td->td_kse; + CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td); + ke->ke_flags |= KEF_DIDRUN; + TD_SET_RUNNING(td); + return (td); + } +#endif + retry: - if ((ke = sched_choose())) { + ke = sched_choose(); + if (ke) { td = ke->ke_thread; KASSERT((td->td_kse == ke), ("kse/thread mismatch")); kg = ke->ke_ksegrp; diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index cf6591f6a701..b582e2027441 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -41,6 +41,9 @@ #include "opt_ddb.h" #include "opt_ktrace.h" +#ifdef __i386__ +#include "opt_swtch.h" +#endif #include #include @@ -67,6 +70,9 @@ #endif #include +#ifdef SWTCH_OPTIM_STATS +#include +#endif static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) @@ -449,12 +455,16 @@ void mi_switch(void) { struct bintime new_switchtime; - struct thread *td = curthread; /* XXX */ - struct proc *p = td->td_proc; /* XXX */ + struct thread *td; +#if defined(__i386__) || defined(__sparc64__) + struct thread *newtd; +#endif + struct proc *p; u_int sched_nest; mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); - + td = curthread; /* XXX */ + p = td->td_proc; /* XXX */ KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); #ifdef INVARIANTS if (!TD_ON_LOCK(td) && @@ -506,7 +516,17 @@ mi_switch(void) thread_switchout(td); sched_switchout(td); +#if defined(__i386__) || defined(__sparc64__) + newtd = choosethread(); + if (td != newtd) + cpu_switch(td, newtd); /* SHAZAM!! */ +#ifdef SWTCH_OPTIM_STATS + else + stupid_switch++; +#endif +#else cpu_switch(); /* SHAZAM!!*/ +#endif sched_lock.mtx_recurse = sched_nest; sched_lock.mtx_lock = (uintptr_t)td; diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c index 31c638ecb434..5f23f6dfcaef 100644 --- a/sys/kern/kern_thr.c +++ b/sys/kern/kern_thr.c @@ -106,7 +106,11 @@ thr_exit1(void) td->td_last_kse = NULL; thread_stash(td); +#if defined(__i386__) || defined(__sparc64__) + cpu_throw(td, choosethread()); +#else cpu_throw(); +#endif } #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start)) diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index 36994610d513..d3ceb096cefc 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -1250,7 +1250,13 @@ thread_exit(void) PROC_UNLOCK(p); } /* XXX Shouldn't cpu_throw() here. */ + mtx_assert(&sched_lock, MA_OWNED); +#if defined(__i386__) || defined(__sparc64__) + cpu_throw(td, choosethread()); +#else cpu_throw(); +#endif + panic("I'm a teapot!"); /* NOTREACHED */ } diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c index 94c93902c6e8..3b7526a536c5 100644 --- a/sys/kern/subr_witness.c +++ b/sys/kern/subr_witness.c @@ -84,6 +84,9 @@ #include "opt_ddb.h" #include "opt_witness.h" +#ifdef __i386__ +#include "opt_swtch.h" +#endif #include #include @@ -295,6 +298,9 @@ static struct witness_order_list_entry order_lists[] = { #if defined(__i386__) && defined(APIC_IO) { "tlb", &lock_class_mtx_spin }, #endif +#if defined(__i386__) && defined(LAZY_SWITCH) + { "lazypmap", &lock_class_mtx_spin }, +#endif #ifdef __sparc64__ { "ipi", &lock_class_mtx_spin }, #endif diff --git a/sys/sparc64/sparc64/mp_machdep.c b/sys/sparc64/sparc64/mp_machdep.c index 35a5474097f8..3f7e2d7ee158 100644 --- a/sys/sparc64/sparc64/mp_machdep.c +++ b/sys/sparc64/sparc64/mp_machdep.c @@ -357,7 +357,7 @@ cpu_mp_bootstrap(struct pcpu *pc) /* ok, now grab sched_lock and enter the scheduler */ mtx_lock_spin(&sched_lock); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ } void diff --git a/sys/sparc64/sparc64/swtch.S b/sys/sparc64/sparc64/swtch.S index d506a07e33b0..4e2128b7edbe 100644 --- a/sys/sparc64/sparc64/swtch.S +++ b/sys/sparc64/sparc64/swtch.S @@ -36,31 +36,29 @@ #include "assym.s" +/* + * void cpu_throw(struct thread *old, struct thread *new) + */ ENTRY(cpu_throw) save %sp, -CCFSZ, %sp - call choosethread - ldx [PCPU(CURTHREAD)], %l0 flushw - b,a %xcc, .Lsw1 - nop + mov %i0, %l0 + ba %xcc, .Lsw1 + mov %i1, %o0 END(cpu_throw) +/* + * void cpu_switch(struct thread *old, struct thread *new) + */ ENTRY(cpu_switch) - /* - * Choose a new thread. If its the same as the current one, do - * nothing. - */ save %sp, -CCFSZ, %sp - call choosethread - ldx [PCPU(CURTHREAD)], %l0 - cmp %l0, %o0 - be,a,pn %xcc, 4f - nop - ldx [%l0 + TD_PCB], %l1 + mov %i0, %l0 + mov %i1, %o0 /* * If the current thread was using floating point, save its context. */ + ldx [%l0 + TD_PCB], %l1 ldx [%l0 + TD_FRAME], %l2 ldx [%l2 + TF_FPRS], %l3 andcc %l3, FPRS_FEF, %g0 diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 42eda069c172..5d2bb04f7aaa 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -889,8 +889,13 @@ void setsugid(struct proc *p); void sleepinit(void); void stopevent(struct proc *, u_int, u_int); void cpu_idle(void); +#if defined(__i386__) || defined(__sparc64__) +void cpu_switch(struct thread *old, struct thread *new); +void cpu_throw(struct thread *old, struct thread *new) __dead2; +#else void cpu_switch(void); void cpu_throw(void) __dead2; +#endif void unsleep(struct thread *); void userret(struct thread *, struct trapframe *, u_int);