diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index e4b6ea5fd65f..d91ff1ce4cee 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -3,6 +3,7 @@ * $FreeBSD$ */ +#include "opt_swtch.h" #include #include @@ -648,7 +649,28 @@ Xrendezvous: POP_FRAME iret +#ifdef LAZY_SWITCH +/* + * Clean up when we lose out on the lazy context switch optimization. + * ie: when we are about to release a PTD but a cpu is still borrowing it. + */ + SUPERALIGN_TEXT + .globl Xlazypmap +Xlazypmap: + PUSH_FRAME + movl $KDSEL, %eax + mov %ax, %ds /* use KERNEL data segment */ + mov %ax, %es + movl $KPSEL, %eax + mov %ax, %fs + + call pmap_lazyfix_action + movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + POP_FRAME + iret +#endif + .data .globl apic_pin_trigger diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S index aaf0ec2c2073..793e63431f4a 100644 --- a/sys/amd64/amd64/cpu_switch.S +++ b/sys/amd64/amd64/cpu_switch.S @@ -37,30 +37,16 @@ */ #include "opt_npx.h" +#include "opt_swtch.h" #include -#ifdef SMP -#include -#include /* CHEAP_TPR, GRAB_LOPRIO */ -#endif - #include "assym.s" /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ - .data - - .globl panic - -#ifdef SWTCH_OPTIM_STATS - .globl swtch_optim_stats, tlb_flush_count -swtch_optim_stats: .long 0 /* number of _swtch_optims */ -tlb_flush_count: .long 0 -#endif - .text /* @@ -68,30 +54,60 @@ tlb_flush_count: .long 0 * * This is the second half of cpu_swtch(). It is used when the current * thread is either a dummy or slated to die, and we no longer care - * about its state. + * about its state. This is only a slight optimization and is probably + * not worth it anymore. Note that we need to clear the pm_active bits so + * we do need the old proc if it still exists. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd */ ENTRY(cpu_throw) + movl PCPU(CPUID), %esi + movl 4(%esp),%ecx /* Old thread */ + testl %ecx,%ecx /* no thread? */ + jz 1f + /* release bit from old pm_active */ + movl TD_PROC(%ecx), %eax /* thread->td_proc */ + movl P_VMSPACE(%eax), %ebx /* proc->p_vmspace */ +#ifdef SMP + lock +#endif + btrl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* clear old */ +1: + movl 8(%esp),%ecx /* New thread */ + movl TD_PCB(%ecx),%edx +#ifdef SWTCH_OPTIM_STATS + incl tlb_flush_count +#endif + movl PCB_CR3(%edx),%eax + movl %eax,%cr3 /* new address space */ + /* set bit in new pm_active */ + movl TD_PROC(%ecx),%eax + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btsl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* set new */ jmp sw1 /* - * cpu_switch() + * cpu_switch(old, new) * * Save the current thread state, then select the next thread to run * and load its state. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd */ ENTRY(cpu_switch) - /* Switch to new thread. First, save context as needed. */ - movl PCPU(CURTHREAD),%ecx + /* Switch to new thread. First, save context. */ + movl 4(%esp),%ecx - /* If no thread to save, don't save it (XXX shouldn't happen). */ - testl %ecx,%ecx - jz sw1 - - movl TD_PROC(%ecx), %eax - movl P_VMSPACE(%eax), %edx - movl PCPU(CPUID), %eax - btrl %eax, VM_PMAP+PM_ACTIVE(%edx) +#ifdef INVARIANTS + testl %ecx,%ecx /* no thread? */ + jz badsw2 /* no, panic */ +#endif movl TD_PCB(%ecx),%edx @@ -125,10 +141,6 @@ ENTRY(cpu_switch) movl %eax,PCB_DR0(%edx) 1: -#ifdef SMP - /* XXX FIXME: we should be saving the local APIC TPR */ -#endif - #ifdef DEV_NPX /* have we used fp, and need a save? */ cmpl %ecx,PCPU(FPCURTHREAD) @@ -140,56 +152,76 @@ ENTRY(cpu_switch) 1: #endif - /* Save is done. Now choose a new thread. */ - /* XXX still trashing space above the old "Top Of Stack". */ -sw1: - -#ifdef SMP - /* - * Stop scheduling if smp_active has become zero (for rebooting) and - * we are not the BSP. - */ - cmpl $0,smp_active - jne 1f - cmpl $0,PCPU(CPUID) - je 1f - movl PCPU(IDLETHREAD), %eax - jmp sw1b -1: -#endif - - /* - * Choose a new thread to schedule. choosethread() returns idlethread - * if it cannot find another thread to run. - */ - call choosethread /* Trash ecx, edx; ret eax. */ - + /* Save is done. Now fire up new thread. Leave old vmspace. */ + movl %ecx,%edi + movl 8(%esp),%ecx /* New thread */ #ifdef INVARIANTS - testl %eax,%eax /* no thread? */ + testl %ecx,%ecx /* no thread? */ jz badsw3 /* no, panic */ #endif - -sw1b: - movl %eax,%ecx movl TD_PCB(%ecx),%edx - -#ifdef SWTCH_OPTIM_STATS - incl swtch_optim_stats -#endif + movl PCPU(CPUID), %esi /* switch address space */ - movl %cr3,%ebx /* The same address space? */ - cmpl PCB_CR3(%edx),%ebx - je 4f /* Yes, skip all that cruft */ + movl PCB_CR3(%edx),%eax +#ifdef LAZY_SWITCH + cmpl $0,lazy_flush_enable + je 1f + cmpl %eax,IdlePTD /* Kernel address space? */ +#ifdef SWTCH_OPTIM_STATS + je 3f +#else + je sw1 +#endif +1: + movl %cr3,%ebx /* The same address space? */ + cmpl %ebx,%eax +#ifdef SWTCH_OPTIM_STATS + je 2f /* Yes, skip all that cruft */ +#else + je sw1 +#endif +#endif + #ifdef SWTCH_OPTIM_STATS - decl swtch_optim_stats incl tlb_flush_count #endif - movl PCB_CR3(%edx),%ebx /* Tell the CPU about the */ - movl %ebx,%cr3 /* new address space */ -4: + movl %eax,%cr3 /* new address space */ - movl PCPU(CPUID), %esi + /* Release bit from old pmap->pm_active */ + movl TD_PROC(%edi), %eax /* oldproc */ + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btrl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* clear old */ + + /* Set bit in new pmap->pm_active */ + movl TD_PROC(%ecx),%eax /* newproc */ + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btsl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* set new */ + +#ifdef LAZY_SWITCH +#ifdef SWTCH_OPTIM_STATS + jmp sw1 + +2: /* same address space */ + incl swtch_optim_stats + jmp sw1 + +3: /* kernel address space */ + incl lazy_flush_count +#endif +#endif + +sw1: + /* + * At this point, we've switched address spaces and are ready + * to load up the rest of the next context. + */ cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f /* If not, use the default */ btsl %esi, private_tss /* mark use of private tss */ @@ -221,11 +253,6 @@ sw1b: movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 3: - /* Note in vmspace that this cpu is using it. */ - movl TD_PROC(%ecx),%eax - movl P_VMSPACE(%eax), %ebx - movl PCPU(CPUID), %eax - btsl %eax, VM_PMAP+PM_ACTIVE(%ebx) /* Restore context. */ movl PCB_EBX(%edx),%ebx @@ -241,10 +268,6 @@ sw1b: movl %edx, PCPU(CURPCB) movl %ecx, PCPU(CURTHREAD) /* into next thread */ -#ifdef SMP - /* XXX FIXME: we should be restoring the local APIC TPR */ -#endif - /* * Determine the LDT to use and load it if is the default one and * that is not the current one. @@ -301,12 +324,23 @@ cpu_switch_load_gs: ret #ifdef INVARIANTS +badsw1: + pushal + pushl $sw0_1 + call panic +sw0_1: .asciz "cpu_throw: no newthread supplied" + +badsw2: + pushal + pushl $sw0_2 + call panic +sw0_2: .asciz "cpu_switch: no curthread supplied" + badsw3: pushal pushl $sw0_3 call panic - -sw0_3: .asciz "cpu_switch: choosethread returned NULL" +sw0_3: .asciz "cpu_switch: no newthread supplied" #endif /* diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index b389828b596f..baab5cb6dccb 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -49,6 +49,7 @@ #include "opt_msgbuf.h" #include "opt_npx.h" #include "opt_perfmon.h" +#include "opt_swtch.h" #include "opt_kstack_pages.h" #include @@ -151,11 +152,40 @@ int _udatasel, _ucodesel; u_int atdevbase; #if defined(SWTCH_OPTIM_STATS) -extern int swtch_optim_stats; +int stupid_switch; +SYSCTL_INT(_debug, OID_AUTO, stupid_switch, + CTLFLAG_RW, &stupid_switch, 0, ""); +int swtch_optim_stats; SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, - CTLFLAG_RD, &swtch_optim_stats, 0, ""); + CTLFLAG_RW, &swtch_optim_stats, 0, ""); +int tlb_flush_count; SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, - CTLFLAG_RD, &tlb_flush_count, 0, ""); + CTLFLAG_RW, &tlb_flush_count, 0, ""); +int lazy_flush_count; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_count, + CTLFLAG_RW, &lazy_flush_count, 0, ""); +int lazy_flush_fixup; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_fixup, + CTLFLAG_RW, &lazy_flush_fixup, 0, ""); +#ifdef SMP +int lazy_flush_smpfixup; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpfixup, + CTLFLAG_RW, &lazy_flush_smpfixup, 0, ""); +int lazy_flush_smpipi; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpipi, + CTLFLAG_RW, &lazy_flush_smpipi, 0, ""); +int lazy_flush_smpbadcr3; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpbadcr3, + CTLFLAG_RW, &lazy_flush_smpbadcr3, 0, ""); +int lazy_flush_smpmiss; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpmiss, + CTLFLAG_RW, &lazy_flush_smpmiss, 0, ""); +#endif +#endif +#ifdef LAZY_SWITCH +int lazy_flush_enable = 1; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_enable, + CTLFLAG_RW, &lazy_flush_enable, 0, ""); #endif int cold = 1; diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 954a7fdfdb8f..4cc8bf545791 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -27,6 +27,7 @@ #include "opt_cpu.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #ifdef SMP #include @@ -634,6 +635,12 @@ mp_enable(u_int boot_addr) setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#ifdef LAZY_SWITCH + /* install an inter-CPU IPI for lazy pmap release */ + setidt(XLAZYPMAP_OFFSET, Xlazypmap, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2598,7 +2605,7 @@ ap_init(void) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ panic("scheduler returned us to %s", __func__); } diff --git a/sys/amd64/amd64/mptable.c b/sys/amd64/amd64/mptable.c index 954a7fdfdb8f..4cc8bf545791 100644 --- a/sys/amd64/amd64/mptable.c +++ b/sys/amd64/amd64/mptable.c @@ -27,6 +27,7 @@ #include "opt_cpu.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #ifdef SMP #include @@ -634,6 +635,12 @@ mp_enable(u_int boot_addr) setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#ifdef LAZY_SWITCH + /* install an inter-CPU IPI for lazy pmap release */ + setidt(XLAZYPMAP_OFFSET, Xlazypmap, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2598,7 +2605,7 @@ ap_init(void) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ panic("scheduler returned us to %s", __func__); } diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 76b45b4219f6..77c53e0904ec 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -102,6 +102,7 @@ #include "opt_pmap.h" #include "opt_msgbuf.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #include #include @@ -184,6 +185,9 @@ struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; +#if defined(SMP) && defined(LAZY_SWITCH) +static struct mtx lazypmap_lock; +#endif vm_paddr_t avail_start; /* PA of first available physical page */ vm_paddr_t avail_end; /* PA of last available physical page */ @@ -336,6 +340,9 @@ pmap_bootstrap(firstaddr, loadaddr) kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvlist); LIST_INIT(&allpmaps); +#if defined(SMP) && defined(LAZY_SWITCH) + mtx_init(&lazypmap_lock, "lazypmap", NULL, MTX_SPIN); +#endif mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); @@ -1486,6 +1493,121 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va) * Pmap allocation/deallocation routines. ***************************************************/ +#ifdef LAZY_SWITCH +#ifdef SMP +/* + * Deal with a SMP shootdown of other users of the pmap that we are + * trying to dispose of. This can be a bit hairy. + */ +static u_int *lazymask; +static u_int lazyptd; +static volatile u_int lazywait; + +void pmap_lazyfix_action(void); + +void +pmap_lazyfix_action(void) +{ + u_int mymask = PCPU_GET(cpumask); + + if (rcr3() == lazyptd) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); +#ifdef SWTCH_OPTIM_STATS + atomic_add_int(&lazy_flush_smpfixup, 1); + } else { + if (*lazymask & mymask) + lazy_flush_smpbadcr3++; + else + lazy_flush_smpmiss++; +#endif + } + atomic_clear_int(lazymask, mymask); + atomic_store_rel_int(&lazywait, 1); +} + +static void +pmap_lazyfix_self(u_int mymask) +{ + + if (rcr3() == lazyptd) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); +#ifdef SWTCH_OPTIM_STATS + lazy_flush_fixup++; + } else { + if (*lazymask & mymask) + lazy_flush_smpbadcr3++; + else + lazy_flush_smpmiss++; +#endif + } + atomic_clear_int(lazymask, mymask); +} + + +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int mymask = PCPU_GET(cpumask); + u_int mask; + register u_int spins; + + while ((mask = pmap->pm_active) != 0) { + spins = 50000000; + mask = mask & -mask; /* Find least significant set bit */ + mtx_lock_spin(&lazypmap_lock); +#ifdef PAE + lazyptd = vtophys(pmap->pm_pdpt); +#else + lazyptd = vtophys(pmap->pm_pdir); +#endif + if (mask == mymask) { + lazymask = &pmap->pm_active; + pmap_lazyfix_self(mymask); + } else { + atomic_store_rel_int((u_int *)&lazymask, + (u_int)&pmap->pm_active); + atomic_store_rel_int(&lazywait, 0); + ipi_selected(mask, IPI_LAZYPMAP); + while (lazywait == 0) { + ia32_pause(); + if (--spins == 0) + break; + } +#ifdef SWTCH_OPTIM_STATS + lazy_flush_smpipi++; +#endif + } + mtx_unlock_spin(&lazypmap_lock); + if (spins == 0) + printf("pmap_lazyfix: spun for 50000000\n"); + } +} + +#else /* SMP */ + +/* + * Cleaning up on uniprocessor is easy. For various reasons, we're + * unlikely to have to even execute this code, including the fact + * that the cleanup is deferred until the parent does a wait(2), which + * means that another userland process has run. + */ +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int cr3; + + cr3 = vtophys(pmap->pm_pdir); + if (cr3 == rcr3()) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + pmap->pm_active &= ~(PCPU_GET(cpumask)); +#ifdef SWTCH_OPTIM_STATS + lazy_flush_fixup++; +#endif + } +} +#endif /* SMP */ +#endif /* LAZY_SWITCH */ + /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. @@ -1507,6 +1629,9 @@ pmap_release(pmap_t pmap) ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); +#ifdef LAZY_SWITCH + pmap_lazyfix(pmap); +#endif mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); @@ -3321,9 +3446,10 @@ pmap_activate(struct thread *td) pmap_t pmap; u_int32_t cr3; + critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); #if defined(SMP) - pmap->pm_active |= PCPU_GET(cpumask); + atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); #else pmap->pm_active |= 1; #endif @@ -3348,6 +3474,7 @@ pmap_activate(struct thread *td) #ifdef SWTCH_OPTIM_STATS tlb_flush_count++; #endif + critical_exit(); } vm_offset_t diff --git a/sys/amd64/amd64/swtch.s b/sys/amd64/amd64/swtch.s index aaf0ec2c2073..793e63431f4a 100644 --- a/sys/amd64/amd64/swtch.s +++ b/sys/amd64/amd64/swtch.s @@ -37,30 +37,16 @@ */ #include "opt_npx.h" +#include "opt_swtch.h" #include -#ifdef SMP -#include -#include /* CHEAP_TPR, GRAB_LOPRIO */ -#endif - #include "assym.s" /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ - .data - - .globl panic - -#ifdef SWTCH_OPTIM_STATS - .globl swtch_optim_stats, tlb_flush_count -swtch_optim_stats: .long 0 /* number of _swtch_optims */ -tlb_flush_count: .long 0 -#endif - .text /* @@ -68,30 +54,60 @@ tlb_flush_count: .long 0 * * This is the second half of cpu_swtch(). It is used when the current * thread is either a dummy or slated to die, and we no longer care - * about its state. + * about its state. This is only a slight optimization and is probably + * not worth it anymore. Note that we need to clear the pm_active bits so + * we do need the old proc if it still exists. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd */ ENTRY(cpu_throw) + movl PCPU(CPUID), %esi + movl 4(%esp),%ecx /* Old thread */ + testl %ecx,%ecx /* no thread? */ + jz 1f + /* release bit from old pm_active */ + movl TD_PROC(%ecx), %eax /* thread->td_proc */ + movl P_VMSPACE(%eax), %ebx /* proc->p_vmspace */ +#ifdef SMP + lock +#endif + btrl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* clear old */ +1: + movl 8(%esp),%ecx /* New thread */ + movl TD_PCB(%ecx),%edx +#ifdef SWTCH_OPTIM_STATS + incl tlb_flush_count +#endif + movl PCB_CR3(%edx),%eax + movl %eax,%cr3 /* new address space */ + /* set bit in new pm_active */ + movl TD_PROC(%ecx),%eax + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btsl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* set new */ jmp sw1 /* - * cpu_switch() + * cpu_switch(old, new) * * Save the current thread state, then select the next thread to run * and load its state. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd */ ENTRY(cpu_switch) - /* Switch to new thread. First, save context as needed. */ - movl PCPU(CURTHREAD),%ecx + /* Switch to new thread. First, save context. */ + movl 4(%esp),%ecx - /* If no thread to save, don't save it (XXX shouldn't happen). */ - testl %ecx,%ecx - jz sw1 - - movl TD_PROC(%ecx), %eax - movl P_VMSPACE(%eax), %edx - movl PCPU(CPUID), %eax - btrl %eax, VM_PMAP+PM_ACTIVE(%edx) +#ifdef INVARIANTS + testl %ecx,%ecx /* no thread? */ + jz badsw2 /* no, panic */ +#endif movl TD_PCB(%ecx),%edx @@ -125,10 +141,6 @@ ENTRY(cpu_switch) movl %eax,PCB_DR0(%edx) 1: -#ifdef SMP - /* XXX FIXME: we should be saving the local APIC TPR */ -#endif - #ifdef DEV_NPX /* have we used fp, and need a save? */ cmpl %ecx,PCPU(FPCURTHREAD) @@ -140,56 +152,76 @@ ENTRY(cpu_switch) 1: #endif - /* Save is done. Now choose a new thread. */ - /* XXX still trashing space above the old "Top Of Stack". */ -sw1: - -#ifdef SMP - /* - * Stop scheduling if smp_active has become zero (for rebooting) and - * we are not the BSP. - */ - cmpl $0,smp_active - jne 1f - cmpl $0,PCPU(CPUID) - je 1f - movl PCPU(IDLETHREAD), %eax - jmp sw1b -1: -#endif - - /* - * Choose a new thread to schedule. choosethread() returns idlethread - * if it cannot find another thread to run. - */ - call choosethread /* Trash ecx, edx; ret eax. */ - + /* Save is done. Now fire up new thread. Leave old vmspace. */ + movl %ecx,%edi + movl 8(%esp),%ecx /* New thread */ #ifdef INVARIANTS - testl %eax,%eax /* no thread? */ + testl %ecx,%ecx /* no thread? */ jz badsw3 /* no, panic */ #endif - -sw1b: - movl %eax,%ecx movl TD_PCB(%ecx),%edx - -#ifdef SWTCH_OPTIM_STATS - incl swtch_optim_stats -#endif + movl PCPU(CPUID), %esi /* switch address space */ - movl %cr3,%ebx /* The same address space? */ - cmpl PCB_CR3(%edx),%ebx - je 4f /* Yes, skip all that cruft */ + movl PCB_CR3(%edx),%eax +#ifdef LAZY_SWITCH + cmpl $0,lazy_flush_enable + je 1f + cmpl %eax,IdlePTD /* Kernel address space? */ +#ifdef SWTCH_OPTIM_STATS + je 3f +#else + je sw1 +#endif +1: + movl %cr3,%ebx /* The same address space? */ + cmpl %ebx,%eax +#ifdef SWTCH_OPTIM_STATS + je 2f /* Yes, skip all that cruft */ +#else + je sw1 +#endif +#endif + #ifdef SWTCH_OPTIM_STATS - decl swtch_optim_stats incl tlb_flush_count #endif - movl PCB_CR3(%edx),%ebx /* Tell the CPU about the */ - movl %ebx,%cr3 /* new address space */ -4: + movl %eax,%cr3 /* new address space */ - movl PCPU(CPUID), %esi + /* Release bit from old pmap->pm_active */ + movl TD_PROC(%edi), %eax /* oldproc */ + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btrl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* clear old */ + + /* Set bit in new pmap->pm_active */ + movl TD_PROC(%ecx),%eax /* newproc */ + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btsl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* set new */ + +#ifdef LAZY_SWITCH +#ifdef SWTCH_OPTIM_STATS + jmp sw1 + +2: /* same address space */ + incl swtch_optim_stats + jmp sw1 + +3: /* kernel address space */ + incl lazy_flush_count +#endif +#endif + +sw1: + /* + * At this point, we've switched address spaces and are ready + * to load up the rest of the next context. + */ cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f /* If not, use the default */ btsl %esi, private_tss /* mark use of private tss */ @@ -221,11 +253,6 @@ sw1b: movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 3: - /* Note in vmspace that this cpu is using it. */ - movl TD_PROC(%ecx),%eax - movl P_VMSPACE(%eax), %ebx - movl PCPU(CPUID), %eax - btsl %eax, VM_PMAP+PM_ACTIVE(%ebx) /* Restore context. */ movl PCB_EBX(%edx),%ebx @@ -241,10 +268,6 @@ sw1b: movl %edx, PCPU(CURPCB) movl %ecx, PCPU(CURTHREAD) /* into next thread */ -#ifdef SMP - /* XXX FIXME: we should be restoring the local APIC TPR */ -#endif - /* * Determine the LDT to use and load it if is the default one and * that is not the current one. @@ -301,12 +324,23 @@ cpu_switch_load_gs: ret #ifdef INVARIANTS +badsw1: + pushal + pushl $sw0_1 + call panic +sw0_1: .asciz "cpu_throw: no newthread supplied" + +badsw2: + pushal + pushl $sw0_2 + call panic +sw0_2: .asciz "cpu_switch: no curthread supplied" + badsw3: pushal pushl $sw0_3 call panic - -sw0_3: .asciz "cpu_switch: choosethread returned NULL" +sw0_3: .asciz "cpu_switch: no newthread supplied" #endif /* diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h index 9143d19cbbc8..6bcecff4001d 100644 --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -65,6 +65,22 @@ extern int szfreebsd4_sigcode; #ifdef COMPAT_43 extern int szosigcode; #endif +#ifdef SWTCH_OPTIM_STATS +extern int stupid_switch; +extern int swtch_optim_stats; +extern int tlb_flush_count; +extern int lazy_flush_count; +extern int lazy_flush_fixup; +#ifdef SMP +extern int lazy_flush_smpfixup; +extern int lazy_flush_smpipi; +extern int lazy_flush_smpbadcr3; +extern int lazy_flush_smpmiss; +#endif +#endif +#ifdef LAZY_SWITCH +extern int lazy_flush_enable; +#endif typedef void alias_for_inthand_t(u_int cs, u_int ef, u_int esp, u_int ss); struct thread; diff --git a/sys/amd64/include/mptable.h b/sys/amd64/include/mptable.h index 954a7fdfdb8f..4cc8bf545791 100644 --- a/sys/amd64/include/mptable.h +++ b/sys/amd64/include/mptable.h @@ -27,6 +27,7 @@ #include "opt_cpu.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #ifdef SMP #include @@ -634,6 +635,12 @@ mp_enable(u_int boot_addr) setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#ifdef LAZY_SWITCH + /* install an inter-CPU IPI for lazy pmap release */ + setidt(XLAZYPMAP_OFFSET, Xlazypmap, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2598,7 +2605,7 @@ ap_init(void) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ panic("scheduler returned us to %s", __func__); } diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index 8fcf2cbb3c97..f044d453abd9 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -247,7 +247,7 @@ struct pmap { pd_entry_t *pm_pdir; /* KVA of page directory */ vm_object_t pm_pteobj; /* Container for pte's */ TAILQ_HEAD(,pv_entry) pm_pvlist; /* list of mappings in pmap */ - int pm_active; /* active on cpus */ + u_int pm_active; /* active on cpus */ struct pmap_statistics pm_stats; /* pmap statistics */ LIST_ENTRY(pmap) pm_list; /* List of all pmaps */ #ifdef PAE diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index b503c2ca37a2..6467365cc8e0 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -56,6 +56,7 @@ extern int current_postcode; /** XXX currently in mp_machdep.c */ #define IPI_INVLTLB XINVLTLB_OFFSET #define IPI_INVLPG XINVLPG_OFFSET #define IPI_INVLRNG XINVLRNG_OFFSET +#define IPI_LAZYPMAP XLAZYPMAP_OFFSET #define IPI_RENDEZVOUS XRENDEZVOUS_OFFSET #define IPI_AST XCPUAST_OFFSET #define IPI_STOP XCPUSTOP_OFFSET diff --git a/sys/amd64/isa/intr_machdep.h b/sys/amd64/isa/intr_machdep.h index 7179268ba6a4..4cfecb5d24d6 100644 --- a/sys/amd64/isa/intr_machdep.h +++ b/sys/amd64/isa/intr_machdep.h @@ -116,6 +116,9 @@ /* inter-CPU rendezvous */ #define XRENDEZVOUS_OFFSET (ICU_OFFSET + 122) /* 0x9A */ +/* lazy pmap release */ +#define XLAZYPMAP_OFFSET (ICU_OFFSET + 123) /* 0x9B */ + /* IPI to generate an additional software trap at the target CPU */ /* XXX in the middle of the interrupt range, overlapping IRQ48 */ #define XCPUAST_OFFSET (ICU_OFFSET + 48) /* 0x50 */ @@ -206,7 +209,8 @@ inthand_t Xcpuast, /* Additional software trap on other cpu */ Xcpustop, /* CPU stops & waits for another CPU to restart it */ Xspuriousint, /* handle APIC "spurious INTs" */ - Xrendezvous; /* handle CPU rendezvous */ + Xrendezvous, /* handle CPU rendezvous */ + Xlazypmap; /* handle lazy pmap release */ #ifdef TEST_TEST1 inthand_t diff --git a/sys/conf/options.i386 b/sys/conf/options.i386 index 868cfbdb6546..de784be34666 100644 --- a/sys/conf/options.i386 +++ b/sys/conf/options.i386 @@ -6,6 +6,8 @@ GPL_MATH_EMULATE opt_math_emulate.h DISABLE_PSE opt_pmap.h PMAP_SHPGPERPROC opt_pmap.h DISABLE_PG_G opt_pmap.h +LAZY_SWITCH opt_swtch.h +SWTCH_OPTIM_STATS opt_swtch.h PPC_PROBE_CHIPSET opt_ppc.h PPC_DEBUG opt_ppc.h MAXMEM diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s index e4b6ea5fd65f..d91ff1ce4cee 100644 --- a/sys/i386/i386/apic_vector.s +++ b/sys/i386/i386/apic_vector.s @@ -3,6 +3,7 @@ * $FreeBSD$ */ +#include "opt_swtch.h" #include #include @@ -648,7 +649,28 @@ Xrendezvous: POP_FRAME iret +#ifdef LAZY_SWITCH +/* + * Clean up when we lose out on the lazy context switch optimization. + * ie: when we are about to release a PTD but a cpu is still borrowing it. + */ + SUPERALIGN_TEXT + .globl Xlazypmap +Xlazypmap: + PUSH_FRAME + movl $KDSEL, %eax + mov %ax, %ds /* use KERNEL data segment */ + mov %ax, %es + movl $KPSEL, %eax + mov %ax, %fs + + call pmap_lazyfix_action + movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + POP_FRAME + iret +#endif + .data .globl apic_pin_trigger diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index b389828b596f..baab5cb6dccb 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -49,6 +49,7 @@ #include "opt_msgbuf.h" #include "opt_npx.h" #include "opt_perfmon.h" +#include "opt_swtch.h" #include "opt_kstack_pages.h" #include @@ -151,11 +152,40 @@ int _udatasel, _ucodesel; u_int atdevbase; #if defined(SWTCH_OPTIM_STATS) -extern int swtch_optim_stats; +int stupid_switch; +SYSCTL_INT(_debug, OID_AUTO, stupid_switch, + CTLFLAG_RW, &stupid_switch, 0, ""); +int swtch_optim_stats; SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, - CTLFLAG_RD, &swtch_optim_stats, 0, ""); + CTLFLAG_RW, &swtch_optim_stats, 0, ""); +int tlb_flush_count; SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, - CTLFLAG_RD, &tlb_flush_count, 0, ""); + CTLFLAG_RW, &tlb_flush_count, 0, ""); +int lazy_flush_count; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_count, + CTLFLAG_RW, &lazy_flush_count, 0, ""); +int lazy_flush_fixup; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_fixup, + CTLFLAG_RW, &lazy_flush_fixup, 0, ""); +#ifdef SMP +int lazy_flush_smpfixup; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpfixup, + CTLFLAG_RW, &lazy_flush_smpfixup, 0, ""); +int lazy_flush_smpipi; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpipi, + CTLFLAG_RW, &lazy_flush_smpipi, 0, ""); +int lazy_flush_smpbadcr3; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpbadcr3, + CTLFLAG_RW, &lazy_flush_smpbadcr3, 0, ""); +int lazy_flush_smpmiss; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpmiss, + CTLFLAG_RW, &lazy_flush_smpmiss, 0, ""); +#endif +#endif +#ifdef LAZY_SWITCH +int lazy_flush_enable = 1; +SYSCTL_INT(_debug, OID_AUTO, lazy_flush_enable, + CTLFLAG_RW, &lazy_flush_enable, 0, ""); #endif int cold = 1; diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c index 954a7fdfdb8f..4cc8bf545791 100644 --- a/sys/i386/i386/mp_machdep.c +++ b/sys/i386/i386/mp_machdep.c @@ -27,6 +27,7 @@ #include "opt_cpu.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #ifdef SMP #include @@ -634,6 +635,12 @@ mp_enable(u_int boot_addr) setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#ifdef LAZY_SWITCH + /* install an inter-CPU IPI for lazy pmap release */ + setidt(XLAZYPMAP_OFFSET, Xlazypmap, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2598,7 +2605,7 @@ ap_init(void) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ panic("scheduler returned us to %s", __func__); } diff --git a/sys/i386/i386/mptable.c b/sys/i386/i386/mptable.c index 954a7fdfdb8f..4cc8bf545791 100644 --- a/sys/i386/i386/mptable.c +++ b/sys/i386/i386/mptable.c @@ -27,6 +27,7 @@ #include "opt_cpu.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #ifdef SMP #include @@ -634,6 +635,12 @@ mp_enable(u_int boot_addr) setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#ifdef LAZY_SWITCH + /* install an inter-CPU IPI for lazy pmap release */ + setidt(XLAZYPMAP_OFFSET, Xlazypmap, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2598,7 +2605,7 @@ ap_init(void) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ panic("scheduler returned us to %s", __func__); } diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index 76b45b4219f6..77c53e0904ec 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -102,6 +102,7 @@ #include "opt_pmap.h" #include "opt_msgbuf.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #include #include @@ -184,6 +185,9 @@ struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; +#if defined(SMP) && defined(LAZY_SWITCH) +static struct mtx lazypmap_lock; +#endif vm_paddr_t avail_start; /* PA of first available physical page */ vm_paddr_t avail_end; /* PA of last available physical page */ @@ -336,6 +340,9 @@ pmap_bootstrap(firstaddr, loadaddr) kernel_pmap->pm_active = -1; /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvlist); LIST_INIT(&allpmaps); +#if defined(SMP) && defined(LAZY_SWITCH) + mtx_init(&lazypmap_lock, "lazypmap", NULL, MTX_SPIN); +#endif mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); @@ -1486,6 +1493,121 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va) * Pmap allocation/deallocation routines. ***************************************************/ +#ifdef LAZY_SWITCH +#ifdef SMP +/* + * Deal with a SMP shootdown of other users of the pmap that we are + * trying to dispose of. This can be a bit hairy. + */ +static u_int *lazymask; +static u_int lazyptd; +static volatile u_int lazywait; + +void pmap_lazyfix_action(void); + +void +pmap_lazyfix_action(void) +{ + u_int mymask = PCPU_GET(cpumask); + + if (rcr3() == lazyptd) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); +#ifdef SWTCH_OPTIM_STATS + atomic_add_int(&lazy_flush_smpfixup, 1); + } else { + if (*lazymask & mymask) + lazy_flush_smpbadcr3++; + else + lazy_flush_smpmiss++; +#endif + } + atomic_clear_int(lazymask, mymask); + atomic_store_rel_int(&lazywait, 1); +} + +static void +pmap_lazyfix_self(u_int mymask) +{ + + if (rcr3() == lazyptd) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); +#ifdef SWTCH_OPTIM_STATS + lazy_flush_fixup++; + } else { + if (*lazymask & mymask) + lazy_flush_smpbadcr3++; + else + lazy_flush_smpmiss++; +#endif + } + atomic_clear_int(lazymask, mymask); +} + + +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int mymask = PCPU_GET(cpumask); + u_int mask; + register u_int spins; + + while ((mask = pmap->pm_active) != 0) { + spins = 50000000; + mask = mask & -mask; /* Find least significant set bit */ + mtx_lock_spin(&lazypmap_lock); +#ifdef PAE + lazyptd = vtophys(pmap->pm_pdpt); +#else + lazyptd = vtophys(pmap->pm_pdir); +#endif + if (mask == mymask) { + lazymask = &pmap->pm_active; + pmap_lazyfix_self(mymask); + } else { + atomic_store_rel_int((u_int *)&lazymask, + (u_int)&pmap->pm_active); + atomic_store_rel_int(&lazywait, 0); + ipi_selected(mask, IPI_LAZYPMAP); + while (lazywait == 0) { + ia32_pause(); + if (--spins == 0) + break; + } +#ifdef SWTCH_OPTIM_STATS + lazy_flush_smpipi++; +#endif + } + mtx_unlock_spin(&lazypmap_lock); + if (spins == 0) + printf("pmap_lazyfix: spun for 50000000\n"); + } +} + +#else /* SMP */ + +/* + * Cleaning up on uniprocessor is easy. For various reasons, we're + * unlikely to have to even execute this code, including the fact + * that the cleanup is deferred until the parent does a wait(2), which + * means that another userland process has run. + */ +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int cr3; + + cr3 = vtophys(pmap->pm_pdir); + if (cr3 == rcr3()) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + pmap->pm_active &= ~(PCPU_GET(cpumask)); +#ifdef SWTCH_OPTIM_STATS + lazy_flush_fixup++; +#endif + } +} +#endif /* SMP */ +#endif /* LAZY_SWITCH */ + /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. @@ -1507,6 +1629,9 @@ pmap_release(pmap_t pmap) ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); +#ifdef LAZY_SWITCH + pmap_lazyfix(pmap); +#endif mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); @@ -3321,9 +3446,10 @@ pmap_activate(struct thread *td) pmap_t pmap; u_int32_t cr3; + critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); #if defined(SMP) - pmap->pm_active |= PCPU_GET(cpumask); + atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); #else pmap->pm_active |= 1; #endif @@ -3348,6 +3474,7 @@ pmap_activate(struct thread *td) #ifdef SWTCH_OPTIM_STATS tlb_flush_count++; #endif + critical_exit(); } vm_offset_t diff --git a/sys/i386/i386/swtch.s b/sys/i386/i386/swtch.s index aaf0ec2c2073..793e63431f4a 100644 --- a/sys/i386/i386/swtch.s +++ b/sys/i386/i386/swtch.s @@ -37,30 +37,16 @@ */ #include "opt_npx.h" +#include "opt_swtch.h" #include -#ifdef SMP -#include -#include /* CHEAP_TPR, GRAB_LOPRIO */ -#endif - #include "assym.s" /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ - .data - - .globl panic - -#ifdef SWTCH_OPTIM_STATS - .globl swtch_optim_stats, tlb_flush_count -swtch_optim_stats: .long 0 /* number of _swtch_optims */ -tlb_flush_count: .long 0 -#endif - .text /* @@ -68,30 +54,60 @@ tlb_flush_count: .long 0 * * This is the second half of cpu_swtch(). It is used when the current * thread is either a dummy or slated to die, and we no longer care - * about its state. + * about its state. This is only a slight optimization and is probably + * not worth it anymore. Note that we need to clear the pm_active bits so + * we do need the old proc if it still exists. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd */ ENTRY(cpu_throw) + movl PCPU(CPUID), %esi + movl 4(%esp),%ecx /* Old thread */ + testl %ecx,%ecx /* no thread? */ + jz 1f + /* release bit from old pm_active */ + movl TD_PROC(%ecx), %eax /* thread->td_proc */ + movl P_VMSPACE(%eax), %ebx /* proc->p_vmspace */ +#ifdef SMP + lock +#endif + btrl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* clear old */ +1: + movl 8(%esp),%ecx /* New thread */ + movl TD_PCB(%ecx),%edx +#ifdef SWTCH_OPTIM_STATS + incl tlb_flush_count +#endif + movl PCB_CR3(%edx),%eax + movl %eax,%cr3 /* new address space */ + /* set bit in new pm_active */ + movl TD_PROC(%ecx),%eax + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btsl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* set new */ jmp sw1 /* - * cpu_switch() + * cpu_switch(old, new) * * Save the current thread state, then select the next thread to run * and load its state. + * 0(%esp) = ret + * 4(%esp) = oldtd + * 8(%esp) = newtd */ ENTRY(cpu_switch) - /* Switch to new thread. First, save context as needed. */ - movl PCPU(CURTHREAD),%ecx + /* Switch to new thread. First, save context. */ + movl 4(%esp),%ecx - /* If no thread to save, don't save it (XXX shouldn't happen). */ - testl %ecx,%ecx - jz sw1 - - movl TD_PROC(%ecx), %eax - movl P_VMSPACE(%eax), %edx - movl PCPU(CPUID), %eax - btrl %eax, VM_PMAP+PM_ACTIVE(%edx) +#ifdef INVARIANTS + testl %ecx,%ecx /* no thread? */ + jz badsw2 /* no, panic */ +#endif movl TD_PCB(%ecx),%edx @@ -125,10 +141,6 @@ ENTRY(cpu_switch) movl %eax,PCB_DR0(%edx) 1: -#ifdef SMP - /* XXX FIXME: we should be saving the local APIC TPR */ -#endif - #ifdef DEV_NPX /* have we used fp, and need a save? */ cmpl %ecx,PCPU(FPCURTHREAD) @@ -140,56 +152,76 @@ ENTRY(cpu_switch) 1: #endif - /* Save is done. Now choose a new thread. */ - /* XXX still trashing space above the old "Top Of Stack". */ -sw1: - -#ifdef SMP - /* - * Stop scheduling if smp_active has become zero (for rebooting) and - * we are not the BSP. - */ - cmpl $0,smp_active - jne 1f - cmpl $0,PCPU(CPUID) - je 1f - movl PCPU(IDLETHREAD), %eax - jmp sw1b -1: -#endif - - /* - * Choose a new thread to schedule. choosethread() returns idlethread - * if it cannot find another thread to run. - */ - call choosethread /* Trash ecx, edx; ret eax. */ - + /* Save is done. Now fire up new thread. Leave old vmspace. */ + movl %ecx,%edi + movl 8(%esp),%ecx /* New thread */ #ifdef INVARIANTS - testl %eax,%eax /* no thread? */ + testl %ecx,%ecx /* no thread? */ jz badsw3 /* no, panic */ #endif - -sw1b: - movl %eax,%ecx movl TD_PCB(%ecx),%edx - -#ifdef SWTCH_OPTIM_STATS - incl swtch_optim_stats -#endif + movl PCPU(CPUID), %esi /* switch address space */ - movl %cr3,%ebx /* The same address space? */ - cmpl PCB_CR3(%edx),%ebx - je 4f /* Yes, skip all that cruft */ + movl PCB_CR3(%edx),%eax +#ifdef LAZY_SWITCH + cmpl $0,lazy_flush_enable + je 1f + cmpl %eax,IdlePTD /* Kernel address space? */ +#ifdef SWTCH_OPTIM_STATS + je 3f +#else + je sw1 +#endif +1: + movl %cr3,%ebx /* The same address space? */ + cmpl %ebx,%eax +#ifdef SWTCH_OPTIM_STATS + je 2f /* Yes, skip all that cruft */ +#else + je sw1 +#endif +#endif + #ifdef SWTCH_OPTIM_STATS - decl swtch_optim_stats incl tlb_flush_count #endif - movl PCB_CR3(%edx),%ebx /* Tell the CPU about the */ - movl %ebx,%cr3 /* new address space */ -4: + movl %eax,%cr3 /* new address space */ - movl PCPU(CPUID), %esi + /* Release bit from old pmap->pm_active */ + movl TD_PROC(%edi), %eax /* oldproc */ + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btrl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* clear old */ + + /* Set bit in new pmap->pm_active */ + movl TD_PROC(%ecx),%eax /* newproc */ + movl P_VMSPACE(%eax), %ebx +#ifdef SMP + lock +#endif + btsl %esi, VM_PMAP+PM_ACTIVE(%ebx) /* set new */ + +#ifdef LAZY_SWITCH +#ifdef SWTCH_OPTIM_STATS + jmp sw1 + +2: /* same address space */ + incl swtch_optim_stats + jmp sw1 + +3: /* kernel address space */ + incl lazy_flush_count +#endif +#endif + +sw1: + /* + * At this point, we've switched address spaces and are ready + * to load up the rest of the next context. + */ cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f /* If not, use the default */ btsl %esi, private_tss /* mark use of private tss */ @@ -221,11 +253,6 @@ sw1b: movl $GPROC0_SEL*8, %esi /* GSEL(entry, SEL_KPL) */ ltr %si 3: - /* Note in vmspace that this cpu is using it. */ - movl TD_PROC(%ecx),%eax - movl P_VMSPACE(%eax), %ebx - movl PCPU(CPUID), %eax - btsl %eax, VM_PMAP+PM_ACTIVE(%ebx) /* Restore context. */ movl PCB_EBX(%edx),%ebx @@ -241,10 +268,6 @@ sw1b: movl %edx, PCPU(CURPCB) movl %ecx, PCPU(CURTHREAD) /* into next thread */ -#ifdef SMP - /* XXX FIXME: we should be restoring the local APIC TPR */ -#endif - /* * Determine the LDT to use and load it if is the default one and * that is not the current one. @@ -301,12 +324,23 @@ cpu_switch_load_gs: ret #ifdef INVARIANTS +badsw1: + pushal + pushl $sw0_1 + call panic +sw0_1: .asciz "cpu_throw: no newthread supplied" + +badsw2: + pushal + pushl $sw0_2 + call panic +sw0_2: .asciz "cpu_switch: no curthread supplied" + badsw3: pushal pushl $sw0_3 call panic - -sw0_3: .asciz "cpu_switch: choosethread returned NULL" +sw0_3: .asciz "cpu_switch: no newthread supplied" #endif /* diff --git a/sys/i386/include/md_var.h b/sys/i386/include/md_var.h index 9143d19cbbc8..6bcecff4001d 100644 --- a/sys/i386/include/md_var.h +++ b/sys/i386/include/md_var.h @@ -65,6 +65,22 @@ extern int szfreebsd4_sigcode; #ifdef COMPAT_43 extern int szosigcode; #endif +#ifdef SWTCH_OPTIM_STATS +extern int stupid_switch; +extern int swtch_optim_stats; +extern int tlb_flush_count; +extern int lazy_flush_count; +extern int lazy_flush_fixup; +#ifdef SMP +extern int lazy_flush_smpfixup; +extern int lazy_flush_smpipi; +extern int lazy_flush_smpbadcr3; +extern int lazy_flush_smpmiss; +#endif +#endif +#ifdef LAZY_SWITCH +extern int lazy_flush_enable; +#endif typedef void alias_for_inthand_t(u_int cs, u_int ef, u_int esp, u_int ss); struct thread; diff --git a/sys/i386/include/mptable.h b/sys/i386/include/mptable.h index 954a7fdfdb8f..4cc8bf545791 100644 --- a/sys/i386/include/mptable.h +++ b/sys/i386/include/mptable.h @@ -27,6 +27,7 @@ #include "opt_cpu.h" #include "opt_kstack_pages.h" +#include "opt_swtch.h" #ifdef SMP #include @@ -634,6 +635,12 @@ mp_enable(u_int boot_addr) setidt(XSTATCLOCK_OFFSET, Xstatclock, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#ifdef LAZY_SWITCH + /* install an inter-CPU IPI for lazy pmap release */ + setidt(XLAZYPMAP_OFFSET, Xlazypmap, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + /* install an inter-CPU IPI for all-CPU rendezvous */ setidt(XRENDEZVOUS_OFFSET, Xrendezvous, SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -2598,7 +2605,7 @@ ap_init(void) binuptime(PCPU_PTR(switchtime)); PCPU_SET(switchticks, ticks); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ panic("scheduler returned us to %s", __func__); } diff --git a/sys/i386/include/pmap.h b/sys/i386/include/pmap.h index 8fcf2cbb3c97..f044d453abd9 100644 --- a/sys/i386/include/pmap.h +++ b/sys/i386/include/pmap.h @@ -247,7 +247,7 @@ struct pmap { pd_entry_t *pm_pdir; /* KVA of page directory */ vm_object_t pm_pteobj; /* Container for pte's */ TAILQ_HEAD(,pv_entry) pm_pvlist; /* list of mappings in pmap */ - int pm_active; /* active on cpus */ + u_int pm_active; /* active on cpus */ struct pmap_statistics pm_stats; /* pmap statistics */ LIST_ENTRY(pmap) pm_list; /* List of all pmaps */ #ifdef PAE diff --git a/sys/i386/include/smp.h b/sys/i386/include/smp.h index b503c2ca37a2..6467365cc8e0 100644 --- a/sys/i386/include/smp.h +++ b/sys/i386/include/smp.h @@ -56,6 +56,7 @@ extern int current_postcode; /** XXX currently in mp_machdep.c */ #define IPI_INVLTLB XINVLTLB_OFFSET #define IPI_INVLPG XINVLPG_OFFSET #define IPI_INVLRNG XINVLRNG_OFFSET +#define IPI_LAZYPMAP XLAZYPMAP_OFFSET #define IPI_RENDEZVOUS XRENDEZVOUS_OFFSET #define IPI_AST XCPUAST_OFFSET #define IPI_STOP XCPUSTOP_OFFSET diff --git a/sys/i386/isa/apic_vector.s b/sys/i386/isa/apic_vector.s index e4b6ea5fd65f..d91ff1ce4cee 100644 --- a/sys/i386/isa/apic_vector.s +++ b/sys/i386/isa/apic_vector.s @@ -3,6 +3,7 @@ * $FreeBSD$ */ +#include "opt_swtch.h" #include #include @@ -648,7 +649,28 @@ Xrendezvous: POP_FRAME iret +#ifdef LAZY_SWITCH +/* + * Clean up when we lose out on the lazy context switch optimization. + * ie: when we are about to release a PTD but a cpu is still borrowing it. + */ + SUPERALIGN_TEXT + .globl Xlazypmap +Xlazypmap: + PUSH_FRAME + movl $KDSEL, %eax + mov %ax, %ds /* use KERNEL data segment */ + mov %ax, %es + movl $KPSEL, %eax + mov %ax, %fs + + call pmap_lazyfix_action + movl $0, lapic+LA_EOI /* End Of Interrupt to APIC */ + POP_FRAME + iret +#endif + .data .globl apic_pin_trigger diff --git a/sys/i386/isa/intr_machdep.h b/sys/i386/isa/intr_machdep.h index 7179268ba6a4..4cfecb5d24d6 100644 --- a/sys/i386/isa/intr_machdep.h +++ b/sys/i386/isa/intr_machdep.h @@ -116,6 +116,9 @@ /* inter-CPU rendezvous */ #define XRENDEZVOUS_OFFSET (ICU_OFFSET + 122) /* 0x9A */ +/* lazy pmap release */ +#define XLAZYPMAP_OFFSET (ICU_OFFSET + 123) /* 0x9B */ + /* IPI to generate an additional software trap at the target CPU */ /* XXX in the middle of the interrupt range, overlapping IRQ48 */ #define XCPUAST_OFFSET (ICU_OFFSET + 48) /* 0x50 */ @@ -206,7 +209,8 @@ inthand_t Xcpuast, /* Additional software trap on other cpu */ Xcpustop, /* CPU stops & waits for another CPU to restart it */ Xspuriousint, /* handle APIC "spurious INTs" */ - Xrendezvous; /* handle CPU rendezvous */ + Xrendezvous, /* handle CPU rendezvous */ + Xlazypmap; /* handle lazy pmap release */ #ifdef TEST_TEST1 inthand_t diff --git a/sys/kern/kern_kse.c b/sys/kern/kern_kse.c index 36994610d513..d3ceb096cefc 100644 --- a/sys/kern/kern_kse.c +++ b/sys/kern/kern_kse.c @@ -1250,7 +1250,13 @@ thread_exit(void) PROC_UNLOCK(p); } /* XXX Shouldn't cpu_throw() here. */ + mtx_assert(&sched_lock, MA_OWNED); +#if defined(__i386__) || defined(__sparc64__) + cpu_throw(td, choosethread()); +#else cpu_throw(); +#endif + panic("I'm a teapot!"); /* NOTREACHED */ } diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c index 8c3924344df3..377ad429ff45 100644 --- a/sys/kern/kern_switch.c +++ b/sys/kern/kern_switch.c @@ -98,6 +98,9 @@ reassigned to keep this true. #include #include #include +#if defined(SMP) && defined(__i386__) +#include +#endif #include CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS); @@ -122,8 +125,21 @@ choosethread(void) struct thread *td; struct ksegrp *kg; +#if defined(SMP) && defined(__i386__) + if (smp_active == 0 && PCPU_GET(cpuid) != 0) { + /* Shutting down, run idlethread on AP's */ + td = PCPU_GET(idlethread); + ke = td->td_kse; + CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td); + ke->ke_flags |= KEF_DIDRUN; + TD_SET_RUNNING(td); + return (td); + } +#endif + retry: - if ((ke = sched_choose())) { + ke = sched_choose(); + if (ke) { td = ke->ke_thread; KASSERT((td->td_kse == ke), ("kse/thread mismatch")); kg = ke->ke_ksegrp; diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index cf6591f6a701..b582e2027441 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -41,6 +41,9 @@ #include "opt_ddb.h" #include "opt_ktrace.h" +#ifdef __i386__ +#include "opt_swtch.h" +#endif #include #include @@ -67,6 +70,9 @@ #endif #include +#ifdef SWTCH_OPTIM_STATS +#include +#endif static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) @@ -449,12 +455,16 @@ void mi_switch(void) { struct bintime new_switchtime; - struct thread *td = curthread; /* XXX */ - struct proc *p = td->td_proc; /* XXX */ + struct thread *td; +#if defined(__i386__) || defined(__sparc64__) + struct thread *newtd; +#endif + struct proc *p; u_int sched_nest; mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); - + td = curthread; /* XXX */ + p = td->td_proc; /* XXX */ KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); #ifdef INVARIANTS if (!TD_ON_LOCK(td) && @@ -506,7 +516,17 @@ mi_switch(void) thread_switchout(td); sched_switchout(td); +#if defined(__i386__) || defined(__sparc64__) + newtd = choosethread(); + if (td != newtd) + cpu_switch(td, newtd); /* SHAZAM!! */ +#ifdef SWTCH_OPTIM_STATS + else + stupid_switch++; +#endif +#else cpu_switch(); /* SHAZAM!!*/ +#endif sched_lock.mtx_recurse = sched_nest; sched_lock.mtx_lock = (uintptr_t)td; diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c index 31c638ecb434..5f23f6dfcaef 100644 --- a/sys/kern/kern_thr.c +++ b/sys/kern/kern_thr.c @@ -106,7 +106,11 @@ thr_exit1(void) td->td_last_kse = NULL; thread_stash(td); +#if defined(__i386__) || defined(__sparc64__) + cpu_throw(td, choosethread()); +#else cpu_throw(); +#endif } #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start)) diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index 36994610d513..d3ceb096cefc 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -1250,7 +1250,13 @@ thread_exit(void) PROC_UNLOCK(p); } /* XXX Shouldn't cpu_throw() here. */ + mtx_assert(&sched_lock, MA_OWNED); +#if defined(__i386__) || defined(__sparc64__) + cpu_throw(td, choosethread()); +#else cpu_throw(); +#endif + panic("I'm a teapot!"); /* NOTREACHED */ } diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c index 94c93902c6e8..3b7526a536c5 100644 --- a/sys/kern/subr_witness.c +++ b/sys/kern/subr_witness.c @@ -84,6 +84,9 @@ #include "opt_ddb.h" #include "opt_witness.h" +#ifdef __i386__ +#include "opt_swtch.h" +#endif #include #include @@ -295,6 +298,9 @@ static struct witness_order_list_entry order_lists[] = { #if defined(__i386__) && defined(APIC_IO) { "tlb", &lock_class_mtx_spin }, #endif +#if defined(__i386__) && defined(LAZY_SWITCH) + { "lazypmap", &lock_class_mtx_spin }, +#endif #ifdef __sparc64__ { "ipi", &lock_class_mtx_spin }, #endif diff --git a/sys/sparc64/sparc64/mp_machdep.c b/sys/sparc64/sparc64/mp_machdep.c index 35a5474097f8..3f7e2d7ee158 100644 --- a/sys/sparc64/sparc64/mp_machdep.c +++ b/sys/sparc64/sparc64/mp_machdep.c @@ -357,7 +357,7 @@ cpu_mp_bootstrap(struct pcpu *pc) /* ok, now grab sched_lock and enter the scheduler */ mtx_lock_spin(&sched_lock); - cpu_throw(); /* doesn't return */ + cpu_throw(NULL, choosethread()); /* doesn't return */ } void diff --git a/sys/sparc64/sparc64/swtch.S b/sys/sparc64/sparc64/swtch.S index d506a07e33b0..4e2128b7edbe 100644 --- a/sys/sparc64/sparc64/swtch.S +++ b/sys/sparc64/sparc64/swtch.S @@ -36,31 +36,29 @@ #include "assym.s" +/* + * void cpu_throw(struct thread *old, struct thread *new) + */ ENTRY(cpu_throw) save %sp, -CCFSZ, %sp - call choosethread - ldx [PCPU(CURTHREAD)], %l0 flushw - b,a %xcc, .Lsw1 - nop + mov %i0, %l0 + ba %xcc, .Lsw1 + mov %i1, %o0 END(cpu_throw) +/* + * void cpu_switch(struct thread *old, struct thread *new) + */ ENTRY(cpu_switch) - /* - * Choose a new thread. If its the same as the current one, do - * nothing. - */ save %sp, -CCFSZ, %sp - call choosethread - ldx [PCPU(CURTHREAD)], %l0 - cmp %l0, %o0 - be,a,pn %xcc, 4f - nop - ldx [%l0 + TD_PCB], %l1 + mov %i0, %l0 + mov %i1, %o0 /* * If the current thread was using floating point, save its context. */ + ldx [%l0 + TD_PCB], %l1 ldx [%l0 + TD_FRAME], %l2 ldx [%l2 + TF_FPRS], %l3 andcc %l3, FPRS_FEF, %g0 diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 42eda069c172..5d2bb04f7aaa 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -889,8 +889,13 @@ void setsugid(struct proc *p); void sleepinit(void); void stopevent(struct proc *, u_int, u_int); void cpu_idle(void); +#if defined(__i386__) || defined(__sparc64__) +void cpu_switch(struct thread *old, struct thread *new); +void cpu_throw(struct thread *old, struct thread *new) __dead2; +#else void cpu_switch(void); void cpu_throw(void) __dead2; +#endif void unsleep(struct thread *); void userret(struct thread *, struct trapframe *, u_int);