diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s index a78b60107e05..86e78c4beb2e 100644 --- a/sys/i386/i386/apic_vector.s +++ b/sys/i386/i386/apic_vector.s @@ -357,4 +357,20 @@ IDTVEC(rendezvous) POP_FRAME iret +/* + * Clean up when we lose out on the lazy context switch optimization. + * ie: when we are about to release a PTD but a cpu is still borrowing it. + */ + SUPERALIGN_TEXT +IDTVEC(lazypmap) + PUSH_FRAME + SET_KERNEL_SREGS + cld + + call pmap_lazyfix_action + + movl lapic, %eax + movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ + POP_FRAME + iret #endif /* SMP */ diff --git a/sys/i386/i386/db_trace.c b/sys/i386/i386/db_trace.c index 79da4dc576dc..445d9c5fe0df 100644 --- a/sys/i386/i386/db_trace.c +++ b/sys/i386/i386/db_trace.c @@ -312,7 +312,8 @@ db_nextframe(struct i386_frame **fp, db_addr_t *ip, struct thread *td) frame_type = TRAP_TIMERINT; else if (strcmp(name, "Xcpustop") == 0 || strcmp(name, "Xrendezvous") == 0 || - strcmp(name, "Xipi_intr_bitmap_handler") == 0) + strcmp(name, "Xipi_intr_bitmap_handler") == 0 || + strcmp(name, "Xlazypmap") == 0) frame_type = TRAP_INTERRUPT; } diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c index 966e46ea0729..fbaae89408a4 100644 --- a/sys/i386/i386/mp_machdep.c +++ b/sys/i386/i386/mp_machdep.c @@ -166,6 +166,7 @@ u_long *ipi_invlrng_counts[MAXCPU]; u_long *ipi_invlpg_counts[MAXCPU]; u_long *ipi_invlcache_counts[MAXCPU]; u_long *ipi_rendezvous_counts[MAXCPU]; +u_long *ipi_lazypmap_counts[MAXCPU]; static u_long *ipi_hardclock_counts[MAXCPU]; #endif @@ -575,6 +576,10 @@ cpu_mp_start(void) setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + /* Install an inter-CPU IPI for lazy pmap release */ + setidt(IPI_LAZYPMAP, IDTVEC(lazypmap), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + /* Install an inter-CPU IPI for all-CPU rendezvous */ setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); @@ -1718,6 +1723,8 @@ mp_ipi_intrcnt(void *dummy) intrcnt_add(buf, &ipi_ast_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); intrcnt_add(buf, &ipi_rendezvous_counts[i]); + snprintf(buf, sizeof(buf), "cpu%d:lazypmap", i); + intrcnt_add(buf, &ipi_lazypmap_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); intrcnt_add(buf, &ipi_hardclock_counts[i]); } diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c index c85f1387030e..d0699b5311a9 100644 --- a/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -1900,6 +1900,104 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) * Pmap allocation/deallocation routines. ***************************************************/ +#ifdef SMP +/* + * Deal with a SMP shootdown of other users of the pmap that we are + * trying to dispose of. This can be a bit hairy. + */ +static cpuset_t *lazymask; +static u_int lazyptd; +static volatile u_int lazywait; + +void pmap_lazyfix_action(void); + +void +pmap_lazyfix_action(void) +{ + +#ifdef COUNT_IPIS + (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; +#endif + if (rcr3() == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask); + atomic_store_rel_int(&lazywait, 1); +} + +static void +pmap_lazyfix_self(cpuset_t mymask) +{ + + if (rcr3() == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + CPU_NAND_ATOMIC(lazymask, &mymask); +} + + +static void +pmap_lazyfix(pmap_t pmap) +{ + cpuset_t mymask, mask; + u_int spins; + int lbs; + + mask = pmap->pm_active; + while (!CPU_EMPTY(&mask)) { + spins = 50000000; + + /* Find least significant set bit. */ + lsb = cpusetobj_ffs(&mask); + lsb--; + CPU_SETOF(lsb, &mask); + mtx_lock_spin(&smp_ipi_mtx); +#ifdef PAE + lazyptd = vtophys(pmap->pm_pdpt); +#else + lazyptd = vtophys(pmap->pm_pdir); +#endif + mymask = PCPU_GET(cpumask); + if (mask == mymask) { + lazymask = &pmap->pm_active; + pmap_lazyfix_self(mymask); + } else { + atomic_store_rel_int((u_int *)&lazymask, + (u_int)&pmap->pm_active); + atomic_store_rel_int(&lazywait, 0); + ipi_selected(mask, IPI_LAZYPMAP); + while (lazywait == 0) { + ia32_pause(); + if (--spins == 0) + break; + } + } + mtx_unlock_spin(&smp_ipi_mtx); + if (spins == 0) + printf("pmap_lazyfix: spun for 50000000\n"); + mask = pmap->pm_active; + } +} + +#else /* SMP */ + +/* + * Cleaning up on uniprocessor is easy. For various reasons, we're + * unlikely to have to even execute this code, including the fact + * that the cleanup is deferred until the parent does a wait(2), which + * means that another userland process has run. + */ +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int cr3; + + cr3 = vtophys(pmap->pm_pdir); + if (cr3 == rcr3()) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active); + } +} +#endif /* SMP */ + /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. @@ -1917,6 +2015,7 @@ pmap_release(pmap_t pmap) KASSERT(pmap->pm_root == NULL, ("pmap_release: pmap has reserved page table page(s)")); + pmap_lazyfix(pmap); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); diff --git a/sys/i386/i386/swtch.s b/sys/i386/i386/swtch.s index 654756910ea3..680b032557c9 100644 --- a/sys/i386/i386/swtch.s +++ b/sys/i386/i386/swtch.s @@ -174,6 +174,12 @@ ENTRY(cpu_switch) /* switch address space */ movl PCB_CR3(%edx),%eax +#ifdef PAE + cmpl %eax,IdlePDPT /* Kernel address space? */ +#else + cmpl %eax,IdlePTD /* Kernel address space? */ +#endif + je sw0 READ_CR3(%ebx) /* The same address space? */ cmpl %ebx,%eax je sw0 diff --git a/sys/i386/include/smp.h b/sys/i386/include/smp.h index cdec9c87bace..33b2578d3a59 100644 --- a/sys/i386/include/smp.h +++ b/sys/i386/include/smp.h @@ -42,6 +42,7 @@ extern u_long *ipi_invlrng_counts[MAXCPU]; extern u_long *ipi_invlpg_counts[MAXCPU]; extern u_long *ipi_invlcache_counts[MAXCPU]; extern u_long *ipi_rendezvous_counts[MAXCPU]; +extern u_long *ipi_lazypmap_counts[MAXCPU]; #endif /* IPI handlers */ @@ -52,7 +53,8 @@ inthand_t IDTVEC(invlcache), /* Write back and invalidate cache */ IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ IDTVEC(cpustop), /* CPU stops & waits to be restarted */ - IDTVEC(rendezvous); /* handle CPU rendezvous */ + IDTVEC(rendezvous), /* handle CPU rendezvous */ + IDTVEC(lazypmap); /* handle lazy pmap release */ /* functions in mp_machdep.c */ void cpu_add(u_int apic_id, char boot_cpu); diff --git a/sys/i386/xen/mp_machdep.c b/sys/i386/xen/mp_machdep.c index 1565859cf0b5..2d05596b89d2 100644 --- a/sys/i386/xen/mp_machdep.c +++ b/sys/i386/xen/mp_machdep.c @@ -154,6 +154,7 @@ static cpuset_t hyperthreading_cpus_mask; extern void Xhypervisor_callback(void); extern void failsafe_callback(void); +extern void pmap_lazyfix_action(void); struct cpu_group * cpu_topo(void) @@ -341,16 +342,24 @@ iv_invlcache(uintptr_t a, uintptr_t b) atomic_add_int(&smp_tlb_wait, 1); } +static void +iv_lazypmap(uintptr_t a, uintptr_t b) +{ + pmap_lazyfix_action(); + atomic_add_int(&smp_tlb_wait, 1); +} + /* * These start from "IPI offset" APIC_IPI_INTS */ -static call_data_func_t *ipi_vectors[5] = +static call_data_func_t *ipi_vectors[6] = { iv_rendezvous, iv_invltlb, iv_invlpg, iv_invlrng, iv_invlcache, + iv_lazypmap, }; /* diff --git a/sys/i386/xen/pmap.c b/sys/i386/xen/pmap.c index 74ace613d133..1149b6f516c4 100644 --- a/sys/i386/xen/pmap.c +++ b/sys/i386/xen/pmap.c @@ -1683,6 +1683,104 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) * Pmap allocation/deallocation routines. ***************************************************/ +#ifdef SMP +/* + * Deal with a SMP shootdown of other users of the pmap that we are + * trying to dispose of. This can be a bit hairy. + */ +static cpuset_t *lazymask; +static u_int lazyptd; +static volatile u_int lazywait; + +void pmap_lazyfix_action(void); + +void +pmap_lazyfix_action(void) +{ + +#ifdef COUNT_IPIS + (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; +#endif + if (rcr3() == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask); + atomic_store_rel_int(&lazywait, 1); +} + +static void +pmap_lazyfix_self(cpuset_t mymask) +{ + + if (rcr3() == lazyptd) + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + CPU_NAND_ATOMIC(lazymask, &mymask); +} + + +static void +pmap_lazyfix(pmap_t pmap) +{ + cpuset_t mymask, mask; + u_int spins; + int lsb; + + mask = pmap->pm_active; + while (!CPU_EMPTY(&mask)) { + spins = 50000000; + + /* Find least significant set bit. */ + lsb = cpusetobj_ffs(&mask); + lsb--; + CPU_SETOF(lsb, &mask); + mtx_lock_spin(&smp_ipi_mtx); +#ifdef PAE + lazyptd = vtophys(pmap->pm_pdpt); +#else + lazyptd = vtophys(pmap->pm_pdir); +#endif + mymask = PCPU_GET(cpumask); + if (mask == mymask) { + lazymask = &pmap->pm_active; + pmap_lazyfix_self(mymask); + } else { + atomic_store_rel_int((u_int *)&lazymask, + (u_int)&pmap->pm_active); + atomic_store_rel_int(&lazywait, 0); + ipi_selected(mask, IPI_LAZYPMAP); + while (lazywait == 0) { + ia32_pause(); + if (--spins == 0) + break; + } + } + mtx_unlock_spin(&smp_ipi_mtx); + if (spins == 0) + printf("pmap_lazyfix: spun for 50000000\n"); + mask = pmap->pm_active; + } +} + +#else /* SMP */ + +/* + * Cleaning up on uniprocessor is easy. For various reasons, we're + * unlikely to have to even execute this code, including the fact + * that the cleanup is deferred until the parent does a wait(2), which + * means that another userland process has run. + */ +static void +pmap_lazyfix(pmap_t pmap) +{ + u_int cr3; + + cr3 = vtophys(pmap->pm_pdir); + if (cr3 == rcr3()) { + load_cr3(PCPU_GET(curpcb)->pcb_cr3); + CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active); + } +} +#endif /* SMP */ + /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. @@ -1708,6 +1806,7 @@ pmap_release(pmap_t pmap) mtx_lock(&createdelete_lock); #endif + pmap_lazyfix(pmap); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock);