From a2622e5dc24f5a23ea124d49b3bd6b1cec647f46 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Thu, 9 Jul 2009 09:34:11 +0000 Subject: [PATCH] Restore the segment registers and segment base MSRs for amd64 syscall return path only when neither thread was context switched while executing syscall code nor syscall explicitely modified LDT or MSRs. Save segment registers in trap handlers before interrupts are enabled, to not allow context switches to happen before registers are saved. Use separated byte in pcb for indication of fast/full return, since pcb_flags are not synchronized with context switches. The change puts back syscall microbenchmark numbers that were slowed down after commit of the support for LDT on amd64. Reviewed by: jeff Tested (and tested, and tested ...) by: pho Approved by: re (kensmith) --- sys/amd64/amd64/cpu_switch.S | 1 + sys/amd64/amd64/exception.S | 28 +++++++++++++++++++--------- sys/amd64/amd64/genassym.c | 1 + sys/amd64/amd64/machdep.c | 4 ++++ sys/amd64/amd64/sys_machdep.c | 8 ++++++++ sys/amd64/amd64/vm_machdep.c | 5 +++++ sys/amd64/ia32/ia32_exception.S | 12 +++++++----- sys/amd64/ia32/ia32_reg.c | 1 + sys/amd64/ia32/ia32_signal.c | 7 +++++++ sys/amd64/include/pcb.h | 3 ++- sys/amd64/linux32/linux32_sysvec.c | 5 +++++ 11 files changed, 60 insertions(+), 15 deletions(-) diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S index 6fc829099b4a..364875ea6b93 100644 --- a/sys/amd64/amd64/cpu_switch.S +++ b/sys/amd64/amd64/cpu_switch.S @@ -97,6 +97,7 @@ END(cpu_throw) ENTRY(cpu_switch) /* Switch to new thread. First, save context. */ movq TD_PCB(%rdi),%r8 + movb $1,PCB_FULL_IRET(%r8) movq (%rsp),%rax /* Hardware registers */ movq %r15,PCB_R15(%r8) diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S index daa5c2592988..d78e2341fa36 100644 --- a/sys/amd64/amd64/exception.S +++ b/sys/amd64/amd64/exception.S @@ -162,19 +162,20 @@ IDTVEC(align) .globl alltraps .type alltraps,@function alltraps: + movq %rdi,TF_RDI(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz alltraps_testi /* already running with kernel GS.base */ swapgs + movq PCPU(CURPCB),%rdi + movb $0,PCB_FULL_IRET(%rdi) movw %fs,TF_FS(%rsp) movw %gs,TF_GS(%rsp) movw %es,TF_ES(%rsp) movw %ds,TF_DS(%rsp) alltraps_testi: testl $PSL_I,TF_RFLAGS(%rsp) - jz alltraps_pushregs + jz alltraps_pushregs_no_rdi sti -alltraps_pushregs: - movq %rdi,TF_RDI(%rsp) alltraps_pushregs_no_rdi: movq %rsi,TF_RSI(%rsp) movq %rdx,TF_RDX(%rsp) @@ -233,14 +234,17 @@ calltrap: .globl alltraps_noen .type alltraps_noen,@function alltraps_noen: + movq %rdi,TF_RDI(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz 1f /* already running with kernel GS.base */ swapgs + movq PCPU(CURPCB),%rdi + movb $0,PCB_FULL_IRET(%rdi) 1: movw %fs,TF_FS(%rsp) movw %gs,TF_GS(%rsp) movw %es,TF_ES(%rsp) movw %ds,TF_DS(%rsp) - jmp alltraps_pushregs + jmp alltraps_pushregs_no_rdi IDTVEC(dblfault) subq $TF_ERR,%rsp @@ -278,12 +282,13 @@ IDTVEC(dblfault) IDTVEC(page) subq $TF_ERR,%rsp movl $T_PAGEFLT,TF_TRAPNO(%rsp) + movq %rdi,TF_RDI(%rsp) /* free up a GP register */ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz 1f /* already running with kernel GS.base */ swapgs -1: - movq %rdi,TF_RDI(%rsp) /* free up a GP register */ - movq %cr2,%rdi /* preserve %cr2 before .. */ + movq PCPU(CURPCB),%rdi + movb $0,PCB_FULL_IRET(%rdi) +1: movq %cr2,%rdi /* preserve %cr2 before .. */ movq %rdi,TF_ADDR(%rsp) /* enabling interrupts. */ movw %fs,TF_FS(%rsp) movw %gs,TF_GS(%rsp) @@ -311,7 +316,9 @@ IDTVEC(prot) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz 2f /* already running with kernel GS.base */ 1: swapgs -2: movw %fs,TF_FS(%rsp) +2: movq PCPU(CURPCB),%rdi + movb $1,PCB_FULL_IRET(%rdi) /* always full iret from GPF */ + movw %fs,TF_FS(%rsp) movw %gs,TF_GS(%rsp) movw %es,TF_ES(%rsp) movw %ds,TF_DS(%rsp) @@ -341,6 +348,8 @@ IDTVEC(fast_syscall) movw %gs,TF_GS(%rsp) movw %es,TF_ES(%rsp) movw %ds,TF_DS(%rsp) + movq PCPU(CURPCB),%r11 + movb $0,PCB_FULL_IRET(%r11) sti movq $KUDSEL,TF_SS(%rsp) movq $KUCSEL,TF_CS(%rsp) @@ -644,7 +653,8 @@ doreti_exit: */ testb $SEL_RPL_MASK,TF_CS(%rsp) jz ld_regs - + cmpb $0,PCB_FULL_IRET(%r8) + je ld_regs testl $TF_HASSEGS,TF_FLAGS(%rsp) je set_segs diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index e25f2f948e0b..9d783095b38b 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -141,6 +141,7 @@ ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp)); +ASSYM(PCB_FULL_IRET, offsetof(struct pcb, pcb_full_iret)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_32BIT, PCB_32BIT); ASSYM(PCB_GS32BIT, PCB_GS32BIT); diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 654e12b4551b..41e7a03cd41d 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -382,6 +382,7 @@ sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; + td->td_pcb->pcb_full_iret = 1; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } @@ -483,6 +484,7 @@ sigreturn(td, uap) signotify(td); PROC_UNLOCK(p); td->td_pcb->pcb_flags |= PCB_FULLCTX; + td->td_pcb->pcb_full_iret = 1; return (EJUSTRETURN); } @@ -853,6 +855,7 @@ exec_setregs(td, entry, stack, ps_strings) pcb->pcb_gsbase = 0; pcb->pcb_flags &= ~(PCB_32BIT | PCB_GS32BIT); pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; + pcb->pcb_full_iret = 1; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = entry; @@ -2031,6 +2034,7 @@ set_mcontext(struct thread *td, const mcontext_t *mcp) td->td_pcb->pcb_gsbase = mcp->mc_gsbase; } td->td_pcb->pcb_flags |= PCB_FULLCTX; + td->td_pcb->pcb_full_iret = 1; return (0); } diff --git a/sys/amd64/amd64/sys_machdep.c b/sys/amd64/amd64/sys_machdep.c index c01ead2aca8b..1cba8a21c21f 100644 --- a/sys/amd64/amd64/sys_machdep.c +++ b/sys/amd64/amd64/sys_machdep.c @@ -103,6 +103,7 @@ sysarch_ldt(struct thread *td, struct sysarch_args *uap, int uap_space) error = amd64_get_ldt(td, largs); break; case I386_SET_LDT: + td->td_pcb->pcb_full_iret = 1; if (largs->descs != NULL) { lp = (struct user_segment_descriptor *) kmem_alloc(kernel_map, largs->num * @@ -132,6 +133,7 @@ update_gdt_gsbase(struct thread *td, uint32_t base) if (td != curthread) return; + td->td_pcb->pcb_full_iret = 1; critical_enter(); sd = PCPU_GET(gs32p); sd->sd_lobase = base & 0xffffff; @@ -146,6 +148,7 @@ update_gdt_fsbase(struct thread *td, uint32_t base) if (td != curthread) return; + td->td_pcb->pcb_full_iret = 1; critical_enter(); sd = PCPU_GET(fs32p); sd->sd_lobase = base & 0xffffff; @@ -201,6 +204,7 @@ sysarch(td, uap) if (!error) { pcb->pcb_fsbase = i386base; td->td_frame->tf_fs = _ufssel; + pcb->pcb_full_iret = 1; update_gdt_fsbase(td, i386base); } break; @@ -212,6 +216,7 @@ sysarch(td, uap) error = copyin(uap->parms, &i386base, sizeof(i386base)); if (!error) { pcb->pcb_gsbase = i386base; + pcb->pcb_full_iret = 1; td->td_frame->tf_gs = _ugssel; update_gdt_gsbase(td, i386base); } @@ -225,6 +230,7 @@ sysarch(td, uap) if (!error) { if (a64base < VM_MAXUSER_ADDRESS) { pcb->pcb_fsbase = a64base; + pcb->pcb_full_iret = 1; td->td_frame->tf_fs = _ufssel; } else error = EINVAL; @@ -240,6 +246,7 @@ sysarch(td, uap) if (!error) { if (a64base < VM_MAXUSER_ADDRESS) { pcb->pcb_gsbase = a64base; + pcb->pcb_full_iret = 1; td->td_frame->tf_gs = _ugssel; } else error = EINVAL; @@ -525,6 +532,7 @@ amd64_set_ldt(td, uap, descs) uap->start, uap->num, (void *)uap->descs); #endif + td->td_pcb->pcb_full_iret = 1; p = td->td_proc; if (descs == NULL) { /* Free descriptors */ diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index 928be345d2c7..51d1d6218b10 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -186,6 +186,9 @@ cpu_fork(td1, p2, td2, flags) /* As an i386, do not copy io permission bitmap. */ pcb2->pcb_tssp = NULL; + /* New segment registers. */ + pcb2->pcb_full_iret = 1; + /* Copy the LDT, if necessary. */ mdp1 = &td1->td_proc->p_md; mdp2 = &p2->p_md; @@ -336,6 +339,7 @@ cpu_set_upcall(struct thread *td, struct thread *td0) */ bcopy(td0->td_pcb, pcb2, sizeof(*pcb2)); pcb2->pcb_flags &= ~PCB_FPUINITDONE; + pcb2->pcb_full_iret = 1; /* * Create a new fresh stack for the new thread. @@ -450,6 +454,7 @@ cpu_set_user_tls(struct thread *td, void *tls_base) } #endif td->td_pcb->pcb_fsbase = (register_t)tls_base; + td->td_pcb->pcb_full_iret = 1; return (0); } diff --git a/sys/amd64/ia32/ia32_exception.S b/sys/amd64/ia32/ia32_exception.S index 76c5d5ac5873..341f00e5a83b 100644 --- a/sys/amd64/ia32/ia32_exception.S +++ b/sys/amd64/ia32/ia32_exception.S @@ -42,10 +42,16 @@ SUPERALIGN_TEXT IDTVEC(int0x80_syscall) swapgs - sti pushq $2 /* sizeof "int 0x80" */ subq $TF_ERR,%rsp /* skip over tf_trapno */ movq %rdi,TF_RDI(%rsp) + movq PCPU(CURPCB),%rdi + movb $0,PCB_FULL_IRET(%rdi) + movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) + movw %es,TF_ES(%rsp) + movw %ds,TF_DS(%rsp) + sti movq %rsi,TF_RSI(%rsp) movq %rdx,TF_RDX(%rsp) movq %rcx,TF_RCX(%rsp) @@ -60,10 +66,6 @@ IDTVEC(int0x80_syscall) movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) movl $TF_HASSEGS,TF_FLAGS(%rsp) FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi diff --git a/sys/amd64/ia32/ia32_reg.c b/sys/amd64/ia32/ia32_reg.c index 49dd4e26409a..83f6783e55c8 100644 --- a/sys/amd64/ia32/ia32_reg.c +++ b/sys/amd64/ia32/ia32_reg.c @@ -125,6 +125,7 @@ set_regs32(struct thread *td, struct reg32 *regs) tp->tf_fs = regs->r_fs; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; + td->td_pcb->pcb_full_iret = 1; tp->tf_flags = TF_HASSEGS; tp->tf_rdi = regs->r_edi; tp->tf_rsi = regs->r_esi; diff --git a/sys/amd64/ia32/ia32_signal.c b/sys/amd64/ia32/ia32_signal.c index 37e8013a416e..d7c1dd5c6f90 100644 --- a/sys/amd64/ia32/ia32_signal.c +++ b/sys/amd64/ia32/ia32_signal.c @@ -159,6 +159,7 @@ ia32_get_mcontext(struct thread *td, struct ia32_mcontext *mcp, int flags) ia32_get_fpcontext(td, mcp); mcp->mc_fsbase = td->td_pcb->pcb_fsbase; mcp->mc_gsbase = td->td_pcb->pcb_gsbase; + td->td_pcb->pcb_full_iret = 1; return (0); } @@ -201,6 +202,7 @@ ia32_set_mcontext(struct thread *td, const struct ia32_mcontext *mcp) tp->tf_rsp = mcp->mc_esp; tp->tf_ss = mcp->mc_ss; td->td_pcb->pcb_flags |= PCB_FULLCTX; + td->td_pcb->pcb_full_iret = 1; return (0); } @@ -394,6 +396,7 @@ freebsd4_ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; + td->td_pcb->pcb_full_iret = 1; /* leave user %fs and %gs untouched */ PROC_LOCK(p); mtx_lock(&psp->ps_mtx); @@ -514,6 +517,7 @@ ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; + td->td_pcb->pcb_full_iret = 1; /* XXXKIB leave user %fs and %gs untouched */ PROC_LOCK(p); mtx_lock(&psp->ps_mtx); @@ -611,6 +615,7 @@ freebsd4_freebsd32_sigreturn(td, uap) SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); + td->td_pcb->pcb_full_iret = 1; return (EJUSTRETURN); } #endif /* COMPAT_FREEBSD4 */ @@ -702,6 +707,7 @@ freebsd32_sigreturn(td, uap) SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); + td->td_pcb->pcb_full_iret = 1; return (EJUSTRETURN); } @@ -747,5 +753,6 @@ ia32_setregs(td, entry, stack, ps_strings) /* Return via doreti so that we can change to a different %cs */ pcb->pcb_flags |= PCB_FULLCTX | PCB_32BIT; pcb->pcb_flags &= ~PCB_GS32BIT; + td->td_pcb->pcb_full_iret = 1; td->td_retval[1] = 0; } diff --git a/sys/amd64/include/pcb.h b/sys/amd64/include/pcb.h index 7361049b28ed..b26188a5bf70 100644 --- a/sys/amd64/include/pcb.h +++ b/sys/amd64/include/pcb.h @@ -72,12 +72,13 @@ struct pcb { struct savefpu pcb_save; uint16_t pcb_initial_fpucw; - caddr_t pcb_onfault; /* copyin/out fault recovery */ + caddr_t pcb_onfault; /* copyin/out fault recovery */ /* 32-bit segment descriptor */ struct user_segment_descriptor pcb_gs32sd; /* local tss, with i/o bitmap; NULL for common */ struct amd64tss *pcb_tssp; + char pcb_full_iret; }; struct xpcb { diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c index 2626ccfec91f..77186a1d37d3 100644 --- a/sys/amd64/linux32/linux32_sysvec.c +++ b/sys/amd64/linux32/linux32_sysvec.c @@ -423,6 +423,7 @@ linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; + td->td_pcb->pcb_full_iret = 1; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } @@ -545,6 +546,7 @@ linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; + td->td_pcb->pcb_full_iret = 1; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } @@ -645,6 +647,7 @@ linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) regs->tf_rflags = eflags; regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; + td->td_pcb->pcb_full_iret = 1; return (EJUSTRETURN); } @@ -746,6 +749,7 @@ linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) regs->tf_rflags = eflags; regs->tf_rsp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; + td->td_pcb->pcb_full_iret = 1; /* * call sigaltstack & ignore results.. @@ -864,6 +868,7 @@ exec_linux_setregs(td, entry, stack, ps_strings) regs->tf_flags = TF_HASSEGS; regs->tf_cs = _ucode32sel; regs->tf_rbx = ps_strings; + td->td_pcb->pcb_full_iret = 1; load_cr0(rcr0() | CR0_MP | CR0_TS); fpstate_drop(td);