From 2c66cccab7ceceb3eed086da3b2dedfc77ce72de Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 1 Apr 2009 13:09:26 +0000 Subject: [PATCH] Save and restore segment registers on amd64 when entering and leaving the kernel on amd64. Fill and read segment registers for mcontext and signals. Handle traps caused by restoration of the invalidated selectors. Implement user-mode creation and manipulation of the process-specific LDT descriptors for amd64, see sysarch(2). Implement support for TSS i/o port access permission bitmap for amd64. Context-switch LDT and TSS. Do not save and restore segment registers on the context switch, that is handled by kernel enter/leave trampolines now. Remove segment restore code from the signal trampolines for freebsd/amd64, freebsd/ia32 and linux/i386 for the same reason. Implement amd64-specific compat shims for sysarch. Linuxolator (temporary ?) switched to use gsbase for thread_area pointer. TODO: Currently, gdb is not adapted to show segment registers from struct reg. Also, no machine-depended ptrace command is added to set segment registers for debugged process. In collaboration with: pho Discussed with: peter Reviewed by: jhb Linuxolator tested by: dchagin --- sys/amd64/acpica/acpi_switch.S | 17 +- sys/amd64/amd64/apic_vector.S | 7 +- sys/amd64/amd64/cpu_switch.S | 155 +++----- sys/amd64/amd64/db_interface.c | 10 +- sys/amd64/amd64/db_trace.c | 6 +- sys/amd64/amd64/exception.S | 197 +++++++--- sys/amd64/amd64/genassym.c | 23 +- sys/amd64/amd64/machdep.c | 113 ++++-- sys/amd64/amd64/mp_machdep.c | 20 +- sys/amd64/amd64/sys_machdep.c | 581 +++++++++++++++++++++++++++- sys/amd64/amd64/trap.c | 95 +++++ sys/amd64/amd64/vm_machdep.c | 96 ++++- sys/amd64/ia32/ia32_exception.S | 5 + sys/amd64/ia32/ia32_misc.c | 71 ++++ sys/amd64/ia32/ia32_reg.c | 35 +- sys/amd64/ia32/ia32_signal.c | 87 +++-- sys/amd64/ia32/ia32_sigtramp.S | 4 - sys/amd64/include/asmacros.h | 7 +- sys/amd64/include/frame.h | 11 +- sys/amd64/include/md_var.h | 13 + sys/amd64/include/pcb.h | 6 +- sys/amd64/include/pcpu.h | 12 +- sys/amd64/include/proc.h | 21 + sys/amd64/include/segments.h | 3 + sys/amd64/include/sysarch.h | 9 + sys/amd64/linux32/linux32_locore.s | 4 - sys/amd64/linux32/linux32_machdep.c | 9 +- sys/amd64/linux32/linux32_sysvec.c | 70 ++-- sys/conf/files.amd64 | 1 + 29 files changed, 1336 insertions(+), 352 deletions(-) create mode 100644 sys/amd64/ia32/ia32_misc.c diff --git a/sys/amd64/acpica/acpi_switch.S b/sys/amd64/acpica/acpi_switch.S index d4f732a6eb2b..0b262925d2f6 100644 --- a/sys/amd64/acpica/acpi_switch.S +++ b/sys/amd64/acpica/acpi_switch.S @@ -64,12 +64,17 @@ ENTRY(acpi_restorecpu) /* Fetch PCB. */ movq WAKEUP_CTX(xpcb), %r11 - /* Restore segment registers. */ - mov WAKEUP_PCB(DS), %ds - mov WAKEUP_PCB(ES), %es - mov WAKEUP_XPCB(SS), %ss - mov WAKEUP_PCB(FS), %fs - mov WAKEUP_PCB(GS), %gs + /* Force kernel segment registers. */ + movl $KDSEL, %eax + movw %ax, %ds + movl $KDSEL, %eax + movw %ax, %es + movl $KDSEL, %eax + movw %ax, %ss + movl $KUF32SEL, %eax + movw %ax, %fs + movl $KUG32SEL, %eax + movw %ax, %gs movl $MSR_FSBASE, %ecx movl WAKEUP_PCB(FSBASE), %eax diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 0306bb37ea42..cebafc8370e8 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -219,9 +219,7 @@ IDTVEC(cpustop) movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ call cpustop_handler - - POP_FRAME - iretq + jmp doreti /* * Executed by a CPU when it receives an IPI_SUSPEND from another CPU. @@ -251,6 +249,5 @@ IDTVEC(rendezvous) call smp_rendezvous_action movq lapic, %rax movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ - POP_FRAME /* Why not doreti? */ - iretq + jmp doreti #endif /* SMP */ diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S index 0c59703f921b..6fc829099b4a 100644 --- a/sys/amd64/amd64/cpu_switch.S +++ b/sys/amd64/amd64/cpu_switch.S @@ -75,8 +75,6 @@ ENTRY(cpu_throw) 1: movq TD_PCB(%rdi),%r8 /* Old pcb */ movl PCPU(CPUID), %eax - movq PCB_FSBASE(%r8),%r9 - movq PCB_GSBASE(%r8),%r10 /* release bit from old pm_active */ movq TD_PROC(%rdi), %rdx /* oldtd->td_proc */ movq P_VMSPACE(%rdx), %rdx /* proc->p_vmspace */ @@ -110,28 +108,6 @@ ENTRY(cpu_switch) movq %rbx,PCB_RBX(%r8) movq %rax,PCB_RIP(%r8) - /* - * Reread fs and gs bases. Explicit fs segment register load - * by the usermode code may change actual fs base without - * updating pcb_{fs,gs}base. - * - * %rdx still contains the mtx, save %rdx around rdmsr. - */ - movq %rdx,%r11 - movl $MSR_FSBASE,%ecx - rdmsr - shlq $32,%rdx - leaq (%rax,%rdx),%r9 - movl $MSR_KGSBASE,%ecx - rdmsr - shlq $32,%rdx - leaq (%rax,%rdx),%r10 - movq %r11,%rdx - - testl $PCB_32BIT,PCB_FLAGS(%r8) - jnz store_seg -done_store_seg: - testl $PCB_DBREGS,PCB_FLAGS(%r8) jnz store_dr /* static predict not taken */ done_store_dr: @@ -192,36 +168,47 @@ sw1: testl $TDP_KTHREAD,TD_PFLAGS(%rsi) jnz do_kthread - testl $PCB_32BIT,PCB_FLAGS(%r8) - jnz load_seg -done_load_seg: + /* + * Load ldt register + */ + movq TD_PROC(%rsi),%rcx + cmpq $0, P_MD+MD_LDT(%rcx) + jne do_ldt + xorl %eax,%eax +ld_ldt: lldt %ax - cmpq PCB_FSBASE(%r8),%r9 - jz 1f - /* Restore userland %fs */ -restore_fsbase: - movl $MSR_FSBASE,%ecx + /* Restore fs base in GDT */ movl PCB_FSBASE(%r8),%eax - movl PCB_FSBASE+4(%r8),%edx - wrmsr -1: - cmpq PCB_GSBASE(%r8),%r10 - jz 2f - /* Restore userland %gs */ - movl $MSR_KGSBASE,%ecx - movl PCB_GSBASE(%r8),%eax - movl PCB_GSBASE+4(%r8),%edx - wrmsr -2: + movq PCPU(FS32P),%rdx + movw %ax,2(%rdx) + shrl $16,%eax + movb %al,4(%rdx) + shrl $8,%eax + movb %al,7(%rdx) -do_tss: + /* Restore gs base in GDT */ + movl PCB_GSBASE(%r8),%eax + movq PCPU(GS32P),%rdx + movw %ax,2(%rdx) + shrl $16,%eax + movb %al,4(%rdx) + shrl $8,%eax + movb %al,7(%rdx) + +do_kthread: + /* Do we need to reload tss ? */ + movq PCPU(TSSP),%rax + movq PCB_TSSP(%r8),%rdx + testq %rdx,%rdx + cmovzq PCPU(COMMONTSSP),%rdx + cmpq %rax,%rdx + jne do_tss +done_tss: + movq %r8,PCPU(RSP0) + movq %r8,PCPU(CURPCB) /* Update the TSS_RSP0 pointer for the next interrupt */ - movq PCPU(TSSP), %rax - movq %r8, PCPU(RSP0) - movq %r8, PCPU(CURPCB) - addq $COMMON_TSS_RSP0, %rax - movq %rsi, PCPU(CURTHREAD) /* into next thread */ - movq %r8, (%rax) + movq %r8,COMMON_TSS_RSP0(%rdx) + movq %rsi,PCPU(CURTHREAD) /* into next thread */ /* Test if debug registers should be restored. */ testl $PCB_DBREGS,PCB_FLAGS(%r8) @@ -250,45 +237,6 @@ done_load_dr: * We use jumps rather than call in order to avoid the stack. */ -do_kthread: - /* - * Copy old fs/gsbase to new kthread pcb for future switches - * This maintains curpcb->pcb_[fg]sbase as caches of the MSR - */ - movq %r9,PCB_FSBASE(%r8) - movq %r10,PCB_GSBASE(%r8) - jmp do_tss - -store_seg: - mov %gs,PCB_GS(%r8) - testl $PCB_GS32BIT,PCB_FLAGS(%r8) - jnz 2f -1: mov %ds,PCB_DS(%r8) - mov %es,PCB_ES(%r8) - mov %fs,PCB_FS(%r8) - jmp done_store_seg -2: movq PCPU(GS32P),%rax - movq (%rax),%rax - movq %rax,PCB_GS32SD(%r8) - jmp 1b - -load_seg: - testl $PCB_GS32BIT,PCB_FLAGS(%r8) - jnz 2f -1: movl $MSR_GSBASE,%ecx - rdmsr - mov PCB_GS(%r8),%gs - wrmsr - mov PCB_DS(%r8),%ds - mov PCB_ES(%r8),%es - mov PCB_FS(%r8),%fs - jmp restore_fsbase - /* Restore userland %gs while preserving kernel gsbase */ -2: movq PCPU(GS32P),%rax - movq PCB_GS32SD(%r8),%rcx - movq %rcx,(%rax) - jmp 1b - store_dr: movq %dr7,%rax /* yes, do the save */ movq %dr0,%r15 @@ -325,6 +273,29 @@ load_dr: movq %r11,%dr6 movq %rax,%dr7 jmp done_load_dr + +do_tss: movq %rdx,PCPU(TSSP) + movq %rdx,%rcx + movq PCPU(TSS),%rax + movw %rcx,2(%rax) + shrq $16,%rcx + movb %cl,4(%rax) + shrq $8,%rcx + movb %cl,7(%rax) + shrq $8,%rcx + movl %ecx,8(%rax) + movb $0x89,5(%rax) /* unset busy */ + movl $TSSSEL,%eax + ltr %ax + jmp done_tss + +do_ldt: movq PCPU(LDT),%rax + movq P_MD+MD_LDT_SD(%rcx),%rdx + movq %rdx,(%rax) + movq P_MD+MD_LDT_SD+8(%rcx),%rdx + movq %rdx,8(%rax) + movl $LDTSEL,%eax + jmp ld_ldt END(cpu_switch) /* @@ -398,12 +369,6 @@ ENTRY(savectx2) movq (%rsp),%rax movq %rax,PCB_RIP(%r8) - mov %ds,PCB_DS(%r8) - mov %es,PCB_ES(%r8) - mov %ss,XPCB_SS(%r8) - mov %fs,PCB_FS(%r8) - mov %gs,PCB_GS(%r8) - movq %rbx,PCB_RBX(%r8) movq %rsp,PCB_RSP(%r8) movq %rbp,PCB_RBP(%r8) diff --git a/sys/amd64/amd64/db_interface.c b/sys/amd64/amd64/db_interface.c index b2976167bd6b..287c236231f3 100644 --- a/sys/amd64/amd64/db_interface.c +++ b/sys/amd64/amd64/db_interface.c @@ -139,7 +139,11 @@ void db_show_mdpcpu(struct pcpu *pc) { -#if 0 - db_printf("currentldt = 0x%x\n", pc->pc_currentldt); -#endif + db_printf("curpmap = %p\n", pc->pc_curpmap); + db_printf("tssp = %p\n", pc->pc_tssp); + db_printf("commontssp = %p\n", pc->pc_commontssp); + db_printf("rsp0 = 0x%lx\n", pc->pc_rsp0); + db_printf("gs32p = %p\n", pc->pc_gs32p); + db_printf("ldt = %p\n", pc->pc_ldt); + db_printf("tss = %p\n", pc->pc_tss); } diff --git a/sys/amd64/amd64/db_trace.c b/sys/amd64/amd64/db_trace.c index 50a5f4d2ca7a..73ffac53def5 100644 --- a/sys/amd64/amd64/db_trace.c +++ b/sys/amd64/amd64/db_trace.c @@ -69,12 +69,10 @@ static db_varfcn_t db_ss; #define DB_OFFSET(x) (db_expr_t *)offsetof(struct trapframe, x) struct db_variable db_regs[] = { { "cs", DB_OFFSET(tf_cs), db_frame }, -#if 0 { "ds", DB_OFFSET(tf_ds), db_frame }, { "es", DB_OFFSET(tf_es), db_frame }, { "fs", DB_OFFSET(tf_fs), db_frame }, { "gs", DB_OFFSET(tf_gs), db_frame }, -#endif { "ss", NULL, db_ss }, { "rax", DB_OFFSET(tf_rax), db_frame }, { "rcx", DB_OFFSET(tf_rcx), db_frame }, @@ -94,7 +92,7 @@ struct db_variable db_regs[] = { { "r15", DB_OFFSET(tf_r15), db_frame }, { "rip", DB_OFFSET(tf_rip), db_frame }, { "rflags", DB_OFFSET(tf_rflags), db_frame }, -#define DB_N_SHOW_REGS 20 /* Don't show registers after here. */ +#define DB_N_SHOW_REGS 24 /* Don't show registers after here. */ { "dr0", NULL, db_dr0 }, { "dr1", NULL, db_dr1 }, { "dr2", NULL, db_dr2 }, @@ -357,7 +355,7 @@ db_nextframe(struct amd64_frame **fp, db_addr_t *ip, struct thread *td) rbp = tf->tf_rbp; switch (frame_type) { case TRAP: - db_printf("--- trap %#lr", tf->tf_trapno); + db_printf("--- trap %#r", tf->tf_trapno); break; case SYSCALL: db_printf("--- syscall"); diff --git a/sys/amd64/amd64/exception.S b/sys/amd64/amd64/exception.S index 897bfec0915c..daa5c2592988 100644 --- a/sys/amd64/amd64/exception.S +++ b/sys/amd64/amd64/exception.S @@ -42,6 +42,7 @@ #include #include #include +#include #include "assym.s" @@ -99,7 +100,7 @@ MCOUNT_LABEL(btrap) /* Traps that we leave interrupts disabled for.. */ #define TRAP_NOEN(a) \ subq $TF_RIP,%rsp; \ - movq $(a),TF_TRAPNO(%rsp) ; \ + movl $(a),TF_TRAPNO(%rsp) ; \ movq $0,TF_ADDR(%rsp) ; \ movq $0,TF_ERR(%rsp) ; \ jmp alltraps_noen @@ -111,7 +112,7 @@ IDTVEC(bpt) /* Regular traps; The cpu does not supply tf_err for these. */ #define TRAP(a) \ subq $TF_RIP,%rsp; \ - movq $(a),TF_TRAPNO(%rsp) ; \ + movl $(a),TF_TRAPNO(%rsp) ; \ movq $0,TF_ADDR(%rsp) ; \ movq $0,TF_ERR(%rsp) ; \ jmp alltraps @@ -139,7 +140,7 @@ IDTVEC(xmm) /* This group of traps have tf_err already pushed by the cpu */ #define TRAP_ERR(a) \ subq $TF_ERR,%rsp; \ - movq $(a),TF_TRAPNO(%rsp) ; \ + movl $(a),TF_TRAPNO(%rsp) ; \ movq $0,TF_ADDR(%rsp) ; \ jmp alltraps IDTVEC(tss) @@ -164,6 +165,10 @@ alltraps: testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz alltraps_testi /* already running with kernel GS.base */ swapgs + movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) + movw %es,TF_ES(%rsp) + movw %ds,TF_DS(%rsp) alltraps_testi: testl $PSL_I,TF_RFLAGS(%rsp) jz alltraps_pushregs @@ -185,6 +190,7 @@ alltraps_pushregs_no_rdi: movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) + movl $TF_HASSEGS,TF_FLAGS(%rsp) FAKE_MCOUNT(TF_RIP(%rsp)) #ifdef KDTRACE_HOOKS /* @@ -193,7 +199,7 @@ alltraps_pushregs_no_rdi: * interrupt. For all other trap types, just handle them in * the usual way. */ - cmpq $T_BPTFLT,TF_TRAPNO(%rsp) + cmpl $T_BPTFLT,TF_TRAPNO(%rsp) jne calltrap /* Check if there is no DTrace hook registered. */ @@ -228,13 +234,17 @@ calltrap: .type alltraps_noen,@function alltraps_noen: testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ - jz alltraps_pushregs /* already running with kernel GS.base */ + jz 1f /* already running with kernel GS.base */ swapgs +1: movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) + movw %es,TF_ES(%rsp) + movw %ds,TF_DS(%rsp) jmp alltraps_pushregs IDTVEC(dblfault) subq $TF_ERR,%rsp - movq $T_DOUBLEFLT,TF_TRAPNO(%rsp) + movl $T_DOUBLEFLT,TF_TRAPNO(%rsp) movq $0,TF_ADDR(%rsp) movq $0,TF_ERR(%rsp) movq %rdi,TF_RDI(%rsp) @@ -252,6 +262,11 @@ IDTVEC(dblfault) movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) + movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) + movw %es,TF_ES(%rsp) + movw %ds,TF_DS(%rsp) + movl $TF_HASSEGS,TF_FLAGS(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz 1f /* already running with kernel GS.base */ swapgs @@ -262,7 +277,7 @@ IDTVEC(dblfault) IDTVEC(page) subq $TF_ERR,%rsp - movq $T_PAGEFLT,TF_TRAPNO(%rsp) + movl $T_PAGEFLT,TF_TRAPNO(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz 1f /* already running with kernel GS.base */ swapgs @@ -270,6 +285,10 @@ IDTVEC(page) movq %rdi,TF_RDI(%rsp) /* free up a GP register */ movq %cr2,%rdi /* preserve %cr2 before .. */ movq %rdi,TF_ADDR(%rsp) /* enabling interrupts. */ + movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) + movw %es,TF_ES(%rsp) + movw %ds,TF_DS(%rsp) testl $PSL_I,TF_RFLAGS(%rsp) jz alltraps_pushregs_no_rdi sti @@ -283,17 +302,19 @@ IDTVEC(page) */ IDTVEC(prot) subq $TF_ERR,%rsp - movq $T_PROTFLT,TF_TRAPNO(%rsp) + movl $T_PROTFLT,TF_TRAPNO(%rsp) movq $0,TF_ADDR(%rsp) movq %rdi,TF_RDI(%rsp) /* free up a GP register */ leaq doreti_iret(%rip),%rdi cmpq %rdi,TF_RIP(%rsp) - je 2f /* kernel but with user gsbase!! */ + je 1f /* kernel but with user gsbase!! */ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ - jz 1f /* already running with kernel GS.base */ -2: - swapgs -1: + jz 2f /* already running with kernel GS.base */ +1: swapgs +2: movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) + movw %es,TF_ES(%rsp) + movw %ds,TF_DS(%rsp) testl $PSL_I,TF_RFLAGS(%rsp) jz alltraps_pushregs_no_rdi sti @@ -316,6 +337,10 @@ IDTVEC(fast_syscall) movq %rcx,TF_RIP(%rsp) /* %rcx original value is in %r10 */ movq PCPU(SCRATCH_RSP),%r11 /* %r11 already saved */ movq %r11,TF_RSP(%rsp) /* user stack pointer */ + movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) + movw %es,TF_ES(%rsp) + movw %ds,TF_DS(%rsp) sti movq $KUDSEL,TF_SS(%rsp) movq $KUCSEL,TF_CS(%rsp) @@ -333,40 +358,11 @@ IDTVEC(fast_syscall) movq %r13,TF_R13(%rsp) /* C preserved */ movq %r14,TF_R14(%rsp) /* C preserved */ movq %r15,TF_R15(%rsp) /* C preserved */ + movl $TF_HASSEGS,TF_FLAGS(%rsp) FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi call syscall movq PCPU(CURPCB),%rax - testq $PCB_FULLCTX,PCB_FLAGS(%rax) - jne 3f -1: /* Check for and handle AST's on return to userland */ - cli - movq PCPU(CURTHREAD),%rax - testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax) - je 2f - sti - movq %rsp, %rdi - call ast - jmp 1b -2: /* restore preserved registers */ - MEXITCOUNT - movq TF_RDI(%rsp),%rdi /* bonus; preserve arg 1 */ - movq TF_RSI(%rsp),%rsi /* bonus: preserve arg 2 */ - movq TF_RDX(%rsp),%rdx /* return value 2 */ - movq TF_RAX(%rsp),%rax /* return value 1 */ - movq TF_RBX(%rsp),%rbx /* C preserved */ - movq TF_RBP(%rsp),%rbp /* C preserved */ - movq TF_R12(%rsp),%r12 /* C preserved */ - movq TF_R13(%rsp),%r13 /* C preserved */ - movq TF_R14(%rsp),%r14 /* C preserved */ - movq TF_R15(%rsp),%r15 /* C preserved */ - movq TF_RFLAGS(%rsp),%r11 /* original %rflags */ - movq TF_RIP(%rsp),%rcx /* original %rip */ - movq TF_RSP(%rsp),%r9 /* user stack pointer */ - movq %r9,%rsp /* original %rsp */ - swapgs - sysretq -3: /* Requested full context restore, use doreti for that */ andq $~PCB_FULLCTX,PCB_FLAGS(%rax) MEXITCOUNT jmp doreti @@ -405,7 +401,7 @@ IDTVEC(fast_syscall32) IDTVEC(nmi) subq $TF_RIP,%rsp - movq $(T_NMI),TF_TRAPNO(%rsp) + movl $(T_NMI),TF_TRAPNO(%rsp) movq $0,TF_ADDR(%rsp) movq $0,TF_ERR(%rsp) movq %rdi,TF_RDI(%rsp) @@ -423,6 +419,11 @@ IDTVEC(nmi) movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) + movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) + movw %es,TF_ES(%rsp) + movw %ds,TF_DS(%rsp) + movl $TF_HASSEGS,TF_FLAGS(%rsp) xorl %ebx,%ebx testb $SEL_RPL_MASK,TF_CS(%rsp) jnz nmi_fromuserspace @@ -515,9 +516,7 @@ outofnmi: nocallchain: #endif testl %ebx,%ebx - jz nmi_kernelexit - swapgs - jmp nmi_restoreregs + jnz doreti_exit nmi_kernelexit: /* * Put back the preserved MSR_GSBASE value. @@ -633,7 +632,55 @@ doreti_ast: */ doreti_exit: MEXITCOUNT - movq TF_RDI(%rsp),%rdi + movq PCPU(CURTHREAD),%r8 + movq TD_PCB(%r8),%r8 + + /* + * Do not reload segment registers for kernel. + * Since we do not reload segments registers with sane + * values on kernel entry, descriptors referenced by + * segments registers may be not valid. This is fatal + * for the usermode, but is innocent for the kernel. + */ + testb $SEL_RPL_MASK,TF_CS(%rsp) + jz ld_regs + + testl $TF_HASSEGS,TF_FLAGS(%rsp) + je set_segs + +do_segs: + /* Restore %fs and fsbase */ + movw TF_FS(%rsp),%ax + .globl ld_fs +ld_fs: movw %ax,%fs + cmpw $KUF32SEL,%ax + jne 1f + movl $MSR_FSBASE,%ecx + movl PCB_FSBASE(%r8),%eax + movl PCB_FSBASE+4(%r8),%edx + wrmsr +1: + /* Restore %gs and gsbase */ + movw TF_GS(%rsp),%si + pushfq + cli + movl $MSR_GSBASE,%ecx + rdmsr + .globl ld_gs +ld_gs: movw %si,%gs + wrmsr + popfq + cmpw $KUG32SEL,%si + jne 1f + movl $MSR_KGSBASE,%ecx + movl PCB_GSBASE(%r8),%eax + movl PCB_GSBASE+4(%r8),%edx + wrmsr +1: .globl ld_es +ld_es: movw TF_ES(%rsp),%es + .globl ld_ds +ld_ds: movw TF_DS(%rsp),%ds +ld_regs:movq TF_RDI(%rsp),%rdi movq TF_RSI(%rsp),%rsi movq TF_RDX(%rsp),%rdx movq TF_RCX(%rsp),%rcx @@ -657,6 +704,14 @@ doreti_exit: doreti_iret: iretq +set_segs: + movw $KUDSEL,%ax + movw %ax,TF_DS(%rsp) + movw %ax,TF_ES(%rsp) + movw $KUF32SEL,TF_FS(%rsp) + movw $KUG32SEL,TF_GS(%rsp) + jmp do_segs + /* * doreti_iret_fault. Alternative return code for * the case where we get a fault in the doreti_exit code @@ -671,7 +726,12 @@ doreti_iret_fault: testl $PSL_I,TF_RFLAGS(%rsp) jz 1f sti -1: movq %rdi,TF_RDI(%rsp) +1: movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) + movw %es,TF_ES(%rsp) + movw %ds,TF_DS(%rsp) + movl $TF_HASSEGS,TF_FLAGS(%rsp) + movq %rdi,TF_RDI(%rsp) movq %rsi,TF_RSI(%rsp) movq %rdx,TF_RDX(%rsp) movq %rcx,TF_RCX(%rsp) @@ -686,11 +746,48 @@ doreti_iret_fault: movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) - movq $T_PROTFLT,TF_TRAPNO(%rsp) + movl $T_PROTFLT,TF_TRAPNO(%rsp) movq $0,TF_ERR(%rsp) /* XXX should be the error code */ movq $0,TF_ADDR(%rsp) FAKE_MCOUNT(TF_RIP(%rsp)) jmp calltrap + + ALIGN_TEXT + .globl ds_load_fault +ds_load_fault: + movl $T_PROTFLT,TF_TRAPNO(%rsp) + movzwl TF_DS(%rsp),%edx + movl %edx,TF_ERR(%rsp) + movw $KUDSEL,TF_DS(%rsp) + jmp calltrap + + ALIGN_TEXT + .globl es_load_fault +es_load_fault: + movl $T_PROTFLT,TF_TRAPNO(%rsp) + movzwl TF_ES(%rsp),%edx + movl %edx,TF_ERR(%rsp) + movw $KUDSEL,TF_ES(%rsp) + jmp calltrap + + ALIGN_TEXT + .globl fs_load_fault +fs_load_fault: + movl $T_PROTFLT,TF_TRAPNO(%rsp) + movzwl TF_FS(%rsp),%edx + movl %edx,TF_ERR(%rsp) + movw $KUF32SEL,TF_FS(%rsp) + jmp calltrap + + ALIGN_TEXT + .globl gs_load_fault +gs_load_fault: + popfq + movl $T_PROTFLT,TF_TRAPNO(%rsp) + movzwl TF_GS(%rsp),%edx + movl %edx,TF_ERR(%rsp) + movw $KUG32SEL,TF_GS(%rsp) + jmp calltrap #ifdef HWPMC_HOOKS ENTRY(end_exceptions) #endif diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index 5aa3134855c2..ea3d83407f3a 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -79,6 +79,10 @@ ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); +ASSYM(P_MD, offsetof(struct proc, p_md)); +ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt)); +ASSYM(MD_LDT_SD, offsetof(struct mdproc, md_ldt_sd)); + ASSYM(TD_LOCK, offsetof(struct thread, td_lock)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); @@ -132,16 +136,13 @@ ASSYM(PCB_RBX, offsetof(struct pcb, pcb_rbx)); ASSYM(PCB_RIP, offsetof(struct pcb, pcb_rip)); ASSYM(PCB_FSBASE, offsetof(struct pcb, pcb_fsbase)); ASSYM(PCB_GSBASE, offsetof(struct pcb, pcb_gsbase)); -ASSYM(PCB_DS, offsetof(struct pcb, pcb_ds)); -ASSYM(PCB_ES, offsetof(struct pcb, pcb_es)); -ASSYM(PCB_FS, offsetof(struct pcb, pcb_fs)); -ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs)); ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0)); ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1)); ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2)); ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); +ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_32BIT, PCB_32BIT); ASSYM(PCB_GS32BIT, PCB_GS32BIT); @@ -193,7 +194,13 @@ ASSYM(TF_CS, offsetof(struct trapframe, tf_cs)); ASSYM(TF_RFLAGS, offsetof(struct trapframe, tf_rflags)); ASSYM(TF_RSP, offsetof(struct trapframe, tf_rsp)); ASSYM(TF_SS, offsetof(struct trapframe, tf_ss)); +ASSYM(TF_DS, offsetof(struct trapframe, tf_ds)); +ASSYM(TF_ES, offsetof(struct trapframe, tf_es)); +ASSYM(TF_FS, offsetof(struct trapframe, tf_fs)); +ASSYM(TF_GS, offsetof(struct trapframe, tf_gs)); +ASSYM(TF_FLAGS, offsetof(struct trapframe, tf_flags)); ASSYM(TF_SIZE, sizeof(struct trapframe)); +ASSYM(TF_HASSEGS, TF_HASSEGS); ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); @@ -215,7 +222,11 @@ ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp)); ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp)); ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0)); +ASSYM(PC_FS32P, offsetof(struct pcpu, pc_fs32p)); ASSYM(PC_GS32P, offsetof(struct pcpu, pc_gs32p)); +ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt)); +ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp)); +ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss)); ASSYM(LA_VER, offsetof(struct LAPIC, version)); ASSYM(LA_TPR, offsetof(struct LAPIC, tpr)); @@ -230,6 +241,10 @@ ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL)); ASSYM(KUCSEL, GSEL(GUCODE_SEL, SEL_UPL)); ASSYM(KUDSEL, GSEL(GUDATA_SEL, SEL_UPL)); ASSYM(KUC32SEL, GSEL(GUCODE32_SEL, SEL_UPL)); +ASSYM(KUF32SEL, GSEL(GUFS32_SEL, SEL_UPL)); +ASSYM(KUG32SEL, GSEL(GUGS32_SEL, SEL_UPL)); +ASSYM(TSSSEL, GSEL(GPROC0_SEL, SEL_KPL)); +ASSYM(LDTSEL, GSEL(GUSERLDT_SEL, SEL_KPL)); ASSYM(SEL_RPL_MASK, SEL_RPL_MASK); ASSYM(MSR_GSBASE, MSR_GSBASE); diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index cd86789fe292..0ad61347a3aa 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -159,7 +159,7 @@ extern vm_offset_t ksym_start, ksym_end; #define ICH_PMBASE 0x400 #define ICH_SMI_EN ICH_PMBASE + 0x30 -int _udatasel, _ucodesel, _ucode32sel; +int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; int cold = 1; @@ -192,6 +192,8 @@ struct mtx icu_lock; struct mem_range_softc mem_range_softc; +struct mtx dt_lock; /* lock for GDT and LDT */ + static void cpu_startup(dummy) void *dummy; @@ -278,7 +280,7 @@ cpu_startup(dummy) * Send an interrupt to process. * * Stack is set up to allow sigcode stored - * at top to call routine, followed by kcall + * at top to call routine, followed by call * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user @@ -316,6 +318,8 @@ sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext); fpstate_drop(td); + sf.sf_uc.uc_mcontext.mc_fsbase = td->td_pcb->pcb_fsbase; + sf.sf_uc.uc_mcontext.mc_gsbase = td->td_pcb->pcb_gsbase; /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && @@ -370,6 +374,11 @@ sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + regs->tf_fs = _ufssel; + regs->tf_gs = _ugssel; + regs->tf_flags = TF_HASSEGS; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } @@ -401,9 +410,16 @@ sigreturn(td, uap) ksiginfo_t ksi; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); - if (error != 0) + if (error != 0) { + printf("sigreturn (pid %d): copyin failed\n", p->p_pid); return (error); + } ucp = &uc; + if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { + printf("sigreturn (pid %d): mc_flags %x\n", p->p_pid, + ucp->uc_mcontext.mc_flags); + return (EINVAL); + } regs = td->td_frame; rflags = ucp->uc_mcontext.mc_rflags; /* @@ -420,7 +436,8 @@ sigreturn(td, uap) * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { - printf("sigreturn: rflags = 0x%lx\n", rflags); + printf("sigreturn (pid %d): rflags = 0x%lx\n", p->p_pid, + rflags); return (EINVAL); } @@ -431,7 +448,7 @@ sigreturn(td, uap) */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { - printf("sigreturn: cs = 0x%x\n", cs); + printf("sigreturn (pid %d): cs = 0x%x\n", p->p_pid, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; @@ -442,9 +459,13 @@ sigreturn(td, uap) } ret = set_fpcontext(td, &ucp->uc_mcontext); - if (ret != 0) + if (ret != 0) { + printf("sigreturn (pid %d): set_fpcontext\n", p->p_pid); return (ret); + } bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); + td->td_pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; + td->td_pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; PROC_LOCK(p); #if defined(COMPAT_43) @@ -738,22 +759,16 @@ exec_setregs(td, entry, stack, ps_strings) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; + + mtx_lock(&dt_lock); + if (td->td_proc->p_md.md_ldt != NULL) + user_ldt_free(td); + else + mtx_unlock(&dt_lock); - critical_enter(); - wrmsr(MSR_FSBASE, 0); - wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; - critical_exit(); pcb->pcb_flags &= ~(PCB_32BIT | PCB_GS32BIT); - load_ds(_udatasel); - load_es(_udatasel); - load_fs(_udatasel); - load_gs(_udatasel); - pcb->pcb_ds = _udatasel; - pcb->pcb_es = _udatasel; - pcb->pcb_fs = _udatasel; - pcb->pcb_gs = _udatasel; pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; bzero((char *)regs, sizeof(struct trapframe)); @@ -763,6 +778,11 @@ exec_setregs(td, entry, stack, ps_strings) regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + regs->tf_fs = _ufssel; + regs->tf_gs = _ugssel; + regs->tf_flags = TF_HASSEGS; /* * Reset the hardware debug registers if they were in use. @@ -1380,12 +1400,12 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) /* * make gdt memory segments */ - gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0]; - for (x = 0; x < NGDT; x++) { - if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) + if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && + x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) ssdtosd(&gdt_segs[x], &gdt[x]); } + gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0]; ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); @@ -1403,6 +1423,10 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) PCPU_SET(curthread, &thread0); PCPU_SET(curpcb, thread0.td_pcb); PCPU_SET(tssp, &common_tss[0]); + PCPU_SET(commontssp, &common_tss[0]); + PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); + PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); + PCPU_SET(fs32p, &gdt[GUFS32_SEL]); PCPU_SET(gs32p, &gdt[GUGS32_SEL]); /* @@ -1415,6 +1439,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); + mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); /* exceptions */ for (x = 0; x < NIDT; x++) @@ -1503,7 +1528,8 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) common_tss[0].tss_ist2 = (long) np; /* Set the IO permission bitmap (empty due to tss seg limit) */ - common_tss[0].tss_iobase = sizeof(struct amd64tss); + common_tss[0].tss_iobase = sizeof(struct amd64tss) + + IOPAGES * PAGE_SIZE; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); @@ -1531,10 +1557,12 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); + _ufssel = GSEL(GUFS32_SEL, SEL_UPL); + _ugssel = GSEL(GUGS32_SEL, SEL_UPL); load_ds(_udatasel); load_es(_udatasel); - load_fs(_udatasel); + load_fs(_ufssel); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; @@ -1656,6 +1684,17 @@ fill_regs(struct thread *td, struct reg *regs) regs->r_rflags = tp->tf_rflags; regs->r_rsp = tp->tf_rsp; regs->r_ss = tp->tf_ss; + if (tp->tf_flags & TF_HASSEGS) { + regs->r_ds = tp->tf_ds; + regs->r_es = tp->tf_es; + regs->r_fs = tp->tf_fs; + regs->r_gs = tp->tf_gs; + } else { + regs->r_ds = 0; + regs->r_es = 0; + regs->r_fs = 0; + regs->r_gs = 0; + } return (0); } @@ -1689,6 +1728,13 @@ set_regs(struct thread *td, struct reg *regs) tp->tf_rflags = rflags; tp->tf_rsp = regs->r_rsp; tp->tf_ss = regs->r_ss; + if (0) { /* XXXKIB */ + tp->tf_ds = regs->r_ds; + tp->tf_es = regs->r_es; + tp->tf_fs = regs->r_fs; + tp->tf_gs = regs->r_gs; + tp->tf_flags = TF_HASSEGS; + } td->td_pcb->pcb_flags |= PCB_FULLCTX; return (0); } @@ -1808,8 +1854,15 @@ get_mcontext(struct thread *td, mcontext_t *mcp, int flags) mcp->mc_cs = tp->tf_cs; mcp->mc_rsp = tp->tf_rsp; mcp->mc_ss = tp->tf_ss; + mcp->mc_ds = tp->tf_ds; + mcp->mc_es = tp->tf_es; + mcp->mc_fs = tp->tf_fs; + mcp->mc_gs = tp->tf_gs; + mcp->mc_flags = tp->tf_flags; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp); + mcp->mc_fsbase = td->td_pcb->pcb_fsbase; + mcp->mc_gsbase = td->td_pcb->pcb_gsbase; return (0); } @@ -1827,7 +1880,8 @@ set_mcontext(struct thread *td, const mcontext_t *mcp) int ret; tp = td->td_frame; - if (mcp->mc_len != sizeof(*mcp)) + if (mcp->mc_len != sizeof(*mcp) || + (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) return (EINVAL); rflags = (mcp->mc_rflags & PSL_USERCHANGE) | (tp->tf_rflags & ~PSL_USERCHANGE); @@ -1853,6 +1907,17 @@ set_mcontext(struct thread *td, const mcontext_t *mcp) tp->tf_rflags = rflags; tp->tf_rsp = mcp->mc_rsp; tp->tf_ss = mcp->mc_ss; + tp->tf_flags = mcp->mc_flags; + if (tp->tf_flags & TF_HASSEGS) { + tp->tf_ds = mcp->mc_ds; + tp->tf_es = mcp->mc_es; + tp->tf_fs = mcp->mc_fs; + tp->tf_gs = mcp->mc_gs; + } + if (mcp->mc_flags & _MC_HASBASES) { + td->td_pcb->pcb_fsbase = mcp->mc_fsbase; + td->td_pcb->pcb_gsbase = mcp->mc_gsbase; + } td->td_pcb->pcb_flags |= PCB_FULLCTX; return (0); } diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index b7c03d9fb2db..59e3e9b98a41 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -101,8 +101,6 @@ extern pt_entry_t *KPTphys; /* SMP page table page */ extern pt_entry_t *SMPpt; -extern int _udatasel; - struct pcb stoppcbs[MAXCPU]; struct xpcb *stopxpcbs = NULL; @@ -463,7 +461,8 @@ init_secondary(void) /* Init tss */ common_tss[cpu] = common_tss[0]; common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */ - common_tss[cpu].tss_iobase = sizeof(struct amd64tss); + common_tss[cpu].tss_iobase = sizeof(struct amd64tss) + + IOPAGES * PAGE_SIZE; common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE]; /* The NMI stack runs on IST2. */ @@ -472,12 +471,13 @@ init_secondary(void) /* Prepare private GDT */ gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu]; - ssdtosyssd(&gdt_segs[GPROC0_SEL], - (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]); for (x = 0; x < NGDT; x++) { - if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) + if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && + x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1)) ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]); } + ssdtosyssd(&gdt_segs[GPROC0_SEL], + (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]); ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; ap_gdt.rd_base = (long) &gdt[NGDT * cpu]; lgdt(&ap_gdt); /* does magic intra-segment return */ @@ -491,8 +491,14 @@ init_secondary(void) pc->pc_prvspace = pc; pc->pc_curthread = 0; pc->pc_tssp = &common_tss[cpu]; + pc->pc_commontssp = &common_tss[cpu]; pc->pc_rsp0 = 0; + pc->pc_tss = (struct system_segment_descriptor *)&gdt[NGDT * cpu + + GPROC0_SEL]; + pc->pc_fs32p = &gdt[NGDT * cpu + GUFS32_SEL]; pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL]; + pc->pc_ldt = (struct system_segment_descriptor *)&gdt[NGDT * cpu + + GUSERLDT_SEL]; /* Save the per-cpu pointer for use by the NMI handler. */ np->np_pcpu = (register_t) pc; @@ -601,7 +607,7 @@ init_secondary(void) load_cr4(rcr4() | CR4_PGE); load_ds(_udatasel); load_es(_udatasel); - load_fs(_udatasel); + load_fs(_ufssel); mtx_unlock_spin(&ap_boot_mtx); /* wait until all the AP's are up */ diff --git a/sys/amd64/amd64/sys_machdep.c b/sys/amd64/amd64/sys_machdep.c index 7f022d00bcce..834dd2c87043 100644 --- a/sys/amd64/amd64/sys_machdep.c +++ b/sys/amd64/amd64/sys_machdep.c @@ -36,16 +36,39 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include +#include #include #include -#include -#include -#include +#include #include #include +#include /* for kernel_map */ +#include + +#include +#include +#include +#include +#include +#include #include +#include + +int max_ldt_segment = 1024; +#define LD_PER_PAGE 512 +#define NULL_LDT_BASE ((caddr_t)NULL) + +#ifdef notyet +#ifdef SMP +static void set_user_ldt_rv(struct vmspace *vmsp); +#endif +#endif +static void user_ldt_derefl(struct proc_ldt *pldt); + #ifndef _SYS_SYSPROTO_H_ struct sysarch_args { int op; @@ -53,6 +76,83 @@ struct sysarch_args { }; #endif +int +sysarch_ldt(struct thread *td, struct sysarch_args *uap, int uap_space) +{ + struct i386_ldt_args *largs, la; + struct user_segment_descriptor *lp; + int error = 0; + + /* + * XXXKIB check that the BSM generation code knows to encode + * the op argument. + */ + AUDIT_ARG(cmd, uap->op); + if (uap_space == UIO_USERSPACE) { + error = copyin(uap->parms, &la, sizeof(struct i386_ldt_args)); + if (error != 0) + return (error); + largs = &la; + } else + largs = (struct i386_ldt_args *)uap->parms; + if (largs->num > max_ldt_segment || largs->num <= 0) + return (EINVAL); + + switch (uap->op) { + case I386_GET_LDT: + error = amd64_get_ldt(td, largs); + break; + case I386_SET_LDT: + if (largs->descs != NULL) { + lp = (struct user_segment_descriptor *) + kmem_alloc(kernel_map, largs->num * + sizeof(struct user_segment_descriptor)); + if (lp == NULL) { + error = ENOMEM; + break; + } + error = copyin(largs->descs, lp, largs->num * + sizeof(struct user_segment_descriptor)); + if (error == 0) + error = amd64_set_ldt(td, largs, lp); + kmem_free(kernel_map, (vm_offset_t)lp, largs->num * + sizeof(struct user_segment_descriptor)); + } else { + error = amd64_set_ldt(td, largs, NULL); + } + break; + } + return (error); +} + +void +update_gdt_gsbase(struct thread *td, uint32_t base) +{ + struct user_segment_descriptor *sd; + + if (td != curthread) + return; + critical_enter(); + sd = PCPU_GET(gs32p); + sd->sd_lobase = base & 0xffffff; + sd->sd_hibase = (base >> 24) & 0xff; + critical_exit(); +} + +void +update_gdt_fsbase(struct thread *td, uint32_t base) +{ + struct user_segment_descriptor *sd; + + if (td != curthread) + return; + critical_enter(); + sd = PCPU_GET(fs32p); + sd->sd_lobase = base & 0xffffff; + sd->sd_hibase = (base >> 24) & 0xff; + critical_exit(); +} + int sysarch(td, uap) struct thread *td; @@ -62,8 +162,36 @@ sysarch(td, uap) struct pcb *pcb = curthread->td_pcb; uint32_t i386base; uint64_t a64base; + struct i386_ioperm_args iargs; - switch(uap->op) { + if (uap->op == I386_GET_LDT || uap->op == I386_SET_LDT) + return (sysarch_ldt(td, uap, UIO_USERSPACE)); + /* + * XXXKIB check that the BSM generation code knows to encode + * the op argument. + */ + AUDIT_ARG(cmd, uap->op); + switch (uap->op) { + case I386_GET_IOPERM: + case I386_SET_IOPERM: + if ((error = copyin(uap->parms, &iargs, + sizeof(struct i386_ioperm_args))) != 0) + return (error); + break; + default: + break; + } + + switch (uap->op) { + case I386_GET_IOPERM: + error = amd64_get_ioperm(td, &iargs); + if (error == 0) + error = copyout(&iargs, uap->parms, + sizeof(struct i386_ioperm_args)); + break; + case I386_SET_IOPERM: + error = amd64_set_ioperm(td, &iargs); + break; case I386_GET_FSBASE: i386base = pcb->pcb_fsbase; error = copyout(&i386base, uap->parms, sizeof(i386base)); @@ -71,10 +199,9 @@ sysarch(td, uap) case I386_SET_FSBASE: error = copyin(uap->parms, &i386base, sizeof(i386base)); if (!error) { - critical_enter(); - wrmsr(MSR_FSBASE, i386base); pcb->pcb_fsbase = i386base; - critical_exit(); + td->td_frame->tf_fs = _ufssel; + update_gdt_fsbase(td, i386base); } break; case I386_GET_GSBASE: @@ -84,10 +211,9 @@ sysarch(td, uap) case I386_SET_GSBASE: error = copyin(uap->parms, &i386base, sizeof(i386base)); if (!error) { - critical_enter(); - wrmsr(MSR_KGSBASE, i386base); pcb->pcb_gsbase = i386base; - critical_exit(); + td->td_frame->tf_gs = _ugssel; + update_gdt_gsbase(td, i386base); } break; case AMD64_GET_FSBASE: @@ -98,13 +224,10 @@ sysarch(td, uap) error = copyin(uap->parms, &a64base, sizeof(a64base)); if (!error) { if (a64base < VM_MAXUSER_ADDRESS) { - critical_enter(); - wrmsr(MSR_FSBASE, a64base); pcb->pcb_fsbase = a64base; - critical_exit(); - } else { + td->td_frame->tf_fs = _ufssel; + } else error = EINVAL; - } } break; @@ -116,13 +239,10 @@ sysarch(td, uap) error = copyin(uap->parms, &a64base, sizeof(a64base)); if (!error) { if (a64base < VM_MAXUSER_ADDRESS) { - critical_enter(); - wrmsr(MSR_KGSBASE, a64base); pcb->pcb_gsbase = a64base; - critical_exit(); - } else { + td->td_frame->tf_gs = _ugssel; + } else error = EINVAL; - } } break; @@ -132,3 +252,424 @@ sysarch(td, uap) } return (error); } + +int +amd64_set_ioperm(td, uap) + struct thread *td; + struct i386_ioperm_args *uap; +{ + int i, error; + char *iomap; + struct amd64tss *tssp; + struct system_segment_descriptor *tss_sd; + u_long *addr; + struct pcb *pcb; + + if ((error = priv_check(td, PRIV_IO)) != 0) + return (error); + if ((error = securelevel_gt(td->td_ucred, 0)) != 0) + return (error); + if (uap->start + uap->length > IOPAGES * PAGE_SIZE * NBBY) + return (EINVAL); + + /* + * XXX + * While this is restricted to root, we should probably figure out + * whether any other driver is using this i/o address, as so not to + * cause confusion. This probably requires a global 'usage registry'. + */ + pcb = td->td_pcb; + if (pcb->pcb_tssp == NULL) { + tssp = (struct amd64tss *)kmem_alloc(kernel_map, + ctob(IOPAGES+1)); + if (tssp == NULL) + return (ENOMEM); + iomap = (char *)&tssp[1]; + addr = (u_long *)iomap; + for (i = 0; i < (ctob(IOPAGES) + 1) / sizeof(u_long); i++) + *addr++ = ~0; + critical_enter(); + /* Takes care of tss_rsp0. */ + memcpy(tssp, &common_tss[PCPU_GET(cpuid)], + sizeof(struct amd64tss)); + tssp->tss_iobase = sizeof(*tssp); + pcb->pcb_tssp = tssp; + tss_sd = PCPU_GET(tss); + tss_sd->sd_lobase = (u_long)tssp & 0xffffff; + tss_sd->sd_hibase = ((u_long)tssp >> 24) & 0xfffffffffful; + tss_sd->sd_type = SDT_SYSTSS; + ltr(GSEL(GPROC0_SEL, SEL_KPL)); + PCPU_SET(tssp, tssp); + critical_exit(); + } else + iomap = (char *)&pcb->pcb_tssp[1]; + for (i = uap->start; i < uap->start + uap->length; i++) { + if (uap->enable) + iomap[i >> 3] &= ~(1 << (i & 7)); + else + iomap[i >> 3] |= (1 << (i & 7)); + } + return (error); +} + +int +amd64_get_ioperm(td, uap) + struct thread *td; + struct i386_ioperm_args *uap; +{ + int i, state; + char *iomap; + + if (uap->start >= IOPAGES * PAGE_SIZE * NBBY) + return (EINVAL); + if (td->td_pcb->pcb_tssp == NULL) { + uap->length = 0; + goto done; + } + + iomap = (char *)&td->td_pcb->pcb_tssp[1]; + + i = uap->start; + state = (iomap[i >> 3] >> (i & 7)) & 1; + uap->enable = !state; + uap->length = 1; + + for (i = uap->start + 1; i < IOPAGES * PAGE_SIZE * NBBY; i++) { + if (state != ((iomap[i >> 3] >> (i & 7)) & 1)) + break; + uap->length++; + } + +done: + return (0); +} + +/* + * Update the GDT entry pointing to the LDT to point to the LDT of the + * current process. + */ +void +set_user_ldt(struct mdproc *mdp) +{ + + critical_enter(); + *PCPU_GET(ldt) = mdp->md_ldt_sd; + lldt(GSEL(GUSERLDT_SEL, SEL_KPL)); + critical_exit(); +} + +#ifdef notyet +#ifdef SMP +static void +set_user_ldt_rv(struct vmspace *vmsp) +{ + struct thread *td; + + td = curthread; + if (vmsp != td->td_proc->p_vmspace) + return; + + set_user_ldt(&td->td_proc->p_md); +} +#endif +#endif + +struct proc_ldt * +user_ldt_alloc(struct proc *p, int force) +{ + struct proc_ldt *pldt, *new_ldt; + struct mdproc *mdp; + struct soft_segment_descriptor sldt; + + mtx_assert(&dt_lock, MA_OWNED); + mdp = &p->p_md; + if (!force && mdp->md_ldt != NULL) + return (mdp->md_ldt); + mtx_unlock(&dt_lock); + new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK); + new_ldt->ldt_base = (caddr_t)kmem_alloc(kernel_map, + max_ldt_segment * sizeof(struct user_segment_descriptor)); + if (new_ldt->ldt_base == NULL) { + FREE(new_ldt, M_SUBPROC); + mtx_lock(&dt_lock); + return (NULL); + } + new_ldt->ldt_refcnt = 1; + sldt.ssd_base = (uint64_t)new_ldt->ldt_base; + sldt.ssd_limit = max_ldt_segment * + sizeof(struct user_segment_descriptor) - 1; + sldt.ssd_type = SDT_SYSLDT; + sldt.ssd_dpl = SEL_KPL; + sldt.ssd_p = 1; + sldt.ssd_long = 0; + sldt.ssd_def32 = 0; + sldt.ssd_gran = 0; + mtx_lock(&dt_lock); + pldt = mdp->md_ldt; + if (pldt != NULL && !force) { + kmem_free(kernel_map, (vm_offset_t)new_ldt->ldt_base, + max_ldt_segment * sizeof(struct user_segment_descriptor)); + free(new_ldt, M_SUBPROC); + return (pldt); + } + + mdp->md_ldt = new_ldt; + if (pldt != NULL) { + bcopy(pldt->ldt_base, new_ldt->ldt_base, max_ldt_segment * + sizeof(struct user_segment_descriptor)); + user_ldt_derefl(pldt); + } + ssdtosyssd(&sldt, &p->p_md.md_ldt_sd); + if (p == curproc) + set_user_ldt(mdp); + + return (mdp->md_ldt); +} + +void +user_ldt_free(struct thread *td) +{ + struct proc *p = td->td_proc; + struct mdproc *mdp = &p->p_md; + struct proc_ldt *pldt; + + mtx_assert(&dt_lock, MA_OWNED); + if ((pldt = mdp->md_ldt) == NULL) { + mtx_unlock(&dt_lock); + return; + } + + mdp->md_ldt = NULL; + bzero(&mdp->md_ldt_sd, sizeof(mdp->md_ldt_sd)); + if (td == curthread) + lldt(GSEL(GNULL_SEL, SEL_KPL)); + user_ldt_deref(pldt); +} + +static void +user_ldt_derefl(struct proc_ldt *pldt) +{ + + if (--pldt->ldt_refcnt == 0) { + kmem_free(kernel_map, (vm_offset_t)pldt->ldt_base, + max_ldt_segment * sizeof(struct user_segment_descriptor)); + free(pldt, M_SUBPROC); + } +} + +void +user_ldt_deref(struct proc_ldt *pldt) +{ + + mtx_assert(&dt_lock, MA_OWNED); + user_ldt_derefl(pldt); + mtx_unlock(&dt_lock); +} + +/* + * Note for the authors of compat layers (linux, etc): copyout() in + * the function below is not a problem since it presents data in + * arch-specific format (i.e. i386-specific in this case), not in + * the OS-specific one. + */ +int +amd64_get_ldt(td, uap) + struct thread *td; + struct i386_ldt_args *uap; +{ + int error = 0; + struct proc_ldt *pldt; + int num; + struct user_segment_descriptor *lp; + +#ifdef DEBUG + printf("amd64_get_ldt: start=%d num=%d descs=%p\n", + uap->start, uap->num, (void *)uap->descs); +#endif + + if ((pldt = td->td_proc->p_md.md_ldt) != NULL) { + lp = &((struct user_segment_descriptor *)(pldt->ldt_base)) + [uap->start]; + num = min(uap->num, max_ldt_segment); + } else + return (EINVAL); + + if ((uap->start > (unsigned int)max_ldt_segment) || + ((unsigned int)num > (unsigned int)max_ldt_segment) || + ((unsigned int)(uap->start + num) > (unsigned int)max_ldt_segment)) + return(EINVAL); + + error = copyout(lp, uap->descs, num * + sizeof(struct user_segment_descriptor)); + if (!error) + td->td_retval[0] = num; + + return(error); +} + +int +amd64_set_ldt(td, uap, descs) + struct thread *td; + struct i386_ldt_args *uap; + struct user_segment_descriptor *descs; +{ + int error = 0, i; + int largest_ld; + struct mdproc *mdp = &td->td_proc->p_md; + struct proc_ldt *pldt; + struct user_segment_descriptor *dp; + struct proc *p; + +#ifdef DEBUG + printf("amd64_set_ldt: start=%d num=%d descs=%p\n", + uap->start, uap->num, (void *)uap->descs); +#endif + + p = td->td_proc; + if (descs == NULL) { + /* Free descriptors */ + if (uap->start == 0 && uap->num == 0) + uap->num = max_ldt_segment; + if (uap->num <= 0) + return (EINVAL); + if ((pldt = mdp->md_ldt) == NULL || + uap->start >= max_ldt_segment) + return (0); + largest_ld = uap->start + uap->num; + if (largest_ld > max_ldt_segment) + largest_ld = max_ldt_segment; + i = largest_ld - uap->start; + mtx_lock(&dt_lock); + bzero(&((struct user_segment_descriptor *)(pldt->ldt_base)) + [uap->start], sizeof(struct user_segment_descriptor) * i); + mtx_unlock(&dt_lock); + return (0); + } + + if (!(uap->start == LDT_AUTO_ALLOC && uap->num == 1)) { + /* verify range of descriptors to modify */ + largest_ld = uap->start + uap->num; + if (uap->start >= max_ldt_segment || + uap->num < 0 || largest_ld > max_ldt_segment) + return (EINVAL); + } + + /* Check descriptors for access violations */ + for (i = 0; i < uap->num; i++) { + dp = &descs[i]; + + switch (dp->sd_type) { + case SDT_SYSNULL: /* system null */ + dp->sd_p = 0; + break; + case SDT_SYS286TSS: + case SDT_SYSLDT: + case SDT_SYS286BSY: + case SDT_SYS286CGT: + case SDT_SYSTASKGT: + case SDT_SYS286IGT: + case SDT_SYS286TGT: + case SDT_SYSNULL2: + case SDT_SYSTSS: + case SDT_SYSNULL3: + case SDT_SYSBSY: + case SDT_SYSCGT: + case SDT_SYSNULL4: + case SDT_SYSIGT: + case SDT_SYSTGT: + /* I can't think of any reason to allow a user proc + * to create a segment of these types. They are + * for OS use only. + */ + return (EACCES); + /*NOTREACHED*/ + + /* memory segment types */ + case SDT_MEMEC: /* memory execute only conforming */ + case SDT_MEMEAC: /* memory execute only accessed conforming */ + case SDT_MEMERC: /* memory execute read conforming */ + case SDT_MEMERAC: /* memory execute read accessed conforming */ + /* Must be "present" if executable and conforming. */ + if (dp->sd_p == 0) + return (EACCES); + break; + case SDT_MEMRO: /* memory read only */ + case SDT_MEMROA: /* memory read only accessed */ + case SDT_MEMRW: /* memory read write */ + case SDT_MEMRWA: /* memory read write accessed */ + case SDT_MEMROD: /* memory read only expand dwn limit */ + case SDT_MEMRODA: /* memory read only expand dwn lim accessed */ + case SDT_MEMRWD: /* memory read write expand dwn limit */ + case SDT_MEMRWDA: /* memory read write expand dwn lim acessed */ + case SDT_MEME: /* memory execute only */ + case SDT_MEMEA: /* memory execute only accessed */ + case SDT_MEMER: /* memory execute read */ + case SDT_MEMERA: /* memory execute read accessed */ + break; + default: + return(EINVAL); + /*NOTREACHED*/ + } + + /* Only user (ring-3) descriptors may be present. */ + if ((dp->sd_p != 0) && (dp->sd_dpl != SEL_UPL)) + return (EACCES); + } + + if (uap->start == LDT_AUTO_ALLOC && uap->num == 1) { + /* Allocate a free slot */ + mtx_lock(&dt_lock); + pldt = user_ldt_alloc(p, 0); + if (pldt == NULL) { + mtx_unlock(&dt_lock); + return (ENOMEM); + } + + /* + * start scanning a bit up to leave room for NVidia and + * Wine, which still user the "Blat" method of allocation. + */ + i = 16; + dp = &((struct user_segment_descriptor *)(pldt->ldt_base))[i]; + for (; i < max_ldt_segment; ++i, ++dp) { + if (dp->sd_type == SDT_SYSNULL) + break; + } + if (i >= max_ldt_segment) { + mtx_unlock(&dt_lock); + return (ENOSPC); + } + uap->start = i; + error = amd64_set_ldt_data(td, i, 1, descs); + mtx_unlock(&dt_lock); + } else { + largest_ld = uap->start + uap->num; + if (largest_ld > max_ldt_segment) + return (EINVAL); + mtx_lock(&dt_lock); + if (user_ldt_alloc(p, 0) != NULL) { + error = amd64_set_ldt_data(td, uap->start, uap->num, + descs); + } + mtx_unlock(&dt_lock); + } + if (error == 0) + td->td_retval[0] = uap->start; + return (error); +} + +int +amd64_set_ldt_data(struct thread *td, int start, int num, + struct user_segment_descriptor *descs) +{ + struct mdproc *mdp = &td->td_proc->p_md; + struct proc_ldt *pldt = mdp->md_ldt; + + mtx_assert(&dt_lock, MA_OWNED); + + /* Fill in range */ + bcopy(descs, + &((struct user_segment_descriptor *)(pldt->ldt_base))[start], + num * sizeof(struct user_segment_descriptor)); + return (0); +} diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index a519414d7452..467feafcee36 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -171,6 +171,52 @@ SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW, extern char *syscallnames[]; +/* #define DEBUG 1 */ +#ifdef DEBUG +static void +report_seg_fault(const char *segn, struct trapframe *frame) +{ + struct proc_ldt *pldt; + struct trapframe *pf; + + pldt = curproc->p_md.md_ldt; + printf("%d: %s load fault %lx %p %d\n", + curproc->p_pid, segn, frame->tf_err, + pldt != NULL ? pldt->ldt_base : NULL, + pldt != NULL ? pldt->ldt_refcnt : 0); + kdb_backtrace(); + pf = (struct trapframe *)frame->tf_rsp; + printf("rdi %lx\n", pf->tf_rdi); + printf("rsi %lx\n", pf->tf_rsi); + printf("rdx %lx\n", pf->tf_rdx); + printf("rcx %lx\n", pf->tf_rcx); + printf("r8 %lx\n", pf->tf_r8); + printf("r9 %lx\n", pf->tf_r9); + printf("rax %lx\n", pf->tf_rax); + printf("rbx %lx\n", pf->tf_rbx); + printf("rbp %lx\n", pf->tf_rbp); + printf("r10 %lx\n", pf->tf_r10); + printf("r11 %lx\n", pf->tf_r11); + printf("r12 %lx\n", pf->tf_r12); + printf("r13 %lx\n", pf->tf_r13); + printf("r14 %lx\n", pf->tf_r14); + printf("r15 %lx\n", pf->tf_r15); + printf("fs %x\n", pf->tf_fs); + printf("gs %x\n", pf->tf_gs); + printf("es %x\n", pf->tf_es); + printf("ds %x\n", pf->tf_ds); + printf("tno %x\n", pf->tf_trapno); + printf("adr %lx\n", pf->tf_addr); + printf("flg %x\n", pf->tf_flags); + printf("err %lx\n", pf->tf_err); + printf("rip %lx\n", pf->tf_rip); + printf("cs %lx\n", pf->tf_cs); + printf("rfl %lx\n", pf->tf_rflags); + printf("rsp %lx\n", pf->tf_rsp); + printf("ss %lx\n", pf->tf_ss); +} +#endif + /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry @@ -258,6 +304,9 @@ trap(struct trapframe *frame) */ printf("kernel trap %d with interrupts disabled\n", type); +#ifdef DEBUG + report_seg_fault("hlt", frame); +#endif /* * We shouldn't enable interrupts while holding a * spin lock or servicing an NMI. @@ -470,6 +519,38 @@ trap(struct trapframe *frame) frame->tf_rip = (long)doreti_iret_fault; goto out; } + if (frame->tf_rip == (long)ld_ds) { +#ifdef DEBUG + report_seg_fault("ds", frame); +#endif + frame->tf_rip = (long)ds_load_fault; + frame->tf_ds = _udatasel; + goto out; + } + if (frame->tf_rip == (long)ld_es) { +#ifdef DEBUG + report_seg_fault("es", frame); +#endif + frame->tf_rip = (long)es_load_fault; + frame->tf_es = _udatasel; + goto out; + } + if (frame->tf_rip == (long)ld_fs) { +#ifdef DEBUG + report_seg_fault("fs", frame); +#endif + frame->tf_rip = (long)fs_load_fault; + frame->tf_fs = _ufssel; + goto out; + } + if (frame->tf_rip == (long)ld_gs) { +#ifdef DEBUG + report_seg_fault("gs", frame); +#endif + frame->tf_rip = (long)gs_load_fault; + frame->tf_gs = _ugssel; + goto out; + } if (PCPU_GET(curpcb)->pcb_onfault != NULL) { frame->tf_rip = (long)PCPU_GET(curpcb)->pcb_onfault; @@ -564,6 +645,9 @@ trap(struct trapframe *frame) trapsignal(td, &ksi); #ifdef DEBUG +{ + register_t rg,rgk, rf; + if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); @@ -571,6 +655,17 @@ trap(struct trapframe *frame) uprintf(", fault VA = 0x%lx", frame->tf_addr); uprintf("\n"); } + rf = rdmsr(0xc0000100); + rg = rdmsr(0xc0000101); + rgk = rdmsr(0xc0000102); + uprintf("pid %d TRAP %d rip %lx err %lx addr %lx cs %lx ss %lx ds %x " + "es %x fs %x fsbase %lx %lx gs %x gsbase %lx %lx %lx\n", + curproc->p_pid, type, frame->tf_rip, frame->tf_err, + frame->tf_addr, + frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es, + frame->tf_fs, td->td_pcb->pcb_fsbase, rf, + frame->tf_gs, td->td_pcb->pcb_gsbase, rg, rgk); +} #endif user: diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index f0003ee67969..928be345d2c7 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -71,6 +71,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -102,12 +103,24 @@ cpu_fork(td1, p2, td2, flags) { register struct proc *p1; struct pcb *pcb2; - struct mdproc *mdp2; + struct mdproc *mdp1, *mdp2; + struct proc_ldt *pldt; pmap_t pmap2; p1 = td1->td_proc; - if ((flags & RFPROC) == 0) + if ((flags & RFPROC) == 0) { + if ((flags & RFMEM) == 0) { + /* unshare user LDT */ + mdp1 = &p1->p_md; + mtx_lock(&dt_lock); + if ((pldt = mdp1->md_ldt) != NULL && + pldt->ldt_refcnt > 1 && + user_ldt_alloc(p1, 1) == NULL) + panic("could not copy LDT"); + mtx_unlock(&dt_lock); + } return; + } /* Ensure that p1's pcb is up to date. */ fpuexit(td1); @@ -170,6 +183,32 @@ cpu_fork(td1, p2, td2, flags) td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I; + /* As an i386, do not copy io permission bitmap. */ + pcb2->pcb_tssp = NULL; + + /* Copy the LDT, if necessary. */ + mdp1 = &td1->td_proc->p_md; + mdp2 = &p2->p_md; + mtx_lock(&dt_lock); + if (mdp1->md_ldt != NULL) { + if (flags & RFMEM) { + mdp1->md_ldt->ldt_refcnt++; + mdp2->md_ldt = mdp1->md_ldt; + bcopy(&mdp1->md_ldt_sd, &mdp2->md_ldt_sd, sizeof(struct + system_segment_descriptor)); + } else { + mdp2->md_ldt = NULL; + mdp2->md_ldt = user_ldt_alloc(p2, 0); + if (mdp2->md_ldt == NULL) + panic("could not copy LDT"); + amd64_set_ldt_data(td2, 0, max_ldt_segment, + (struct user_segment_descriptor *) + mdp1->md_ldt->ldt_base); + } + } else + mdp2->md_ldt = NULL; + mtx_unlock(&dt_lock); + /* * Now, cpu_switch() can schedule the new process. * pcb_rsp is loaded pointing to the cpu_switch() stack frame @@ -204,25 +243,49 @@ cpu_set_fork_handler(td, func, arg) void cpu_exit(struct thread *td) { + + /* + * If this process has a custom LDT, release it. + */ + mtx_lock(&dt_lock); + if (td->td_proc->p_md.md_ldt != 0) + user_ldt_free(td); + else + mtx_unlock(&dt_lock); } void cpu_thread_exit(struct thread *td) { + struct pcb *pcb; if (td == PCPU_GET(fpcurthread)) fpudrop(); + pcb = td->td_pcb; + /* Disable any hardware breakpoints. */ - if (td->td_pcb->pcb_flags & PCB_DBREGS) { + if (pcb->pcb_flags & PCB_DBREGS) { reset_dbregs(); - td->td_pcb->pcb_flags &= ~PCB_DBREGS; + pcb->pcb_flags &= ~PCB_DBREGS; } } void cpu_thread_clean(struct thread *td) { + struct pcb *pcb; + + pcb = td->td_pcb; + + /* + * Clean TSS/iomap + */ + if (pcb->pcb_tssp != NULL) { + kmem_free(kernel_map, (vm_offset_t)pcb->pcb_tssp, + ctob(IOPAGES + 1)); + pcb->pcb_tssp = NULL; + } } void @@ -247,6 +310,8 @@ cpu_thread_alloc(struct thread *td) void cpu_thread_free(struct thread *td) { + + cpu_thread_clean(td); } /* @@ -358,6 +423,11 @@ cpu_set_upcall_kse(struct thread *td, void (*entry)(void *), void *arg, ((register_t)stack->ss_sp + stack->ss_size) & ~0x0f; td->td_frame->tf_rsp -= 8; td->td_frame->tf_rip = (register_t)entry; + td->td_frame->tf_ds = _udatasel; + td->td_frame->tf_es = _udatasel; + td->td_frame->tf_fs = _ufssel; + td->td_frame->tf_gs = _ugssel; + td->td_frame->tf_flags = TF_HASSEGS; /* * Pass the address of the mailbox for this kse to the uts @@ -375,25 +445,11 @@ cpu_set_user_tls(struct thread *td, void *tls_base) #ifdef COMPAT_IA32 if (td->td_proc->p_sysent->sv_flags & SV_ILP32) { - if (td == curthread) { - critical_enter(); - td->td_pcb->pcb_gsbase = (register_t)tls_base; - wrmsr(MSR_KGSBASE, td->td_pcb->pcb_gsbase); - critical_exit(); - } else { - td->td_pcb->pcb_gsbase = (register_t)tls_base; - } + td->td_pcb->pcb_gsbase = (register_t)tls_base; return (0); } #endif - if (td == curthread) { - critical_enter(); - td->td_pcb->pcb_fsbase = (register_t)tls_base; - wrmsr(MSR_FSBASE, td->td_pcb->pcb_fsbase); - critical_exit(); - } else { - td->td_pcb->pcb_fsbase = (register_t)tls_base; - } + td->td_pcb->pcb_fsbase = (register_t)tls_base; return (0); } diff --git a/sys/amd64/ia32/ia32_exception.S b/sys/amd64/ia32/ia32_exception.S index 4820f5399628..76c5d5ac5873 100644 --- a/sys/amd64/ia32/ia32_exception.S +++ b/sys/amd64/ia32/ia32_exception.S @@ -60,6 +60,11 @@ IDTVEC(int0x80_syscall) movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) + movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) + movw %es,TF_ES(%rsp) + movw %ds,TF_DS(%rsp) + movl $TF_HASSEGS,TF_FLAGS(%rsp) FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi call ia32_syscall diff --git a/sys/amd64/ia32/ia32_misc.c b/sys/amd64/ia32/ia32_misc.c new file mode 100644 index 000000000000..2fa197219010 --- /dev/null +++ b/sys/amd64/ia32/ia32_misc.c @@ -0,0 +1,71 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_compat.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +int +freebsd32_sysarch(struct thread *td, struct freebsd32_sysarch_args *uap) +{ + struct sysarch_args uap1; + struct i386_ldt_args uapl; + struct i386_ldt_args32 uapl32; + int error; + + if (uap->op == I386_SET_LDT || uap->op == I386_GET_LDT) { + if ((error = copyin(uap->parms, &uapl32, sizeof(uapl32))) != 0) + return (error); + uap1.op = uap->op; + uap1.parms = (char *)&uapl; + uapl.start = uapl32.start; + uapl.descs = (struct user_segment_descriptor *)(uintptr_t) + uapl32.descs; + uapl.num = uapl32.num; + return (sysarch_ldt(td, &uap1, UIO_SYSSPACE)); + } else { + uap1.op = uap->op; + uap1.parms = uap->parms; + return (sysarch(td, &uap1)); + } +} diff --git a/sys/amd64/ia32/ia32_reg.c b/sys/amd64/ia32/ia32_reg.c index 8abc6fc6d42a..49dd4e26409a 100644 --- a/sys/amd64/ia32/ia32_reg.c +++ b/sys/amd64/ia32/ia32_reg.c @@ -85,9 +85,17 @@ fill_regs32(struct thread *td, struct reg32 *regs) tp = td->td_frame; pcb = td->td_pcb; - regs->r_fs = pcb->pcb_fs; - regs->r_es = pcb->pcb_es; - regs->r_ds = pcb->pcb_ds; + if (tp->tf_flags & TF_HASSEGS) { + regs->r_gs = tp->tf_gs; + regs->r_fs = tp->tf_fs; + regs->r_es = tp->tf_es; + regs->r_ds = tp->tf_ds; + } else { + regs->r_gs = _ugssel; + regs->r_fs = _ufssel; + regs->r_es = _udatasel; + regs->r_ds = _udatasel; + } regs->r_edi = tp->tf_rdi; regs->r_esi = tp->tf_rsi; regs->r_ebp = tp->tf_rbp; @@ -100,7 +108,6 @@ fill_regs32(struct thread *td, struct reg32 *regs) regs->r_eflags = tp->tf_rflags; regs->r_esp = tp->tf_rsp; regs->r_ss = tp->tf_ss; - regs->r_gs = pcb->pcb_gs; return (0); } @@ -114,14 +121,11 @@ set_regs32(struct thread *td, struct reg32 *regs) if (!EFL_SECURE(regs->r_eflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); pcb = td->td_pcb; -#if 0 - load_fs(regs->r_fs); - pcb->pcb_fs = regs->r_fs; - load_es(regs->r_es); - pcb->pcb_es = regs->r_es; - load_ds(regs->r_ds); - pcb->pcb_ds = regs->r_ds; -#endif + tp->tf_gs = regs->r_gs; + tp->tf_fs = regs->r_fs; + tp->tf_es = regs->r_es; + tp->tf_ds = regs->r_ds; + tp->tf_flags = TF_HASSEGS; tp->tf_rdi = regs->r_edi; tp->tf_rsi = regs->r_esi; tp->tf_rbp = regs->r_ebp; @@ -134,10 +138,6 @@ set_regs32(struct thread *td, struct reg32 *regs) tp->tf_rflags = regs->r_eflags; tp->tf_rsp = regs->r_esp; tp->tf_ss = regs->r_ss; -#if 0 - load_gs(regs->r_gs); - pcb->pcb_gs = regs->r_gs; -#endif return (0); } @@ -166,7 +166,8 @@ fill_fpregs32(struct thread *td, struct fpreg32 *regs) penv_87->en_fcs = td->td_frame->tf_cs; penv_87->en_opcode = penv_xmm->en_opcode; penv_87->en_foo = penv_xmm->en_rdp; - penv_87->en_fos = td->td_pcb->pcb_ds; + /* Entry into the kernel always sets TF_HASSEGS */ + penv_87->en_fos = td->td_frame->tf_ds; /* FPU registers */ for (i = 0; i < 8; ++i) diff --git a/sys/amd64/ia32/ia32_signal.c b/sys/amd64/ia32/ia32_signal.c index 019fabac5212..37e8013a416e 100644 --- a/sys/amd64/ia32/ia32_signal.c +++ b/sys/amd64/ia32/ia32_signal.c @@ -85,8 +85,6 @@ static void freebsd4_ia32_sendsig(sig_t, ksiginfo_t *, sigset_t *); static void ia32_get_fpcontext(struct thread *td, struct ia32_mcontext *mcp); static int ia32_set_fpcontext(struct thread *td, const struct ia32_mcontext *mcp); -extern int _ucode32sel, _udatasel; - #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) @@ -134,10 +132,11 @@ ia32_get_mcontext(struct thread *td, struct ia32_mcontext *mcp, int flags) PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_rsp); PROC_UNLOCK(curthread->td_proc); - mcp->mc_gs = td->td_pcb->pcb_gs; - mcp->mc_fs = td->td_pcb->pcb_fs; - mcp->mc_es = td->td_pcb->pcb_es; - mcp->mc_ds = td->td_pcb->pcb_ds; + /* Entry into kernel always sets TF_HASSEGS */ + mcp->mc_gs = tp->tf_gs; + mcp->mc_fs = tp->tf_fs; + mcp->mc_es = tp->tf_es; + mcp->mc_ds = tp->tf_ds; mcp->mc_edi = tp->tf_rdi; mcp->mc_esi = tp->tf_rsi; mcp->mc_ebp = tp->tf_rbp; @@ -158,6 +157,8 @@ ia32_get_mcontext(struct thread *td, struct ia32_mcontext *mcp, int flags) mcp->mc_ss = tp->tf_ss; mcp->mc_len = sizeof(*mcp); ia32_get_fpcontext(td, mcp); + mcp->mc_fsbase = td->td_pcb->pcb_fsbase; + mcp->mc_gsbase = td->td_pcb->pcb_gsbase; return (0); } @@ -182,11 +183,11 @@ ia32_set_mcontext(struct thread *td, const struct ia32_mcontext *mcp) ret = ia32_set_fpcontext(td, mcp); if (ret != 0) return (ret); -#if 0 /* XXX deal with load_fs() and friends */ + tp->tf_gs = mcp->mc_gs; tp->tf_fs = mcp->mc_fs; tp->tf_es = mcp->mc_es; tp->tf_ds = mcp->mc_ds; -#endif + tp->tf_flags = TF_HASSEGS; tp->tf_rdi = mcp->mc_edi; tp->tf_rsi = mcp->mc_esi; tp->tf_rbp = mcp->mc_ebp; @@ -199,9 +200,6 @@ ia32_set_mcontext(struct thread *td, const struct ia32_mcontext *mcp) tp->tf_rflags = rflags; tp->tf_rsp = mcp->mc_esp; tp->tf_ss = mcp->mc_ss; -#if 0 /* XXX deal with load_gs() and friends */ - td->td_pcb->pcb_gs = mcp->mc_gs; -#endif td->td_pcb->pcb_flags |= PCB_FULLCTX; return (0); } @@ -326,10 +324,6 @@ freebsd4_ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; - sf.sf_uc.uc_mcontext.mc_gs = rgs(); - sf.sf_uc.uc_mcontext.mc_fs = rfs(); - __asm __volatile("mov %%es,%0" : "=rm" (sf.sf_uc.uc_mcontext.mc_es)); - __asm __volatile("mov %%ds,%0" : "=rm" (sf.sf_uc.uc_mcontext.mc_ds)); sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi; sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi; sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp; @@ -345,6 +339,10 @@ freebsd4_ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags; sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp; sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss; + sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds; + sf.sf_uc.uc_mcontext.mc_es = regs->tf_es; + sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs; + sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs; /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && @@ -394,10 +392,8 @@ freebsd4_ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; - load_ds(_udatasel); - td->td_pcb->pcb_ds = _udatasel; - load_es(_udatasel); - td->td_pcb->pcb_es = _udatasel; + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; /* leave user %fs and %gs untouched */ PROC_LOCK(p); mtx_lock(&psp->ps_mtx); @@ -441,10 +437,6 @@ ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; - sf.sf_uc.uc_mcontext.mc_gs = rgs(); - sf.sf_uc.uc_mcontext.mc_fs = rfs(); - __asm __volatile("mov %%es,%0" : "=rm" (sf.sf_uc.uc_mcontext.mc_es)); - __asm __volatile("mov %%ds,%0" : "=rm" (sf.sf_uc.uc_mcontext.mc_ds)); sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi; sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi; sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp; @@ -460,9 +452,15 @@ ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags; sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp; sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss; + sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds; + sf.sf_uc.uc_mcontext.mc_es = regs->tf_es; + sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs; + sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs; sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ ia32_get_fpcontext(td, &sf.sf_uc.uc_mcontext); fpstate_drop(td); + sf.sf_uc.uc_mcontext.mc_fsbase = td->td_pcb->pcb_fsbase; + sf.sf_uc.uc_mcontext.mc_gsbase = td->td_pcb->pcb_gsbase; /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && @@ -514,11 +512,9 @@ ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; - load_ds(_udatasel); - td->td_pcb->pcb_ds = _udatasel; - load_es(_udatasel); - td->td_pcb->pcb_es = _udatasel; - /* leave user %fs and %gs untouched */ + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + /* XXXKIB leave user %fs and %gs untouched */ PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } @@ -591,7 +587,6 @@ freebsd4_freebsd32_sigreturn(td, uap) return (EINVAL); } - /* Segment selectors restored by sigtramp.S */ regs->tf_rdi = ucp->uc_mcontext.mc_edi; regs->tf_rsi = ucp->uc_mcontext.mc_esi; regs->tf_rbp = ucp->uc_mcontext.mc_ebp; @@ -606,6 +601,10 @@ freebsd4_freebsd32_sigreturn(td, uap) regs->tf_rflags = ucp->uc_mcontext.mc_eflags; regs->tf_rsp = ucp->uc_mcontext.mc_esp; regs->tf_ss = ucp->uc_mcontext.mc_ss; + regs->tf_ds = ucp->uc_mcontext.mc_ds; + regs->tf_es = ucp->uc_mcontext.mc_es; + regs->tf_fs = ucp->uc_mcontext.mc_fs; + regs->tf_gs = ucp->uc_mcontext.mc_gs; PROC_LOCK(p); td->td_sigmask = ucp->uc_sigmask; @@ -678,7 +677,6 @@ freebsd32_sigreturn(td, uap) if (ret != 0) return (ret); - /* Segment selectors restored by sigtramp.S */ regs->tf_rdi = ucp->uc_mcontext.mc_edi; regs->tf_rsi = ucp->uc_mcontext.mc_esi; regs->tf_rbp = ucp->uc_mcontext.mc_ebp; @@ -693,6 +691,11 @@ freebsd32_sigreturn(td, uap) regs->tf_rflags = ucp->uc_mcontext.mc_eflags; regs->tf_rsp = ucp->uc_mcontext.mc_esp; regs->tf_ss = ucp->uc_mcontext.mc_ss; + regs->tf_ds = ucp->uc_mcontext.mc_ds; + regs->tf_es = ucp->uc_mcontext.mc_es; + regs->tf_fs = ucp->uc_mcontext.mc_fs; + regs->tf_gs = ucp->uc_mcontext.mc_gs; + regs->tf_flags = TF_HASSEGS; PROC_LOCK(p); td->td_sigmask = ucp->uc_sigmask; @@ -715,20 +718,14 @@ ia32_setregs(td, entry, stack, ps_strings) struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; - critical_enter(); - wrmsr(MSR_FSBASE, 0); - wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ + mtx_lock(&dt_lock); + if (td->td_proc->p_md.md_ldt != NULL) + user_ldt_free(td); + else + mtx_unlock(&dt_lock); + pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; - critical_exit(); - load_ds(_udatasel); - load_es(_udatasel); - load_fs(_udatasel); - load_gs(_udatasel); - pcb->pcb_ds = _udatasel; - pcb->pcb_es = _udatasel; - pcb->pcb_fs = _udatasel; - pcb->pcb_gs = _udatasel; pcb->pcb_initial_fpucw = __INITIAL_FPUCW_I386__; bzero((char *)regs, sizeof(struct trapframe)); @@ -738,6 +735,12 @@ ia32_setregs(td, entry, stack, ps_strings) regs->tf_ss = _udatasel; regs->tf_cs = _ucode32sel; regs->tf_rbx = ps_strings; + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + regs->tf_fs = _ufssel; + regs->tf_gs = _ugssel; + regs->tf_flags = TF_HASSEGS; + load_cr0(rcr0() | CR0_MP | CR0_TS); fpstate_drop(td); diff --git a/sys/amd64/ia32/ia32_sigtramp.S b/sys/amd64/ia32/ia32_sigtramp.S index 1cd220a5f1cc..945516986600 100644 --- a/sys/amd64/ia32/ia32_sigtramp.S +++ b/sys/amd64/ia32/ia32_sigtramp.S @@ -45,8 +45,6 @@ ia32_sigcode: calll *IA32_SIGF_HANDLER(%esp) leal IA32_SIGF_UC(%esp),%eax /* get ucontext */ pushl %eax - mov IA32_UC_ES(%eax),%es /* restore %es */ - mov IA32_UC_DS(%eax),%ds /* restore %ds */ movl $SYS_sigreturn,%eax pushl %eax /* junk to fake return addr. */ int $0x80 /* enter kernel with args */ @@ -60,8 +58,6 @@ freebsd4_ia32_sigcode: calll *IA32_SIGF_HANDLER(%esp) leal IA32_SIGF_UC4(%esp),%eax/* get ucontext */ pushl %eax - mov IA32_UC4_ES(%eax),%es /* restore %es */ - mov IA32_UC4_DS(%eax),%ds /* restore %ds */ movl $344,%eax /* 4.x SYS_sigreturn */ pushl %eax /* junk to fake return addr. */ int $0x80 /* enter kernel with args */ diff --git a/sys/amd64/include/asmacros.h b/sys/amd64/include/asmacros.h index 788f39f44bc5..0bf0029ef105 100644 --- a/sys/amd64/include/asmacros.h +++ b/sys/amd64/include/asmacros.h @@ -161,7 +161,12 @@ movq %r12,TF_R12(%rsp) ; \ movq %r13,TF_R13(%rsp) ; \ movq %r14,TF_R14(%rsp) ; \ - movq %r15,TF_R15(%rsp) + movq %r15,TF_R15(%rsp) ; \ + movw %fs,TF_FS(%rsp) ; \ + movw %gs,TF_GS(%rsp) ; \ + movw %es,TF_ES(%rsp) ; \ + movw %ds,TF_DS(%rsp) ; \ + movl $TF_HASSEGS,TF_FLAGS(%rsp) #define POP_FRAME \ movq TF_RDI(%rsp),%rdi ; \ diff --git a/sys/amd64/include/frame.h b/sys/amd64/include/frame.h index 26c9dd06d33d..12722a48585f 100644 --- a/sys/amd64/include/frame.h +++ b/sys/amd64/include/frame.h @@ -64,9 +64,13 @@ struct trapframe { register_t tf_r13; register_t tf_r14; register_t tf_r15; - register_t tf_trapno; + uint32_t tf_trapno; + uint16_t tf_fs; + uint16_t tf_gs; register_t tf_addr; - register_t tf_flags; + uint32_t tf_flags; + uint16_t tf_es; + uint16_t tf_ds; /* below portion defined in hardware */ register_t tf_err; register_t tf_rip; @@ -76,4 +80,7 @@ struct trapframe { register_t tf_ss; }; +#define TF_HASSEGS 0x1 +/* #define _MC_HASBASES 0x2 */ + #endif /* _MACHINE_FRAME_H_ */ diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h index 2125b9fcfdd3..892e19d78ff5 100644 --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -60,6 +60,11 @@ extern char sigcode[]; extern int szsigcode; extern uint64_t *vm_page_dump; extern int vm_page_dump_size; +extern int _udatasel; +extern int _ucodesel; +extern int _ucode32sel; +extern int _ufssel; +extern int _ugssel; typedef void alias_for_inthand_t(u_int cs, u_int ef, u_int esp, u_int ss); struct thread; @@ -72,6 +77,14 @@ void busdma_swi(void); void cpu_setregs(void); void doreti_iret(void) __asm(__STRING(doreti_iret)); void doreti_iret_fault(void) __asm(__STRING(doreti_iret_fault)); +void ld_ds(void) __asm(__STRING(ld_ds)); +void ld_es(void) __asm(__STRING(ld_es)); +void ld_fs(void) __asm(__STRING(ld_fs)); +void ld_gs(void) __asm(__STRING(ld_gs)); +void ds_load_fault(void) __asm(__STRING(ds_load_fault)); +void es_load_fault(void) __asm(__STRING(es_load_fault)); +void fs_load_fault(void) __asm(__STRING(fs_load_fault)); +void gs_load_fault(void) __asm(__STRING(gs_load_fault)); void dump_add_page(vm_paddr_t); void dump_drop_page(vm_paddr_t); void initializecpu(void); diff --git a/sys/amd64/include/pcb.h b/sys/amd64/include/pcb.h index 2e2ca87eb61f..39ca832e0419 100644 --- a/sys/amd64/include/pcb.h +++ b/sys/amd64/include/pcb.h @@ -62,10 +62,6 @@ struct pcb { #define PCB_32BIT 0x40 /* process has 32 bit context (segs etc) */ #define PCB_FULLCTX 0x80 /* full context restore on sysret */ - u_int32_t pcb_ds; - u_int32_t pcb_es; - u_int32_t pcb_fs; - u_int32_t pcb_gs; u_int64_t pcb_dr0; u_int64_t pcb_dr1; u_int64_t pcb_dr2; @@ -80,6 +76,8 @@ struct pcb { /* 32-bit segment descriptor */ struct user_segment_descriptor pcb_gs32sd; + /* local tss, with i/o bitmap; NULL for common */ + struct amd64tss *pcb_tssp; }; struct xpcb { diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h index 23818ca81328..139281a6b478 100644 --- a/sys/amd64/include/pcpu.h +++ b/sys/amd64/include/pcpu.h @@ -62,12 +62,20 @@ char pc_monitorbuf[128] __aligned(128); /* cache line */ \ struct pcpu *pc_prvspace; /* Self-reference */ \ struct pmap *pc_curpmap; \ - struct amd64tss *pc_tssp; \ + struct amd64tss *pc_tssp; /* TSS segment active on CPU */ \ + struct amd64tss *pc_commontssp;/* Common TSS for the CPU */ \ register_t pc_rsp0; \ register_t pc_scratch_rsp; /* User %rsp in syscall */ \ u_int pc_apic_id; \ u_int pc_acpi_id; /* ACPI CPU id */ \ - struct user_segment_descriptor *pc_gs32p \ + /* Pointer to the CPU %fs descriptor */ \ + struct user_segment_descriptor *pc_fs32p; \ + /* Pointer to the CPU %gs descriptor */ \ + struct user_segment_descriptor *pc_gs32p; \ + /* Pointer to the CPU LDT descriptor */ \ + struct system_segment_descriptor *pc_ldt; \ + /* Pointer to the CPU TSS descriptor */ \ + struct system_segment_descriptor *pc_tss PCPU_XEN_FIELDS #ifdef _KERNEL diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h index a3ebd796e03d..33d5181d2d53 100644 --- a/sys/amd64/include/proc.h +++ b/sys/amd64/include/proc.h @@ -33,6 +33,13 @@ #ifndef _MACHINE_PROC_H_ #define _MACHINE_PROC_H_ +#include + +struct proc_ldt { + caddr_t ldt_base; + int ldt_refcnt; +}; + /* * Machine-dependent part of the proc structure for AMD64. */ @@ -42,6 +49,8 @@ struct mdthread { }; struct mdproc { + struct proc_ldt *md_ldt; /* (t) per-process ldt */ + struct system_segment_descriptor md_ldt_sd; }; #ifdef _KERNEL @@ -55,6 +64,18 @@ struct mdproc { (char *)&td; \ } while (0) +void set_user_ldt(struct mdproc *); +struct proc_ldt *user_ldt_alloc(struct proc *, int); +void user_ldt_free(struct thread *); +void user_ldt_deref(struct proc_ldt *); +struct sysarch_args; +int sysarch_ldt(struct thread *td, struct sysarch_args *uap, int uap_space); +int amd64_set_ldt_data(struct thread *td, int start, int num, + struct user_segment_descriptor *descs); + +extern struct mtx dt_lock; +extern int max_ldt_segment; + #endif /* _KERNEL */ #endif /* !_MACHINE_PROC_H_ */ diff --git a/sys/amd64/include/segments.h b/sys/amd64/include/segments.h index cab7554c82c1..3dca80ad5be6 100644 --- a/sys/amd64/include/segments.h +++ b/sys/amd64/include/segments.h @@ -239,6 +239,9 @@ void ssdtosd(struct soft_segment_descriptor *ssdp, struct user_segment_descriptor *sdp); void ssdtosyssd(struct soft_segment_descriptor *ssdp, struct system_segment_descriptor *sdp); +void update_gdt_gsbase(struct thread *td, uint32_t base); +void update_gdt_fsbase(struct thread *td, uint32_t base); + #endif /* _KERNEL */ #endif /* !_MACHINE_SEGMENTS_H_ */ diff --git a/sys/amd64/include/sysarch.h b/sys/amd64/include/sysarch.h index 2b0d0a6847fd..6c3e6c913b74 100644 --- a/sys/amd64/include/sysarch.h +++ b/sys/amd64/include/sysarch.h @@ -77,6 +77,15 @@ int amd64_set_fsbase(void *); int amd64_set_gsbase(void *); int sysarch(int, void *); __END_DECLS +#else +struct thread; +union descriptor; + +int amd64_get_ldt(struct thread *, struct i386_ldt_args *); +int amd64_set_ldt(struct thread *, struct i386_ldt_args *, + struct user_segment_descriptor *); +int amd64_get_ioperm(struct thread *, struct i386_ioperm_args *); +int amd64_set_ioperm(struct thread *, struct i386_ioperm_args *); #endif #endif /* !_MACHINE_SYSARCH_H_ */ diff --git a/sys/amd64/linux32/linux32_locore.s b/sys/amd64/linux32/linux32_locore.s index 6045925c9b5b..36e1abf87b83 100644 --- a/sys/amd64/linux32/linux32_locore.s +++ b/sys/amd64/linux32/linux32_locore.s @@ -11,8 +11,6 @@ NON_GPROF_ENTRY(linux_sigcode) call *LINUX_SIGF_HANDLER(%esp) leal LINUX_SIGF_SC(%esp),%ebx /* linux scp */ - mov LINUX_SC_ES(%ebx),%es - mov LINUX_SC_DS(%ebx),%ds movl %esp, %ebx /* pass sigframe */ push %eax /* fake ret addr */ movl $LINUX_SYS_linux_sigreturn,%eax /* linux_sigreturn() */ @@ -24,8 +22,6 @@ linux_rt_sigcode: call *LINUX_RT_SIGF_HANDLER(%esp) leal LINUX_RT_SIGF_UC(%esp),%ebx /* linux ucp */ leal LINUX_RT_SIGF_SC(%ebx),%ecx /* linux sigcontext */ - mov LINUX_SC_ES(%ecx),%es - mov LINUX_SC_DS(%ecx),%ds push %eax /* fake ret addr */ movl $LINUX_SYS_linux_rt_sigreturn,%eax /* linux_rt_sigreturn() */ int $0x80 /* enter kernel with args */ diff --git a/sys/amd64/linux32/linux32_machdep.c b/sys/amd64/linux32/linux32_machdep.c index eb91623cfd84..42ea0700f8b6 100644 --- a/sys/amd64/linux32/linux32_machdep.c +++ b/sys/amd64/linux32/linux32_machdep.c @@ -716,8 +716,8 @@ linux_clone(struct thread *td, struct linux_clone_args *args) sd.sd_long, sd.sd_def32, sd.sd_gran); #endif td2->td_pcb->pcb_gsbase = (register_t)info.base_addr; - td2->td_pcb->pcb_gs32sd = sd; - td2->td_pcb->pcb_gs = GSEL(GUGS32_SEL, SEL_UPL); +/* XXXKIB td2->td_pcb->pcb_gs32sd = sd; */ + td2->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL); td2->td_pcb->pcb_flags |= PCB_GS32BIT | PCB_32BIT; } } @@ -1359,12 +1359,9 @@ linux_set_thread_area(struct thread *td, sd.sd_gran); #endif - critical_enter(); td->td_pcb->pcb_gsbase = (register_t)info.base_addr; - td->td_pcb->pcb_gs32sd = *PCPU_GET(gs32p) = sd; td->td_pcb->pcb_flags |= PCB_32BIT | PCB_GS32BIT; - wrmsr(MSR_KGSBASE, td->td_pcb->pcb_gsbase); - critical_exit(); + update_gdt_gsbase(td, info.base_addr); return (0); } diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c index 3ed65eb4cfb8..925d2e100a2f 100644 --- a/sys/amd64/linux32/linux32_sysvec.c +++ b/sys/amd64/linux32/linux32_sysvec.c @@ -290,7 +290,6 @@ elf_linux_fixup(register_t **stack_base, struct image_params *imgp) return 0; } -extern int _ucodesel, _ucode32sel, _udatasel; extern unsigned long linux_sznonrtsigcode; static void @@ -360,13 +359,7 @@ linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); - frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0]; - frame.sf_sc.uc_mcontext.sc_gs = rgs(); - frame.sf_sc.uc_mcontext.sc_fs = rfs(); - __asm __volatile("mov %%es,%0" : - "=rm" (frame.sf_sc.uc_mcontext.sc_es)); - __asm __volatile("mov %%ds,%0" : - "=rm" (frame.sf_sc.uc_mcontext.sc_ds)); + frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0]; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp; @@ -376,6 +369,10 @@ linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; + frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs; + frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; + frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; + frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; @@ -413,11 +410,11 @@ linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; - load_ds(_udatasel); - td->td_pcb->pcb_ds = _udatasel; - load_es(_udatasel); - td->td_pcb->pcb_es = _udatasel; - /* leave user %fs and %gs untouched */ + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + regs->tf_fs = _ufssel; + regs->tf_gs = _ugssel; + regs->tf_flags = TF_HASSEGS; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } @@ -495,10 +492,10 @@ linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) * Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__bits[0]; - frame.sf_sc.sc_gs = rgs(); - frame.sf_sc.sc_fs = rfs(); - __asm __volatile("mov %%es,%0" : "=rm" (frame.sf_sc.sc_es)); - __asm __volatile("mov %%ds,%0" : "=rm" (frame.sf_sc.sc_ds)); + frame.sf_sc.sc_gs = regs->tf_gs; + frame.sf_sc.sc_fs = regs->tf_fs; + frame.sf_sc.sc_es = regs->tf_es; + frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_rdi; frame.sf_sc.sc_esi = regs->tf_rsi; frame.sf_sc.sc_ebp = regs->tf_rbp; @@ -535,11 +532,11 @@ linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; - load_ds(_udatasel); - td->td_pcb->pcb_ds = _udatasel; - load_es(_udatasel); - td->td_pcb->pcb_es = _udatasel; - /* leave user %fs and %gs untouched */ + regs->tf_ds = _udatasel; + regs->tf_es = _udatasel; + regs->tf_fs = _ufssel; + regs->tf_gs = _ugssel; + regs->tf_flags = TF_HASSEGS; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } @@ -624,7 +621,6 @@ linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) /* * Restore signal context. */ - /* Selectors were restored by the trampoline. */ regs->tf_rdi = frame.sf_sc.sc_edi; regs->tf_rsi = frame.sf_sc.sc_esi; regs->tf_rbp = frame.sf_sc.sc_ebp; @@ -634,6 +630,10 @@ linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) regs->tf_rax = frame.sf_sc.sc_eax; regs->tf_rip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; + regs->tf_ds = frame.sf_sc.sc_ds; + regs->tf_es = frame.sf_sc.sc_es; + regs->tf_fs = frame.sf_sc.sc_fs; + regs->tf_gs = frame.sf_sc.sc_gs; regs->tf_rflags = eflags; regs->tf_rsp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; @@ -722,7 +722,10 @@ linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) /* * Restore signal context */ - /* Selectors were restored by the trampoline. */ + regs->tf_gs = context->sc_gs; + regs->tf_fs = context->sc_fs; + regs->tf_es = context->sc_es; + regs->tf_ds = context->sc_ds; regs->tf_rdi = context->sc_edi; regs->tf_rsi = context->sc_esi; regs->tf_rbp = context->sc_ebp; @@ -827,27 +830,30 @@ exec_linux_setregs(td, entry, stack, ps_strings) struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; + mtx_lock(&dt_lock); + if (td->td_proc->p_md.md_ldt != NULL) + user_ldt_free(td); + else + mtx_unlock(&dt_lock); + critical_enter(); wrmsr(MSR_FSBASE, 0); wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; critical_exit(); - load_ds(_udatasel); - load_es(_udatasel); - load_fs(_udatasel); - load_gs(_udatasel); - pcb->pcb_ds = _udatasel; - pcb->pcb_es = _udatasel; - pcb->pcb_fs = _udatasel; - pcb->pcb_gs = _udatasel; pcb->pcb_initial_fpucw = __LINUX_NPXCW__; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = entry; regs->tf_rsp = stack; regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); + regs->tf_gs = _ugssel; + regs->tf_fs = _ufssel; + regs->tf_es = _udatasel; + regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; + regs->tf_flags = TF_HASSEGS; regs->tf_cs = _ucode32sel; regs->tf_rbx = ps_strings; load_cr0(rcr0() | CR0_MP | CR0_TS); diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index fa9575295c6b..56e17d15712d 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -232,6 +232,7 @@ amd64/ia32/ia32_reg.c optional compat_ia32 amd64/ia32/ia32_signal.c optional compat_ia32 amd64/ia32/ia32_sigtramp.S optional compat_ia32 amd64/ia32/ia32_syscall.c optional compat_ia32 +amd64/ia32/ia32_misc.c optional compat_ia32 compat/freebsd32/freebsd32_ioctl.c optional compat_ia32 compat/freebsd32/freebsd32_misc.c optional compat_ia32 compat/freebsd32/freebsd32_syscalls.c optional compat_ia32