Make WRFSBASE and WRGSBASE instructions functional.

Right now, we enable the CR4.FSGSBASE bit on CPUs which support the
facility (Ivy and later), to allow usermode to read fs and gs bases
without syscalls. This bit also controls the write access to bases
from userspace, but WRFSBASE and WRGSBASE instructions currently
cannot be used, because return path from both exceptions or interrupts
overrides bases with the values from pcb.

Supporting the instructions is useful because this means that usermode
can implement green-threads completely in userspace without issuing
syscalls to change all of the machine context.

Support is implemented by saving the fs base and user gs base when
PCB_FULL_IRET flag is set. The flag is set on the context switch,
which potentially causes clobber of the bases due to activation of
another context, and when explicit modification of the user context by
a syscall or exception handler is performed. In particular, the patch
moves setting of the flag before syscalls change context.

The changes to doreti_exit and PUSH_FRAME to clear PCB_FULL_IRET on
entry from userspace can be considered a bug fixes on its own.

Reviewed by:	jhb (previous version)
Tested by:	pho (previous version)
Sponsored by:	The FreeBSD Foundation
MFC after:	3 weeks
Differential revision:	https://reviews.freebsd.org/D12023
This commit is contained in:
kib 2017-08-21 17:38:02 +00:00
parent 3149ed68c4
commit f495f3ebd8
9 changed files with 191 additions and 64 deletions

View File

@ -87,7 +87,6 @@ END(cpu_throw)
ENTRY(cpu_switch)
/* Switch to new thread. First, save context. */
movq TD_PCB(%rdi),%r8
orl $PCB_FULL_IRET,PCB_FLAGS(%r8)
movq (%rsp),%rax /* Hardware registers */
movq %r15,PCB_R15(%r8)
@ -99,6 +98,30 @@ ENTRY(cpu_switch)
movq %rbx,PCB_RBX(%r8)
movq %rax,PCB_RIP(%r8)
testl $PCB_FULL_IRET,PCB_FLAGS(%r8)
jnz 2f
orl $PCB_FULL_IRET,PCB_FLAGS(%r8)
testl $TDP_KTHREAD,TD_PFLAGS(%rdi)
jnz 2f
testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
jz 2f
movl %fs,%eax
cmpl $KUF32SEL,%eax
jne 1f
rdfsbaseq %rax
movq %rax,PCB_FSBASE(%r8)
1: movl %gs,%eax
cmpl $KUG32SEL,%eax
jne 2f
movq %rdx,%r12
movl $MSR_KGSBASE,%ecx /* Read user gs base */
rdmsr
shlq $32,%rdx
orq %rdx,%rax
movq %rax,PCB_GSBASE(%r8)
movq %r12,%rdx
2:
testl $PCB_DBREGS,PCB_FLAGS(%r8)
jnz store_dr /* static predict not taken */
done_store_dr:

View File

@ -187,12 +187,13 @@ alltraps_testi:
jz alltraps_pushregs_no_rdi
sti
alltraps_pushregs_no_rdi:
movq %rsi,TF_RSI(%rsp)
movq %rdx,TF_RDX(%rsp)
movq %rax,TF_RAX(%rsp)
alltraps_pushregs_no_rax:
movq %rsi,TF_RSI(%rsp)
movq %rcx,TF_RCX(%rsp)
movq %r8,TF_R8(%rsp)
movq %r9,TF_R9(%rsp)
movq %rax,TF_RAX(%rsp)
movq %rbx,TF_RBX(%rsp)
movq %rbp,TF_RBP(%rsp)
movq %r10,TF_R10(%rsp)
@ -326,22 +327,44 @@ IDTVEC(prot)
prot_addrf:
movq $0,TF_ADDR(%rsp)
movq %rdi,TF_RDI(%rsp) /* free up a GP register */
leaq doreti_iret(%rip),%rdi
cmpq %rdi,TF_RIP(%rsp)
je 1f /* kernel but with user gsbase!! */
testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
jz 2f /* already running with kernel GS.base */
1: swapgs
2: movq PCPU(CURPCB),%rdi
orl $PCB_FULL_IRET,PCB_FLAGS(%rdi) /* always full iret from GPF */
movq %rax,TF_RAX(%rsp)
movq %rdx,TF_RDX(%rsp)
movw %fs,TF_FS(%rsp)
movw %gs,TF_GS(%rsp)
leaq doreti_iret(%rip),%rdi
cmpq %rdi,TF_RIP(%rsp)
je 5f /* kernel but with user gsbase!! */
testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
jz 6f /* already running with kernel GS.base */
testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
jz 2f
cmpw $KUF32SEL,TF_FS(%rsp)
jne 1f
rdfsbaseq %rax
1: cmpw $KUG32SEL,TF_GS(%rsp)
jne 2f
rdgsbaseq %rdx
2: swapgs
movq PCPU(CURPCB),%rdi
testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
jz 4f
cmpw $KUF32SEL,TF_FS(%rsp)
jne 3f
movq %rax,PCB_FSBASE(%rdi)
3: cmpw $KUG32SEL,TF_GS(%rsp)
jne 4f
movq %rdx,PCB_GSBASE(%rdi)
4: orl $PCB_FULL_IRET,PCB_FLAGS(%rdi) /* always full iret from GPF */
movw %es,TF_ES(%rsp)
movw %ds,TF_DS(%rsp)
testl $PSL_I,TF_RFLAGS(%rsp)
jz alltraps_pushregs_no_rdi
jz alltraps_pushregs_no_rax
sti
jmp alltraps_pushregs_no_rdi
jmp alltraps_pushregs_no_rax
5: swapgs
6: movq PCPU(CURPCB),%rdi
jmp 4b
/*
* Fast syscall entry point. We enter here with just our new %cs/%ss set,
@ -349,8 +372,8 @@ prot_addrf:
* pointer. We have to juggle a few things around to find our stack etc.
* swapgs gives us access to our PCPU space only.
*
* We do not support invoking this from a custom %cs or %ss (e.g. using
* entries from an LDT).
* We do not support invoking this from a custom segment registers,
* esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT.
*/
IDTVEC(fast_syscall)
swapgs
@ -503,6 +526,23 @@ IDTVEC(nmi)
nmi_fromuserspace:
incl %ebx
swapgs
testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
jz 2f
movq PCPU(CURPCB),%rdi
testq %rdi,%rdi
jz 2f
cmpw $KUF32SEL,TF_FS(%rsp)
jne 1f
rdfsbaseq %rax
movq %rax,PCB_FSBASE(%rdi)
1: cmpw $KUG32SEL,TF_GS(%rsp)
jne 2f
movl $MSR_KGSBASE,%ecx
rdmsr
shlq $32,%rdx
orq %rdx,%rax
movq %rax,PCB_GSBASE(%rdi)
2:
/* Note: this label is also used by ddb and gdb: */
nmi_calltrap:
FAKE_MCOUNT(TF_RIP(%rsp))
@ -705,6 +745,7 @@ doreti_exit:
jz ld_regs
testl $PCB_FULL_IRET,PCB_FLAGS(%r8)
jz ld_regs
andl $~PCB_FULL_IRET,PCB_FLAGS(%r8)
testl $TF_HASSEGS,TF_FLAGS(%rsp)
je set_segs

View File

@ -372,6 +372,7 @@ sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
fpstate_drop(td);
update_pcb_bases(pcb);
sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
bzero(sf.sf_uc.uc_mcontext.mc_spare,
@ -442,7 +443,6 @@ sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
regs->tf_fs = _ufssel;
regs->tf_gs = _ugssel;
regs->tf_flags = TF_HASSEGS;
set_pcb_flags(pcb, PCB_FULL_IRET);
PROC_LOCK(p);
mtx_lock(&psp->ps_mtx);
}
@ -548,6 +548,7 @@ sys_sigreturn(td, uap)
return (ret);
}
bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
update_pcb_bases(pcb);
pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
@ -559,7 +560,6 @@ sys_sigreturn(td, uap)
#endif
kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
set_pcb_flags(pcb, PCB_FULL_IRET);
return (EJUSTRETURN);
}
@ -587,11 +587,11 @@ exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
else
mtx_unlock(&dt_lock);
update_pcb_bases(pcb);
pcb->pcb_fsbase = 0;
pcb->pcb_gsbase = 0;
clear_pcb_flags(pcb, PCB_32BIT);
pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
set_pcb_flags(pcb, PCB_FULL_IRET);
bzero((char *)regs, sizeof(struct trapframe));
regs->tf_rip = imgp->entry_addr;
@ -2135,6 +2135,7 @@ get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
mcp->mc_flags = tp->tf_flags;
mcp->mc_len = sizeof(*mcp);
get_fpcontext(td, mcp, NULL, 0);
update_pcb_bases(pcb);
mcp->mc_fsbase = pcb->pcb_fsbase;
mcp->mc_gsbase = pcb->pcb_gsbase;
mcp->mc_xfpustate = 0;
@ -2205,11 +2206,11 @@ set_mcontext(struct thread *td, mcontext_t *mcp)
tp->tf_fs = mcp->mc_fs;
tp->tf_gs = mcp->mc_gs;
}
set_pcb_flags(pcb, PCB_FULL_IRET);
if (mcp->mc_flags & _MC_HASBASES) {
pcb->pcb_fsbase = mcp->mc_fsbase;
pcb->pcb_gsbase = mcp->mc_gsbase;
}
set_pcb_flags(pcb, PCB_FULL_IRET);
return (0);
}
@ -2480,6 +2481,71 @@ user_dbreg_trap(void)
return 0;
}
/*
* The pcb_flags is only modified by current thread, or by other threads
* when current thread is stopped. However, current thread may change it
* from the interrupt context in cpu_switch(), or in the trap handler.
* When we read-modify-write pcb_flags from C sources, compiler may generate
* code that is not atomic regarding the interrupt handler. If a trap or
* interrupt happens and any flag is modified from the handler, it can be
* clobbered with the cached value later. Therefore, we implement setting
* and clearing flags with single-instruction functions, which do not race
* with possible modification of the flags from the trap or interrupt context,
* because traps and interrupts are executed only on instruction boundary.
*/
void
set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
{
__asm __volatile("orl %1,%0"
: "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
: "cc", "memory");
}
/*
* The support for RDFSBASE, WRFSBASE and similar instructions for %gs
* base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
* pcb if user space modified the bases. We must save on the context
* switch or if the return to usermode happens through the doreti.
*
* Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
* which have a consequence that the base MSRs must be saved each time
* the PCB_FULL_IRET flag is set. We disable interrupts to sync with
* context switches.
*/
void
set_pcb_flags(struct pcb *pcb, const u_int flags)
{
register_t r;
if (curpcb == pcb &&
(flags & PCB_FULL_IRET) != 0 &&
(pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
(cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
r = intr_disable();
if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
if (rfs() == _ufssel)
pcb->pcb_fsbase = rdfsbase();
if (rgs() == _ugssel)
pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
}
set_pcb_flags_raw(pcb, flags);
intr_restore(r);
} else {
set_pcb_flags_raw(pcb, flags);
}
}
void
clear_pcb_flags(struct pcb *pcb, const u_int flags)
{
__asm __volatile("andl %1,%0"
: "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
: "cc", "memory");
}
#ifdef KDB
/*

View File

@ -117,15 +117,17 @@ cpu_ptrace_xstate(struct thread *td, int req, void *addr, int data)
static void
cpu_ptrace_setbase(struct thread *td, int req, register_t r)
{
struct pcb *pcb;
pcb = td->td_pcb;
set_pcb_flags(pcb, PCB_FULL_IRET);
if (req == PT_SETFSBASE) {
td->td_pcb->pcb_fsbase = r;
pcb->pcb_fsbase = r;
td->td_frame->tf_fs = _ufssel;
} else {
td->td_pcb->pcb_gsbase = r;
pcb->pcb_gsbase = r;
td->td_frame->tf_gs = _ugssel;
}
set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
}
#ifdef COMPAT_FREEBSD32
@ -136,6 +138,7 @@ static int
cpu32_ptrace(struct thread *td, int req, void *addr, int data)
{
struct savefpu *fpstate;
struct pcb *pcb;
uint32_t r;
int error;
@ -167,8 +170,10 @@ cpu32_ptrace(struct thread *td, int req, void *addr, int data)
error = EINVAL;
break;
}
r = req == PT_GETFSBASE ? td->td_pcb->pcb_fsbase :
td->td_pcb->pcb_gsbase;
pcb = td->td_pcb;
if (td == curthread)
update_pcb_bases(pcb);
r = req == PT_GETFSBASE ? pcb->pcb_fsbase : pcb->pcb_gsbase;
error = copyout(&r, addr, sizeof(r));
break;
@ -197,6 +202,7 @@ int
cpu_ptrace(struct thread *td, int req, void *addr, int data)
{
register_t *r, rv;
struct pcb *pcb;
int error;
#ifdef COMPAT_FREEBSD32
@ -221,8 +227,10 @@ cpu_ptrace(struct thread *td, int req, void *addr, int data)
case PT_GETFSBASE:
case PT_GETGSBASE:
r = req == PT_GETFSBASE ? &td->td_pcb->pcb_fsbase :
&td->td_pcb->pcb_gsbase;
pcb = td->td_pcb;
if (td == curthread)
update_pcb_bases(pcb);
r = req == PT_GETFSBASE ? &pcb->pcb_fsbase : &pcb->pcb_gsbase;
error = copyout(r, addr, sizeof(*r));
break;

View File

@ -254,39 +254,45 @@ sysarch(struct thread *td, struct sysarch_args *uap)
error = amd64_set_ioperm(td, &iargs);
break;
case I386_GET_FSBASE:
update_pcb_bases(pcb);
i386base = pcb->pcb_fsbase;
error = copyout(&i386base, uap->parms, sizeof(i386base));
break;
case I386_SET_FSBASE:
error = copyin(uap->parms, &i386base, sizeof(i386base));
if (!error) {
set_pcb_flags(pcb, PCB_FULL_IRET);
pcb->pcb_fsbase = i386base;
td->td_frame->tf_fs = _ufssel;
update_gdt_fsbase(td, i386base);
}
break;
case I386_GET_GSBASE:
update_pcb_bases(pcb);
i386base = pcb->pcb_gsbase;
error = copyout(&i386base, uap->parms, sizeof(i386base));
break;
case I386_SET_GSBASE:
error = copyin(uap->parms, &i386base, sizeof(i386base));
if (!error) {
set_pcb_flags(pcb, PCB_FULL_IRET);
pcb->pcb_gsbase = i386base;
td->td_frame->tf_gs = _ugssel;
update_gdt_gsbase(td, i386base);
}
break;
case AMD64_GET_FSBASE:
error = copyout(&pcb->pcb_fsbase, uap->parms, sizeof(pcb->pcb_fsbase));
update_pcb_bases(pcb);
error = copyout(&pcb->pcb_fsbase, uap->parms,
sizeof(pcb->pcb_fsbase));
break;
case AMD64_SET_FSBASE:
error = copyin(uap->parms, &a64base, sizeof(a64base));
if (!error) {
if (a64base < VM_MAXUSER_ADDRESS) {
pcb->pcb_fsbase = a64base;
set_pcb_flags(pcb, PCB_FULL_IRET);
pcb->pcb_fsbase = a64base;
td->td_frame->tf_fs = _ufssel;
} else
error = EINVAL;
@ -294,15 +300,17 @@ sysarch(struct thread *td, struct sysarch_args *uap)
break;
case AMD64_GET_GSBASE:
error = copyout(&pcb->pcb_gsbase, uap->parms, sizeof(pcb->pcb_gsbase));
update_pcb_bases(pcb);
error = copyout(&pcb->pcb_gsbase, uap->parms,
sizeof(pcb->pcb_gsbase));
break;
case AMD64_SET_GSBASE:
error = copyin(uap->parms, &a64base, sizeof(a64base));
if (!error) {
if (a64base < VM_MAXUSER_ADDRESS) {
pcb->pcb_gsbase = a64base;
set_pcb_flags(pcb, PCB_FULL_IRET);
pcb->pcb_gsbase = a64base;
td->td_frame->tf_gs = _ugssel;
} else
error = EINVAL;

View File

@ -238,7 +238,7 @@ cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags)
pcb2->pcb_tssp = NULL;
/* New segment registers. */
set_pcb_flags(pcb2, PCB_FULL_IRET);
set_pcb_flags_raw(pcb2, PCB_FULL_IRET);
/* Copy the LDT, if necessary. */
mdp1 = &td1->td_proc->p_md;
@ -439,7 +439,7 @@ cpu_copy_thread(struct thread *td, struct thread *td0)
pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
bcopy(get_pcb_user_save_td(td0), pcb2->pcb_save,
cpu_max_ext_state_size);
set_pcb_flags(pcb2, PCB_FULL_IRET);
set_pcb_flags_raw(pcb2, PCB_FULL_IRET);
/*
* Create a new fresh stack for the new thread.

View File

@ -177,7 +177,12 @@
movw %es,TF_ES(%rsp) ; \
movw %ds,TF_DS(%rsp) ; \
movl $TF_HASSEGS,TF_FLAGS(%rsp) ; \
cld
cld ; \
testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel ? */ \
jz 2f ; /* yes, leave PCB_FULL_IRET alone */ \
movq PCPU(CURPCB),%r8 ; \
andl $~PCB_FULL_IRET,PCB_FLAGS(%r8) ; \
2:
#define POP_FRAME \
movq TF_RDI(%rsp),%rdi ; \

View File

@ -119,40 +119,15 @@ struct susppcb {
#ifdef _KERNEL
struct trapframe;
/*
* The pcb_flags is only modified by current thread, or by other threads
* when current thread is stopped. However, current thread may change it
* from the interrupt context in cpu_switch(), or in the trap handler.
* When we read-modify-write pcb_flags from C sources, compiler may generate
* code that is not atomic regarding the interrupt handler. If a trap or
* interrupt happens and any flag is modified from the handler, it can be
* clobbered with the cached value later. Therefore, we implement setting
* and clearing flags with single-instruction functions, which do not race
* with possible modification of the flags from the trap or interrupt context,
* because traps and interrupts are executed only on instruction boundary.
*/
static __inline void
set_pcb_flags(struct pcb *pcb, const u_int flags)
{
__asm __volatile("orl %1,%0"
: "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
: "cc");
}
static __inline void
clear_pcb_flags(struct pcb *pcb, const u_int flags)
{
__asm __volatile("andl %1,%0"
: "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
: "cc");
}
void clear_pcb_flags(struct pcb *pcb, const u_int flags);
void makectx(struct trapframe *, struct pcb *);
void set_pcb_flags(struct pcb *pcb, const u_int flags);
void set_pcb_flags_raw(struct pcb *pcb, const u_int flags);
int savectx(struct pcb *) __returns_twice;
void resumectx(struct pcb *);
/* Ensure that pcb_gsbase and pcb_fsbase are up to date */
#define update_pcb_bases(pcb) set_pcb_flags((pcb), PCB_FULL_IRET)
#endif
#endif /* _AMD64_PCB_H_ */

View File

@ -58,7 +58,7 @@
* in the range 5 to 9.
*/
#undef __FreeBSD_version
#define __FreeBSD_version 1200040 /* Master, propagated to newvers */
#define __FreeBSD_version 1200041 /* Master, propagated to newvers */
/*
* __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
@ -83,6 +83,7 @@
#define P_OSREL_MAP_FSTRICT 1100036
#define P_OSREL_SHUTDOWN_ENOTCONN 1100077
#define P_OSREL_MAP_GUARD 1200035
#define P_OSREL_WRFSBASE 1200041
#define P_OSREL_MAJOR(x) ((x) / 100000)
#endif