Enable eager FPU context switch by default on amd64.
With compilers making increasing use of vector instructions the performance benefit of lazily switching FPU state is no longer a desirable tradeoff. Linux switched to eager FPU context switch some time ago, and the idea was floated on the FreeBSD-current mailing list some years ago[1]. Enable eager FPU context switch by default on amd64, with a tunable/sysctl available to turn it back off. [1] https://lists.freebsd.org/pipermail/freebsd-current/2015-March/055198.html Reviewed by: jhb Tested by: pho Sponsored by: The FreeBSD Foundation
This commit is contained in:
parent
46c0e42b6e
commit
d1a07e31e5
@ -128,10 +128,10 @@ done_store_dr:
|
|||||||
|
|
||||||
/* have we used fp, and need a save? */
|
/* have we used fp, and need a save? */
|
||||||
cmpq %rdi,PCPU(FPCURTHREAD)
|
cmpq %rdi,PCPU(FPCURTHREAD)
|
||||||
jne 3f
|
jne 2f
|
||||||
movq PCB_SAVEFPU(%r8),%r8
|
movq PCB_SAVEFPU(%r8),%r8
|
||||||
clts
|
clts
|
||||||
cmpl $0,use_xsave
|
cmpl $0,use_xsave(%rip)
|
||||||
jne 1f
|
jne 1f
|
||||||
fxsave (%r8)
|
fxsave (%r8)
|
||||||
jmp 2f
|
jmp 2f
|
||||||
@ -143,12 +143,7 @@ ctx_switch_xsave:
|
|||||||
/* This is patched to xsaveopt if supported, see fpuinit_bsp1() */
|
/* This is patched to xsaveopt if supported, see fpuinit_bsp1() */
|
||||||
xsave (%r8)
|
xsave (%r8)
|
||||||
movq %rcx,%rdx
|
movq %rcx,%rdx
|
||||||
2: smsw %ax
|
2:
|
||||||
orb $CR0_TS,%al
|
|
||||||
lmsw %ax
|
|
||||||
xorl %eax,%eax
|
|
||||||
movq %rax,PCPU(FPCURTHREAD)
|
|
||||||
3:
|
|
||||||
/* Save is done. Now fire up new thread. Leave old vmspace. */
|
/* Save is done. Now fire up new thread. Leave old vmspace. */
|
||||||
movq %rsi,%r12
|
movq %rsi,%r12
|
||||||
movq %rdi,%r13
|
movq %rdi,%r13
|
||||||
@ -238,6 +233,8 @@ done_load_dr:
|
|||||||
movq PCB_RBX(%r8),%rbx
|
movq PCB_RBX(%r8),%rbx
|
||||||
movq PCB_RIP(%r8),%rax
|
movq PCB_RIP(%r8),%rax
|
||||||
movq %rax,(%rsp)
|
movq %rax,(%rsp)
|
||||||
|
movq PCPU(CURTHREAD),%rdi
|
||||||
|
call fpu_activate_sw
|
||||||
ret
|
ret
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -142,6 +142,11 @@ static void fpu_clean_state(void);
|
|||||||
SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD,
|
SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD,
|
||||||
SYSCTL_NULL_INT_PTR, 1, "Floating point instructions executed in hardware");
|
SYSCTL_NULL_INT_PTR, 1, "Floating point instructions executed in hardware");
|
||||||
|
|
||||||
|
int lazy_fpu_switch = 0;
|
||||||
|
SYSCTL_INT(_hw, OID_AUTO, lazy_fpu_switch, CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
|
||||||
|
&lazy_fpu_switch, 0,
|
||||||
|
"Lazily load FPU context after context switch");
|
||||||
|
|
||||||
int use_xsave; /* non-static for cpu_switch.S */
|
int use_xsave; /* non-static for cpu_switch.S */
|
||||||
uint64_t xsave_mask; /* the same */
|
uint64_t xsave_mask; /* the same */
|
||||||
static uma_zone_t fpu_save_area_zone;
|
static uma_zone_t fpu_save_area_zone;
|
||||||
@ -242,6 +247,7 @@ fpuinit_bsp1(void)
|
|||||||
uint64_t xsave_mask_user;
|
uint64_t xsave_mask_user;
|
||||||
bool old_wp;
|
bool old_wp;
|
||||||
|
|
||||||
|
TUNABLE_INT_FETCH("hw.lazy_fpu_switch", &lazy_fpu_switch);
|
||||||
if (!use_xsave)
|
if (!use_xsave)
|
||||||
return;
|
return;
|
||||||
cpuid_count(0xd, 0x0, cp);
|
cpuid_count(0xd, 0x0, cp);
|
||||||
@ -651,6 +657,45 @@ fputrap_sse(void)
|
|||||||
return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]);
|
return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
restore_fpu_curthread(struct thread *td)
|
||||||
|
{
|
||||||
|
struct pcb *pcb;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Record new context early in case frstor causes a trap.
|
||||||
|
*/
|
||||||
|
PCPU_SET(fpcurthread, td);
|
||||||
|
|
||||||
|
stop_emulating();
|
||||||
|
fpu_clean_state();
|
||||||
|
pcb = td->td_pcb;
|
||||||
|
|
||||||
|
if ((pcb->pcb_flags & PCB_FPUINITDONE) == 0) {
|
||||||
|
/*
|
||||||
|
* This is the first time this thread has used the FPU or
|
||||||
|
* the PCB doesn't contain a clean FPU state. Explicitly
|
||||||
|
* load an initial state.
|
||||||
|
*
|
||||||
|
* We prefer to restore the state from the actual save
|
||||||
|
* area in PCB instead of directly loading from
|
||||||
|
* fpu_initialstate, to ignite the XSAVEOPT
|
||||||
|
* tracking engine.
|
||||||
|
*/
|
||||||
|
bcopy(fpu_initialstate, pcb->pcb_save,
|
||||||
|
cpu_max_ext_state_size);
|
||||||
|
fpurestore(pcb->pcb_save);
|
||||||
|
if (pcb->pcb_initial_fpucw != __INITIAL_FPUCW__)
|
||||||
|
fldcw(pcb->pcb_initial_fpucw);
|
||||||
|
if (PCB_USER_FPU(pcb))
|
||||||
|
set_pcb_flags(pcb, PCB_FPUINITDONE |
|
||||||
|
PCB_USERFPUINITDONE);
|
||||||
|
else
|
||||||
|
set_pcb_flags(pcb, PCB_FPUINITDONE);
|
||||||
|
} else
|
||||||
|
fpurestore(pcb->pcb_save);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Device Not Available (DNA, #NM) exception handler.
|
* Device Not Available (DNA, #NM) exception handler.
|
||||||
*
|
*
|
||||||
@ -661,7 +706,9 @@ fputrap_sse(void)
|
|||||||
void
|
void
|
||||||
fpudna(void)
|
fpudna(void)
|
||||||
{
|
{
|
||||||
|
struct thread *td;
|
||||||
|
|
||||||
|
td = curthread;
|
||||||
/*
|
/*
|
||||||
* This handler is entered with interrupts enabled, so context
|
* This handler is entered with interrupts enabled, so context
|
||||||
* switches may occur before critical_enter() is executed. If
|
* switches may occur before critical_enter() is executed. If
|
||||||
@ -675,7 +722,7 @@ fpudna(void)
|
|||||||
|
|
||||||
KASSERT((curpcb->pcb_flags & PCB_FPUNOSAVE) == 0,
|
KASSERT((curpcb->pcb_flags & PCB_FPUNOSAVE) == 0,
|
||||||
("fpudna while in fpu_kern_enter(FPU_KERN_NOCTX)"));
|
("fpudna while in fpu_kern_enter(FPU_KERN_NOCTX)"));
|
||||||
if (PCPU_GET(fpcurthread) == curthread) {
|
if (PCPU_GET(fpcurthread) == td) {
|
||||||
printf("fpudna: fpcurthread == curthread\n");
|
printf("fpudna: fpcurthread == curthread\n");
|
||||||
stop_emulating();
|
stop_emulating();
|
||||||
critical_exit();
|
critical_exit();
|
||||||
@ -684,42 +731,26 @@ fpudna(void)
|
|||||||
if (PCPU_GET(fpcurthread) != NULL) {
|
if (PCPU_GET(fpcurthread) != NULL) {
|
||||||
panic("fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n",
|
panic("fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n",
|
||||||
PCPU_GET(fpcurthread), PCPU_GET(fpcurthread)->td_tid,
|
PCPU_GET(fpcurthread), PCPU_GET(fpcurthread)->td_tid,
|
||||||
curthread, curthread->td_tid);
|
td, td->td_tid);
|
||||||
}
|
}
|
||||||
stop_emulating();
|
restore_fpu_curthread(td);
|
||||||
/*
|
|
||||||
* Record new context early in case frstor causes a trap.
|
|
||||||
*/
|
|
||||||
PCPU_SET(fpcurthread, curthread);
|
|
||||||
|
|
||||||
fpu_clean_state();
|
|
||||||
|
|
||||||
if ((curpcb->pcb_flags & PCB_FPUINITDONE) == 0) {
|
|
||||||
/*
|
|
||||||
* This is the first time this thread has used the FPU or
|
|
||||||
* the PCB doesn't contain a clean FPU state. Explicitly
|
|
||||||
* load an initial state.
|
|
||||||
*
|
|
||||||
* We prefer to restore the state from the actual save
|
|
||||||
* area in PCB instead of directly loading from
|
|
||||||
* fpu_initialstate, to ignite the XSAVEOPT
|
|
||||||
* tracking engine.
|
|
||||||
*/
|
|
||||||
bcopy(fpu_initialstate, curpcb->pcb_save,
|
|
||||||
cpu_max_ext_state_size);
|
|
||||||
fpurestore(curpcb->pcb_save);
|
|
||||||
if (curpcb->pcb_initial_fpucw != __INITIAL_FPUCW__)
|
|
||||||
fldcw(curpcb->pcb_initial_fpucw);
|
|
||||||
if (PCB_USER_FPU(curpcb))
|
|
||||||
set_pcb_flags(curpcb,
|
|
||||||
PCB_FPUINITDONE | PCB_USERFPUINITDONE);
|
|
||||||
else
|
|
||||||
set_pcb_flags(curpcb, PCB_FPUINITDONE);
|
|
||||||
} else
|
|
||||||
fpurestore(curpcb->pcb_save);
|
|
||||||
critical_exit();
|
critical_exit();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void fpu_activate_sw(struct thread *td); /* Called from the context switch */
|
||||||
|
void
|
||||||
|
fpu_activate_sw(struct thread *td)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (lazy_fpu_switch || (td->td_pflags & TDP_KTHREAD) != 0 ||
|
||||||
|
!PCB_USER_FPU(td->td_pcb)) {
|
||||||
|
PCPU_SET(fpcurthread, NULL);
|
||||||
|
start_emulating();
|
||||||
|
} else if (PCPU_GET(fpcurthread) != td) {
|
||||||
|
restore_fpu_curthread(td);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
fpudrop(void)
|
fpudrop(void)
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user