First pass at (possibly futile) microoptimizing of cpu_switch. Results

are mixed.  Some pure context switch microbenchmarks show up to 29%
improvement.  Pipe based context switch microbenchmarks show up to 7%
improvement.  Real world tests are far less impressive as they are
dominated more by actual work than switch overheads, but depending on
the machine in question, workload, kernel options, phase of moon, etc, a
few percent gain might be seen.

Summary of changes:
- don't reload MSR_[FG]SBASE registers when context switching between
  non-threaded userland apps.  These typically cost 120 clock cycles each
  on an AMD cpu (less on Barcelona/Phenom).  Intel cores are probably no
  faster on this.
- The above change only helps unthreaded userland apps that tend to use
  the same value for gsbase.  Threaded apps will get no benefit from this.
- reorder things like accessing the pcb to be in memory order, to give
  prefetching a better chance of working.  Operations are now in increasing
  memory address order, rather than reverse or random.
- Push some lesser used code out of the main code paths.  Hopefully
  allowing better code density in cache lines.  This is probably futile.
- (part 2 of previous item) Reorder code so that branches have a more
  realistic static branch prediction hint.  Both Intel and AMD cpus
  default to predicting branches to lower memory addresses as being
  taken, and to higher memory addresses as not being taken.  This is
  overridden by the limited dynamic branch prediction subsystem.  A trip
  through userland might overflow this.
- Futule attempt at spreading the use of the results of previous operations
  in new operations.  Hopefully this will allow the cpus to execute in
  parallel better.
- stop wasting 16 bytes at the top of kernel stack, below the PCB.
- Never load the userland fs/gsbase registers for kthreads, but preserve
  curpcb->pcb_[fg]sbase as caches for the cpu. (Thanks Jeff!)

Microbenchmarking this code seems to be really sensitive to things like
scheduling luck, timing, cache behavior, tlb behavior, kernel options,
other random code changes, etc.

While it doesn't help heavy userland workloads much, it does help high
context switch loads a little, and should help those that involve
switching via kthreads a bit more.

A special thanks to Kris for the testing and reality checks, and Jeff for
tormenting me into doing this. :)

This is still work-in-progress.
This commit is contained in:
Peter Wemm 2008-03-23 23:09:06 +00:00
parent 58680920e9
commit f001eabf3a

View File

@ -69,16 +69,20 @@
* %rsi = newtd
*/
ENTRY(cpu_throw)
testq %rdi,%rdi
jnz 1f
movq PCPU(IDLETHREAD),%rdi
1:
movq TD_PCB(%rdi),%r8 /* Old pcb */
movl PCPU(CPUID), %eax
testq %rdi,%rdi /* no thread? */
jz 1f
movq PCB_FSBASE(%r8),%r9
movq PCB_GSBASE(%r8),%r10
/* release bit from old pm_active */
movq TD_PROC(%rdi), %rdx /* oldtd->td_proc */
movq P_VMSPACE(%rdx), %rdx /* proc->p_vmspace */
LK btrl %eax, VM_PMAP+PM_ACTIVE(%rdx) /* clear old */
1:
movq TD_PCB(%rsi),%rdx /* newtd->td_proc */
movq PCB_CR3(%rdx),%rdx
movq TD_PCB(%rsi),%r8 /* newtd->td_proc */
movq PCB_CR3(%r8),%rdx
movq %rdx,%cr3 /* new address space */
jmp swact
END(cpu_throw)
@ -97,43 +101,24 @@ ENTRY(cpu_switch)
movq TD_PCB(%rdi),%r8
movq (%rsp),%rax /* Hardware registers */
movq %rax,PCB_RIP(%r8)
movq %rbx,PCB_RBX(%r8)
movq %rsp,PCB_RSP(%r8)
movq %rbp,PCB_RBP(%r8)
movq %r12,PCB_R12(%r8)
movq %r13,PCB_R13(%r8)
movq %r14,PCB_R14(%r8)
movq %r15,PCB_R15(%r8)
movq %r14,PCB_R14(%r8)
movq %r13,PCB_R13(%r8)
movq %r12,PCB_R12(%r8)
movq %rbp,PCB_RBP(%r8)
movq %rsp,PCB_RSP(%r8)
movq %rbx,PCB_RBX(%r8)
movq %rax,PCB_RIP(%r8)
movq PCB_FSBASE(%r8),%r9
movq PCB_GSBASE(%r8),%r10
testl $PCB_32BIT,PCB_FLAGS(%r8)
jz 1f /* no, skip over */
jnz store_gs /* static predict not taken */
done_store_gs:
/* Save userland %gs */
movl %gs,PCB_GS(%r8)
movq PCB_GS32P(%r8),%rax
movq (%rax),%rax
movq %rax,PCB_GS32SD(%r8)
1:
/* Test if debug registers should be saved. */
testl $PCB_DBREGS,PCB_FLAGS(%r8)
jz 1f /* no, skip over */
movq %dr7,%rax /* yes, do the save */
movq %rax,PCB_DR7(%r8)
andq $0x0000fc00, %rax /* disable all watchpoints */
movq %rax,%dr7
movq %dr6,%rax
movq %rax,PCB_DR6(%r8)
movq %dr3,%rax
movq %rax,PCB_DR3(%r8)
movq %dr2,%rax
movq %rax,PCB_DR2(%r8)
movq %dr1,%rax
movq %rax,PCB_DR1(%r8)
movq %dr0,%rax
movq %rax,PCB_DR0(%r8)
1:
jnz store_dr /* static predict not taken */
done_store_dr:
/* have we used fp, and need a save? */
cmpq %rdi,PCPU(FPCURTHREAD)
@ -181,82 +166,138 @@ sw1:
cmpq %rcx, %rdx
pause
je 1b
lfence
#endif
/*
* At this point, we've switched address spaces and are ready
* to load up the rest of the next context.
*/
movq TD_PCB(%rsi),%r8
/* Skip loading user fsbase/gsbase for kthreads */
testl $TDP_KTHREAD,TD_PFLAGS(%rsi)
jnz do_kthread
cmpq PCB_FSBASE(%r8),%r9
jz 1f
/* Restore userland %fs */
movl $MSR_FSBASE,%ecx
movl PCB_FSBASE(%r8),%eax
movl PCB_FSBASE+4(%r8),%edx
wrmsr
1:
cmpq PCB_GSBASE(%r8),%r10
jz 2f
/* Restore userland %gs */
movl $MSR_KGSBASE,%ecx
movl PCB_GSBASE(%r8),%eax
movl PCB_GSBASE+4(%r8),%edx
wrmsr
2:
do_tss:
/* Update the TSS_RSP0 pointer for the next interrupt */
movq PCPU(TSSP), %rax
addq $COMMON_TSS_RSP0, %rax
leaq -16(%r8), %rbx
movq %rbx, (%rax)
movq %rbx, PCPU(RSP0)
movq %r8, PCPU(RSP0)
movq %r8, PCPU(CURPCB)
addq $COMMON_TSS_RSP0, %rax
movq %rsi, PCPU(CURTHREAD) /* into next thread */
movq %r8, (%rax)
/* Test if debug registers should be restored. */
testl $PCB_DBREGS,PCB_FLAGS(%r8)
jnz load_dr /* static predict not taken */
done_load_dr:
testl $PCB_32BIT,PCB_FLAGS(%r8)
jz 1f /* no, skip over */
jnz load_gs /* static predict not taken */
done_load_gs:
/* Restore context. */
movq PCB_R15(%r8),%r15
movq PCB_R14(%r8),%r14
movq PCB_R13(%r8),%r13
movq PCB_R12(%r8),%r12
movq PCB_RBP(%r8),%rbp
movq PCB_RSP(%r8),%rsp
movq PCB_RBX(%r8),%rbx
movq PCB_RIP(%r8),%rax
movq %rax,(%rsp)
ret
/*
* We order these strangely for several reasons.
* 1: I wanted to use static branch prediction hints
* 2: Most athlon64/opteron cpus don't have them. They define
* a forward branch as 'predict not taken'. Intel cores have
* the 'rep' prefix to invert this.
* So, to make it work on both forms of cpu we do the detour.
* We use jumps rather than call in order to avoid the stack.
*/
do_kthread:
/*
* Copy old fs/gsbase to new kthread pcb for future switches
* This maintains curpcb->pcb_[fg]sbase as caches of the MSR
*/
movq %r9,PCB_FSBASE(%r8)
movq %r10,PCB_GSBASE(%r8)
jmp do_tss
store_gs:
movl %gs,PCB_GS(%r8)
movq PCB_GS32P(%r8),%rax
movq (%rax),%rax
movq %rax,PCB_GS32SD(%r8)
jmp done_store_gs
load_gs:
/* Restore userland %gs while preserving kernel gsbase */
movq PCB_GS32P(%r8),%rax
movq PCB_GS32SD(%r8),%rbx
movq %rbx,(%rax)
movq PCB_GS32SD(%r8),%rcx
movq %rcx,(%rax)
movl $MSR_GSBASE,%ecx
rdmsr
movl PCB_GS(%r8),%gs
wrmsr
jmp done_load_gs
1:
/* Restore context. */
movq PCB_RBX(%r8),%rbx
movq PCB_RSP(%r8),%rsp
movq PCB_RBP(%r8),%rbp
movq PCB_R12(%r8),%r12
movq PCB_R13(%r8),%r13
movq PCB_R14(%r8),%r14
movq PCB_R15(%r8),%r15
movq PCB_RIP(%r8),%rax
movq %rax,(%rsp)
/* Test if debug registers should be restored. */
testl $PCB_DBREGS,PCB_FLAGS(%r8)
jz 1f
movq PCB_DR6(%r8),%rax
movq %rax,%dr6
movq PCB_DR3(%r8),%rax
movq %rax,%dr3
movq PCB_DR2(%r8),%rax
movq %rax,%dr2
movq PCB_DR1(%r8),%rax
movq %rax,%dr1
movq PCB_DR0(%r8),%rax
movq %rax,%dr0
/* But preserve reserved bits in %dr7 */
movq %dr7,%rax
andq $0x0000fc00,%rax
movq PCB_DR7(%r8),%rcx
andq $~0x0000fc00,%rcx
orq %rcx,%rax
store_dr:
movq %dr7,%rax /* yes, do the save */
movq %dr0,%r15
movq %dr1,%r14
movq %dr2,%r13
movq %dr3,%r12
movq %dr6,%r11
andq $0x0000fc00, %rax /* disable all watchpoints */
movq %r15,PCB_DR0(%r8)
movq %r14,PCB_DR1(%r8)
movq %r13,PCB_DR2(%r8)
movq %r12,PCB_DR3(%r8)
movq %r11,PCB_DR6(%r8)
movq %rax,PCB_DR7(%r8)
movq %rax,%dr7
1:
ret
jmp done_store_dr
load_dr:
movq %dr7,%rax
movq PCB_DR0(%r8),%r15
movq PCB_DR1(%r8),%r14
movq PCB_DR2(%r8),%r13
movq PCB_DR3(%r8),%r12
movq PCB_DR6(%r8),%r11
movq PCB_DR7(%r8),%rcx
movq %r15,%dr0
movq %r14,%dr1
/* Preserve reserved bits in %dr7 */
andq $0x0000fc00,%rax
andq $~0x0000fc00,%rcx
movq %r13,%dr2
movq %r12,%dr3
orq %rcx,%rax
movq %r11,%dr6
movq %rax,%dr7
jmp done_load_dr
END(cpu_switch)
/*