amd64: depessimize copyinstr_smap

The stac/clac combo around each byte copy is causing a measurable
slowdown in benchmarks. Do it only before and after all data is
copied. While here reorder the code to avoid a forward branch in
the common case.

Note the copying loop (originating from copyinstr) is avoidably slow
and will be fixed later.

Reviewed by:	kib
Approved by:	re (gjb)
Differential Revision:	https://reviews.freebsd.org/D17063
This commit is contained in:
Mateusz Guzik 2018-09-06 19:42:40 +00:00
parent 23984ce5cd
commit 12360b3079

View File

@ -914,6 +914,8 @@ ENTRY(copyinstr_smap)
subq %rsi,%rax
jbe cpystrflt
stac
/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
cmpq %rdx,%rax
jae 1f
@ -924,31 +926,19 @@ ENTRY(copyinstr_smap)
2:
decq %rdx
jz copyinstr_toolong
jz copyinstr_toolong_smap
stac
lodsb
stosb
clac
orb %al,%al
jnz 2b
clac
copyinstr_succ:
/* Success -- 0 byte reached */
decq %rdx
xorl %eax,%eax
jmp cpystrflt_x
copyinstr_toolong:
/* rdx is zero - return ENAMETOOLONG or EFAULT */
movq $VM_MAXUSER_ADDRESS,%rax
cmpq %rax,%rsi
jae cpystrflt
movq $ENAMETOOLONG,%rax
jmp cpystrflt_x
/* Fault entry clears PSL.AC */
cpystrflt:
movq $EFAULT,%rax
cpystrflt_x:
/* set *lencopied and return %eax */
@ -962,6 +952,21 @@ cpystrflt_x:
1:
POP_FRAME_POINTER
ret
/* Fault entry clears PSL.AC */
cpystrflt:
movq $EFAULT,%rax
jmp cpystrflt_x
copyinstr_toolong_smap:
clac
copyinstr_toolong:
/* rdx is zero - return ENAMETOOLONG or EFAULT */
movq $VM_MAXUSER_ADDRESS,%rax
cmpq %rax,%rsi
jae cpystrflt
movq $ENAMETOOLONG,%rax
jmp cpystrflt_x
END(copyinstr_smap)
/*