amd64: depessimize bcmp for small buffers
Adapt assembly generated by clang for memcmp and use it for <= 64 sized compares (which are the vast majority). Sample result of doing stats on Broadwell (% of samples): before: 4.0 kernel bcmp cache_lookup after : 0.7 kernel bcmp cache_lookup The routine is most definitely still not optimal. Anyone interested in spending time improving it is welcome to take over. Reviewed by: kib
This commit is contained in:
parent
55c9d75e6b
commit
20ca271fdd
@ -98,17 +98,40 @@ END(sse2_pagezero)
|
|||||||
|
|
||||||
ENTRY(bcmp)
|
ENTRY(bcmp)
|
||||||
PUSH_FRAME_POINTER
|
PUSH_FRAME_POINTER
|
||||||
|
test %rdx,%rdx
|
||||||
|
je 1f
|
||||||
|
cmpq $64,%rdx
|
||||||
|
jg 4f
|
||||||
|
|
||||||
|
xor %ecx,%ecx
|
||||||
|
2:
|
||||||
|
movzbl (%rdi,%rcx,1),%eax
|
||||||
|
movzbl (%rsi,%rcx,1),%r8d
|
||||||
|
cmp %r8b,%al
|
||||||
|
jne 3f
|
||||||
|
add $0x1,%rcx
|
||||||
|
cmp %rcx,%rdx
|
||||||
|
jne 2b
|
||||||
|
1:
|
||||||
|
xor %eax,%eax
|
||||||
|
POP_FRAME_POINTER
|
||||||
|
retq
|
||||||
|
3:
|
||||||
|
mov $1,%eax
|
||||||
|
POP_FRAME_POINTER
|
||||||
|
retq
|
||||||
|
4:
|
||||||
movq %rdx,%rcx
|
movq %rdx,%rcx
|
||||||
shrq $3,%rcx
|
shrq $3,%rcx
|
||||||
repe
|
repe
|
||||||
cmpsq
|
cmpsq
|
||||||
jne 1f
|
jne 5f
|
||||||
|
|
||||||
movq %rdx,%rcx
|
movq %rdx,%rcx
|
||||||
andq $7,%rcx
|
andq $7,%rcx
|
||||||
repe
|
repe
|
||||||
cmpsb
|
cmpsb
|
||||||
1:
|
5:
|
||||||
setne %al
|
setne %al
|
||||||
movsbl %al,%eax
|
movsbl %al,%eax
|
||||||
POP_FRAME_POINTER
|
POP_FRAME_POINTER
|
||||||
|
Loading…
x
Reference in New Issue
Block a user