amd64: move memcmp checks upfront

This is a tradeoff which saves jumps for smaller sizes while making
the 8-16 range slower (roughly in line with the other cases).

Tested with glibc test suite.

For example size 3 (most common with vfs namecache) (ops/s):
before:	407086026
after:	461391995

The regressed range of 8-16 (with 8 as example):
before:	540850489
after:	461671032
This commit is contained in:
Mateusz Guzik 2021-01-31 16:46:18 +01:00
parent 0db6aef407
commit f1be262ec1
2 changed files with 57 additions and 45 deletions

View File

@ -45,9 +45,25 @@ ENTRY(memcmp)
cmpq $16,%rdx
ja 101632f
100816:
cmpb $8,%dl
jl 100408f
jg 100816f
cmpb $4,%dl
jg 100408f
cmpb $2,%dl
jge 100204f
cmpb $1,%dl
jl 100000f
movzbl (%rdi),%eax
movzbl (%rsi),%r8d
subl %r8d,%eax
100000:
ret
ALIGN_TEXT
100816:
movq (%rdi),%r8
movq (%rsi),%r9
cmpq %r8,%r9
@ -57,9 +73,8 @@ ENTRY(memcmp)
cmpq %r8,%r9
jne 10081608f
ret
ALIGN_TEXT
100408:
cmpb $4,%dl
jl 100204f
movl (%rdi),%r8d
movl (%rsi),%r9d
cmpl %r8d,%r9d
@ -69,9 +84,8 @@ ENTRY(memcmp)
cmpl %r8d,%r9d
jne 10040804f
ret
ALIGN_TEXT
100204:
cmpb $2,%dl
jl 100001f
movzwl (%rdi),%r8d
movzwl (%rsi),%r9d
cmpl %r8d,%r9d
@ -81,15 +95,7 @@ ENTRY(memcmp)
cmpl %r8d,%r9d
jne 1f
ret
100001:
cmpb $1,%dl
jl 100000f
movzbl (%rdi),%eax
movzbl (%rsi),%r8d
subl %r8d,%eax
100000:
ret
ALIGN_TEXT
ALIGN_TEXT
101632:
cmpq $32,%rdx
ja 103200f
@ -110,7 +116,7 @@ ALIGN_TEXT
cmpq %r8,%r9
jne 10163224f
ret
ALIGN_TEXT
ALIGN_TEXT
103200:
movq (%rdi),%r8
movq 8(%rdi),%r9
@ -140,7 +146,7 @@ ALIGN_TEXT
*
* Before we compute it we narrow down the range (16 -> 8 -> 4 bytes).
*/
ALIGN_TEXT
ALIGN_TEXT
10320016:
leaq 16(%rdi),%rdi
leaq 16(%rsi),%rsi
@ -152,29 +158,29 @@ ALIGN_TEXT
leaq 8(%rdi),%rdi
leaq 8(%rsi),%rsi
jmp 80f
ALIGN_TEXT
ALIGN_TEXT
10081608:
10163224:
leaq -8(%rdi,%rdx),%rdi
leaq -8(%rsi,%rdx),%rsi
jmp 80f
ALIGN_TEXT
ALIGN_TEXT
10163216:
leaq -16(%rdi,%rdx),%rdi
leaq -16(%rsi,%rdx),%rsi
jmp 80f
ALIGN_TEXT
ALIGN_TEXT
10163208:
leaq 8(%rdi),%rdi
leaq 8(%rsi),%rsi
jmp 80f
ALIGN_TEXT
ALIGN_TEXT
10040804:
leaq -4(%rdi,%rdx),%rdi
leaq -4(%rsi,%rdx),%rsi
jmp 1f
ALIGN_TEXT
ALIGN_TEXT
80:
movl (%rdi),%r8d
movl (%rsi),%r9d

View File

@ -93,9 +93,26 @@ ENTRY(memcmp)
cmpq $16,%rdx
ja 101632f
100816:
cmpb $8,%dl
jl 100408f
jg 100816f
cmpb $4,%dl
jg 100408f
cmpb $2,%dl
jge 100204f
cmpb $1,%dl
jl 100000f
movzbl (%rdi),%eax
movzbl (%rsi),%r8d
subl %r8d,%eax
100000:
POP_FRAME_POINTER
ret
ALIGN_TEXT
100816:
movq (%rdi),%r8
movq (%rsi),%r9
cmpq %r8,%r9
@ -106,9 +123,8 @@ ENTRY(memcmp)
jne 10081608f
POP_FRAME_POINTER
ret
ALIGN_TEXT
100408:
cmpb $4,%dl
jl 100204f
movl (%rdi),%r8d
movl (%rsi),%r9d
cmpl %r8d,%r9d
@ -119,9 +135,8 @@ ENTRY(memcmp)
jne 10040804f
POP_FRAME_POINTER
ret
ALIGN_TEXT
100204:
cmpb $2,%dl
jl 100001f
movzwl (%rdi),%r8d
movzwl (%rsi),%r9d
cmpl %r8d,%r9d
@ -132,16 +147,7 @@ ENTRY(memcmp)
jne 1f
POP_FRAME_POINTER
ret
100001:
cmpb $1,%dl
jl 100000f
movzbl (%rdi),%eax
movzbl (%rsi),%r8d
subl %r8d,%eax
100000:
POP_FRAME_POINTER
ret
ALIGN_TEXT
ALIGN_TEXT
101632:
cmpq $32,%rdx
ja 103200f
@ -163,7 +169,7 @@ ALIGN_TEXT
jne 10163224f
POP_FRAME_POINTER
ret
ALIGN_TEXT
ALIGN_TEXT
103200:
movq (%rdi),%r8
movq 8(%rdi),%r9
@ -194,7 +200,7 @@ ALIGN_TEXT
*
* Before we compute it we narrow down the range (16 -> 8 -> 4 bytes).
*/
ALIGN_TEXT
ALIGN_TEXT
10320016:
leaq 16(%rdi),%rdi
leaq 16(%rsi),%rsi
@ -206,29 +212,29 @@ ALIGN_TEXT
leaq 8(%rdi),%rdi
leaq 8(%rsi),%rsi
jmp 80f
ALIGN_TEXT
ALIGN_TEXT
10081608:
10163224:
leaq -8(%rdi,%rdx),%rdi
leaq -8(%rsi,%rdx),%rsi
jmp 80f
ALIGN_TEXT
ALIGN_TEXT
10163216:
leaq -16(%rdi,%rdx),%rdi
leaq -16(%rsi,%rdx),%rsi
jmp 80f
ALIGN_TEXT
ALIGN_TEXT
10163208:
leaq 8(%rdi),%rdi
leaq 8(%rsi),%rsi
jmp 80f
ALIGN_TEXT
ALIGN_TEXT
10040804:
leaq -4(%rdi,%rdx),%rdi
leaq -4(%rsi,%rdx),%rsi
jmp 1f
ALIGN_TEXT
ALIGN_TEXT
80:
movl (%rdi),%r8d
movl (%rsi),%r9d