amd64: speed up failing case for memcmp
Instead of branching on up to 8 bytes, drop the size to 4. Assorted clean ups while here. Validated with glibc test suite.
This commit is contained in:
parent
7ab99925fd
commit
aa88cc44f3
@ -107,7 +107,7 @@ END(sse2_pagezero)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* memcmpy(b1, b2, len)
|
* memcmpy(b1, b2, len)
|
||||||
* rdi,rsi,len
|
* rdi,rsi,rdx
|
||||||
*/
|
*/
|
||||||
ENTRY(memcmp)
|
ENTRY(memcmp)
|
||||||
PUSH_FRAME_POINTER
|
PUSH_FRAME_POINTER
|
||||||
@ -123,7 +123,7 @@ ENTRY(memcmp)
|
|||||||
movq (%rdi),%r8
|
movq (%rdi),%r8
|
||||||
movq (%rsi),%r9
|
movq (%rsi),%r9
|
||||||
cmpq %r8,%r9
|
cmpq %r8,%r9
|
||||||
jne 1f
|
jne 80f
|
||||||
movq -8(%rdi,%rdx),%r8
|
movq -8(%rdi,%rdx),%r8
|
||||||
movq -8(%rsi,%rdx),%r9
|
movq -8(%rsi,%rdx),%r9
|
||||||
cmpq %r8,%r9
|
cmpq %r8,%r9
|
||||||
@ -133,25 +133,25 @@ ENTRY(memcmp)
|
|||||||
100408:
|
100408:
|
||||||
cmpb $4,%dl
|
cmpb $4,%dl
|
||||||
jl 100204f
|
jl 100204f
|
||||||
movl (%rsi),%r8d
|
movl (%rdi),%r8d
|
||||||
movl (%rdi),%r9d
|
movl (%rsi),%r9d
|
||||||
cmpl %r8d,%r9d
|
cmpl %r8d,%r9d
|
||||||
jne 1f
|
jne 80f
|
||||||
movl -4(%rsi,%rdx),%r8d
|
movl -4(%rdi,%rdx),%r8d
|
||||||
movl -4(%rdi,%rdx),%r9d
|
movl -4(%rsi,%rdx),%r9d
|
||||||
cmpl %r8d,%r9d
|
cmpl %r8d,%r9d
|
||||||
jne 1f
|
jne 10040804f
|
||||||
POP_FRAME_POINTER
|
POP_FRAME_POINTER
|
||||||
ret
|
ret
|
||||||
100204:
|
100204:
|
||||||
cmpb $2,%dl
|
cmpb $2,%dl
|
||||||
jl 100001f
|
jl 100001f
|
||||||
movzwl (%rsi),%r8d
|
movzwl (%rdi),%r8d
|
||||||
movzwl (%rdi),%r9d
|
movzwl (%rsi),%r9d
|
||||||
cmpl %r8d,%r9d
|
cmpl %r8d,%r9d
|
||||||
jne 1f
|
jne 1f
|
||||||
movzwl -2(%rsi,%rdx),%r8d
|
movzwl -2(%rdi,%rdx),%r8d
|
||||||
movzwl -2(%rdi,%rdx),%r9d
|
movzwl -2(%rsi,%rdx),%r9d
|
||||||
cmpl %r8d,%r9d
|
cmpl %r8d,%r9d
|
||||||
jne 1f
|
jne 1f
|
||||||
POP_FRAME_POINTER
|
POP_FRAME_POINTER
|
||||||
@ -159,10 +159,9 @@ ENTRY(memcmp)
|
|||||||
100001:
|
100001:
|
||||||
cmpb $1,%dl
|
cmpb $1,%dl
|
||||||
jl 100000f
|
jl 100000f
|
||||||
movzbl (%rdi),%r8d
|
movzbl (%rdi),%eax
|
||||||
movzbl (%rsi),%r9d
|
movzbl (%rsi),%r8d
|
||||||
cmpb %r8b,%r9b
|
subl %r8d,%eax
|
||||||
jne 1f
|
|
||||||
100000:
|
100000:
|
||||||
POP_FRAME_POINTER
|
POP_FRAME_POINTER
|
||||||
ret
|
ret
|
||||||
@ -173,11 +172,11 @@ ALIGN_TEXT
|
|||||||
movq (%rdi),%r8
|
movq (%rdi),%r8
|
||||||
movq (%rsi),%r9
|
movq (%rsi),%r9
|
||||||
cmpq %r8,%r9
|
cmpq %r8,%r9
|
||||||
jne 1f
|
jne 80f
|
||||||
movq 8(%rdi),%r8
|
movq 8(%rdi),%r8
|
||||||
movq 8(%rsi),%r9
|
movq 8(%rsi),%r9
|
||||||
cmpq %r8,%r9
|
cmpq %r8,%r9
|
||||||
jne 10163208f
|
jne 10163208f
|
||||||
movq -16(%rdi,%rdx),%r8
|
movq -16(%rdi,%rdx),%r8
|
||||||
movq -16(%rsi,%rdx),%r9
|
movq -16(%rsi,%rdx),%r9
|
||||||
cmpq %r8,%r9
|
cmpq %r8,%r9
|
||||||
@ -194,14 +193,14 @@ ALIGN_TEXT
|
|||||||
movq 8(%rdi),%r9
|
movq 8(%rdi),%r9
|
||||||
subq (%rsi),%r8
|
subq (%rsi),%r8
|
||||||
subq 8(%rsi),%r9
|
subq 8(%rsi),%r9
|
||||||
or %r8,%r9
|
orq %r8,%r9
|
||||||
jnz 10320000f
|
jnz 10320000f
|
||||||
|
|
||||||
movq 16(%rdi),%r8
|
movq 16(%rdi),%r8
|
||||||
movq 24(%rdi),%r9
|
movq 24(%rdi),%r9
|
||||||
subq 16(%rsi),%r8
|
subq 16(%rsi),%r8
|
||||||
subq 24(%rsi),%r9
|
subq 24(%rsi),%r9
|
||||||
or %r8,%r9
|
orq %r8,%r9
|
||||||
jnz 10320016f
|
jnz 10320016f
|
||||||
|
|
||||||
leaq 32(%rdi),%rdi
|
leaq 32(%rdi),%rdi
|
||||||
@ -214,40 +213,57 @@ ALIGN_TEXT
|
|||||||
POP_FRAME_POINTER
|
POP_FRAME_POINTER
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Mismatch was found.
|
||||||
|
*
|
||||||
|
* Before we compute it we narrow down the range (16 -> 8 -> 4 bytes).
|
||||||
|
*/
|
||||||
|
ALIGN_TEXT
|
||||||
10320016:
|
10320016:
|
||||||
leaq 16(%rdi),%rdi
|
leaq 16(%rdi),%rdi
|
||||||
leaq 16(%rsi),%rsi
|
leaq 16(%rsi),%rsi
|
||||||
10320000:
|
10320000:
|
||||||
/*
|
|
||||||
* Mismatch was found within a 16 bytes range. The part of the routine
|
|
||||||
* which calculates it only operates on sizes up to 8 bytes. Find the
|
|
||||||
* right part.
|
|
||||||
*/
|
|
||||||
movq (%rdi),%r8
|
movq (%rdi),%r8
|
||||||
movq (%rsi),%r9
|
movq (%rsi),%r9
|
||||||
cmpq %r8,%r9
|
cmpq %r8,%r9
|
||||||
jne 1f
|
jne 80f
|
||||||
leaq 8(%rdi),%rdi
|
leaq 8(%rdi),%rdi
|
||||||
leaq 8(%rsi),%rsi
|
leaq 8(%rsi),%rsi
|
||||||
jmp 1f
|
jmp 80f
|
||||||
|
ALIGN_TEXT
|
||||||
|
10081608:
|
||||||
10163224:
|
10163224:
|
||||||
leaq -8(%rdi,%rdx),%rdi
|
leaq -8(%rdi,%rdx),%rdi
|
||||||
leaq -8(%rsi,%rdx),%rsi
|
leaq -8(%rsi,%rdx),%rsi
|
||||||
jmp 1f
|
jmp 80f
|
||||||
|
ALIGN_TEXT
|
||||||
10163216:
|
10163216:
|
||||||
leaq -16(%rdi,%rdx),%rdi
|
leaq -16(%rdi,%rdx),%rdi
|
||||||
leaq -16(%rsi,%rdx),%rsi
|
leaq -16(%rsi,%rdx),%rsi
|
||||||
jmp 1f
|
jmp 80f
|
||||||
|
ALIGN_TEXT
|
||||||
10163208:
|
10163208:
|
||||||
10081608:
|
|
||||||
leaq 8(%rdi),%rdi
|
leaq 8(%rdi),%rdi
|
||||||
leaq 8(%rsi),%rsi
|
leaq 8(%rsi),%rsi
|
||||||
|
jmp 80f
|
||||||
|
ALIGN_TEXT
|
||||||
|
10040804:
|
||||||
|
leaq -4(%rdi,%rdx),%rdi
|
||||||
|
leaq -4(%rsi,%rdx),%rsi
|
||||||
jmp 1f
|
jmp 1f
|
||||||
|
|
||||||
/*
|
|
||||||
* Mismatch was found. We have no more than 8 bytes to inspect.
|
|
||||||
*/
|
|
||||||
ALIGN_TEXT
|
ALIGN_TEXT
|
||||||
|
80:
|
||||||
|
movl (%rdi),%r8d
|
||||||
|
movl (%rsi),%r9d
|
||||||
|
cmpl %r8d,%r9d
|
||||||
|
jne 1f
|
||||||
|
leaq 4(%rdi),%rdi
|
||||||
|
leaq 4(%rsi),%rsi
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We have up to 4 bytes to inspect.
|
||||||
|
*/
|
||||||
1:
|
1:
|
||||||
movzbl (%rdi),%eax
|
movzbl (%rdi),%eax
|
||||||
movzbl (%rsi),%r8d
|
movzbl (%rsi),%r8d
|
||||||
@ -266,32 +282,6 @@ ALIGN_TEXT
|
|||||||
|
|
||||||
movzbl 3(%rdi),%eax
|
movzbl 3(%rdi),%eax
|
||||||
movzbl 3(%rsi),%r8d
|
movzbl 3(%rsi),%r8d
|
||||||
cmpb %r8b,%al
|
|
||||||
jne 2f
|
|
||||||
|
|
||||||
movzbl 4(%rdi),%eax
|
|
||||||
movzbl 4(%rsi),%r8d
|
|
||||||
cmpb %r8b,%al
|
|
||||||
jne 2f
|
|
||||||
|
|
||||||
movzbl 5(%rdi),%eax
|
|
||||||
movzbl 5(%rsi),%r8d
|
|
||||||
cmpb %r8b,%al
|
|
||||||
jne 2f
|
|
||||||
|
|
||||||
movzbl 6(%rdi),%eax
|
|
||||||
movzbl 6(%rsi),%r8d
|
|
||||||
cmpb %r8b,%al
|
|
||||||
jne 2f
|
|
||||||
|
|
||||||
movzbl 7(%rdi),%eax
|
|
||||||
movzbl 7(%rsi),%r8d
|
|
||||||
cmpb %r8b,%al
|
|
||||||
jne 2f
|
|
||||||
|
|
||||||
xorl %eax,%eax
|
|
||||||
POP_FRAME_POINTER
|
|
||||||
ret
|
|
||||||
2:
|
2:
|
||||||
subl %r8d,%eax
|
subl %r8d,%eax
|
||||||
POP_FRAME_POINTER
|
POP_FRAME_POINTER
|
||||||
|
Loading…
Reference in New Issue
Block a user