amd64: speed up failing case for memcmp

Instead of branching on up to 8 bytes, drop the size to 4.

Assorted clean ups while here.

Validated with glibc test suite.
This commit is contained in:
Mateusz Guzik 2020-01-30 19:56:22 +00:00
parent 7ab99925fd
commit aa88cc44f3

View File

@ -107,7 +107,7 @@ END(sse2_pagezero)
/* /*
* memcmpy(b1, b2, len) * memcmpy(b1, b2, len)
* rdi,rsi,len * rdi,rsi,rdx
*/ */
ENTRY(memcmp) ENTRY(memcmp)
PUSH_FRAME_POINTER PUSH_FRAME_POINTER
@ -123,7 +123,7 @@ ENTRY(memcmp)
movq (%rdi),%r8 movq (%rdi),%r8
movq (%rsi),%r9 movq (%rsi),%r9
cmpq %r8,%r9 cmpq %r8,%r9
jne 1f jne 80f
movq -8(%rdi,%rdx),%r8 movq -8(%rdi,%rdx),%r8
movq -8(%rsi,%rdx),%r9 movq -8(%rsi,%rdx),%r9
cmpq %r8,%r9 cmpq %r8,%r9
@ -133,25 +133,25 @@ ENTRY(memcmp)
100408: 100408:
cmpb $4,%dl cmpb $4,%dl
jl 100204f jl 100204f
movl (%rsi),%r8d movl (%rdi),%r8d
movl (%rdi),%r9d movl (%rsi),%r9d
cmpl %r8d,%r9d cmpl %r8d,%r9d
jne 1f jne 80f
movl -4(%rsi,%rdx),%r8d movl -4(%rdi,%rdx),%r8d
movl -4(%rdi,%rdx),%r9d movl -4(%rsi,%rdx),%r9d
cmpl %r8d,%r9d cmpl %r8d,%r9d
jne 1f jne 10040804f
POP_FRAME_POINTER POP_FRAME_POINTER
ret ret
100204: 100204:
cmpb $2,%dl cmpb $2,%dl
jl 100001f jl 100001f
movzwl (%rsi),%r8d movzwl (%rdi),%r8d
movzwl (%rdi),%r9d movzwl (%rsi),%r9d
cmpl %r8d,%r9d cmpl %r8d,%r9d
jne 1f jne 1f
movzwl -2(%rsi,%rdx),%r8d movzwl -2(%rdi,%rdx),%r8d
movzwl -2(%rdi,%rdx),%r9d movzwl -2(%rsi,%rdx),%r9d
cmpl %r8d,%r9d cmpl %r8d,%r9d
jne 1f jne 1f
POP_FRAME_POINTER POP_FRAME_POINTER
@ -159,10 +159,9 @@ ENTRY(memcmp)
100001: 100001:
cmpb $1,%dl cmpb $1,%dl
jl 100000f jl 100000f
movzbl (%rdi),%r8d movzbl (%rdi),%eax
movzbl (%rsi),%r9d movzbl (%rsi),%r8d
cmpb %r8b,%r9b subl %r8d,%eax
jne 1f
100000: 100000:
POP_FRAME_POINTER POP_FRAME_POINTER
ret ret
@ -173,11 +172,11 @@ ALIGN_TEXT
movq (%rdi),%r8 movq (%rdi),%r8
movq (%rsi),%r9 movq (%rsi),%r9
cmpq %r8,%r9 cmpq %r8,%r9
jne 1f jne 80f
movq 8(%rdi),%r8 movq 8(%rdi),%r8
movq 8(%rsi),%r9 movq 8(%rsi),%r9
cmpq %r8,%r9 cmpq %r8,%r9
jne 10163208f jne 10163208f
movq -16(%rdi,%rdx),%r8 movq -16(%rdi,%rdx),%r8
movq -16(%rsi,%rdx),%r9 movq -16(%rsi,%rdx),%r9
cmpq %r8,%r9 cmpq %r8,%r9
@ -194,14 +193,14 @@ ALIGN_TEXT
movq 8(%rdi),%r9 movq 8(%rdi),%r9
subq (%rsi),%r8 subq (%rsi),%r8
subq 8(%rsi),%r9 subq 8(%rsi),%r9
or %r8,%r9 orq %r8,%r9
jnz 10320000f jnz 10320000f
movq 16(%rdi),%r8 movq 16(%rdi),%r8
movq 24(%rdi),%r9 movq 24(%rdi),%r9
subq 16(%rsi),%r8 subq 16(%rsi),%r8
subq 24(%rsi),%r9 subq 24(%rsi),%r9
or %r8,%r9 orq %r8,%r9
jnz 10320016f jnz 10320016f
leaq 32(%rdi),%rdi leaq 32(%rdi),%rdi
@ -214,40 +213,57 @@ ALIGN_TEXT
POP_FRAME_POINTER POP_FRAME_POINTER
ret ret
/*
* Mismatch was found.
*
* Before we compute it we narrow down the range (16 -> 8 -> 4 bytes).
*/
ALIGN_TEXT
10320016: 10320016:
leaq 16(%rdi),%rdi leaq 16(%rdi),%rdi
leaq 16(%rsi),%rsi leaq 16(%rsi),%rsi
10320000: 10320000:
/*
* Mismatch was found within a 16 bytes range. The part of the routine
* which calculates it only operates on sizes up to 8 bytes. Find the
* right part.
*/
movq (%rdi),%r8 movq (%rdi),%r8
movq (%rsi),%r9 movq (%rsi),%r9
cmpq %r8,%r9 cmpq %r8,%r9
jne 1f jne 80f
leaq 8(%rdi),%rdi leaq 8(%rdi),%rdi
leaq 8(%rsi),%rsi leaq 8(%rsi),%rsi
jmp 1f jmp 80f
ALIGN_TEXT
10081608:
10163224: 10163224:
leaq -8(%rdi,%rdx),%rdi leaq -8(%rdi,%rdx),%rdi
leaq -8(%rsi,%rdx),%rsi leaq -8(%rsi,%rdx),%rsi
jmp 1f jmp 80f
ALIGN_TEXT
10163216: 10163216:
leaq -16(%rdi,%rdx),%rdi leaq -16(%rdi,%rdx),%rdi
leaq -16(%rsi,%rdx),%rsi leaq -16(%rsi,%rdx),%rsi
jmp 1f jmp 80f
ALIGN_TEXT
10163208: 10163208:
10081608:
leaq 8(%rdi),%rdi leaq 8(%rdi),%rdi
leaq 8(%rsi),%rsi leaq 8(%rsi),%rsi
jmp 80f
ALIGN_TEXT
10040804:
leaq -4(%rdi,%rdx),%rdi
leaq -4(%rsi,%rdx),%rsi
jmp 1f jmp 1f
/*
* Mismatch was found. We have no more than 8 bytes to inspect.
*/
ALIGN_TEXT ALIGN_TEXT
80:
movl (%rdi),%r8d
movl (%rsi),%r9d
cmpl %r8d,%r9d
jne 1f
leaq 4(%rdi),%rdi
leaq 4(%rsi),%rsi
/*
* We have up to 4 bytes to inspect.
*/
1: 1:
movzbl (%rdi),%eax movzbl (%rdi),%eax
movzbl (%rsi),%r8d movzbl (%rsi),%r8d
@ -266,32 +282,6 @@ ALIGN_TEXT
movzbl 3(%rdi),%eax movzbl 3(%rdi),%eax
movzbl 3(%rsi),%r8d movzbl 3(%rsi),%r8d
cmpb %r8b,%al
jne 2f
movzbl 4(%rdi),%eax
movzbl 4(%rsi),%r8d
cmpb %r8b,%al
jne 2f
movzbl 5(%rdi),%eax
movzbl 5(%rsi),%r8d
cmpb %r8b,%al
jne 2f
movzbl 6(%rdi),%eax
movzbl 6(%rsi),%r8d
cmpb %r8b,%al
jne 2f
movzbl 7(%rdi),%eax
movzbl 7(%rsi),%r8d
cmpb %r8b,%al
jne 2f
xorl %eax,%eax
POP_FRAME_POINTER
ret
2: 2:
subl %r8d,%eax subl %r8d,%eax
POP_FRAME_POINTER POP_FRAME_POINTER