From f1be262ec11c1c35e6485f432415b5b52adb505d Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 31 Jan 2021 16:46:18 +0100 Subject: [PATCH] amd64: move memcmp checks upfront This is a tradeoff which saves jumps for smaller sizes while making the 8-16 range slower (roughly in line with the other cases). Tested with glibc test suite. For example size 3 (most common with vfs namecache) (ops/s): before: 407086026 after: 461391995 The regressed range of 8-16 (with 8 as example): before: 540850489 after: 461671032 --- lib/libc/amd64/string/memcmp.S | 50 ++++++++++++++++++-------------- sys/amd64/amd64/support.S | 52 +++++++++++++++++++--------------- 2 files changed, 57 insertions(+), 45 deletions(-) diff --git a/lib/libc/amd64/string/memcmp.S b/lib/libc/amd64/string/memcmp.S index 67c7df280679..0c8121f9d885 100644 --- a/lib/libc/amd64/string/memcmp.S +++ b/lib/libc/amd64/string/memcmp.S @@ -45,9 +45,25 @@ ENTRY(memcmp) cmpq $16,%rdx ja 101632f -100816: cmpb $8,%dl - jl 100408f + jg 100816f + + cmpb $4,%dl + jg 100408f + + cmpb $2,%dl + jge 100204f + + cmpb $1,%dl + jl 100000f + movzbl (%rdi),%eax + movzbl (%rsi),%r8d + subl %r8d,%eax +100000: + ret + + ALIGN_TEXT +100816: movq (%rdi),%r8 movq (%rsi),%r9 cmpq %r8,%r9 @@ -57,9 +73,8 @@ ENTRY(memcmp) cmpq %r8,%r9 jne 10081608f ret + ALIGN_TEXT 100408: - cmpb $4,%dl - jl 100204f movl (%rdi),%r8d movl (%rsi),%r9d cmpl %r8d,%r9d @@ -69,9 +84,8 @@ ENTRY(memcmp) cmpl %r8d,%r9d jne 10040804f ret + ALIGN_TEXT 100204: - cmpb $2,%dl - jl 100001f movzwl (%rdi),%r8d movzwl (%rsi),%r9d cmpl %r8d,%r9d @@ -81,15 +95,7 @@ ENTRY(memcmp) cmpl %r8d,%r9d jne 1f ret -100001: - cmpb $1,%dl - jl 100000f - movzbl (%rdi),%eax - movzbl (%rsi),%r8d - subl %r8d,%eax -100000: - ret -ALIGN_TEXT + ALIGN_TEXT 101632: cmpq $32,%rdx ja 103200f @@ -110,7 +116,7 @@ ALIGN_TEXT cmpq %r8,%r9 jne 10163224f ret -ALIGN_TEXT + ALIGN_TEXT 103200: movq (%rdi),%r8 movq 8(%rdi),%r9 @@ -140,7 +146,7 @@ ALIGN_TEXT * * Before we compute it we narrow down the range (16 -> 8 -> 4 bytes). */ -ALIGN_TEXT + ALIGN_TEXT 10320016: leaq 16(%rdi),%rdi leaq 16(%rsi),%rsi @@ -152,29 +158,29 @@ ALIGN_TEXT leaq 8(%rdi),%rdi leaq 8(%rsi),%rsi jmp 80f -ALIGN_TEXT + ALIGN_TEXT 10081608: 10163224: leaq -8(%rdi,%rdx),%rdi leaq -8(%rsi,%rdx),%rsi jmp 80f -ALIGN_TEXT + ALIGN_TEXT 10163216: leaq -16(%rdi,%rdx),%rdi leaq -16(%rsi,%rdx),%rsi jmp 80f -ALIGN_TEXT + ALIGN_TEXT 10163208: leaq 8(%rdi),%rdi leaq 8(%rsi),%rsi jmp 80f -ALIGN_TEXT + ALIGN_TEXT 10040804: leaq -4(%rdi,%rdx),%rdi leaq -4(%rsi,%rdx),%rsi jmp 1f -ALIGN_TEXT + ALIGN_TEXT 80: movl (%rdi),%r8d movl (%rsi),%r9d diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 49baa50ac294..b623fba277db 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -93,9 +93,26 @@ ENTRY(memcmp) cmpq $16,%rdx ja 101632f -100816: cmpb $8,%dl - jl 100408f + jg 100816f + + cmpb $4,%dl + jg 100408f + + cmpb $2,%dl + jge 100204f + + cmpb $1,%dl + jl 100000f + movzbl (%rdi),%eax + movzbl (%rsi),%r8d + subl %r8d,%eax +100000: + POP_FRAME_POINTER + ret + + ALIGN_TEXT +100816: movq (%rdi),%r8 movq (%rsi),%r9 cmpq %r8,%r9 @@ -106,9 +123,8 @@ ENTRY(memcmp) jne 10081608f POP_FRAME_POINTER ret + ALIGN_TEXT 100408: - cmpb $4,%dl - jl 100204f movl (%rdi),%r8d movl (%rsi),%r9d cmpl %r8d,%r9d @@ -119,9 +135,8 @@ ENTRY(memcmp) jne 10040804f POP_FRAME_POINTER ret + ALIGN_TEXT 100204: - cmpb $2,%dl - jl 100001f movzwl (%rdi),%r8d movzwl (%rsi),%r9d cmpl %r8d,%r9d @@ -132,16 +147,7 @@ ENTRY(memcmp) jne 1f POP_FRAME_POINTER ret -100001: - cmpb $1,%dl - jl 100000f - movzbl (%rdi),%eax - movzbl (%rsi),%r8d - subl %r8d,%eax -100000: - POP_FRAME_POINTER - ret -ALIGN_TEXT + ALIGN_TEXT 101632: cmpq $32,%rdx ja 103200f @@ -163,7 +169,7 @@ ALIGN_TEXT jne 10163224f POP_FRAME_POINTER ret -ALIGN_TEXT + ALIGN_TEXT 103200: movq (%rdi),%r8 movq 8(%rdi),%r9 @@ -194,7 +200,7 @@ ALIGN_TEXT * * Before we compute it we narrow down the range (16 -> 8 -> 4 bytes). */ -ALIGN_TEXT + ALIGN_TEXT 10320016: leaq 16(%rdi),%rdi leaq 16(%rsi),%rsi @@ -206,29 +212,29 @@ ALIGN_TEXT leaq 8(%rdi),%rdi leaq 8(%rsi),%rsi jmp 80f -ALIGN_TEXT + ALIGN_TEXT 10081608: 10163224: leaq -8(%rdi,%rdx),%rdi leaq -8(%rsi,%rdx),%rsi jmp 80f -ALIGN_TEXT + ALIGN_TEXT 10163216: leaq -16(%rdi,%rdx),%rdi leaq -16(%rsi,%rdx),%rsi jmp 80f -ALIGN_TEXT + ALIGN_TEXT 10163208: leaq 8(%rdi),%rdi leaq 8(%rsi),%rsi jmp 80f -ALIGN_TEXT + ALIGN_TEXT 10040804: leaq -4(%rdi,%rdx),%rdi leaq -4(%rsi,%rdx),%rsi jmp 1f -ALIGN_TEXT + ALIGN_TEXT 80: movl (%rdi),%r8d movl (%rsi),%r9d