amd64: implement strlen in assembly

The C variant in libkern performs excessive branching to find the
non-zero byte instead of using the bsfq instruction. The same code
patched to use it is still slower than the routine implemented here
as the compiler keeps neglecting to perform certain optimizations
(like using leaq).

On top of that the routine can is a starting point for copyinstr
which operates on words instead of bytes.

Tested with glibc test suite.

Sample results (calls/s):

Haswell:
$(perl -e "print 'A' x 3"):
stock:	211198039
patched:338626619
asm:	465609618

$(perl -e "print 'A' x 100"):
stock:	 83151997
patched: 98285919
asm:	120719888

AMD EPYC 7R32:
$(perl -e "print 'A' x 3"):
stock:	282523617
asm:	491498172

$(perl -e "print 'A' x 100"):
stock:	114857172
asm:	112082057
This commit is contained in:
Mateusz Guzik 2021-02-08 18:01:48 +01:00
parent 3acea07c18
commit af366d353b
8 changed files with 72 additions and 1 deletions

View File

@ -697,6 +697,72 @@ ENTRY(fillw)
ret
END(fillw)
/*
* strlen(string)
* %rdi
*
* Uses the ((x - 0x01....01) & ~x & 0x80....80) trick.
*
* 0x80....80 is replaced with 0 - 0x80....80 so that it can be added
* with leaq.
*
* For a description see either:
* - "Hacker's Delight" by Henry S. Warren, Jr.
* - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms"
* by Agner Fog
*
* The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
*/
ENTRY(strlen)
PUSH_FRAME_POINTER
movabsq $0xfefefefefefefeff,%r8
movabsq $0x8080808080808080,%r9
movq %rdi,%r10
movq %rdi,%rcx
testb $7,%dil
jz 2f
/*
* Handle misaligned reads: align to 8 and fill
* the spurious bytes.
*/
andq $~7,%rdi
movq (%rdi),%r11
shlq $3,%rcx
movq $-1,%rdx
shlq %cl,%rdx
notq %rdx
orq %rdx,%r11
leaq (%r11,%r8),%rcx
notq %r11
andq %r11,%rcx
andq %r9,%rcx
jnz 3f
/*
* Main loop.
*/
ALIGN_TEXT
1:
leaq 8(%rdi),%rdi
2:
movq (%rdi),%r11
leaq (%r11,%r8),%rcx
notq %r11
andq %rcx,%r11
andq %r9,%rcx
jz 1b
3:
bsfq %rcx,%rcx
shrq $3,%rcx
leaq (%rcx,%rdi),%rax
subq %r10,%rax
POP_FRAME_POINTER
ret
END(strlen)
/*****************************************************************************/
/* copyout and fubyte family */
/*****************************************************************************/

View File

@ -4085,7 +4085,6 @@ libkern/strdup.c standard
libkern/strndup.c standard
libkern/strlcat.c standard
libkern/strlcpy.c standard
libkern/strlen.c standard
libkern/strncat.c standard
libkern/strncmp.c standard
libkern/strncpy.c standard

View File

@ -127,6 +127,7 @@ libkern/lshrdi3.c standard
libkern/memcmp.c standard
libkern/moddi3.c standard
libkern/qdivrem.c standard
libkern/strlen.c standard
libkern/ucmpdi2.c standard
libkern/udivdi3.c standard
libkern/umoddi3.c standard

View File

@ -432,6 +432,7 @@ libkern/memcmp.c standard \
compile-with "${NORMAL_C:N-fsanitize*}"
libkern/memset.c standard \
compile-with "${NORMAL_C:N-fsanitize*}"
libkern/strlen.c standard
libkern/arm64/crc32c_armv8.S standard
cddl/dev/dtrace/aarch64/dtrace_asm.S optional dtrace compile-with "${DTRACE_S}"
cddl/dev/dtrace/aarch64/dtrace_subr.c optional dtrace compile-with "${DTRACE_C}"

View File

@ -218,6 +218,7 @@ libkern/memcmp.c standard
libkern/memset.c standard
libkern/moddi3.c standard
libkern/qdivrem.c standard
libkern/strlen.c standard
libkern/ucmpdi2.c standard
libkern/udivdi3.c standard
libkern/umoddi3.c standard

View File

@ -66,6 +66,7 @@ libkern/ucmpdi2.c optional mips | mipshf | mipsel | mipselhf
libkern/ashldi3.c standard
libkern/ashrdi3.c standard
libkern/memcmp.c standard
libkern/strlen.c standard
# cfe support
dev/cfe/cfe_api.c optional cfe

View File

@ -129,6 +129,7 @@ libkern/memcmp.c standard
libkern/memset.c standard
libkern/moddi3.c optional powerpc | powerpcspe
libkern/qdivrem.c optional powerpc | powerpcspe
libkern/strlen.c standard
libkern/ucmpdi2.c optional powerpc | powerpcspe
libkern/udivdi3.c optional powerpc | powerpcspe
libkern/umoddi3.c optional powerpc | powerpcspe

View File

@ -29,6 +29,7 @@ libkern/flsl.c standard
libkern/flsll.c standard
libkern/memcmp.c standard
libkern/memset.c standard
libkern/strlen.c standard
riscv/riscv/autoconf.c standard
riscv/riscv/bus_machdep.c standard
riscv/riscv/bus_space_asm.S standard