amd64: implement strlen in assembly
The C variant in libkern performs excessive branching to find the non-zero byte instead of using the bsfq instruction. The same code patched to use it is still slower than the routine implemented here as the compiler keeps neglecting to perform certain optimizations (like using leaq). On top of that the routine can is a starting point for copyinstr which operates on words instead of bytes. Tested with glibc test suite. Sample results (calls/s): Haswell: $(perl -e "print 'A' x 3"): stock: 211198039 patched:338626619 asm: 465609618 $(perl -e "print 'A' x 100"): stock: 83151997 patched: 98285919 asm: 120719888 AMD EPYC 7R32: $(perl -e "print 'A' x 3"): stock: 282523617 asm: 491498172 $(perl -e "print 'A' x 100"): stock: 114857172 asm: 112082057
This commit is contained in:
parent
3acea07c18
commit
af366d353b
@ -697,6 +697,72 @@ ENTRY(fillw)
|
||||
ret
|
||||
END(fillw)
|
||||
|
||||
/*
|
||||
* strlen(string)
|
||||
* %rdi
|
||||
*
|
||||
* Uses the ((x - 0x01....01) & ~x & 0x80....80) trick.
|
||||
*
|
||||
* 0x80....80 is replaced with 0 - 0x80....80 so that it can be added
|
||||
* with leaq.
|
||||
*
|
||||
* For a description see either:
|
||||
* - "Hacker's Delight" by Henry S. Warren, Jr.
|
||||
* - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms"
|
||||
* by Agner Fog
|
||||
*
|
||||
* The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
|
||||
*/
|
||||
ENTRY(strlen)
|
||||
PUSH_FRAME_POINTER
|
||||
movabsq $0xfefefefefefefeff,%r8
|
||||
movabsq $0x8080808080808080,%r9
|
||||
|
||||
movq %rdi,%r10
|
||||
movq %rdi,%rcx
|
||||
testb $7,%dil
|
||||
jz 2f
|
||||
|
||||
/*
|
||||
* Handle misaligned reads: align to 8 and fill
|
||||
* the spurious bytes.
|
||||
*/
|
||||
andq $~7,%rdi
|
||||
movq (%rdi),%r11
|
||||
shlq $3,%rcx
|
||||
movq $-1,%rdx
|
||||
shlq %cl,%rdx
|
||||
notq %rdx
|
||||
orq %rdx,%r11
|
||||
|
||||
leaq (%r11,%r8),%rcx
|
||||
notq %r11
|
||||
andq %r11,%rcx
|
||||
andq %r9,%rcx
|
||||
jnz 3f
|
||||
|
||||
/*
|
||||
* Main loop.
|
||||
*/
|
||||
ALIGN_TEXT
|
||||
1:
|
||||
leaq 8(%rdi),%rdi
|
||||
2:
|
||||
movq (%rdi),%r11
|
||||
leaq (%r11,%r8),%rcx
|
||||
notq %r11
|
||||
andq %rcx,%r11
|
||||
andq %r9,%rcx
|
||||
jz 1b
|
||||
3:
|
||||
bsfq %rcx,%rcx
|
||||
shrq $3,%rcx
|
||||
leaq (%rcx,%rdi),%rax
|
||||
subq %r10,%rax
|
||||
POP_FRAME_POINTER
|
||||
ret
|
||||
END(strlen)
|
||||
|
||||
/*****************************************************************************/
|
||||
/* copyout and fubyte family */
|
||||
/*****************************************************************************/
|
||||
|
@ -4085,7 +4085,6 @@ libkern/strdup.c standard
|
||||
libkern/strndup.c standard
|
||||
libkern/strlcat.c standard
|
||||
libkern/strlcpy.c standard
|
||||
libkern/strlen.c standard
|
||||
libkern/strncat.c standard
|
||||
libkern/strncmp.c standard
|
||||
libkern/strncpy.c standard
|
||||
|
@ -127,6 +127,7 @@ libkern/lshrdi3.c standard
|
||||
libkern/memcmp.c standard
|
||||
libkern/moddi3.c standard
|
||||
libkern/qdivrem.c standard
|
||||
libkern/strlen.c standard
|
||||
libkern/ucmpdi2.c standard
|
||||
libkern/udivdi3.c standard
|
||||
libkern/umoddi3.c standard
|
||||
|
@ -432,6 +432,7 @@ libkern/memcmp.c standard \
|
||||
compile-with "${NORMAL_C:N-fsanitize*}"
|
||||
libkern/memset.c standard \
|
||||
compile-with "${NORMAL_C:N-fsanitize*}"
|
||||
libkern/strlen.c standard
|
||||
libkern/arm64/crc32c_armv8.S standard
|
||||
cddl/dev/dtrace/aarch64/dtrace_asm.S optional dtrace compile-with "${DTRACE_S}"
|
||||
cddl/dev/dtrace/aarch64/dtrace_subr.c optional dtrace compile-with "${DTRACE_C}"
|
||||
|
@ -218,6 +218,7 @@ libkern/memcmp.c standard
|
||||
libkern/memset.c standard
|
||||
libkern/moddi3.c standard
|
||||
libkern/qdivrem.c standard
|
||||
libkern/strlen.c standard
|
||||
libkern/ucmpdi2.c standard
|
||||
libkern/udivdi3.c standard
|
||||
libkern/umoddi3.c standard
|
||||
|
@ -66,6 +66,7 @@ libkern/ucmpdi2.c optional mips | mipshf | mipsel | mipselhf
|
||||
libkern/ashldi3.c standard
|
||||
libkern/ashrdi3.c standard
|
||||
libkern/memcmp.c standard
|
||||
libkern/strlen.c standard
|
||||
|
||||
# cfe support
|
||||
dev/cfe/cfe_api.c optional cfe
|
||||
|
@ -129,6 +129,7 @@ libkern/memcmp.c standard
|
||||
libkern/memset.c standard
|
||||
libkern/moddi3.c optional powerpc | powerpcspe
|
||||
libkern/qdivrem.c optional powerpc | powerpcspe
|
||||
libkern/strlen.c standard
|
||||
libkern/ucmpdi2.c optional powerpc | powerpcspe
|
||||
libkern/udivdi3.c optional powerpc | powerpcspe
|
||||
libkern/umoddi3.c optional powerpc | powerpcspe
|
||||
|
@ -29,6 +29,7 @@ libkern/flsl.c standard
|
||||
libkern/flsll.c standard
|
||||
libkern/memcmp.c standard
|
||||
libkern/memset.c standard
|
||||
libkern/strlen.c standard
|
||||
riscv/riscv/autoconf.c standard
|
||||
riscv/riscv/bus_machdep.c standard
|
||||
riscv/riscv/bus_space_asm.S standard
|
||||
|
Loading…
x
Reference in New Issue
Block a user