lib/libc/amd64/string/strlen.S: add amd64 baseline kernel

This performs very well.  x86-64-v3 and x86-64-v4 kernels were written,
too, but performed worse than the baseline kernel on short strings.
These may be added at a future point in time if the performance issues
can be fixed.

os: FreeBSD
arch: amd64
cpu: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
        │ strlen_scalar.out │          strlen_baseline.out          │
        │        B/s        │     B/s       vs base                 │
Short          1.667Gi ± 1%   2.676Gi ± 1%   +60.55% (p=0.000 n=20)
Mid            5.459Gi ± 1%   8.756Gi ± 1%   +60.39% (p=0.000 n=20)
Long           15.34Gi ± 0%   52.27Gi ± 0%  +240.64% (p=0.000 n=20)
geomean        5.188Gi        10.70Gi       +106.24%

Sponsored by:	The FreeBSD Foundation
Approved by:	kib
Reviewed by:	mjg jrtc27
Differential Revision:	https://reviews.freebsd.org/D40693
This commit is contained in:
Robert Clausecker 2023-08-04 01:48:32 +03:00
parent ad2fac552c
commit d8385768fb

View File

@ -1,11 +1,18 @@
/*
/*-
* Written by Mateusz Guzik <mjg@freebsd.org>
* Copyright (c) 2023 The FreeBSD Foundation
*
* Portions of this software were developed by Robert Clausecker
* <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
*
* Public domain.
*/
#include <machine/asm.h>
__FBSDID("$FreeBSD$");
#include "amd64_archlevel.h"
/*
* Note: this routine was written with kernel use in mind (read: no simd),
* it is only present in userspace as a temporary measure until something
@ -14,6 +21,11 @@ __FBSDID("$FreeBSD$");
#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
ARCHFUNCS(strlen)
ARCHFUNC(strlen, scalar)
ARCHFUNC(strlen, baseline)
ENDARCHFUNCS(strlen)
/*
* strlen(string)
* %rdi
@ -30,7 +42,7 @@ __FBSDID("$FreeBSD$");
*
* The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
*/
ENTRY(strlen)
ARCHENTRY(strlen, scalar)
movabsq $0xfefefefefefefeff,%r8
movabsq $0x8080808080808080,%r9
@ -76,6 +88,46 @@ ENTRY(strlen)
leaq (%rcx,%rdi),%rax
subq %r10,%rax
ret
END(strlen)
ARCHEND(strlen, scalar)
ARCHENTRY(strlen, baseline)
mov %rdi, %rcx
pxor %xmm1, %xmm1
and $~0xf, %rdi # align string
pcmpeqb (%rdi), %xmm1 # compare head (with junk before string)
mov %rcx, %rsi # string pointer copy for later
and $0xf, %ecx # amount of bytes rdi is past 16 byte alignment
pmovmskb %xmm1, %eax
add $32, %rdi # advance to next iteration
shr %cl, %eax # clear out matches in junk bytes
test %eax, %eax # any match? (can't use ZF from SHR as CL=0 is possible)
jnz 2f
ALIGN_TEXT
1: pxor %xmm1, %xmm1
pcmpeqb -16(%rdi), %xmm1 # find NUL bytes
pmovmskb %xmm1, %eax
test %eax, %eax # were any NUL bytes present?
jnz 3f
/* the same unrolled once more */
pxor %xmm1, %xmm1
pcmpeqb (%rdi), %xmm1
pmovmskb %xmm1, %eax
add $32, %rdi # advance to next iteration
test %eax, %eax
jz 1b
/* match found in loop body */
sub $16, %rdi # undo half the advancement
3: tzcnt %eax, %eax # find the first NUL byte
sub %rsi, %rdi # string length until beginning of (%rdi)
lea -16(%rdi, %rax, 1), %rax # that plus loc. of NUL byte: full string length
ret
/* match found in head */
2: tzcnt %eax, %eax # compute string length
ret
ARCHEND(strlen, baseline)
.section .note.GNU-stack,"",%progbits