lib/libc/amd64/string: add strchrnul implementations (scalar, baseline)

A lot better than the generic (pre) implementaion.  We do not beat glibc
for long strings, likely due to glibc switching to AVX once the input is
sufficiently long.  X86-64-v3 and v4 implementations may be added at a
future time.

os: FreeBSD
arch: amd64
cpu: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
        │ strchrnul_pre.out │         strchrnul_scalar.out         │       strchrnul_baseline.out        │
        │      sec/op       │    sec/op     vs base                │   sec/op     vs base                │
Short          129.68µ ± 3%    59.91µ ± 1%  -53.80% (p=0.000 n=20)   44.37µ ± 1%  -65.79% (p=0.000 n=20)
Mid             21.15µ ± 0%    19.30µ ± 0%   -8.76% (p=0.000 n=20)   12.30µ ± 0%  -41.85% (p=0.000 n=20)
Long           13.772µ ± 0%   11.028µ ± 0%  -19.92% (p=0.000 n=20)   3.285µ ± 0%  -76.15% (p=0.000 n=20)
geomean         33.55µ         23.36µ       -30.37%                  12.15µ       -63.80%

        │ strchrnul_pre.out │          strchrnul_scalar.out          │         strchrnul_baseline.out         │
        │        B/s        │      B/s       vs base                 │      B/s       vs base                 │
Short          919.3Mi ± 3%   1989.7Mi ± 1%  +116.45% (p=0.000 n=20)   2686.8Mi ± 1%  +192.28% (p=0.000 n=20)
Mid            5.505Gi ± 0%    6.033Gi ± 0%    +9.60% (p=0.000 n=20)    9.466Gi ± 0%   +71.97% (p=0.000 n=20)
Long           8.453Gi ± 0%   10.557Gi ± 0%   +24.88% (p=0.000 n=20)   35.441Gi ± 0%  +319.26% (p=0.000 n=20)
geomean        3.470Gi         4.983Gi        +43.62%                   9.584Gi       +176.22%

For comparison, glibc on the same machine:

        │ strchrnul_glibc.out │
        │       sec/op        │
Short             49.73µ ± 0%
Mid               14.60µ ± 0%
Long              1.237µ ± 0%
geomean           9.646µ

        │ strchrnul_glibc.out │
        │         B/s         │
Short            2.341Gi ± 0%
Mid              7.976Gi ± 0%
Long             94.14Gi ± 0%
geomean          12.07Gi

Sponsored by:	The FreeBSD Foundation
Approved by:	mjg
Differential Revision: https://reviews.freebsd.org/D41333
This commit is contained in:
Robert Clausecker 2023-06-30 14:45:11 +00:00
parent f1d955be2a
commit 61f4c4d3dd
2 changed files with 168 additions and 0 deletions

View File

@ -8,6 +8,7 @@ MDSRCS+= \
memmove.S \
memset.S \
strcat.S \
strchrnul.S \
strcmp.S \
strlen.S \
stpcpy.S

View File

@ -0,0 +1,167 @@
/*-
* Copyright (c) 2023 The FreeBSD Foundation
*
* This software was developed by Robert Clausecker <fuz@FreeBSD.org>
* under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE
*/
#include <machine/asm.h>
#include "amd64_archlevel.h"
#define ALIGN_TEXT .p2align 4,0x90 # 16-byte alignment, nop-filled
.weak strchrnul
.set strchrnul, __strchrnul
ARCHFUNCS(__strchrnul)
ARCHFUNC(__strchrnul, scalar)
ARCHFUNC(__strchrnul, baseline)
ENDARCHFUNCS(__strchrnul)
/*
* strchrnul(str, c)
* This is implemented like strlen(str), but we check for the
* presence of both NUL and c in each iteration.
*/
ARCHENTRY(__strchrnul, scalar)
mov %edi, %ecx
and $~7, %rdi # align to 8 byte
movzbl %sil, %esi # clear stray high bits
movabs $0x0101010101010101, %r8
mov (%rdi), %rax # load first word
imul %r8, %rsi # replicate char 8 times
movabs $0x8080808080808080, %r9
/*
* Unaligned input: align to 8 bytes. Then proceed the same
* way as with aligned input, but ignore matches before the
* beginning of the string. This is achieved by shifting r9
* into r10 to have 0x00 bytes before the string begins.
*/
shl $3, %ecx
mov %r9, %r10
add $8, %rdi
shl %cl, %r10 # 0x80 where the string is
neg %r8 # negate 01..01 so we can use lea
mov %rsi, %rcx
xor %rax, %rcx # str ^ c
lea (%rax, %r8, 1), %rdx # str - 0x01..01
lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01
not %rax # ~str
not %rcx # ~(str ^ c)
and %rdx, %rax # (str - 0x01..01) & ~str
and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)
or %rcx, %rax # matches for both
and %r10, %rax # not including junk bytes or bytes before the string
jnz 1f
/* main loop unrolled twice */
ALIGN_TEXT
0: mov (%rdi), %rax # str
mov %rsi, %rcx
xor %rax, %rcx # str ^ c
lea (%rax, %r8, 1), %rdx # str - 0x01..01
lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01
not %rax # ~str
not %rcx # ~(str ^ c)
and %rdx, %rax # (str - 0x01..01) & ~str
and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)
or %rcx, %rax # matches for both
and %r9, %rax # not including junk bits
jnz 2f
mov 8(%rdi), %rax # str
add $16, %rdi
mov %rsi, %rcx
xor %rax, %rcx # str ^ c
lea (%rax, %r8, 1), %rdx # str - 0x01..01
lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01
not %rax # ~str
not %rcx # ~(str ^ c)
and %rdx, %rax # (str - 0x01..01) & ~str
and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c)
or %rcx, %rax # matches for both
and %r9, %rax # not including junk bits
jz 0b
/* NUL or c found */
1: sub $8, %rdi # undo advance past buffer
2: tzcnt %rax, %rax # first NUL or c byte match
shr $3, %eax # scale from bit to byte index
add %rdi, %rax # pointer to found c or NUL
ret
ARCHEND(__strchrnul, scalar)
ARCHENTRY(__strchrnul, baseline)
mov %edi, %ecx
and $~0xf, %rdi # align to 16 byte
movdqa (%rdi), %xmm1
movd %esi, %xmm0
and $0xf, %ecx # distance from (%rdi) to start of string
pxor %xmm2, %xmm2
mov $-1, %edx
punpcklbw %xmm0, %xmm0 # c -> cc
shl %cl, %edx # bits corresponding to bytes in the string
punpcklwd %xmm0, %xmm0 # cc -> cccc
add $16, %rdi
/* check for match in head */
pcmpeqb %xmm1, %xmm2 # NUL bytes present?
pshufd $0, %xmm0, %xmm0 # cccc -> cccccccccccccccc
pcmpeqb %xmm0, %xmm1 # c present?
por %xmm2, %xmm1 # either present?
pmovmskb %xmm1, %eax
and %edx, %eax # match in the string?
jnz 1f
/* main loop unrolled twice */
ALIGN_TEXT
0: movdqa (%rdi), %xmm1
pxor %xmm2, %xmm2
pcmpeqb %xmm1, %xmm2 # NUL bytes present?
pcmpeqb %xmm0, %xmm1 # c present?
por %xmm2, %xmm1 # either present?
pmovmskb %xmm1, %eax
test %eax, %eax # match in the string?
jnz 2f
movdqa 16(%rdi), %xmm1
add $32, %rdi
pxor %xmm2, %xmm2
pcmpeqb %xmm1, %xmm2 # NUL bytes present?
pcmpeqb %xmm0, %xmm1 # c present?
por %xmm2, %xmm1 # either present?
pmovmskb %xmm1, %eax
test %eax, %eax # match in the string?
jz 0b
1: sub $16, %rdi # undo advance past buffer
2: tzcnt %eax, %eax # where is the match?
add %rdi, %rax # pointer to found c or NUL
ret
ARCHEND(__strchrnul, baseline)
.section .note.GNU-stack,"",%progbits