Import the updated Arm Optimized Routines

The main changes this brings in are:
 - Improves the performance of memcmp
 - Adds SVE implementation of memcpy
 - Uses the MTE version of some str* functions as they are faster

Sponsored by:   The FreeBSD Foundation
This commit is contained in:
Andrew Turner 2022-09-06 17:20:29 +01:00
commit d49ad20625
24 changed files with 1107 additions and 1305 deletions

View File

@ -9,7 +9,7 @@ contributor-agreement.pdf. This is needed so upstreaming code
to projects that require copyright assignment is possible.
Regular quarterly releases are tagged as vYY.MM, the latest
release is v20.11.
release is v21.02.
Source code layout:

View File

@ -22,7 +22,7 @@ cosf (float y)
int n;
const sincos_t *p = &__sincosf_table[0];
if (abstop12 (y) < abstop12 (pio4))
if (abstop12 (y) < abstop12 (pio4f))
{
double x2 = x * x;

View File

@ -22,7 +22,7 @@ sincosf (float y, float *sinp, float *cosp)
int n;
const sincos_t *p = &__sincosf_table[0];
if (abstop12 (y) < abstop12 (pio4))
if (abstop12 (y) < abstop12 (pio4f))
{
double x2 = x * x;

View File

@ -12,7 +12,7 @@
/* 2PI * 2^-64. */
static const double pi63 = 0x1.921FB54442D18p-62;
/* PI / 4. */
static const double pio4 = 0x1.921FB54442D18p-1;
static const float pio4f = 0x1.921FB6p-1f;
/* The constants and polynomials for sine and cosine. */
typedef struct

View File

@ -21,7 +21,7 @@ sinf (float y)
int n;
const sincos_t *p = &__sincosf_table[0];
if (abstop12 (y) < abstop12 (pio4))
if (abstop12 (y) < abstop12 (pio4f))
{
s = x * x;

View File

@ -1,103 +1,84 @@
/* memcmp - compare memory
*
* Copyright (c) 2013-2020, Arm Limited.
* Copyright (c) 2013-2021, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
* ARMv8-a, AArch64, unaligned accesses.
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*/
#include "../asmdefs.h"
/* Parameters and result. */
#define src1 x0
#define src2 x1
#define limit x2
#define result w0
#define src1 x0
#define src2 x1
#define limit x2
#define result w0
#define data1 x3
#define data1w w3
#define data2 x4
#define data2w w4
#define data3 x5
#define data3w w5
#define data4 x6
#define data4w w6
#define tmp x6
#define src1end x7
#define src2end x8
/* Internal variables. */
#define data1 x3
#define data1w w3
#define data1h x4
#define data2 x5
#define data2w w5
#define data2h x6
#define tmp1 x7
#define tmp2 x8
ENTRY (__memcmp_aarch64)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
subs limit, limit, 8
b.lo L(less8)
ldr data1, [src1], 8
ldr data2, [src2], 8
cmp limit, 16
b.lo L(less16)
ldp data1, data3, [src1]
ldp data2, data4, [src2]
ccmp data1, data2, 0, ne
ccmp data3, data4, 0, eq
b.ne L(return2)
add src1end, src1, limit
add src2end, src2, limit
cmp limit, 32
b.ls L(last_bytes)
cmp limit, 160
b.hs L(loop_align)
sub limit, limit, 32
.p2align 4
L(loop32):
ldp data1, data3, [src1, 16]
ldp data2, data4, [src2, 16]
cmp data1, data2
b.ne L(return)
subs limit, limit, 8
b.gt L(more16)
ldr data1, [src1, limit]
ldr data2, [src2, limit]
b L(return)
L(more16):
ldr data1, [src1], 8
ldr data2, [src2], 8
cmp data1, data2
bne L(return)
/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
strings. */
subs limit, limit, 16
ccmp data3, data4, 0, eq
b.ne L(return2)
cmp limit, 16
b.ls L(last_bytes)
/* We overlap loads between 0-32 bytes at either side of SRC1 when we
try to align, so limit it only to strings larger than 128 bytes. */
cmp limit, 96
b.ls L(loop16)
/* Align src1 and adjust src2 with bytes not yet done. */
and tmp1, src1, 15
add limit, limit, tmp1
sub src1, src1, tmp1
sub src2, src2, tmp1
/* Loop performing 16 bytes per iteration using aligned src1.
Limit is pre-decremented by 16 and must be larger than zero.
Exit if <= 16 bytes left to do or if the data is not equal. */
.p2align 4
L(loop16):
ldp data1, data1h, [src1], 16
ldp data2, data2h, [src2], 16
subs limit, limit, 16
ccmp data1, data2, 0, hi
ccmp data1h, data2h, 0, eq
b.eq L(loop16)
ldp data1, data3, [src1, 32]
ldp data2, data4, [src2, 32]
cmp data1, data2
bne L(return)
mov data1, data1h
mov data2, data2h
cmp data1, data2
bne L(return)
ccmp data3, data4, 0, eq
b.ne L(return2)
add src1, src1, 32
add src2, src2, 32
L(last64):
subs limit, limit, 32
b.hi L(loop32)
/* Compare last 1-16 bytes using unaligned access. */
L(last_bytes):
add src1, src1, limit
add src2, src2, limit
ldp data1, data1h, [src1]
ldp data2, data2h, [src2]
cmp data1, data2
bne L(return)
mov data1, data1h
mov data2, data2h
ldp data1, data3, [src1end, -16]
ldp data2, data4, [src2end, -16]
L(return2):
cmp data1, data2
csel data1, data1, data3, ne
csel data2, data2, data4, ne
/* Compare data bytes and set return value to 0, -1 or 1. */
L(return):
@ -105,33 +86,105 @@ L(return):
rev data1, data1
rev data2, data2
#endif
cmp data1, data2
L(ret_eq):
cmp data1, data2
cset result, ne
cneg result, result, lo
ret
.p2align 4
/* Compare up to 8 bytes. Limit is [-8..-1]. */
L(less16):
add src1end, src1, limit
add src2end, src2, limit
tbz limit, 3, L(less8)
ldr data1, [src1]
ldr data2, [src2]
ldr data3, [src1end, -8]
ldr data4, [src2end, -8]
b L(return2)
.p2align 4
L(less8):
adds limit, limit, 4
b.lo L(less4)
ldr data1w, [src1], 4
ldr data2w, [src2], 4
tbz limit, 2, L(less4)
ldr data1w, [src1]
ldr data2w, [src2]
ldr data3w, [src1end, -4]
ldr data4w, [src2end, -4]
b L(return2)
L(less4):
tbz limit, 1, L(less2)
ldrh data1w, [src1]
ldrh data2w, [src2]
cmp data1w, data2w
b.ne L(return)
sub limit, limit, 4
L(less4):
adds limit, limit, 4
beq L(ret_eq)
L(byte_loop):
ldrb data1w, [src1], 1
ldrb data2w, [src2], 1
subs limit, limit, 1
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.eq L(byte_loop)
L(less2):
mov result, 0
tbz limit, 0, L(return_zero)
ldrb data1w, [src1end, -1]
ldrb data2w, [src2end, -1]
sub result, data1w, data2w
L(return_zero):
ret
L(loop_align):
ldp data1, data3, [src1, 16]
ldp data2, data4, [src2, 16]
cmp data1, data2
ccmp data3, data4, 0, eq
b.ne L(return2)
/* Align src2 and adjust src1, src2 and limit. */
and tmp, src2, 15
sub tmp, tmp, 16
sub src2, src2, tmp
add limit, limit, tmp
sub src1, src1, tmp
sub limit, limit, 64 + 16
.p2align 4
L(loop64):
ldr q0, [src1, 16]
ldr q1, [src2, 16]
subs limit, limit, 64
ldr q2, [src1, 32]
ldr q3, [src2, 32]
eor v0.16b, v0.16b, v1.16b
eor v1.16b, v2.16b, v3.16b
ldr q2, [src1, 48]
ldr q3, [src2, 48]
umaxp v0.16b, v0.16b, v1.16b
ldr q4, [src1, 64]!
ldr q5, [src2, 64]!
eor v1.16b, v2.16b, v3.16b
eor v2.16b, v4.16b, v5.16b
umaxp v1.16b, v1.16b, v2.16b
umaxp v0.16b, v0.16b, v1.16b
umaxp v0.16b, v0.16b, v0.16b
fmov tmp, d0
ccmp tmp, 0, 0, hi
b.eq L(loop64)
/* If equal, process last 1-64 bytes using scalar loop. */
add limit, limit, 64 + 16
cbz tmp, L(last64)
/* Determine the 8-byte aligned offset of the first difference. */
#ifdef __AARCH64EB__
rev16 tmp, tmp
#endif
rev tmp, tmp
clz tmp, tmp
bic tmp, tmp, 7
sub tmp, tmp, 48
ldr data1, [src1, tmp]
ldr data2, [src2, tmp]
#ifndef __AARCH64EB__
rev data1, data1
rev data2, data2
#endif
mov result, 1
cmp data1, data2
cneg result, result, lo
ret
END (__memcmp_aarch64)

View File

@ -0,0 +1,180 @@
/*
* memcpy - copy memory area
*
* Copyright (c) 2019-2022, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
* ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
*
*/
#if __ARM_FEATURE_SVE
#include "../asmdefs.h"
#define dstin x0
#define src x1
#define count x2
#define dst x3
#define srcend x4
#define dstend x5
#define tmp1 x6
#define vlen x6
#define A_q q0
#define B_q q1
#define C_q q2
#define D_q q3
#define E_q q4
#define F_q q5
#define G_q q6
#define H_q q7
/* This implementation handles overlaps and supports both memcpy and memmove
from a single entry point. It uses unaligned accesses and branchless
sequences to keep the code small, simple and improve performance.
SVE vectors are used to speedup small copies.
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
copies of up to 128 bytes, and large copies. The overhead of the overlap
check is negligible since it is only required for large copies.
Large copies use a software pipelined loop processing 64 bytes per iteration.
The source pointer is 16-byte aligned to minimize unaligned accesses.
The loop tail is handled by always copying 64 bytes from the end.
*/
ENTRY_ALIAS (__memmove_aarch64_sve)
ENTRY (__memcpy_aarch64_sve)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
cmp count, 128
b.hi L(copy_long)
cmp count, 32
b.hi L(copy32_128)
whilelo p0.b, xzr, count
cntb vlen
tbnz vlen, 4, L(vlen128)
ld1b z0.b, p0/z, [src]
st1b z0.b, p0, [dstin]
ret
/* Medium copies: 33..128 bytes. */
L(copy32_128):
add srcend, src, count
add dstend, dstin, count
ldp A_q, B_q, [src]
ldp C_q, D_q, [srcend, -32]
cmp count, 64
b.hi L(copy128)
stp A_q, B_q, [dstin]
stp C_q, D_q, [dstend, -32]
ret
/* Copy 65..128 bytes. */
L(copy128):
ldp E_q, F_q, [src, 32]
cmp count, 96
b.ls L(copy96)
ldp G_q, H_q, [srcend, -64]
stp G_q, H_q, [dstend, -64]
L(copy96):
stp A_q, B_q, [dstin]
stp E_q, F_q, [dstin, 32]
stp C_q, D_q, [dstend, -32]
ret
/* Copy more than 128 bytes. */
L(copy_long):
add srcend, src, count
add dstend, dstin, count
/* Use backwards copy if there is an overlap. */
sub tmp1, dstin, src
cmp tmp1, count
b.lo L(copy_long_backwards)
/* Copy 16 bytes and then align src to 16-byte alignment. */
ldr D_q, [src]
and tmp1, src, 15
bic src, src, 15
sub dst, dstin, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_q, B_q, [src, 16]
str D_q, [dstin]
ldp C_q, D_q, [src, 48]
subs count, count, 128 + 16 /* Test and readjust count. */
b.ls L(copy64_from_end)
L(loop64):
stp A_q, B_q, [dst, 16]
ldp A_q, B_q, [src, 80]
stp C_q, D_q, [dst, 48]
ldp C_q, D_q, [src, 112]
add src, src, 64
add dst, dst, 64
subs count, count, 64
b.hi L(loop64)
/* Write the last iteration and copy 64 bytes from the end. */
L(copy64_from_end):
ldp E_q, F_q, [srcend, -64]
stp A_q, B_q, [dst, 16]
ldp A_q, B_q, [srcend, -32]
stp C_q, D_q, [dst, 48]
stp E_q, F_q, [dstend, -64]
stp A_q, B_q, [dstend, -32]
ret
L(vlen128):
whilelo p1.b, vlen, count
ld1b z0.b, p0/z, [src, 0, mul vl]
ld1b z1.b, p1/z, [src, 1, mul vl]
st1b z0.b, p0, [dstin, 0, mul vl]
st1b z1.b, p1, [dstin, 1, mul vl]
ret
/* Large backwards copy for overlapping copies.
Copy 16 bytes and then align srcend to 16-byte alignment. */
L(copy_long_backwards):
cbz tmp1, L(return)
ldr D_q, [srcend, -16]
and tmp1, srcend, 15
bic srcend, srcend, 15
sub count, count, tmp1
ldp A_q, B_q, [srcend, -32]
str D_q, [dstend, -16]
ldp C_q, D_q, [srcend, -64]
sub dstend, dstend, tmp1
subs count, count, 128
b.ls L(copy64_from_start)
L(loop64_backwards):
str B_q, [dstend, -16]
str A_q, [dstend, -32]
ldp A_q, B_q, [srcend, -96]
str D_q, [dstend, -48]
str C_q, [dstend, -64]!
ldp C_q, D_q, [srcend, -128]
sub srcend, srcend, 64
subs count, count, 64
b.hi L(loop64_backwards)
/* Write the last iteration and copy 64 bytes from the start. */
L(copy64_from_start):
ldp E_q, F_q, [src, 32]
stp A_q, B_q, [dstend, -32]
ldp A_q, B_q, [src]
stp C_q, D_q, [dstend, -64]
stp E_q, F_q, [dstin, 32]
stp A_q, B_q, [dstin]
L(return):
ret
END (__memcpy_aarch64_sve)
#endif

View File

@ -1,10 +0,0 @@
/*
* stpcpy - copy a string returning pointer to end.
*
* Copyright (c) 2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
#define BUILD_STPCPY 1
#include "strcpy-mte.S"

View File

@ -1,189 +0,0 @@
/*
* strcmp - compare two strings
*
* Copyright (c) 2012-2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
* ARMv8-a, AArch64.
* MTE compatible.
*/
#include "../asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define src1 x0
#define src2 x1
#define result x0
#define data1 x2
#define data1w w2
#define data2 x3
#define data2w w3
#define has_nul x4
#define diff x5
#define off1 x5
#define syndrome x6
#define tmp x6
#define data3 x7
#define zeroones x8
#define shift x9
#define off2 x10
/* On big-endian early bytes are at MSB and on little-endian LSB.
LS_FW means shifting towards early bytes. */
#ifdef __AARCH64EB__
# define LS_FW lsl
#else
# define LS_FW lsr
#endif
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word.
Since carry propagation makes 0x1 bytes before a NUL byte appear
NUL too in big-endian, byte-reverse the data before the NUL check. */
ENTRY (__strcmp_aarch64_mte)
PTR_ARG (0)
PTR_ARG (1)
sub off2, src2, src1
mov zeroones, REP8_01
and tmp, src1, 7
tst off2, 7
b.ne L(misaligned8)
cbnz tmp, L(mutual_align)
.p2align 4
L(loop_aligned):
ldr data2, [src1, off2]
ldr data1, [src1], 8
L(start_realigned):
#ifdef __AARCH64EB__
rev tmp, data1
sub has_nul, tmp, zeroones
orr tmp, tmp, REP8_7f
#else
sub has_nul, data1, zeroones
orr tmp, data1, REP8_7f
#endif
bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
ccmp data1, data2, 0, eq
b.eq L(loop_aligned)
#ifdef __AARCH64EB__
rev has_nul, has_nul
#endif
eor diff, data1, data2
orr syndrome, diff, has_nul
L(end):
#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
rev data2, data2
#endif
clz shift, syndrome
/* The most-significant-non-zero bit of the syndrome marks either the
first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
lsl data1, data1, shift
lsl data2, data2, shift
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, 56
sub result, data1, data2, lsr 56
ret
.p2align 4
L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that precede the start point. */
bic src1, src1, 7
ldr data2, [src1, off2]
ldr data1, [src1], 8
neg shift, src2, lsl 3 /* Bits to alignment -64. */
mov tmp, -1
LS_FW tmp, tmp, shift
orr data1, data1, tmp
orr data2, data2, tmp
b L(start_realigned)
L(misaligned8):
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
checking to make sure that we don't access beyond the end of SRC2. */
cbz tmp, L(src1_aligned)
L(do_misaligned):
ldrb data1w, [src1], 1
ldrb data2w, [src2], 1
cmp data1w, 0
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.ne L(done)
tst src1, 7
b.ne L(do_misaligned)
L(src1_aligned):
neg shift, src2, lsl 3
bic src2, src2, 7
ldr data3, [src2], 8
#ifdef __AARCH64EB__
rev data3, data3
#endif
lsr tmp, zeroones, shift
orr data3, data3, tmp
sub has_nul, data3, zeroones
orr tmp, data3, REP8_7f
bics has_nul, has_nul, tmp
b.ne L(tail)
sub off1, src2, src1
.p2align 4
L(loop_unaligned):
ldr data3, [src1, off1]
ldr data2, [src1, off2]
#ifdef __AARCH64EB__
rev data3, data3
#endif
sub has_nul, data3, zeroones
orr tmp, data3, REP8_7f
ldr data1, [src1], 8
bics has_nul, has_nul, tmp
ccmp data1, data2, 0, eq
b.eq L(loop_unaligned)
lsl tmp, has_nul, shift
#ifdef __AARCH64EB__
rev tmp, tmp
#endif
eor diff, data1, data2
orr syndrome, diff, tmp
cbnz syndrome, L(end)
L(tail):
ldr data1, [src1]
neg shift, shift
lsr data2, data3, shift
lsr has_nul, has_nul, shift
#ifdef __AARCH64EB__
rev data2, data2
rev has_nul, has_nul
#endif
eor diff, data1, data2
orr syndrome, diff, has_nul
b L(end)
L(done):
sub result, data1, data2
ret
END (__strcmp_aarch64_mte)

View File

@ -1,168 +1,184 @@
/*
* strcmp - compare two strings
*
* Copyright (c) 2012-2020, Arm Limited.
* Copyright (c) 2012-2022, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
* ARMv8-a, AArch64
* ARMv8-a, AArch64.
* MTE compatible.
*/
#include "../asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
/* Parameters and result. */
#define src1 x0
#define src2 x1
#define result x0
/* Internal variables. */
#define data1 x2
#define data1w w2
#define data2 x3
#define data2w w3
#define has_nul x4
#define diff x5
#define off1 x5
#define syndrome x6
#define tmp1 x7
#define tmp2 x8
#define tmp3 x9
#define zeroones x10
#define pos x11
#define tmp x6
#define data3 x7
#define zeroones x8
#define shift x9
#define off2 x10
/* On big-endian early bytes are at MSB and on little-endian LSB.
LS_FW means shifting towards early bytes. */
#ifdef __AARCH64EB__
# define LS_FW lsl
#else
# define LS_FW lsr
#endif
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word.
Since carry propagation makes 0x1 bytes before a NUL byte appear
NUL too in big-endian, byte-reverse the data before the NUL check. */
/* Start of performance-critical section -- one 64B cache line. */
ENTRY (__strcmp_aarch64)
PTR_ARG (0)
PTR_ARG (1)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
sub off2, src2, src1
mov zeroones, REP8_01
and tmp, src1, 7
tst off2, 7
b.ne L(misaligned8)
ands tmp1, src1, #7
b.ne L(mutual_align)
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
L(start_realigned):
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
orr syndrome, diff, has_nul
cbz syndrome, L(loop_aligned)
/* End of performance-critical section -- one 64B cache line. */
cbnz tmp, L(mutual_align)
.p2align 4
L(loop_aligned):
ldr data2, [src1, off2]
ldr data1, [src1], 8
L(start_realigned):
#ifdef __AARCH64EB__
rev tmp, data1
sub has_nul, tmp, zeroones
orr tmp, tmp, REP8_7f
#else
sub has_nul, data1, zeroones
orr tmp, data1, REP8_7f
#endif
bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
ccmp data1, data2, 0, eq
b.eq L(loop_aligned)
#ifdef __AARCH64EB__
rev has_nul, has_nul
#endif
eor diff, data1, data2
orr syndrome, diff, has_nul
L(end):
#ifndef __AARCH64EB__
#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
/* The MS-non-zero bit of the syndrome marks either the first bit
that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
clz pos, syndrome
rev data2, data2
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
ret
#else
/* For big-endian we cannot use the trick with the syndrome value
as carry-propagation can corrupt the upper bits if the trailing
bytes in the string contain 0x01. */
/* However, if there is no NUL byte in the dword, we can generate
the result directly. We can't just subtract the bytes as the
MSB might be significant. */
cbnz has_nul, 1f
cmp data1, data2
cset result, ne
cneg result, result, lo
ret
1:
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
rev tmp3, data1
sub tmp1, tmp3, zeroones
orr tmp2, tmp3, #REP8_7f
bic has_nul, tmp1, tmp2
rev has_nul, has_nul
orr syndrome, diff, has_nul
clz pos, syndrome
/* The MS-non-zero bit of the syndrome marks either the first bit
that is different, or the top bit of the first zero byte.
#endif
clz shift, syndrome
/* The most-significant-non-zero bit of the syndrome marks either the
first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
lsl data1, data1, pos
lsl data2, data2, pos
lsl data1, data1, shift
lsl data2, data2, shift
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
lsr data1, data1, 56
sub result, data1, data2, lsr 56
ret
#endif
.p2align 4
L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that preceed the start point. */
bic src1, src1, #7
bic src2, src2, #7
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
ldr data1, [src1], #8
neg tmp1, tmp1 /* Bits to alignment -64. */
ldr data2, [src2], #8
mov tmp2, #~0
#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
#endif
orr data1, data1, tmp2
orr data2, data2, tmp2
the bytes that precede the start point. */
bic src1, src1, 7
ldr data2, [src1, off2]
ldr data1, [src1], 8
neg shift, src2, lsl 3 /* Bits to alignment -64. */
mov tmp, -1
LS_FW tmp, tmp, shift
orr data1, data1, tmp
orr data2, data2, tmp
b L(start_realigned)
L(misaligned8):
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
checking to make sure that we don't access beyond page boundary in
SRC2. */
tst src1, #7
b.eq L(loop_misaligned)
checking to make sure that we don't access beyond the end of SRC2. */
cbz tmp, L(src1_aligned)
L(do_misaligned):
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
cmp data1w, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
ldrb data1w, [src1], 1
ldrb data2w, [src2], 1
cmp data1w, 0
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.ne L(done)
tst src1, #7
tst src1, 7
b.ne L(do_misaligned)
L(loop_misaligned):
/* Test if we are within the last dword of the end of a 4K page. If
yes then jump back to the misaligned loop to copy a byte at a time. */
and tmp1, src2, #0xff8
eor tmp1, tmp1, #0xff8
cbz tmp1, L(do_misaligned)
ldr data1, [src1], #8
ldr data2, [src2], #8
L(src1_aligned):
neg shift, src2, lsl 3
bic src2, src2, 7
ldr data3, [src2], 8
#ifdef __AARCH64EB__
rev data3, data3
#endif
lsr tmp, zeroones, shift
orr data3, data3, tmp
sub has_nul, data3, zeroones
orr tmp, data3, REP8_7f
bics has_nul, has_nul, tmp
b.ne L(tail)
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
sub off1, src2, src1
.p2align 4
L(loop_unaligned):
ldr data3, [src1, off1]
ldr data2, [src1, off2]
#ifdef __AARCH64EB__
rev data3, data3
#endif
sub has_nul, data3, zeroones
orr tmp, data3, REP8_7f
ldr data1, [src1], 8
bics has_nul, has_nul, tmp
ccmp data1, data2, 0, eq
b.eq L(loop_unaligned)
lsl tmp, has_nul, shift
#ifdef __AARCH64EB__
rev tmp, tmp
#endif
eor diff, data1, data2
orr syndrome, diff, tmp
cbnz syndrome, L(end)
L(tail):
ldr data1, [src1]
neg shift, shift
lsr data2, data3, shift
lsr has_nul, has_nul, shift
#ifdef __AARCH64EB__
rev data2, data2
rev has_nul, has_nul
#endif
eor diff, data1, data2
orr syndrome, diff, has_nul
cbz syndrome, L(loop_misaligned)
b L(end)
L(done):

View File

@ -1,161 +0,0 @@
/*
* strcpy/stpcpy - copy a string returning pointer to start/end.
*
* Copyright (c) 2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
* ARMv8-a, AArch64, Advanced SIMD.
* MTE compatible.
*/
#include "../asmdefs.h"
#define dstin x0
#define srcin x1
#define result x0
#define src x2
#define dst x3
#define len x4
#define synd x4
#define tmp x5
#define wtmp w5
#define shift x5
#define data1 x6
#define dataw1 w6
#define data2 x7
#define dataw2 w7
#define dataq q0
#define vdata v0
#define vhas_nul v1
#define vrepmask v2
#define vend v3
#define dend d3
#define dataq2 q1
#ifdef BUILD_STPCPY
# define STRCPY __stpcpy_aarch64_mte
# define IFSTPCPY(X,...) X,__VA_ARGS__
#else
# define STRCPY __strcpy_aarch64_mte
# define IFSTPCPY(X,...)
#endif
/* Core algorithm:
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
set likewise for odd bytes so that adjacent bytes can be merged. Since the
bits in the syndrome reflect the order in which things occur in the original
string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (STRCPY)
PTR_ARG (0)
PTR_ARG (1)
bic src, srcin, 15
mov wtmp, 0xf00f
ld1 {vdata.16b}, [src]
dup vrepmask.8h, wtmp
cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
addp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
lsr synd, synd, shift
cbnz synd, L(tail)
ldr dataq, [src, 16]!
cmeq vhas_nul.16b, vdata.16b, 0
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
addp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(start_loop)
#ifndef __AARCH64EB__
rbit synd, synd
#endif
sub tmp, src, srcin
clz len, synd
add len, tmp, len, lsr 2
tbz len, 4, L(less16)
sub tmp, len, 15
ldr dataq, [srcin]
ldr dataq2, [srcin, tmp]
str dataq, [dstin]
str dataq2, [dstin, tmp]
IFSTPCPY (add result, dstin, len)
ret
.p2align 4,,8
L(tail):
rbit synd, synd
clz len, synd
lsr len, len, 2
.p2align 4
L(less16):
tbz len, 3, L(less8)
sub tmp, len, 7
ldr data1, [srcin]
ldr data2, [srcin, tmp]
str data1, [dstin]
str data2, [dstin, tmp]
IFSTPCPY (add result, dstin, len)
ret
.p2align 4
L(less8):
subs tmp, len, 3
b.lo L(less4)
ldr dataw1, [srcin]
ldr dataw2, [srcin, tmp]
str dataw1, [dstin]
str dataw2, [dstin, tmp]
IFSTPCPY (add result, dstin, len)
ret
L(less4):
cbz len, L(zerobyte)
ldrh dataw1, [srcin]
strh dataw1, [dstin]
L(zerobyte):
strb wzr, [dstin, len]
IFSTPCPY (add result, dstin, len)
ret
.p2align 4
L(start_loop):
sub len, src, srcin
ldr dataq2, [srcin]
add dst, dstin, len
str dataq2, [dstin]
.p2align 5
L(loop):
str dataq, [dst], 16
ldr dataq, [src, 16]!
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop)
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd
#endif
clz len, synd
lsr len, len, 2
sub tmp, len, 15
ldr dataq, [src, tmp]
str dataq, [dst, tmp]
IFSTPCPY (add result, dst, len)
ret
END (STRCPY)

View File

@ -1,311 +1,161 @@
/*
* strcpy/stpcpy - copy a string returning pointer to start/end.
*
* Copyright (c) 2013-2020, Arm Limited.
* Copyright (c) 2020-2022, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
* ARMv8-a, AArch64, Advanced SIMD.
* MTE compatible.
*/
#include "../asmdefs.h"
/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
To test the page crossing code path more thoroughly, compile with
-DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
entry path. This option is not intended for production use. */
/* Arguments and results. */
#define dstin x0
#define srcin x1
#define result x0
/* Locals and temporaries. */
#define src x2
#define dst x3
#define data1 x4
#define data1w w4
#define data2 x5
#define data2w w5
#define has_nul1 x6
#define has_nul2 x7
#define tmp1 x8
#define tmp2 x9
#define tmp3 x10
#define tmp4 x11
#define zeroones x12
#define data1a x13
#define data2a x14
#define pos x15
#define len x16
#define to_align x17
#define len x4
#define synd x4
#define tmp x5
#define wtmp w5
#define shift x5
#define data1 x6
#define dataw1 w6
#define data2 x7
#define dataw2 w7
#define dataq q0
#define vdata v0
#define vhas_nul v1
#define vrepmask v2
#define vend v3
#define dend d3
#define dataq2 q1
#ifdef BUILD_STPCPY
#define STRCPY __stpcpy_aarch64
# define STRCPY __stpcpy_aarch64
# define IFSTPCPY(X,...) X,__VA_ARGS__
#else
#define STRCPY __strcpy_aarch64
# define STRCPY __strcpy_aarch64
# define IFSTPCPY(X,...)
#endif
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
/* Core algorithm:
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
/* AArch64 systems have a minimum page size of 4k. We can do a quick
page size check for crossing this boundary on entry and if we
do not, then we can short-circuit much of the entry code. We
expect early page-crossing strings to be rare (probability of
16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
predictable, even with random strings.
We don't bother checking for larger page sizes, the cost of setting
up the correct page size is just not worth the extra gain from
a small reduction in the cases taking the slow path. Note that
we only care about whether the first fetch, which may be
misaligned, crosses a page boundary - after that we move to aligned
fetches for the remainder of the string. */
#ifdef STRCPY_TEST_PAGE_CROSS
/* Make everything that isn't Qword aligned look like a page cross. */
#define MIN_PAGE_P2 4
#else
#define MIN_PAGE_P2 12
#endif
#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
set likewise for odd bytes so that adjacent bytes can be merged. Since the
bits in the syndrome reflect the order in which things occur in the original
string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (STRCPY)
PTR_ARG (0)
PTR_ARG (1)
/* For moderately short strings, the fastest way to do the copy is to
calculate the length of the string in the same way as strlen, then
essentially do a memcpy of the result. This avoids the need for
multiple byte copies and further means that by the time we
reach the bulk copy loop we know we can always use DWord
accesses. We expect __strcpy_aarch64 to rarely be called repeatedly
with the same source string, so branch prediction is likely to
always be difficult - we mitigate against this by preferring
conditional select operations over branches whenever this is
feasible. */
and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
mov zeroones, #REP8_01
and to_align, srcin, #15
cmp tmp2, #(MIN_PAGE_SIZE - 16)
neg tmp1, to_align
/* The first fetch will straddle a (possible) page boundary iff
srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
aligned string will never fail the page align check, so will
always take the fast path. */
b.gt L(page_cross)
bic src, srcin, 15
mov wtmp, 0xf00f
ld1 {vdata.16b}, [src]
dup vrepmask.8h, wtmp
cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
addp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
lsr synd, synd, shift
cbnz synd, L(tail)
L(page_cross_ok):
ldp data1, data2, [srcin]
#ifdef __AARCH64EB__
/* Because we expect the end to be found within 16 characters
(profiling shows this is the most common case), it's worth
swapping the bytes now to save having to recalculate the
termination syndrome later. We preserve data1 and data2
so that we can re-use the values later on. */
rev tmp2, data1
sub tmp1, tmp2, zeroones
orr tmp2, tmp2, #REP8_7f
bics has_nul1, tmp1, tmp2
b.ne L(fp_le8)
rev tmp4, data2
sub tmp3, tmp4, zeroones
orr tmp4, tmp4, #REP8_7f
#else
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
bics has_nul1, tmp1, tmp2
b.ne L(fp_le8)
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
#endif
bics has_nul2, tmp3, tmp4
b.eq L(bulk_entry)
ldr dataq, [src, 16]!
cmeq vhas_nul.16b, vdata.16b, 0
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
addp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(start_loop)
/* The string is short (<=16 bytes). We don't know exactly how
short though, yet. Work out the exact length so that we can
quickly select the optimal copy strategy. */
L(fp_gt8):
rev has_nul2, has_nul2
clz pos, has_nul2
mov tmp2, #56
add dst, dstin, pos, lsr #3 /* Bits to bytes. */
sub pos, tmp2, pos
#ifdef __AARCH64EB__
lsr data2, data2, pos
#else
lsl data2, data2, pos
#ifndef __AARCH64EB__
rbit synd, synd
#endif
str data2, [dst, #1]
sub tmp, src, srcin
clz len, synd
add len, tmp, len, lsr 2
tbz len, 4, L(less16)
sub tmp, len, 15
ldr dataq, [srcin]
ldr dataq2, [srcin, tmp]
str dataq, [dstin]
str dataq2, [dstin, tmp]
IFSTPCPY (add result, dstin, len)
ret
.p2align 4,,8
L(tail):
rbit synd, synd
clz len, synd
lsr len, len, 2
.p2align 4
L(less16):
tbz len, 3, L(less8)
sub tmp, len, 7
ldr data1, [srcin]
ldr data2, [srcin, tmp]
str data1, [dstin]
#ifdef BUILD_STPCPY
add dstin, dst, #8
#endif
str data2, [dstin, tmp]
IFSTPCPY (add result, dstin, len)
ret
L(fp_le8):
rev has_nul1, has_nul1
clz pos, has_nul1
add dst, dstin, pos, lsr #3 /* Bits to bytes. */
subs tmp2, pos, #24 /* Pos in bits. */
b.lt L(fp_lt4)
#ifdef __AARCH64EB__
mov tmp2, #56
sub pos, tmp2, pos
lsr data2, data1, pos
lsr data1, data1, #32
#else
lsr data2, data1, tmp2
#endif
/* 4->7 bytes to copy. */
str data2w, [dst, #-3]
str data1w, [dstin]
#ifdef BUILD_STPCPY
mov dstin, dst
#endif
ret
L(fp_lt4):
cbz pos, L(fp_lt2)
/* 2->3 bytes to copy. */
#ifdef __AARCH64EB__
lsr data1, data1, #48
#endif
strh data1w, [dstin]
/* Fall-through, one byte (max) to go. */
L(fp_lt2):
/* Null-terminated string. Last character must be zero! */
strb wzr, [dst]
#ifdef BUILD_STPCPY
mov dstin, dst
#endif
.p2align 4
L(less8):
subs tmp, len, 3
b.lo L(less4)
ldr dataw1, [srcin]
ldr dataw2, [srcin, tmp]
str dataw1, [dstin]
str dataw2, [dstin, tmp]
IFSTPCPY (add result, dstin, len)
ret
.p2align 6
/* Aligning here ensures that the entry code and main loop all lies
within one 64-byte cache line. */
L(bulk_entry):
sub to_align, to_align, #16
stp data1, data2, [dstin]
sub src, srcin, to_align
sub dst, dstin, to_align
b L(entry_no_page_cross)
/* The inner loop deals with two Dwords at a time. This has a
slightly higher start-up cost, but we should win quite quickly,
especially on cores with a high number of issue slots per
cycle, as we get much better parallelism out of the operations. */
L(main_loop):
stp data1, data2, [dst], #16
L(entry_no_page_cross):
ldp data1, data2, [src], #16
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
b.eq L(main_loop)
/* Since we know we are copying at least 16 bytes, the fastest way
to deal with the tail is to determine the location of the
trailing NUL, then (re)copy the 16 bytes leading up to that. */
cmp has_nul1, #0
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul directly. The
easiest way to get the correct byte is to byte-swap the data
and calculate the syndrome a second time. */
csel data1, data1, data2, ne
rev data1, data1
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
bic has_nul1, tmp1, tmp2
#else
csel has_nul1, has_nul1, has_nul2, ne
#endif
rev has_nul1, has_nul1
clz pos, has_nul1
add tmp1, pos, #72
add pos, pos, #8
csel pos, pos, tmp1, ne
add src, src, pos, lsr #3
add dst, dst, pos, lsr #3
ldp data1, data2, [src, #-32]
stp data1, data2, [dst, #-16]
#ifdef BUILD_STPCPY
sub dstin, dst, #1
#endif
L(less4):
cbz len, L(zerobyte)
ldrh dataw1, [srcin]
strh dataw1, [dstin]
L(zerobyte):
strb wzr, [dstin, len]
IFSTPCPY (add result, dstin, len)
ret
L(page_cross):
bic src, srcin, #15
/* Start by loading two words at [srcin & ~15], then forcing the
bytes that precede srcin to 0xff. This means they never look
like termination bytes. */
ldp data1, data2, [src]
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
tst to_align, #7
csetm tmp2, ne
#ifdef __AARCH64EB__
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
#else
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
.p2align 4
L(start_loop):
sub len, src, srcin
ldr dataq2, [srcin]
add dst, dstin, len
str dataq2, [dstin]
.p2align 5
L(loop):
str dataq, [dst], 16
ldr dataq, [src, 16]!
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop)
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd
#endif
orr data1, data1, tmp2
orr data2a, data2, tmp2
cmp to_align, #8
csinv data1, data1, xzr, lt
csel data2, data2, data2a, lt
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
b.eq L(page_cross_ok)
/* We now need to make data1 and data2 look like they've been
loaded directly from srcin. Do a rotate on the 128-bit value. */
lsl tmp1, to_align, #3 /* Bytes->bits. */
neg tmp2, to_align, lsl #3
#ifdef __AARCH64EB__
lsl data1a, data1, tmp1
lsr tmp4, data2, tmp2
lsl data2, data2, tmp1
orr tmp4, tmp4, data1a
cmp to_align, #8
csel data1, tmp4, data2, lt
rev tmp2, data1
rev tmp4, data2
sub tmp1, tmp2, zeroones
orr tmp2, tmp2, #REP8_7f
sub tmp3, tmp4, zeroones
orr tmp4, tmp4, #REP8_7f
#else
lsr data1a, data1, tmp1
lsl tmp4, data2, tmp2
lsr data2, data2, tmp1
orr tmp4, tmp4, data1a
cmp to_align, #8
csel data1, tmp4, data2, lt
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
#endif
bic has_nul1, tmp1, tmp2
cbnz has_nul1, L(fp_le8)
bic has_nul2, tmp3, tmp4
b L(fp_gt8)
clz len, synd
lsr len, len, 2
sub tmp, len, 15
ldr dataq, [src, tmp]
str dataq, [dst, tmp]
IFSTPCPY (add result, dst, len)
ret
END (STRCPY)

View File

@ -1,307 +0,0 @@
/*
* strncmp - compare two strings
*
* Copyright (c) 2013-2021, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
* ARMv8-a, AArch64
*/
#include "../asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
/* Parameters and result. */
#define src1 x0
#define src2 x1
#define limit x2
#define result x0
/* Internal variables. */
#define data1 x3
#define data1w w3
#define data2 x4
#define data2w w4
#define has_nul x5
#define diff x6
#define syndrome x7
#define tmp1 x8
#define tmp2 x9
#define tmp3 x10
#define zeroones x11
#define pos x12
#define mask x13
#define endloop x14
#define count mask
#define offset pos
#define neg_offset x15
/* Define endian dependent shift operations.
On big-endian early bytes are at MSB and on little-endian LSB.
LS_FW means shifting towards early bytes.
LS_BK means shifting towards later bytes.
*/
#ifdef __AARCH64EB__
#define LS_FW lsl
#define LS_BK lsr
#else
#define LS_FW lsr
#define LS_BK lsl
#endif
ENTRY (__strncmp_aarch64_mte)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
cbz limit, L(ret0)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
and count, src1, #7
b.ne L(misaligned8)
cbnz count, L(mutual_align)
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
.p2align 4
L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
L(start_realigned):
subs limit, limit, #8
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
csinv endloop, diff, xzr, hi /* Last Dword or differences. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
b.eq L(loop_aligned)
/* End of main loop */
L(full_check):
#ifndef __AARCH64EB__
orr syndrome, diff, has_nul
add limit, limit, 8 /* Rewind limit to before last subs. */
L(syndrome_check):
/* Limit was reached. Check if the NUL byte or the difference
is before the limit. */
rev syndrome, syndrome
rev data1, data1
clz pos, syndrome
rev data2, data2
lsl data1, data1, pos
cmp limit, pos, lsr #3
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
csel result, result, xzr, hi
ret
#else
/* Not reached the limit, must have found the end or a diff. */
tbz limit, #63, L(not_limit)
add tmp1, limit, 8
cbz limit, L(not_limit)
lsl limit, tmp1, #3 /* Bits -> bytes. */
mov mask, #~0
lsr mask, mask, limit
bic data1, data1, mask
bic data2, data2, mask
/* Make sure that the NUL byte is marked in the syndrome. */
orr has_nul, has_nul, mask
L(not_limit):
/* For big-endian we cannot use the trick with the syndrome value
as carry-propagation can corrupt the upper bits if the trailing
bytes in the string contain 0x01. */
/* However, if there is no NUL byte in the dword, we can generate
the result directly. We can't just subtract the bytes as the
MSB might be significant. */
cbnz has_nul, 1f
cmp data1, data2
cset result, ne
cneg result, result, lo
ret
1:
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
rev tmp3, data1
sub tmp1, tmp3, zeroones
orr tmp2, tmp3, #REP8_7f
bic has_nul, tmp1, tmp2
rev has_nul, has_nul
orr syndrome, diff, has_nul
clz pos, syndrome
/* The most-significant-non-zero bit of the syndrome marks either the
first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
L(end_quick):
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
ret
#endif
L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that precede the start point.
We also need to adjust the limit calculations, but without
overflowing if the limit is near ULONG_MAX. */
bic src1, src1, #7
bic src2, src2, #7
ldr data1, [src1], #8
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
/* Adjust the limit and ensure it doesn't overflow. */
adds limit, limit, count
csinv limit, limit, xzr, lo
orr data1, data1, tmp2
orr data2, data2, tmp2
b L(start_realigned)
.p2align 4
/* Don't bother with dwords for up to 16 bytes. */
L(misaligned8):
cmp limit, #16
b.hs L(try_misaligned_words)
L(byte_loop):
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
b.eq L(byte_loop)
L(done):
sub result, data1, data2
ret
/* Align the SRC1 to a dword by doing a bytewise compare and then do
the dword loop. */
L(try_misaligned_words):
cbz count, L(src1_aligned)
neg count, count
and count, count, #7
sub limit, limit, count
L(page_end_loop):
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
cmp data1w, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
b.ne L(done)
subs count, count, #1
b.hi L(page_end_loop)
/* The following diagram explains the comparison of misaligned strings.
The bytes are shown in natural order. For little-endian, it is
reversed in the registers. The "x" bytes are before the string.
The "|" separates data that is loaded at one time.
src1 | a a a a a a a a | b b b c c c c c | . . .
src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
After shifting in each step, the data looks like this:
STEP_A STEP_B STEP_C
data1 a a a a a a a a b b b c c c c c b b b c c c c c
data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
The bytes with "0" are eliminated from the syndrome via mask.
Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
time from SRC2. The comparison happens in 3 steps. After each step
the loop can exit, or read from SRC1 or SRC2. */
L(src1_aligned):
/* Calculate offset from 8 byte alignment to string start in bits. No
need to mask offset since shifts are ignoring upper bits. */
lsl offset, src2, #3
bic src2, src2, #0xf
mov mask, -1
neg neg_offset, offset
ldr data1, [src1], #8
ldp tmp1, tmp2, [src2], #16
LS_BK mask, mask, neg_offset
and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
/* Skip the first compare if data in tmp1 is irrelevant. */
tbnz offset, 6, L(misaligned_mid_loop)
L(loop_misaligned):
/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
LS_FW data2, tmp1, offset
LS_BK tmp1, tmp2, neg_offset
subs limit, limit, #8
orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
sub has_nul, data1, zeroones
eor diff, data1, data2 /* Non-zero if differences found. */
orr tmp3, data1, #REP8_7f
csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
orr tmp3, endloop, has_nul
cbnz tmp3, L(full_check)
ldr data1, [src1], #8
L(misaligned_mid_loop):
/* STEP_B: Compare first part of data1 to second part of tmp2. */
LS_FW data2, tmp2, offset
#ifdef __AARCH64EB__
/* For big-endian we do a byte reverse to avoid carry-propagation
problem described above. This way we can reuse the has_nul in the
next step and also use syndrome value trick at the end. */
rev tmp3, data1
#define data1_fixed tmp3
#else
#define data1_fixed data1
#endif
sub has_nul, data1_fixed, zeroones
orr tmp3, data1_fixed, #REP8_7f
eor diff, data2, data1 /* Non-zero if differences found. */
bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
#ifdef __AARCH64EB__
rev has_nul, has_nul
#endif
cmp limit, neg_offset, lsr #3
orr syndrome, diff, has_nul
bic syndrome, syndrome, mask /* Ignore later bytes. */
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
cbnz tmp3, L(syndrome_check)
/* STEP_C: Compare second part of data1 to first part of tmp1. */
ldp tmp1, tmp2, [src2], #16
cmp limit, #8
LS_BK data2, tmp1, neg_offset
eor diff, data2, data1 /* Non-zero if differences found. */
orr syndrome, diff, has_nul
and syndrome, syndrome, mask /* Ignore earlier bytes. */
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
cbnz tmp3, L(syndrome_check)
ldr data1, [src1], #8
sub limit, limit, #8
b L(loop_misaligned)
#ifdef __AARCH64EB__
L(syndrome_check):
clz pos, syndrome
cmp pos, limit, lsl #3
b.lo L(end_quick)
#endif
L(ret0):
mov result, #0
ret
END(__strncmp_aarch64_mte)

View File

@ -1,20 +1,20 @@
/*
* strncmp - compare two strings
*
* Copyright (c) 2013-2021, Arm Limited.
* Copyright (c) 2013-2022, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
* ARMv8-a, AArch64
* ARMv8-a, AArch64.
* MTE compatible.
*/
#include "../asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
/* Parameters and result. */
#define src1 x0
@ -35,10 +35,24 @@
#define tmp3 x10
#define zeroones x11
#define pos x12
#define limit_wd x13
#define mask x14
#define endloop x15
#define mask x13
#define endloop x14
#define count mask
#define offset pos
#define neg_offset x15
/* Define endian dependent shift operations.
On big-endian early bytes are at MSB and on little-endian LSB.
LS_FW means shifting towards early bytes.
LS_BK means shifting towards later bytes.
*/
#ifdef __AARCH64EB__
#define LS_FW lsl
#define LS_BK lsr
#else
#define LS_FW lsr
#define LS_BK lsl
#endif
ENTRY (__strncmp_aarch64)
PTR_ARG (0)
@ -51,9 +65,6 @@ ENTRY (__strncmp_aarch64)
and count, src1, #7
b.ne L(misaligned8)
cbnz count, L(mutual_align)
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
@ -63,30 +74,45 @@ L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
L(start_realigned):
subs limit_wd, limit_wd, #1
subs limit, limit, #8
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
csinv endloop, diff, xzr, pl /* Last Dword or differences. */
csinv endloop, diff, xzr, hi /* Last Dword or differences. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
b.eq L(loop_aligned)
/* End of main loop */
/* Not reached the limit, must have found the end or a diff. */
tbz limit_wd, #63, L(not_limit)
/* Limit % 8 == 0 => all bytes significant. */
ands limit, limit, #7
b.eq L(not_limit)
lsl limit, limit, #3 /* Bits -> bytes. */
mov mask, #~0
#ifdef __AARCH64EB__
lsr mask, mask, limit
L(full_check):
#ifndef __AARCH64EB__
orr syndrome, diff, has_nul
add limit, limit, 8 /* Rewind limit to before last subs. */
L(syndrome_check):
/* Limit was reached. Check if the NUL byte or the difference
is before the limit. */
rev syndrome, syndrome
rev data1, data1
clz pos, syndrome
rev data2, data2
lsl data1, data1, pos
cmp limit, pos, lsr #3
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
csel result, result, xzr, hi
ret
#else
lsl mask, mask, limit
#endif
/* Not reached the limit, must have found the end or a diff. */
tbz limit, #63, L(not_limit)
add tmp1, limit, 8
cbz limit, L(not_limit)
lsl limit, tmp1, #3 /* Bits -> bytes. */
mov mask, #~0
lsr mask, mask, limit
bic data1, data1, mask
bic data2, data2, mask
@ -94,25 +120,6 @@ L(start_realigned):
orr has_nul, has_nul, mask
L(not_limit):
orr syndrome, diff, has_nul
#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
/* The MS-non-zero bit of the syndrome marks either the first bit
that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
clz pos, syndrome
rev data2, data2
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
ret
#else
/* For big-endian we cannot use the trick with the syndrome value
as carry-propagation can corrupt the upper bits if the trailing
bytes in the string contain 0x01. */
@ -133,10 +140,11 @@ L(not_limit):
rev has_nul, has_nul
orr syndrome, diff, has_nul
clz pos, syndrome
/* The MS-non-zero bit of the syndrome marks either the first bit
that is different, or the top bit of the first zero byte.
/* The most-significant-non-zero bit of the syndrome marks either the
first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
L(end_quick):
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
@ -158,22 +166,12 @@ L(mutual_align):
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
#endif
and tmp3, limit_wd, #7
lsr limit_wd, limit_wd, #3
/* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
add limit, limit, count
add tmp3, tmp3, count
LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
/* Adjust the limit and ensure it doesn't overflow. */
adds limit, limit, count
csinv limit, limit, xzr, lo
orr data1, data1, tmp2
orr data2, data2, tmp2
add limit_wd, limit_wd, tmp3, lsr #3
b L(start_realigned)
.p2align 4
@ -196,13 +194,11 @@ L(done):
/* Align the SRC1 to a dword by doing a bytewise compare and then do
the dword loop. */
L(try_misaligned_words):
lsr limit_wd, limit, #3
cbz count, L(do_misaligned)
cbz count, L(src1_aligned)
neg count, count
and count, count, #7
sub limit, limit, count
lsr limit_wd, limit, #3
L(page_end_loop):
ldrb data1w, [src1], #1
@ -213,48 +209,100 @@ L(page_end_loop):
subs count, count, #1
b.hi L(page_end_loop)
L(do_misaligned):
/* Prepare ourselves for the next page crossing. Unlike the aligned
loop, we fetch 1 less dword because we risk crossing bounds on
SRC2. */
mov count, #8
subs limit_wd, limit_wd, #1
b.lo L(done_loop)
/* The following diagram explains the comparison of misaligned strings.
The bytes are shown in natural order. For little-endian, it is
reversed in the registers. The "x" bytes are before the string.
The "|" separates data that is loaded at one time.
src1 | a a a a a a a a | b b b c c c c c | . . .
src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
After shifting in each step, the data looks like this:
STEP_A STEP_B STEP_C
data1 a a a a a a a a b b b c c c c c b b b c c c c c
data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
The bytes with "0" are eliminated from the syndrome via mask.
Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
time from SRC2. The comparison happens in 3 steps. After each step
the loop can exit, or read from SRC1 or SRC2. */
L(src1_aligned):
/* Calculate offset from 8 byte alignment to string start in bits. No
need to mask offset since shifts are ignoring upper bits. */
lsl offset, src2, #3
bic src2, src2, #0xf
mov mask, -1
neg neg_offset, offset
ldr data1, [src1], #8
ldp tmp1, tmp2, [src2], #16
LS_BK mask, mask, neg_offset
and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
/* Skip the first compare if data in tmp1 is irrelevant. */
tbnz offset, 6, L(misaligned_mid_loop)
L(loop_misaligned):
and tmp2, src2, #0xff8
eor tmp2, tmp2, #0xff8
cbz tmp2, L(page_end_loop)
/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
LS_FW data2, tmp1, offset
LS_BK tmp1, tmp2, neg_offset
subs limit, limit, #8
orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
sub has_nul, data1, zeroones
eor diff, data1, data2 /* Non-zero if differences found. */
orr tmp3, data1, #REP8_7f
csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
orr tmp3, endloop, has_nul
cbnz tmp3, L(full_check)
ldr data1, [src1], #8
ldr data2, [src2], #8
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp diff, #0, #0, eq
b.ne L(not_limit)
subs limit_wd, limit_wd, #1
b.pl L(loop_misaligned)
L(misaligned_mid_loop):
/* STEP_B: Compare first part of data1 to second part of tmp2. */
LS_FW data2, tmp2, offset
#ifdef __AARCH64EB__
/* For big-endian we do a byte reverse to avoid carry-propagation
problem described above. This way we can reuse the has_nul in the
next step and also use syndrome value trick at the end. */
rev tmp3, data1
#define data1_fixed tmp3
#else
#define data1_fixed data1
#endif
sub has_nul, data1_fixed, zeroones
orr tmp3, data1_fixed, #REP8_7f
eor diff, data2, data1 /* Non-zero if differences found. */
bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
#ifdef __AARCH64EB__
rev has_nul, has_nul
#endif
cmp limit, neg_offset, lsr #3
orr syndrome, diff, has_nul
bic syndrome, syndrome, mask /* Ignore later bytes. */
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
cbnz tmp3, L(syndrome_check)
L(done_loop):
/* We found a difference or a NULL before the limit was reached. */
and limit, limit, #7
cbz limit, L(not_limit)
/* Read the last word. */
sub src1, src1, 8
sub src2, src2, 8
ldr data1, [src1, limit]
ldr data2, [src2, limit]
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp diff, #0, #0, eq
b.ne L(not_limit)
/* STEP_C: Compare second part of data1 to first part of tmp1. */
ldp tmp1, tmp2, [src2], #16
cmp limit, #8
LS_BK data2, tmp1, neg_offset
eor diff, data2, data1 /* Non-zero if differences found. */
orr syndrome, diff, has_nul
and syndrome, syndrome, mask /* Ignore earlier bytes. */
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
cbnz tmp3, L(syndrome_check)
ldr data1, [src1], #8
sub limit, limit, #8
b L(loop_misaligned)
#ifdef __AARCH64EB__
L(syndrome_check):
clz pos, syndrome
cmp pos, limit, lsl #3
b.lo L(end_quick)
#endif
L(ret0):
mov result, #0
ret
END ( __strncmp_aarch64)
END(__strncmp_aarch64)

View File

@ -1,7 +1,7 @@
/*
* memcpy benchmark.
*
* Copyright (c) 2020, Arm Limited.
* Copyright (c) 2020-2021, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@ -13,14 +13,15 @@
#include "stringlib.h"
#include "benchlib.h"
#define ITERS 5000
#define ITERS 5000
#define ITERS2 20000000
#define ITERS3 500000
#define MAX_COPIES 8192
#define SIZE (256*1024)
#define ITERS3 200000
#define NUM_TESTS 16384
#define MIN_SIZE 32768
#define MAX_SIZE (1024 * 1024)
static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64)));
static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64)));
static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
#define F(x) {#x, x},
@ -30,15 +31,18 @@ static const struct fun
void *(*fun)(void *, const void *, size_t);
} funtab[] =
{
F(memcpy)
#if __aarch64__
F(__memcpy_aarch64)
# if __ARM_NEON
F(__memcpy_aarch64_simd)
# endif
# if __ARM_FEATURE_SVE
F(__memcpy_aarch64_sve)
# endif
#elif __arm__
F(__memcpy_arm)
#endif
F(memcpy)
#undef F
{0, 0}
};
@ -109,7 +113,7 @@ typedef struct
uint64_t len : 16;
} copy_t;
static copy_t copy[MAX_COPIES];
static copy_t test_arr[NUM_TESTS];
typedef char *(*proto_t) (char *, const char *, size_t);
@ -140,14 +144,14 @@ init_copies (size_t max_size)
size_t total = 0;
/* Create a random set of copies with the given size and alignment
distributions. */
for (int i = 0; i < MAX_COPIES; i++)
for (int i = 0; i < NUM_TESTS; i++)
{
copy[i].dst = (rand32 (0) & (max_size - 1));
copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
copy[i].src = (rand32 (0) & (max_size - 1));
copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
copy[i].len = size_arr[rand32 (0) & SIZE_MASK];
total += copy[i].len;
test_arr[i].dst = (rand32 (0) & (max_size - 1));
test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
test_arr[i].src = (rand32 (0) & (max_size - 1));
test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK];
total += test_arr[i].len;
}
return total;
@ -160,25 +164,27 @@ int main (void)
memset (a, 1, sizeof (a));
memset (b, 2, sizeof (b));
printf("Random memcpy:\n");
printf("Random memcpy (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
size_t total = 0;
uint64_t tsum = 0;
printf ("%22s (B/ns) ", funtab[f].name);
printf ("%22s ", funtab[f].name);
rand32 (0x12345678);
for (int size = 16384; size <= SIZE; size *= 2)
for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
{
size_t copy_size = init_copies (size) * ITERS;
for (int c = 0; c < MAX_COPIES; c++)
funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
for (int c = 0; c < NUM_TESTS; c++)
funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
test_arr[c].len);
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS; i++)
for (int c = 0; c < MAX_COPIES; c++)
funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
for (int c = 0; c < NUM_TESTS; c++)
funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
test_arr[c].len);
t = clock_get_ns () - t;
total += copy_size;
tsum += t;
@ -187,74 +193,147 @@ int main (void)
printf( "avg %.2f\n", (double)total / tsum);
}
printf ("\nMedium memcpy:\n");
size_t total = 0;
uint64_t tsum = 0;
printf ("%22s ", "memcpy_call");
rand32 (0x12345678);
for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
{
size_t copy_size = init_copies (size) * ITERS;
for (int c = 0; c < NUM_TESTS; c++)
memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS; i++)
for (int c = 0; c < NUM_TESTS; c++)
memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
t = clock_get_ns () - t;
total += copy_size;
tsum += t;
printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
}
printf( "avg %.2f\n", (double)total / tsum);
printf ("\nAligned medium memcpy (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
printf ("%22s (B/ns) ", funtab[f].name);
printf ("%22s ", funtab[f].name);
for (int size = 16; size <= 512; size *= 2)
for (int size = 8; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
funtab[f].fun (b, a, size);
t = clock_get_ns () - t;
printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
}
printf ("\n");
}
printf ("\nLarge memcpy:\n");
printf ("%22s ", "memcpy_call");
for (int size = 8; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
memcpy (b, a, size);
t = clock_get_ns () - t;
printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
}
printf ("\n");
printf ("\nUnaligned medium memcpy (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
printf ("%22s (B/ns) ", funtab[f].name);
printf ("%22s ", funtab[f].name);
for (int size = 1024; size <= 32768; size *= 2)
for (int size = 8; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
funtab[f].fun (b + 3, a + 1, size);
t = clock_get_ns () - t;
printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
}
printf ("\n");
}
printf ("%22s ", "memcpy_call");
for (int size = 8; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
memcpy (b + 3, a + 1, size);
t = clock_get_ns () - t;
printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
}
printf ("\n");
printf ("\nLarge memcpy (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
printf ("%22s ", funtab[f].name);
for (int size = 1024; size <= 65536; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
funtab[f].fun (b, a, size);
t = clock_get_ns () - t;
printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
}
printf ("\n");
}
printf ("\nUnaligned forwards memmove:\n");
printf ("%22s ", "memcpy_call");
for (int size = 1024; size <= 65536; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
memcpy (b, a, size);
t = clock_get_ns () - t;
printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
}
printf ("\n");
printf ("\nUnaligned forwards memmove (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
printf ("%22s (B/ns) ", funtab[f].name);
printf ("%22s ", funtab[f].name);
for (int size = 1024; size <= 32768; size *= 2)
for (int size = 1024; size <= 65536; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
funtab[f].fun (a, a + 256 + (i & 31), size);
t = clock_get_ns () - t;
printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
}
printf ("\n");
}
printf ("\nUnaligned backwards memmove:\n");
printf ("\nUnaligned backwards memmove (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
printf ("%22s (B/ns) ", funtab[f].name);
printf ("%22s ", funtab[f].name);
for (int size = 1024; size <= 32768; size *= 2)
for (int size = 1024; size <= 65536; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
funtab[f].fun (a + 256 + (i & 31), a, size);
t = clock_get_ns () - t;
printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
}
printf ("\n");
}
printf ("\n");
return 0;
}

View File

@ -0,0 +1,243 @@
/*
* memset benchmark.
*
* Copyright (c) 2021, Arm Limited.
* SPDX-License-Identifier: MIT
*/
#define _GNU_SOURCE
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include "stringlib.h"
#include "benchlib.h"
#define ITERS 5000
#define ITERS2 20000000
#define ITERS3 1000000
#define NUM_TESTS 16384
#define MIN_SIZE 32768
#define MAX_SIZE (1024 * 1024)
static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64)));
#define F(x) {#x, x},
static const struct fun
{
const char *name;
void *(*fun)(void *, int, size_t);
} funtab[] =
{
#if __aarch64__
F(__memset_aarch64)
#elif __arm__
F(__memset_arm)
#endif
F(memset)
#undef F
{0, 0}
};
typedef struct { uint32_t offset : 20, len : 12; } memset_test_t;
static memset_test_t test_arr[NUM_TESTS];
typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
typedef struct { uint8_t align; uint16_t freq; } align_data_t;
#define SIZE_NUM 65536
#define SIZE_MASK (SIZE_NUM-1)
static uint8_t len_arr[SIZE_NUM];
/* Frequency data for memset sizes up to 4096 based on SPEC2017. */
static freq_data_t memset_len_freq[] =
{
{40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, { 8,1412},
{292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414},
{ 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, { 2, 200}, { 4, 192},
{ 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140},
{4095,133}, { 10, 130}, { 9, 124}, { 3, 124}, { 28, 120}, { 0, 118},
{288, 110}, {1152, 96}, {104, 90}, { 1, 86}, {832, 76}, {248, 74},
{1024, 69}, {120, 64}, {512, 63}, {384, 60}, { 6, 59}, { 80, 54},
{ 17, 50}, { 7, 49}, {520, 47}, {2048, 39}, {256, 37}, {864, 33},
{1440, 28}, { 22, 27}, {2056, 24}, {260, 23}, { 68, 23}, { 5, 22},
{ 18, 21}, {200, 18}, {2120, 18}, { 60, 17}, { 52, 16}, {336, 15},
{ 44, 13}, {192, 13}, {160, 12}, {2064, 12}, {128, 12}, { 76, 11},
{164, 11}, {152, 10}, {136, 9}, {488, 7}, { 96, 6}, {560, 6},
{1016, 6}, {112, 5}, {232, 5}, {168, 5}, {952, 5}, {184, 5},
{144, 4}, {252, 4}, { 84, 3}, {960, 3}, {3808, 3}, {244, 3},
{280, 3}, {224, 3}, {156, 3}, {1088, 3}, {440, 3}, {216, 2},
{304, 2}, { 23, 2}, { 25, 2}, { 26, 2}, {264, 2}, {328, 2},
{1096, 2}, {240, 2}, {1104, 2}, {704, 2}, {1664, 2}, {360, 2},
{808, 1}, {544, 1}, {236, 1}, {720, 1}, {368, 1}, {424, 1},
{640, 1}, {1112, 1}, {552, 1}, {272, 1}, {776, 1}, {376, 1},
{ 92, 1}, {536, 1}, {824, 1}, {496, 1}, {760, 1}, {792, 1},
{504, 1}, {344, 1}, {1816, 1}, {880, 1}, {176, 1}, {320, 1},
{352, 1}, {2008, 1}, {208, 1}, {408, 1}, {228, 1}, {2072, 1},
{568, 1}, {220, 1}, {616, 1}, {600, 1}, {392, 1}, {696, 1},
{2144, 1}, {1280, 1}, {2136, 1}, {632, 1}, {584, 1}, {456, 1},
{472, 1}, {3440, 1}, {2088, 1}, {680, 1}, {2928, 1}, {212, 1},
{648, 1}, {1752, 1}, {664, 1}, {3512, 1}, {1032, 1}, {528, 1},
{4072, 1}, {204, 1}, {2880, 1}, {3392, 1}, {712, 1}, { 59, 1},
{736, 1}, {592, 1}, {2520, 1}, {744, 1}, {196, 1}, {172, 1},
{728, 1}, {2040, 1}, {1192, 1}, {3600, 1}, {0, 0}
};
#define ALIGN_NUM 1024
#define ALIGN_MASK (ALIGN_NUM-1)
static uint8_t align_arr[ALIGN_NUM];
/* Alignment data for memset based on SPEC2017. */
static align_data_t memset_align_freq[] =
{
{16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0}
};
static void
init_memset_distribution (void)
{
int i, j, freq, size, n;
for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++)
for (j = 0, size = memset_len_freq[i].size; j < freq; j++)
len_arr[n++] = size;
assert (n == SIZE_NUM);
for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++)
for (j = 0, size = memset_align_freq[i].align; j < freq; j++)
align_arr[n++] = size - 1;
assert (n == ALIGN_NUM);
}
static size_t
init_memset (size_t max_size)
{
size_t total = 0;
/* Create a random set of memsets with the given size and alignment
distributions. */
for (int i = 0; i < NUM_TESTS; i++)
{
test_arr[i].offset = (rand32 (0) & (max_size - 1));
test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK];
test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK];
total += test_arr[i].len;
}
return total;
}
int main (void)
{
init_memset_distribution ();
memset (a, 1, sizeof (a));
printf("Random memset (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
size_t total_size = 0;
uint64_t tsum = 0;
printf ("%22s ", funtab[f].name);
rand32 (0x12345678);
for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
{
size_t memset_size = init_memset (size) * ITERS;
for (int c = 0; c < NUM_TESTS; c++)
funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS; i++)
for (int c = 0; c < NUM_TESTS; c++)
funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
t = clock_get_ns () - t;
total_size += memset_size;
tsum += t;
printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
}
printf( "avg %.2f\n", (double)total_size / tsum);
}
size_t total_size = 0;
uint64_t tsum = 0;
printf ("%22s ", "memset_call");
rand32 (0x12345678);
for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
{
size_t memset_size = init_memset (size) * ITERS;
for (int c = 0; c < NUM_TESTS; c++)
memset (a + test_arr[c].offset, 0, test_arr[c].len);
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS; i++)
for (int c = 0; c < NUM_TESTS; c++)
memset (a + test_arr[c].offset, 0, test_arr[c].len);
t = clock_get_ns () - t;
total_size += memset_size;
tsum += t;
printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
}
printf( "avg %.2f\n", (double)total_size / tsum);
printf ("\nMedium memset (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
printf ("%22s ", funtab[f].name);
for (int size = 8; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
funtab[f].fun (a, 0, size);
t = clock_get_ns () - t;
printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
}
printf ("\n");
}
printf ("%22s ", "memset_call");
for (int size = 8; size <= 512; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS2; i++)
memset (a, 0, size);
t = clock_get_ns () - t;
printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
}
printf ("\nLarge memset (bytes/ns):\n");
for (int f = 0; funtab[f].name != 0; f++)
{
printf ("%22s ", funtab[f].name);
for (int size = 1024; size <= 65536; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
funtab[f].fun (a, 0, size);
t = clock_get_ns () - t;
printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
}
printf ("\n");
}
printf ("%22s ", "memset_call");
for (int size = 1024; size <= 65536; size *= 2)
{
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS3; i++)
memset (a, 0, size);
t = clock_get_ns () - t;
printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
}
printf ("\n\n");
return 0;
}

View File

@ -1,7 +1,7 @@
/*
* strlen benchmark.
*
* Copyright (c) 2020, Arm Limited.
* Copyright (c) 2020-2021, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@ -13,10 +13,10 @@
#include "stringlib.h"
#include "benchlib.h"
#define ITERS 2000
#define ITERS 5000
#define ITERS2 20000000
#define ITERS3 2000000
#define NUM_STRLEN 16384
#define NUM_TESTS 16384
#define MAX_ALIGN 32
#define MAX_STRLEN 256
@ -49,7 +49,7 @@ static const struct fun
};
#undef F
static uint16_t strlen_tests[NUM_STRLEN];
static uint16_t strlen_tests[NUM_TESTS];
typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
typedef struct { uint8_t align; uint16_t freq; } align_data_t;
@ -117,7 +117,7 @@ init_strlen_tests (void)
/* Create a random set of strlen input strings using the string length
and alignment distributions. */
for (int n = 0; n < NUM_STRLEN; n++)
for (int n = 0; n < NUM_TESTS; n++)
{
int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
@ -141,14 +141,14 @@ int main (void)
size_t res = 0, strlen_size = 0, mask = maskv;
printf ("%22s ", funtab[f].name);
for (int c = 0; c < NUM_STRLEN; c++)
for (int c = 0; c < NUM_TESTS; c++)
strlen_size += funtab[f].fun (a + strlen_tests[c]);
strlen_size *= ITERS;
/* Measure latency of strlen result with (res & mask). */
uint64_t t = clock_get_ns ();
for (int i = 0; i < ITERS; i++)
for (int c = 0; c < NUM_STRLEN; c++)
for (int c = 0; c < NUM_TESTS; c++)
res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
t = clock_get_ns () - t;
printf ("%.2f\n", (double)strlen_size / t);

View File

@ -29,19 +29,17 @@ size_t __strlen_aarch64 (const char *);
size_t __strnlen_aarch64 (const char *, size_t);
int __strncmp_aarch64 (const char *, const char *, size_t);
void * __memchr_aarch64_mte (const void *, int, size_t);
char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);
char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);
char *__strchr_aarch64_mte (const char *, int);
char * __strchrnul_aarch64_mte (const char *, int );
size_t __strlen_aarch64_mte (const char *);
char *__strrchr_aarch64_mte (const char *, int);
int __strcmp_aarch64_mte (const char *, const char *);
int __strncmp_aarch64_mte (const char *, const char *, size_t);
#if __ARM_NEON
void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
void *__memmove_aarch64_simd (void *, const void *, size_t);
#endif
# if __ARM_FEATURE_SVE
void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t);
void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t);
void *__memchr_aarch64_sve (const void *, int, size_t);
int __memcmp_aarch64_sve (const void *, const void *, size_t);
char *__strchr_aarch64_sve (const char *, int);

View File

@ -28,6 +28,9 @@ static const struct fun
# if __ARM_NEON
F(__memcpy_aarch64_simd, 1)
# endif
# if __ARM_FEATURE_SVE
F(__memcpy_aarch64_sve, 1)
# endif
#elif __arm__
F(__memcpy_arm, 0)
#endif

View File

@ -28,6 +28,9 @@ static const struct fun
# if __ARM_NEON
F(__memmove_aarch64_simd, 1)
# endif
# if __ARM_FEATURE_SVE
F(__memmove_aarch64_sve, 1)
# endif
#endif
{0, 0, 0}
// clang-format on

View File

@ -28,8 +28,7 @@ static const struct fun
// clang-format off
F(stpcpy, 0)
#if __aarch64__
F(__stpcpy_aarch64, 0)
F(__stpcpy_aarch64_mte, 1)
F(__stpcpy_aarch64, 1)
# if __ARM_FEATURE_SVE
F(__stpcpy_aarch64_sve, 1)
# endif

View File

@ -24,8 +24,7 @@ static const struct fun
// clang-format off
F(strcmp, 0)
#if __aarch64__
F(__strcmp_aarch64, 0)
F(__strcmp_aarch64_mte, 1)
F(__strcmp_aarch64, 1)
# if __ARM_FEATURE_SVE
F(__strcmp_aarch64_sve, 1)
# endif

View File

@ -24,8 +24,7 @@ static const struct fun
// clang-format off
F(strcpy, 0)
#if __aarch64__
F(__strcpy_aarch64, 0)
F(__strcpy_aarch64_mte, 1)
F(__strcpy_aarch64, 1)
# if __ARM_FEATURE_SVE
F(__strcpy_aarch64_sve, 1)
# endif

View File

@ -24,8 +24,7 @@ static const struct fun
// clang-format off
F(strncmp, 0)
#if __aarch64__
F(__strncmp_aarch64, 0)
F(__strncmp_aarch64_mte, 1)
F(__strncmp_aarch64, 1)
# if __ARM_FEATURE_SVE
F(__strncmp_aarch64_sve, 1)
# endif