Import the updated Arm Optimized Routines
The main changes this brings in are: - Improves the performance of memcmp - Adds SVE implementation of memcpy - Uses the MTE version of some str* functions as they are faster Sponsored by: The FreeBSD Foundation
This commit is contained in:
commit
d49ad20625
@ -9,7 +9,7 @@ contributor-agreement.pdf. This is needed so upstreaming code
|
||||
to projects that require copyright assignment is possible.
|
||||
|
||||
Regular quarterly releases are tagged as vYY.MM, the latest
|
||||
release is v20.11.
|
||||
release is v21.02.
|
||||
|
||||
Source code layout:
|
||||
|
||||
|
@ -22,7 +22,7 @@ cosf (float y)
|
||||
int n;
|
||||
const sincos_t *p = &__sincosf_table[0];
|
||||
|
||||
if (abstop12 (y) < abstop12 (pio4))
|
||||
if (abstop12 (y) < abstop12 (pio4f))
|
||||
{
|
||||
double x2 = x * x;
|
||||
|
||||
|
@ -22,7 +22,7 @@ sincosf (float y, float *sinp, float *cosp)
|
||||
int n;
|
||||
const sincos_t *p = &__sincosf_table[0];
|
||||
|
||||
if (abstop12 (y) < abstop12 (pio4))
|
||||
if (abstop12 (y) < abstop12 (pio4f))
|
||||
{
|
||||
double x2 = x * x;
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
/* 2PI * 2^-64. */
|
||||
static const double pi63 = 0x1.921FB54442D18p-62;
|
||||
/* PI / 4. */
|
||||
static const double pio4 = 0x1.921FB54442D18p-1;
|
||||
static const float pio4f = 0x1.921FB6p-1f;
|
||||
|
||||
/* The constants and polynomials for sine and cosine. */
|
||||
typedef struct
|
||||
|
@ -21,7 +21,7 @@ sinf (float y)
|
||||
int n;
|
||||
const sincos_t *p = &__sincosf_table[0];
|
||||
|
||||
if (abstop12 (y) < abstop12 (pio4))
|
||||
if (abstop12 (y) < abstop12 (pio4f))
|
||||
{
|
||||
s = x * x;
|
||||
|
||||
|
@ -1,103 +1,84 @@
|
||||
/* memcmp - compare memory
|
||||
*
|
||||
* Copyright (c) 2013-2020, Arm Limited.
|
||||
* Copyright (c) 2013-2021, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, unaligned accesses.
|
||||
* ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
*/
|
||||
|
||||
#include "../asmdefs.h"
|
||||
|
||||
/* Parameters and result. */
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define limit x2
|
||||
#define result w0
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define limit x2
|
||||
#define result w0
|
||||
|
||||
#define data1 x3
|
||||
#define data1w w3
|
||||
#define data2 x4
|
||||
#define data2w w4
|
||||
#define data3 x5
|
||||
#define data3w w5
|
||||
#define data4 x6
|
||||
#define data4w w6
|
||||
#define tmp x6
|
||||
#define src1end x7
|
||||
#define src2end x8
|
||||
|
||||
/* Internal variables. */
|
||||
#define data1 x3
|
||||
#define data1w w3
|
||||
#define data1h x4
|
||||
#define data2 x5
|
||||
#define data2w w5
|
||||
#define data2h x6
|
||||
#define tmp1 x7
|
||||
#define tmp2 x8
|
||||
|
||||
ENTRY (__memcmp_aarch64)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
SIZE_ARG (2)
|
||||
subs limit, limit, 8
|
||||
b.lo L(less8)
|
||||
|
||||
ldr data1, [src1], 8
|
||||
ldr data2, [src2], 8
|
||||
cmp limit, 16
|
||||
b.lo L(less16)
|
||||
ldp data1, data3, [src1]
|
||||
ldp data2, data4, [src2]
|
||||
ccmp data1, data2, 0, ne
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
|
||||
add src1end, src1, limit
|
||||
add src2end, src2, limit
|
||||
cmp limit, 32
|
||||
b.ls L(last_bytes)
|
||||
cmp limit, 160
|
||||
b.hs L(loop_align)
|
||||
sub limit, limit, 32
|
||||
|
||||
.p2align 4
|
||||
L(loop32):
|
||||
ldp data1, data3, [src1, 16]
|
||||
ldp data2, data4, [src2, 16]
|
||||
cmp data1, data2
|
||||
b.ne L(return)
|
||||
|
||||
subs limit, limit, 8
|
||||
b.gt L(more16)
|
||||
|
||||
ldr data1, [src1, limit]
|
||||
ldr data2, [src2, limit]
|
||||
b L(return)
|
||||
|
||||
L(more16):
|
||||
ldr data1, [src1], 8
|
||||
ldr data2, [src2], 8
|
||||
cmp data1, data2
|
||||
bne L(return)
|
||||
|
||||
/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
|
||||
strings. */
|
||||
subs limit, limit, 16
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
cmp limit, 16
|
||||
b.ls L(last_bytes)
|
||||
|
||||
/* We overlap loads between 0-32 bytes at either side of SRC1 when we
|
||||
try to align, so limit it only to strings larger than 128 bytes. */
|
||||
cmp limit, 96
|
||||
b.ls L(loop16)
|
||||
|
||||
/* Align src1 and adjust src2 with bytes not yet done. */
|
||||
and tmp1, src1, 15
|
||||
add limit, limit, tmp1
|
||||
sub src1, src1, tmp1
|
||||
sub src2, src2, tmp1
|
||||
|
||||
/* Loop performing 16 bytes per iteration using aligned src1.
|
||||
Limit is pre-decremented by 16 and must be larger than zero.
|
||||
Exit if <= 16 bytes left to do or if the data is not equal. */
|
||||
.p2align 4
|
||||
L(loop16):
|
||||
ldp data1, data1h, [src1], 16
|
||||
ldp data2, data2h, [src2], 16
|
||||
subs limit, limit, 16
|
||||
ccmp data1, data2, 0, hi
|
||||
ccmp data1h, data2h, 0, eq
|
||||
b.eq L(loop16)
|
||||
|
||||
ldp data1, data3, [src1, 32]
|
||||
ldp data2, data4, [src2, 32]
|
||||
cmp data1, data2
|
||||
bne L(return)
|
||||
mov data1, data1h
|
||||
mov data2, data2h
|
||||
cmp data1, data2
|
||||
bne L(return)
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
add src1, src1, 32
|
||||
add src2, src2, 32
|
||||
L(last64):
|
||||
subs limit, limit, 32
|
||||
b.hi L(loop32)
|
||||
|
||||
/* Compare last 1-16 bytes using unaligned access. */
|
||||
L(last_bytes):
|
||||
add src1, src1, limit
|
||||
add src2, src2, limit
|
||||
ldp data1, data1h, [src1]
|
||||
ldp data2, data2h, [src2]
|
||||
cmp data1, data2
|
||||
bne L(return)
|
||||
mov data1, data1h
|
||||
mov data2, data2h
|
||||
ldp data1, data3, [src1end, -16]
|
||||
ldp data2, data4, [src2end, -16]
|
||||
L(return2):
|
||||
cmp data1, data2
|
||||
csel data1, data1, data3, ne
|
||||
csel data2, data2, data4, ne
|
||||
|
||||
/* Compare data bytes and set return value to 0, -1 or 1. */
|
||||
L(return):
|
||||
@ -105,33 +86,105 @@ L(return):
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
cmp data1, data2
|
||||
L(ret_eq):
|
||||
cmp data1, data2
|
||||
cset result, ne
|
||||
cneg result, result, lo
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Compare up to 8 bytes. Limit is [-8..-1]. */
|
||||
L(less16):
|
||||
add src1end, src1, limit
|
||||
add src2end, src2, limit
|
||||
tbz limit, 3, L(less8)
|
||||
ldr data1, [src1]
|
||||
ldr data2, [src2]
|
||||
ldr data3, [src1end, -8]
|
||||
ldr data4, [src2end, -8]
|
||||
b L(return2)
|
||||
|
||||
.p2align 4
|
||||
L(less8):
|
||||
adds limit, limit, 4
|
||||
b.lo L(less4)
|
||||
ldr data1w, [src1], 4
|
||||
ldr data2w, [src2], 4
|
||||
tbz limit, 2, L(less4)
|
||||
ldr data1w, [src1]
|
||||
ldr data2w, [src2]
|
||||
ldr data3w, [src1end, -4]
|
||||
ldr data4w, [src2end, -4]
|
||||
b L(return2)
|
||||
|
||||
L(less4):
|
||||
tbz limit, 1, L(less2)
|
||||
ldrh data1w, [src1]
|
||||
ldrh data2w, [src2]
|
||||
cmp data1w, data2w
|
||||
b.ne L(return)
|
||||
sub limit, limit, 4
|
||||
L(less4):
|
||||
adds limit, limit, 4
|
||||
beq L(ret_eq)
|
||||
L(byte_loop):
|
||||
ldrb data1w, [src1], 1
|
||||
ldrb data2w, [src2], 1
|
||||
subs limit, limit, 1
|
||||
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
|
||||
b.eq L(byte_loop)
|
||||
L(less2):
|
||||
mov result, 0
|
||||
tbz limit, 0, L(return_zero)
|
||||
ldrb data1w, [src1end, -1]
|
||||
ldrb data2w, [src2end, -1]
|
||||
sub result, data1w, data2w
|
||||
L(return_zero):
|
||||
ret
|
||||
|
||||
L(loop_align):
|
||||
ldp data1, data3, [src1, 16]
|
||||
ldp data2, data4, [src2, 16]
|
||||
cmp data1, data2
|
||||
ccmp data3, data4, 0, eq
|
||||
b.ne L(return2)
|
||||
|
||||
/* Align src2 and adjust src1, src2 and limit. */
|
||||
and tmp, src2, 15
|
||||
sub tmp, tmp, 16
|
||||
sub src2, src2, tmp
|
||||
add limit, limit, tmp
|
||||
sub src1, src1, tmp
|
||||
sub limit, limit, 64 + 16
|
||||
|
||||
.p2align 4
|
||||
L(loop64):
|
||||
ldr q0, [src1, 16]
|
||||
ldr q1, [src2, 16]
|
||||
subs limit, limit, 64
|
||||
ldr q2, [src1, 32]
|
||||
ldr q3, [src2, 32]
|
||||
eor v0.16b, v0.16b, v1.16b
|
||||
eor v1.16b, v2.16b, v3.16b
|
||||
ldr q2, [src1, 48]
|
||||
ldr q3, [src2, 48]
|
||||
umaxp v0.16b, v0.16b, v1.16b
|
||||
ldr q4, [src1, 64]!
|
||||
ldr q5, [src2, 64]!
|
||||
eor v1.16b, v2.16b, v3.16b
|
||||
eor v2.16b, v4.16b, v5.16b
|
||||
umaxp v1.16b, v1.16b, v2.16b
|
||||
umaxp v0.16b, v0.16b, v1.16b
|
||||
umaxp v0.16b, v0.16b, v0.16b
|
||||
fmov tmp, d0
|
||||
ccmp tmp, 0, 0, hi
|
||||
b.eq L(loop64)
|
||||
|
||||
/* If equal, process last 1-64 bytes using scalar loop. */
|
||||
add limit, limit, 64 + 16
|
||||
cbz tmp, L(last64)
|
||||
|
||||
/* Determine the 8-byte aligned offset of the first difference. */
|
||||
#ifdef __AARCH64EB__
|
||||
rev16 tmp, tmp
|
||||
#endif
|
||||
rev tmp, tmp
|
||||
clz tmp, tmp
|
||||
bic tmp, tmp, 7
|
||||
sub tmp, tmp, 48
|
||||
ldr data1, [src1, tmp]
|
||||
ldr data2, [src2, tmp]
|
||||
#ifndef __AARCH64EB__
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
mov result, 1
|
||||
cmp data1, data2
|
||||
cneg result, result, lo
|
||||
ret
|
||||
|
||||
END (__memcmp_aarch64)
|
||||
|
||||
|
180
contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S
Normal file
180
contrib/arm-optimized-routines/string/aarch64/memcpy-sve.S
Normal file
@ -0,0 +1,180 @@
|
||||
/*
|
||||
* memcpy - copy memory area
|
||||
*
|
||||
* Copyright (c) 2019-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
|
||||
*
|
||||
*/
|
||||
|
||||
#if __ARM_FEATURE_SVE
|
||||
|
||||
#include "../asmdefs.h"
|
||||
|
||||
#define dstin x0
|
||||
#define src x1
|
||||
#define count x2
|
||||
#define dst x3
|
||||
#define srcend x4
|
||||
#define dstend x5
|
||||
#define tmp1 x6
|
||||
#define vlen x6
|
||||
|
||||
#define A_q q0
|
||||
#define B_q q1
|
||||
#define C_q q2
|
||||
#define D_q q3
|
||||
#define E_q q4
|
||||
#define F_q q5
|
||||
#define G_q q6
|
||||
#define H_q q7
|
||||
|
||||
/* This implementation handles overlaps and supports both memcpy and memmove
|
||||
from a single entry point. It uses unaligned accesses and branchless
|
||||
sequences to keep the code small, simple and improve performance.
|
||||
SVE vectors are used to speedup small copies.
|
||||
|
||||
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
|
||||
copies of up to 128 bytes, and large copies. The overhead of the overlap
|
||||
check is negligible since it is only required for large copies.
|
||||
|
||||
Large copies use a software pipelined loop processing 64 bytes per iteration.
|
||||
The source pointer is 16-byte aligned to minimize unaligned accesses.
|
||||
The loop tail is handled by always copying 64 bytes from the end.
|
||||
*/
|
||||
|
||||
ENTRY_ALIAS (__memmove_aarch64_sve)
|
||||
ENTRY (__memcpy_aarch64_sve)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
SIZE_ARG (2)
|
||||
|
||||
cmp count, 128
|
||||
b.hi L(copy_long)
|
||||
cmp count, 32
|
||||
b.hi L(copy32_128)
|
||||
|
||||
whilelo p0.b, xzr, count
|
||||
cntb vlen
|
||||
tbnz vlen, 4, L(vlen128)
|
||||
ld1b z0.b, p0/z, [src]
|
||||
st1b z0.b, p0, [dstin]
|
||||
ret
|
||||
|
||||
/* Medium copies: 33..128 bytes. */
|
||||
L(copy32_128):
|
||||
add srcend, src, count
|
||||
add dstend, dstin, count
|
||||
ldp A_q, B_q, [src]
|
||||
ldp C_q, D_q, [srcend, -32]
|
||||
cmp count, 64
|
||||
b.hi L(copy128)
|
||||
stp A_q, B_q, [dstin]
|
||||
stp C_q, D_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
/* Copy 65..128 bytes. */
|
||||
L(copy128):
|
||||
ldp E_q, F_q, [src, 32]
|
||||
cmp count, 96
|
||||
b.ls L(copy96)
|
||||
ldp G_q, H_q, [srcend, -64]
|
||||
stp G_q, H_q, [dstend, -64]
|
||||
L(copy96):
|
||||
stp A_q, B_q, [dstin]
|
||||
stp E_q, F_q, [dstin, 32]
|
||||
stp C_q, D_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
/* Copy more than 128 bytes. */
|
||||
L(copy_long):
|
||||
add srcend, src, count
|
||||
add dstend, dstin, count
|
||||
|
||||
/* Use backwards copy if there is an overlap. */
|
||||
sub tmp1, dstin, src
|
||||
cmp tmp1, count
|
||||
b.lo L(copy_long_backwards)
|
||||
|
||||
/* Copy 16 bytes and then align src to 16-byte alignment. */
|
||||
ldr D_q, [src]
|
||||
and tmp1, src, 15
|
||||
bic src, src, 15
|
||||
sub dst, dstin, tmp1
|
||||
add count, count, tmp1 /* Count is now 16 too large. */
|
||||
ldp A_q, B_q, [src, 16]
|
||||
str D_q, [dstin]
|
||||
ldp C_q, D_q, [src, 48]
|
||||
subs count, count, 128 + 16 /* Test and readjust count. */
|
||||
b.ls L(copy64_from_end)
|
||||
L(loop64):
|
||||
stp A_q, B_q, [dst, 16]
|
||||
ldp A_q, B_q, [src, 80]
|
||||
stp C_q, D_q, [dst, 48]
|
||||
ldp C_q, D_q, [src, 112]
|
||||
add src, src, 64
|
||||
add dst, dst, 64
|
||||
subs count, count, 64
|
||||
b.hi L(loop64)
|
||||
|
||||
/* Write the last iteration and copy 64 bytes from the end. */
|
||||
L(copy64_from_end):
|
||||
ldp E_q, F_q, [srcend, -64]
|
||||
stp A_q, B_q, [dst, 16]
|
||||
ldp A_q, B_q, [srcend, -32]
|
||||
stp C_q, D_q, [dst, 48]
|
||||
stp E_q, F_q, [dstend, -64]
|
||||
stp A_q, B_q, [dstend, -32]
|
||||
ret
|
||||
|
||||
L(vlen128):
|
||||
whilelo p1.b, vlen, count
|
||||
ld1b z0.b, p0/z, [src, 0, mul vl]
|
||||
ld1b z1.b, p1/z, [src, 1, mul vl]
|
||||
st1b z0.b, p0, [dstin, 0, mul vl]
|
||||
st1b z1.b, p1, [dstin, 1, mul vl]
|
||||
ret
|
||||
|
||||
/* Large backwards copy for overlapping copies.
|
||||
Copy 16 bytes and then align srcend to 16-byte alignment. */
|
||||
L(copy_long_backwards):
|
||||
cbz tmp1, L(return)
|
||||
ldr D_q, [srcend, -16]
|
||||
and tmp1, srcend, 15
|
||||
bic srcend, srcend, 15
|
||||
sub count, count, tmp1
|
||||
ldp A_q, B_q, [srcend, -32]
|
||||
str D_q, [dstend, -16]
|
||||
ldp C_q, D_q, [srcend, -64]
|
||||
sub dstend, dstend, tmp1
|
||||
subs count, count, 128
|
||||
b.ls L(copy64_from_start)
|
||||
|
||||
L(loop64_backwards):
|
||||
str B_q, [dstend, -16]
|
||||
str A_q, [dstend, -32]
|
||||
ldp A_q, B_q, [srcend, -96]
|
||||
str D_q, [dstend, -48]
|
||||
str C_q, [dstend, -64]!
|
||||
ldp C_q, D_q, [srcend, -128]
|
||||
sub srcend, srcend, 64
|
||||
subs count, count, 64
|
||||
b.hi L(loop64_backwards)
|
||||
|
||||
/* Write the last iteration and copy 64 bytes from the start. */
|
||||
L(copy64_from_start):
|
||||
ldp E_q, F_q, [src, 32]
|
||||
stp A_q, B_q, [dstend, -32]
|
||||
ldp A_q, B_q, [src]
|
||||
stp C_q, D_q, [dstend, -64]
|
||||
stp E_q, F_q, [dstin, 32]
|
||||
stp A_q, B_q, [dstin]
|
||||
L(return):
|
||||
ret
|
||||
|
||||
END (__memcpy_aarch64_sve)
|
||||
#endif
|
@ -1,10 +0,0 @@
|
||||
/*
|
||||
* stpcpy - copy a string returning pointer to end.
|
||||
*
|
||||
* Copyright (c) 2020, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#define BUILD_STPCPY 1
|
||||
|
||||
#include "strcpy-mte.S"
|
@ -1,189 +0,0 @@
|
||||
/*
|
||||
* strcmp - compare two strings
|
||||
*
|
||||
* Copyright (c) 2012-2020, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#include "../asmdefs.h"
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define result x0
|
||||
|
||||
#define data1 x2
|
||||
#define data1w w2
|
||||
#define data2 x3
|
||||
#define data2w w3
|
||||
#define has_nul x4
|
||||
#define diff x5
|
||||
#define off1 x5
|
||||
#define syndrome x6
|
||||
#define tmp x6
|
||||
#define data3 x7
|
||||
#define zeroones x8
|
||||
#define shift x9
|
||||
#define off2 x10
|
||||
|
||||
/* On big-endian early bytes are at MSB and on little-endian LSB.
|
||||
LS_FW means shifting towards early bytes. */
|
||||
#ifdef __AARCH64EB__
|
||||
# define LS_FW lsl
|
||||
#else
|
||||
# define LS_FW lsr
|
||||
#endif
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word.
|
||||
Since carry propagation makes 0x1 bytes before a NUL byte appear
|
||||
NUL too in big-endian, byte-reverse the data before the NUL check. */
|
||||
|
||||
|
||||
ENTRY (__strcmp_aarch64_mte)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
sub off2, src2, src1
|
||||
mov zeroones, REP8_01
|
||||
and tmp, src1, 7
|
||||
tst off2, 7
|
||||
b.ne L(misaligned8)
|
||||
cbnz tmp, L(mutual_align)
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(loop_aligned):
|
||||
ldr data2, [src1, off2]
|
||||
ldr data1, [src1], 8
|
||||
L(start_realigned):
|
||||
#ifdef __AARCH64EB__
|
||||
rev tmp, data1
|
||||
sub has_nul, tmp, zeroones
|
||||
orr tmp, tmp, REP8_7f
|
||||
#else
|
||||
sub has_nul, data1, zeroones
|
||||
orr tmp, data1, REP8_7f
|
||||
#endif
|
||||
bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
|
||||
ccmp data1, data2, 0, eq
|
||||
b.eq L(loop_aligned)
|
||||
#ifdef __AARCH64EB__
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, has_nul
|
||||
L(end):
|
||||
#ifndef __AARCH64EB__
|
||||
rev syndrome, syndrome
|
||||
rev data1, data1
|
||||
rev data2, data2
|
||||
#endif
|
||||
clz shift, syndrome
|
||||
/* The most-significant-non-zero bit of the syndrome marks either the
|
||||
first bit that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
lsl data1, data1, shift
|
||||
lsl data2, data2, shift
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, 56
|
||||
sub result, data1, data2, lsr 56
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(mutual_align):
|
||||
/* Sources are mutually aligned, but are not currently at an
|
||||
alignment boundary. Round down the addresses and then mask off
|
||||
the bytes that precede the start point. */
|
||||
bic src1, src1, 7
|
||||
ldr data2, [src1, off2]
|
||||
ldr data1, [src1], 8
|
||||
neg shift, src2, lsl 3 /* Bits to alignment -64. */
|
||||
mov tmp, -1
|
||||
LS_FW tmp, tmp, shift
|
||||
orr data1, data1, tmp
|
||||
orr data2, data2, tmp
|
||||
b L(start_realigned)
|
||||
|
||||
L(misaligned8):
|
||||
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
|
||||
checking to make sure that we don't access beyond the end of SRC2. */
|
||||
cbz tmp, L(src1_aligned)
|
||||
L(do_misaligned):
|
||||
ldrb data1w, [src1], 1
|
||||
ldrb data2w, [src2], 1
|
||||
cmp data1w, 0
|
||||
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
|
||||
b.ne L(done)
|
||||
tst src1, 7
|
||||
b.ne L(do_misaligned)
|
||||
|
||||
L(src1_aligned):
|
||||
neg shift, src2, lsl 3
|
||||
bic src2, src2, 7
|
||||
ldr data3, [src2], 8
|
||||
#ifdef __AARCH64EB__
|
||||
rev data3, data3
|
||||
#endif
|
||||
lsr tmp, zeroones, shift
|
||||
orr data3, data3, tmp
|
||||
sub has_nul, data3, zeroones
|
||||
orr tmp, data3, REP8_7f
|
||||
bics has_nul, has_nul, tmp
|
||||
b.ne L(tail)
|
||||
|
||||
sub off1, src2, src1
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(loop_unaligned):
|
||||
ldr data3, [src1, off1]
|
||||
ldr data2, [src1, off2]
|
||||
#ifdef __AARCH64EB__
|
||||
rev data3, data3
|
||||
#endif
|
||||
sub has_nul, data3, zeroones
|
||||
orr tmp, data3, REP8_7f
|
||||
ldr data1, [src1], 8
|
||||
bics has_nul, has_nul, tmp
|
||||
ccmp data1, data2, 0, eq
|
||||
b.eq L(loop_unaligned)
|
||||
|
||||
lsl tmp, has_nul, shift
|
||||
#ifdef __AARCH64EB__
|
||||
rev tmp, tmp
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, tmp
|
||||
cbnz syndrome, L(end)
|
||||
L(tail):
|
||||
ldr data1, [src1]
|
||||
neg shift, shift
|
||||
lsr data2, data3, shift
|
||||
lsr has_nul, has_nul, shift
|
||||
#ifdef __AARCH64EB__
|
||||
rev data2, data2
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, has_nul
|
||||
b L(end)
|
||||
|
||||
L(done):
|
||||
sub result, data1, data2
|
||||
ret
|
||||
|
||||
END (__strcmp_aarch64_mte)
|
||||
|
@ -1,168 +1,184 @@
|
||||
/*
|
||||
* strcmp - compare two strings
|
||||
*
|
||||
* Copyright (c) 2012-2020, Arm Limited.
|
||||
* Copyright (c) 2012-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* ARMv8-a, AArch64.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#include "../asmdefs.h"
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
#define REP8_80 0x8080808080808080
|
||||
|
||||
/* Parameters and result. */
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define result x0
|
||||
|
||||
/* Internal variables. */
|
||||
#define data1 x2
|
||||
#define data1w w2
|
||||
#define data2 x3
|
||||
#define data2w w3
|
||||
#define has_nul x4
|
||||
#define diff x5
|
||||
#define off1 x5
|
||||
#define syndrome x6
|
||||
#define tmp1 x7
|
||||
#define tmp2 x8
|
||||
#define tmp3 x9
|
||||
#define zeroones x10
|
||||
#define pos x11
|
||||
#define tmp x6
|
||||
#define data3 x7
|
||||
#define zeroones x8
|
||||
#define shift x9
|
||||
#define off2 x10
|
||||
|
||||
/* On big-endian early bytes are at MSB and on little-endian LSB.
|
||||
LS_FW means shifting towards early bytes. */
|
||||
#ifdef __AARCH64EB__
|
||||
# define LS_FW lsl
|
||||
#else
|
||||
# define LS_FW lsr
|
||||
#endif
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word.
|
||||
Since carry propagation makes 0x1 bytes before a NUL byte appear
|
||||
NUL too in big-endian, byte-reverse the data before the NUL check. */
|
||||
|
||||
|
||||
/* Start of performance-critical section -- one 64B cache line. */
|
||||
ENTRY (__strcmp_aarch64)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
eor tmp1, src1, src2
|
||||
mov zeroones, #REP8_01
|
||||
tst tmp1, #7
|
||||
sub off2, src2, src1
|
||||
mov zeroones, REP8_01
|
||||
and tmp, src1, 7
|
||||
tst off2, 7
|
||||
b.ne L(misaligned8)
|
||||
ands tmp1, src1, #7
|
||||
b.ne L(mutual_align)
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word. */
|
||||
L(loop_aligned):
|
||||
ldr data1, [src1], #8
|
||||
ldr data2, [src2], #8
|
||||
L(start_realigned):
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
orr syndrome, diff, has_nul
|
||||
cbz syndrome, L(loop_aligned)
|
||||
/* End of performance-critical section -- one 64B cache line. */
|
||||
cbnz tmp, L(mutual_align)
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(loop_aligned):
|
||||
ldr data2, [src1, off2]
|
||||
ldr data1, [src1], 8
|
||||
L(start_realigned):
|
||||
#ifdef __AARCH64EB__
|
||||
rev tmp, data1
|
||||
sub has_nul, tmp, zeroones
|
||||
orr tmp, tmp, REP8_7f
|
||||
#else
|
||||
sub has_nul, data1, zeroones
|
||||
orr tmp, data1, REP8_7f
|
||||
#endif
|
||||
bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
|
||||
ccmp data1, data2, 0, eq
|
||||
b.eq L(loop_aligned)
|
||||
#ifdef __AARCH64EB__
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, has_nul
|
||||
L(end):
|
||||
#ifndef __AARCH64EB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev syndrome, syndrome
|
||||
rev data1, data1
|
||||
/* The MS-non-zero bit of the syndrome marks either the first bit
|
||||
that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
clz pos, syndrome
|
||||
rev data2, data2
|
||||
lsl data1, data1, pos
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
ret
|
||||
#else
|
||||
/* For big-endian we cannot use the trick with the syndrome value
|
||||
as carry-propagation can corrupt the upper bits if the trailing
|
||||
bytes in the string contain 0x01. */
|
||||
/* However, if there is no NUL byte in the dword, we can generate
|
||||
the result directly. We can't just subtract the bytes as the
|
||||
MSB might be significant. */
|
||||
cbnz has_nul, 1f
|
||||
cmp data1, data2
|
||||
cset result, ne
|
||||
cneg result, result, lo
|
||||
ret
|
||||
1:
|
||||
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
|
||||
rev tmp3, data1
|
||||
sub tmp1, tmp3, zeroones
|
||||
orr tmp2, tmp3, #REP8_7f
|
||||
bic has_nul, tmp1, tmp2
|
||||
rev has_nul, has_nul
|
||||
orr syndrome, diff, has_nul
|
||||
clz pos, syndrome
|
||||
/* The MS-non-zero bit of the syndrome marks either the first bit
|
||||
that is different, or the top bit of the first zero byte.
|
||||
#endif
|
||||
clz shift, syndrome
|
||||
/* The most-significant-non-zero bit of the syndrome marks either the
|
||||
first bit that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
lsl data1, data1, pos
|
||||
lsl data2, data2, pos
|
||||
lsl data1, data1, shift
|
||||
lsl data2, data2, shift
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
lsr data1, data1, 56
|
||||
sub result, data1, data2, lsr 56
|
||||
ret
|
||||
#endif
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(mutual_align):
|
||||
/* Sources are mutually aligned, but are not currently at an
|
||||
alignment boundary. Round down the addresses and then mask off
|
||||
the bytes that preceed the start point. */
|
||||
bic src1, src1, #7
|
||||
bic src2, src2, #7
|
||||
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
|
||||
ldr data1, [src1], #8
|
||||
neg tmp1, tmp1 /* Bits to alignment -64. */
|
||||
ldr data2, [src2], #8
|
||||
mov tmp2, #~0
|
||||
#ifdef __AARCH64EB__
|
||||
/* Big-endian. Early bytes are at MSB. */
|
||||
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
|
||||
#else
|
||||
/* Little-endian. Early bytes are at LSB. */
|
||||
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
|
||||
#endif
|
||||
orr data1, data1, tmp2
|
||||
orr data2, data2, tmp2
|
||||
the bytes that precede the start point. */
|
||||
bic src1, src1, 7
|
||||
ldr data2, [src1, off2]
|
||||
ldr data1, [src1], 8
|
||||
neg shift, src2, lsl 3 /* Bits to alignment -64. */
|
||||
mov tmp, -1
|
||||
LS_FW tmp, tmp, shift
|
||||
orr data1, data1, tmp
|
||||
orr data2, data2, tmp
|
||||
b L(start_realigned)
|
||||
|
||||
L(misaligned8):
|
||||
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
|
||||
checking to make sure that we don't access beyond page boundary in
|
||||
SRC2. */
|
||||
tst src1, #7
|
||||
b.eq L(loop_misaligned)
|
||||
checking to make sure that we don't access beyond the end of SRC2. */
|
||||
cbz tmp, L(src1_aligned)
|
||||
L(do_misaligned):
|
||||
ldrb data1w, [src1], #1
|
||||
ldrb data2w, [src2], #1
|
||||
cmp data1w, #1
|
||||
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
|
||||
ldrb data1w, [src1], 1
|
||||
ldrb data2w, [src2], 1
|
||||
cmp data1w, 0
|
||||
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
|
||||
b.ne L(done)
|
||||
tst src1, #7
|
||||
tst src1, 7
|
||||
b.ne L(do_misaligned)
|
||||
|
||||
L(loop_misaligned):
|
||||
/* Test if we are within the last dword of the end of a 4K page. If
|
||||
yes then jump back to the misaligned loop to copy a byte at a time. */
|
||||
and tmp1, src2, #0xff8
|
||||
eor tmp1, tmp1, #0xff8
|
||||
cbz tmp1, L(do_misaligned)
|
||||
ldr data1, [src1], #8
|
||||
ldr data2, [src2], #8
|
||||
L(src1_aligned):
|
||||
neg shift, src2, lsl 3
|
||||
bic src2, src2, 7
|
||||
ldr data3, [src2], 8
|
||||
#ifdef __AARCH64EB__
|
||||
rev data3, data3
|
||||
#endif
|
||||
lsr tmp, zeroones, shift
|
||||
orr data3, data3, tmp
|
||||
sub has_nul, data3, zeroones
|
||||
orr tmp, data3, REP8_7f
|
||||
bics has_nul, has_nul, tmp
|
||||
b.ne L(tail)
|
||||
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
sub off1, src2, src1
|
||||
|
||||
.p2align 4
|
||||
|
||||
L(loop_unaligned):
|
||||
ldr data3, [src1, off1]
|
||||
ldr data2, [src1, off2]
|
||||
#ifdef __AARCH64EB__
|
||||
rev data3, data3
|
||||
#endif
|
||||
sub has_nul, data3, zeroones
|
||||
orr tmp, data3, REP8_7f
|
||||
ldr data1, [src1], 8
|
||||
bics has_nul, has_nul, tmp
|
||||
ccmp data1, data2, 0, eq
|
||||
b.eq L(loop_unaligned)
|
||||
|
||||
lsl tmp, has_nul, shift
|
||||
#ifdef __AARCH64EB__
|
||||
rev tmp, tmp
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, tmp
|
||||
cbnz syndrome, L(end)
|
||||
L(tail):
|
||||
ldr data1, [src1]
|
||||
neg shift, shift
|
||||
lsr data2, data3, shift
|
||||
lsr has_nul, has_nul, shift
|
||||
#ifdef __AARCH64EB__
|
||||
rev data2, data2
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
eor diff, data1, data2
|
||||
orr syndrome, diff, has_nul
|
||||
cbz syndrome, L(loop_misaligned)
|
||||
b L(end)
|
||||
|
||||
L(done):
|
||||
|
@ -1,161 +0,0 @@
|
||||
/*
|
||||
* strcpy/stpcpy - copy a string returning pointer to start/end.
|
||||
*
|
||||
* Copyright (c) 2020, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, Advanced SIMD.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#include "../asmdefs.h"
|
||||
|
||||
#define dstin x0
|
||||
#define srcin x1
|
||||
#define result x0
|
||||
|
||||
#define src x2
|
||||
#define dst x3
|
||||
#define len x4
|
||||
#define synd x4
|
||||
#define tmp x5
|
||||
#define wtmp w5
|
||||
#define shift x5
|
||||
#define data1 x6
|
||||
#define dataw1 w6
|
||||
#define data2 x7
|
||||
#define dataw2 w7
|
||||
|
||||
#define dataq q0
|
||||
#define vdata v0
|
||||
#define vhas_nul v1
|
||||
#define vrepmask v2
|
||||
#define vend v3
|
||||
#define dend d3
|
||||
#define dataq2 q1
|
||||
|
||||
#ifdef BUILD_STPCPY
|
||||
# define STRCPY __stpcpy_aarch64_mte
|
||||
# define IFSTPCPY(X,...) X,__VA_ARGS__
|
||||
#else
|
||||
# define STRCPY __strcpy_aarch64_mte
|
||||
# define IFSTPCPY(X,...)
|
||||
#endif
|
||||
|
||||
/* Core algorithm:
|
||||
|
||||
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
|
||||
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
|
||||
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
|
||||
set likewise for odd bytes so that adjacent bytes can be merged. Since the
|
||||
bits in the syndrome reflect the order in which things occur in the original
|
||||
string, counting trailing zeros identifies exactly which byte matched. */
|
||||
|
||||
ENTRY (STRCPY)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
bic src, srcin, 15
|
||||
mov wtmp, 0xf00f
|
||||
ld1 {vdata.16b}, [src]
|
||||
dup vrepmask.8h, wtmp
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbnz synd, L(tail)
|
||||
|
||||
ldr dataq, [src, 16]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbz synd, L(start_loop)
|
||||
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
sub tmp, src, srcin
|
||||
clz len, synd
|
||||
add len, tmp, len, lsr 2
|
||||
tbz len, 4, L(less16)
|
||||
sub tmp, len, 15
|
||||
ldr dataq, [srcin]
|
||||
ldr dataq2, [srcin, tmp]
|
||||
str dataq, [dstin]
|
||||
str dataq2, [dstin, tmp]
|
||||
IFSTPCPY (add result, dstin, len)
|
||||
ret
|
||||
|
||||
.p2align 4,,8
|
||||
L(tail):
|
||||
rbit synd, synd
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
|
||||
.p2align 4
|
||||
L(less16):
|
||||
tbz len, 3, L(less8)
|
||||
sub tmp, len, 7
|
||||
ldr data1, [srcin]
|
||||
ldr data2, [srcin, tmp]
|
||||
str data1, [dstin]
|
||||
str data2, [dstin, tmp]
|
||||
IFSTPCPY (add result, dstin, len)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(less8):
|
||||
subs tmp, len, 3
|
||||
b.lo L(less4)
|
||||
ldr dataw1, [srcin]
|
||||
ldr dataw2, [srcin, tmp]
|
||||
str dataw1, [dstin]
|
||||
str dataw2, [dstin, tmp]
|
||||
IFSTPCPY (add result, dstin, len)
|
||||
ret
|
||||
|
||||
L(less4):
|
||||
cbz len, L(zerobyte)
|
||||
ldrh dataw1, [srcin]
|
||||
strh dataw1, [dstin]
|
||||
L(zerobyte):
|
||||
strb wzr, [dstin, len]
|
||||
IFSTPCPY (add result, dstin, len)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(start_loop):
|
||||
sub len, src, srcin
|
||||
ldr dataq2, [srcin]
|
||||
add dst, dstin, len
|
||||
str dataq2, [dstin]
|
||||
|
||||
.p2align 5
|
||||
L(loop):
|
||||
str dataq, [dst], 16
|
||||
ldr dataq, [src, 16]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop)
|
||||
|
||||
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
|
||||
fmov synd, dend
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
sub tmp, len, 15
|
||||
ldr dataq, [src, tmp]
|
||||
str dataq, [dst, tmp]
|
||||
IFSTPCPY (add result, dst, len)
|
||||
ret
|
||||
|
||||
END (STRCPY)
|
@ -1,311 +1,161 @@
|
||||
/*
|
||||
* strcpy/stpcpy - copy a string returning pointer to start/end.
|
||||
*
|
||||
* Copyright (c) 2013-2020, Arm Limited.
|
||||
* Copyright (c) 2020-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
|
||||
* ARMv8-a, AArch64, Advanced SIMD.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#include "../asmdefs.h"
|
||||
|
||||
/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
|
||||
|
||||
To test the page crossing code path more thoroughly, compile with
|
||||
-DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
|
||||
entry path. This option is not intended for production use. */
|
||||
|
||||
/* Arguments and results. */
|
||||
#define dstin x0
|
||||
#define srcin x1
|
||||
#define result x0
|
||||
|
||||
/* Locals and temporaries. */
|
||||
#define src x2
|
||||
#define dst x3
|
||||
#define data1 x4
|
||||
#define data1w w4
|
||||
#define data2 x5
|
||||
#define data2w w5
|
||||
#define has_nul1 x6
|
||||
#define has_nul2 x7
|
||||
#define tmp1 x8
|
||||
#define tmp2 x9
|
||||
#define tmp3 x10
|
||||
#define tmp4 x11
|
||||
#define zeroones x12
|
||||
#define data1a x13
|
||||
#define data2a x14
|
||||
#define pos x15
|
||||
#define len x16
|
||||
#define to_align x17
|
||||
#define len x4
|
||||
#define synd x4
|
||||
#define tmp x5
|
||||
#define wtmp w5
|
||||
#define shift x5
|
||||
#define data1 x6
|
||||
#define dataw1 w6
|
||||
#define data2 x7
|
||||
#define dataw2 w7
|
||||
|
||||
#define dataq q0
|
||||
#define vdata v0
|
||||
#define vhas_nul v1
|
||||
#define vrepmask v2
|
||||
#define vend v3
|
||||
#define dend d3
|
||||
#define dataq2 q1
|
||||
|
||||
#ifdef BUILD_STPCPY
|
||||
#define STRCPY __stpcpy_aarch64
|
||||
# define STRCPY __stpcpy_aarch64
|
||||
# define IFSTPCPY(X,...) X,__VA_ARGS__
|
||||
#else
|
||||
#define STRCPY __strcpy_aarch64
|
||||
# define STRCPY __strcpy_aarch64
|
||||
# define IFSTPCPY(X,...)
|
||||
#endif
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word. */
|
||||
/* Core algorithm:
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
#define REP8_80 0x8080808080808080
|
||||
|
||||
/* AArch64 systems have a minimum page size of 4k. We can do a quick
|
||||
page size check for crossing this boundary on entry and if we
|
||||
do not, then we can short-circuit much of the entry code. We
|
||||
expect early page-crossing strings to be rare (probability of
|
||||
16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
|
||||
predictable, even with random strings.
|
||||
|
||||
We don't bother checking for larger page sizes, the cost of setting
|
||||
up the correct page size is just not worth the extra gain from
|
||||
a small reduction in the cases taking the slow path. Note that
|
||||
we only care about whether the first fetch, which may be
|
||||
misaligned, crosses a page boundary - after that we move to aligned
|
||||
fetches for the remainder of the string. */
|
||||
|
||||
#ifdef STRCPY_TEST_PAGE_CROSS
|
||||
/* Make everything that isn't Qword aligned look like a page cross. */
|
||||
#define MIN_PAGE_P2 4
|
||||
#else
|
||||
#define MIN_PAGE_P2 12
|
||||
#endif
|
||||
|
||||
#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
|
||||
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
|
||||
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
|
||||
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
|
||||
set likewise for odd bytes so that adjacent bytes can be merged. Since the
|
||||
bits in the syndrome reflect the order in which things occur in the original
|
||||
string, counting trailing zeros identifies exactly which byte matched. */
|
||||
|
||||
ENTRY (STRCPY)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
/* For moderately short strings, the fastest way to do the copy is to
|
||||
calculate the length of the string in the same way as strlen, then
|
||||
essentially do a memcpy of the result. This avoids the need for
|
||||
multiple byte copies and further means that by the time we
|
||||
reach the bulk copy loop we know we can always use DWord
|
||||
accesses. We expect __strcpy_aarch64 to rarely be called repeatedly
|
||||
with the same source string, so branch prediction is likely to
|
||||
always be difficult - we mitigate against this by preferring
|
||||
conditional select operations over branches whenever this is
|
||||
feasible. */
|
||||
and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
|
||||
mov zeroones, #REP8_01
|
||||
and to_align, srcin, #15
|
||||
cmp tmp2, #(MIN_PAGE_SIZE - 16)
|
||||
neg tmp1, to_align
|
||||
/* The first fetch will straddle a (possible) page boundary iff
|
||||
srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
|
||||
aligned string will never fail the page align check, so will
|
||||
always take the fast path. */
|
||||
b.gt L(page_cross)
|
||||
bic src, srcin, 15
|
||||
mov wtmp, 0xf00f
|
||||
ld1 {vdata.16b}, [src]
|
||||
dup vrepmask.8h, wtmp
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
lsl shift, srcin, 2
|
||||
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
cbnz synd, L(tail)
|
||||
|
||||
L(page_cross_ok):
|
||||
ldp data1, data2, [srcin]
|
||||
#ifdef __AARCH64EB__
|
||||
/* Because we expect the end to be found within 16 characters
|
||||
(profiling shows this is the most common case), it's worth
|
||||
swapping the bytes now to save having to recalculate the
|
||||
termination syndrome later. We preserve data1 and data2
|
||||
so that we can re-use the values later on. */
|
||||
rev tmp2, data1
|
||||
sub tmp1, tmp2, zeroones
|
||||
orr tmp2, tmp2, #REP8_7f
|
||||
bics has_nul1, tmp1, tmp2
|
||||
b.ne L(fp_le8)
|
||||
rev tmp4, data2
|
||||
sub tmp3, tmp4, zeroones
|
||||
orr tmp4, tmp4, #REP8_7f
|
||||
#else
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
bics has_nul1, tmp1, tmp2
|
||||
b.ne L(fp_le8)
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, #REP8_7f
|
||||
#endif
|
||||
bics has_nul2, tmp3, tmp4
|
||||
b.eq L(bulk_entry)
|
||||
ldr dataq, [src, 16]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbz synd, L(start_loop)
|
||||
|
||||
/* The string is short (<=16 bytes). We don't know exactly how
|
||||
short though, yet. Work out the exact length so that we can
|
||||
quickly select the optimal copy strategy. */
|
||||
L(fp_gt8):
|
||||
rev has_nul2, has_nul2
|
||||
clz pos, has_nul2
|
||||
mov tmp2, #56
|
||||
add dst, dstin, pos, lsr #3 /* Bits to bytes. */
|
||||
sub pos, tmp2, pos
|
||||
#ifdef __AARCH64EB__
|
||||
lsr data2, data2, pos
|
||||
#else
|
||||
lsl data2, data2, pos
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
str data2, [dst, #1]
|
||||
sub tmp, src, srcin
|
||||
clz len, synd
|
||||
add len, tmp, len, lsr 2
|
||||
tbz len, 4, L(less16)
|
||||
sub tmp, len, 15
|
||||
ldr dataq, [srcin]
|
||||
ldr dataq2, [srcin, tmp]
|
||||
str dataq, [dstin]
|
||||
str dataq2, [dstin, tmp]
|
||||
IFSTPCPY (add result, dstin, len)
|
||||
ret
|
||||
|
||||
.p2align 4,,8
|
||||
L(tail):
|
||||
rbit synd, synd
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
|
||||
.p2align 4
|
||||
L(less16):
|
||||
tbz len, 3, L(less8)
|
||||
sub tmp, len, 7
|
||||
ldr data1, [srcin]
|
||||
ldr data2, [srcin, tmp]
|
||||
str data1, [dstin]
|
||||
#ifdef BUILD_STPCPY
|
||||
add dstin, dst, #8
|
||||
#endif
|
||||
str data2, [dstin, tmp]
|
||||
IFSTPCPY (add result, dstin, len)
|
||||
ret
|
||||
|
||||
L(fp_le8):
|
||||
rev has_nul1, has_nul1
|
||||
clz pos, has_nul1
|
||||
add dst, dstin, pos, lsr #3 /* Bits to bytes. */
|
||||
subs tmp2, pos, #24 /* Pos in bits. */
|
||||
b.lt L(fp_lt4)
|
||||
#ifdef __AARCH64EB__
|
||||
mov tmp2, #56
|
||||
sub pos, tmp2, pos
|
||||
lsr data2, data1, pos
|
||||
lsr data1, data1, #32
|
||||
#else
|
||||
lsr data2, data1, tmp2
|
||||
#endif
|
||||
/* 4->7 bytes to copy. */
|
||||
str data2w, [dst, #-3]
|
||||
str data1w, [dstin]
|
||||
#ifdef BUILD_STPCPY
|
||||
mov dstin, dst
|
||||
#endif
|
||||
ret
|
||||
L(fp_lt4):
|
||||
cbz pos, L(fp_lt2)
|
||||
/* 2->3 bytes to copy. */
|
||||
#ifdef __AARCH64EB__
|
||||
lsr data1, data1, #48
|
||||
#endif
|
||||
strh data1w, [dstin]
|
||||
/* Fall-through, one byte (max) to go. */
|
||||
L(fp_lt2):
|
||||
/* Null-terminated string. Last character must be zero! */
|
||||
strb wzr, [dst]
|
||||
#ifdef BUILD_STPCPY
|
||||
mov dstin, dst
|
||||
#endif
|
||||
.p2align 4
|
||||
L(less8):
|
||||
subs tmp, len, 3
|
||||
b.lo L(less4)
|
||||
ldr dataw1, [srcin]
|
||||
ldr dataw2, [srcin, tmp]
|
||||
str dataw1, [dstin]
|
||||
str dataw2, [dstin, tmp]
|
||||
IFSTPCPY (add result, dstin, len)
|
||||
ret
|
||||
|
||||
.p2align 6
|
||||
/* Aligning here ensures that the entry code and main loop all lies
|
||||
within one 64-byte cache line. */
|
||||
L(bulk_entry):
|
||||
sub to_align, to_align, #16
|
||||
stp data1, data2, [dstin]
|
||||
sub src, srcin, to_align
|
||||
sub dst, dstin, to_align
|
||||
b L(entry_no_page_cross)
|
||||
|
||||
/* The inner loop deals with two Dwords at a time. This has a
|
||||
slightly higher start-up cost, but we should win quite quickly,
|
||||
especially on cores with a high number of issue slots per
|
||||
cycle, as we get much better parallelism out of the operations. */
|
||||
L(main_loop):
|
||||
stp data1, data2, [dst], #16
|
||||
L(entry_no_page_cross):
|
||||
ldp data1, data2, [src], #16
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, #REP8_7f
|
||||
bic has_nul1, tmp1, tmp2
|
||||
bics has_nul2, tmp3, tmp4
|
||||
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
|
||||
b.eq L(main_loop)
|
||||
|
||||
/* Since we know we are copying at least 16 bytes, the fastest way
|
||||
to deal with the tail is to determine the location of the
|
||||
trailing NUL, then (re)copy the 16 bytes leading up to that. */
|
||||
cmp has_nul1, #0
|
||||
#ifdef __AARCH64EB__
|
||||
/* For big-endian, carry propagation (if the final byte in the
|
||||
string is 0x01) means we cannot use has_nul directly. The
|
||||
easiest way to get the correct byte is to byte-swap the data
|
||||
and calculate the syndrome a second time. */
|
||||
csel data1, data1, data2, ne
|
||||
rev data1, data1
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
bic has_nul1, tmp1, tmp2
|
||||
#else
|
||||
csel has_nul1, has_nul1, has_nul2, ne
|
||||
#endif
|
||||
rev has_nul1, has_nul1
|
||||
clz pos, has_nul1
|
||||
add tmp1, pos, #72
|
||||
add pos, pos, #8
|
||||
csel pos, pos, tmp1, ne
|
||||
add src, src, pos, lsr #3
|
||||
add dst, dst, pos, lsr #3
|
||||
ldp data1, data2, [src, #-32]
|
||||
stp data1, data2, [dst, #-16]
|
||||
#ifdef BUILD_STPCPY
|
||||
sub dstin, dst, #1
|
||||
#endif
|
||||
L(less4):
|
||||
cbz len, L(zerobyte)
|
||||
ldrh dataw1, [srcin]
|
||||
strh dataw1, [dstin]
|
||||
L(zerobyte):
|
||||
strb wzr, [dstin, len]
|
||||
IFSTPCPY (add result, dstin, len)
|
||||
ret
|
||||
|
||||
L(page_cross):
|
||||
bic src, srcin, #15
|
||||
/* Start by loading two words at [srcin & ~15], then forcing the
|
||||
bytes that precede srcin to 0xff. This means they never look
|
||||
like termination bytes. */
|
||||
ldp data1, data2, [src]
|
||||
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
|
||||
tst to_align, #7
|
||||
csetm tmp2, ne
|
||||
#ifdef __AARCH64EB__
|
||||
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
|
||||
#else
|
||||
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
|
||||
.p2align 4
|
||||
L(start_loop):
|
||||
sub len, src, srcin
|
||||
ldr dataq2, [srcin]
|
||||
add dst, dstin, len
|
||||
str dataq2, [dstin]
|
||||
|
||||
.p2align 5
|
||||
L(loop):
|
||||
str dataq, [dst], 16
|
||||
ldr dataq, [src, 16]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop)
|
||||
|
||||
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
|
||||
addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
|
||||
fmov synd, dend
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
#endif
|
||||
orr data1, data1, tmp2
|
||||
orr data2a, data2, tmp2
|
||||
cmp to_align, #8
|
||||
csinv data1, data1, xzr, lt
|
||||
csel data2, data2, data2a, lt
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, #REP8_7f
|
||||
bic has_nul1, tmp1, tmp2
|
||||
bics has_nul2, tmp3, tmp4
|
||||
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
|
||||
b.eq L(page_cross_ok)
|
||||
/* We now need to make data1 and data2 look like they've been
|
||||
loaded directly from srcin. Do a rotate on the 128-bit value. */
|
||||
lsl tmp1, to_align, #3 /* Bytes->bits. */
|
||||
neg tmp2, to_align, lsl #3
|
||||
#ifdef __AARCH64EB__
|
||||
lsl data1a, data1, tmp1
|
||||
lsr tmp4, data2, tmp2
|
||||
lsl data2, data2, tmp1
|
||||
orr tmp4, tmp4, data1a
|
||||
cmp to_align, #8
|
||||
csel data1, tmp4, data2, lt
|
||||
rev tmp2, data1
|
||||
rev tmp4, data2
|
||||
sub tmp1, tmp2, zeroones
|
||||
orr tmp2, tmp2, #REP8_7f
|
||||
sub tmp3, tmp4, zeroones
|
||||
orr tmp4, tmp4, #REP8_7f
|
||||
#else
|
||||
lsr data1a, data1, tmp1
|
||||
lsl tmp4, data2, tmp2
|
||||
lsr data2, data2, tmp1
|
||||
orr tmp4, tmp4, data1a
|
||||
cmp to_align, #8
|
||||
csel data1, tmp4, data2, lt
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
sub tmp3, data2, zeroones
|
||||
orr tmp4, data2, #REP8_7f
|
||||
#endif
|
||||
bic has_nul1, tmp1, tmp2
|
||||
cbnz has_nul1, L(fp_le8)
|
||||
bic has_nul2, tmp3, tmp4
|
||||
b L(fp_gt8)
|
||||
clz len, synd
|
||||
lsr len, len, 2
|
||||
sub tmp, len, 15
|
||||
ldr dataq, [src, tmp]
|
||||
str dataq, [dst, tmp]
|
||||
IFSTPCPY (add result, dst, len)
|
||||
ret
|
||||
|
||||
END (STRCPY)
|
||||
|
||||
|
@ -1,307 +0,0 @@
|
||||
/*
|
||||
* strncmp - compare two strings
|
||||
*
|
||||
* Copyright (c) 2013-2021, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
*/
|
||||
|
||||
#include "../asmdefs.h"
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
|
||||
/* Parameters and result. */
|
||||
#define src1 x0
|
||||
#define src2 x1
|
||||
#define limit x2
|
||||
#define result x0
|
||||
|
||||
/* Internal variables. */
|
||||
#define data1 x3
|
||||
#define data1w w3
|
||||
#define data2 x4
|
||||
#define data2w w4
|
||||
#define has_nul x5
|
||||
#define diff x6
|
||||
#define syndrome x7
|
||||
#define tmp1 x8
|
||||
#define tmp2 x9
|
||||
#define tmp3 x10
|
||||
#define zeroones x11
|
||||
#define pos x12
|
||||
#define mask x13
|
||||
#define endloop x14
|
||||
#define count mask
|
||||
#define offset pos
|
||||
#define neg_offset x15
|
||||
|
||||
/* Define endian dependent shift operations.
|
||||
On big-endian early bytes are at MSB and on little-endian LSB.
|
||||
LS_FW means shifting towards early bytes.
|
||||
LS_BK means shifting towards later bytes.
|
||||
*/
|
||||
#ifdef __AARCH64EB__
|
||||
#define LS_FW lsl
|
||||
#define LS_BK lsr
|
||||
#else
|
||||
#define LS_FW lsr
|
||||
#define LS_BK lsl
|
||||
#endif
|
||||
|
||||
ENTRY (__strncmp_aarch64_mte)
|
||||
PTR_ARG (0)
|
||||
PTR_ARG (1)
|
||||
SIZE_ARG (2)
|
||||
cbz limit, L(ret0)
|
||||
eor tmp1, src1, src2
|
||||
mov zeroones, #REP8_01
|
||||
tst tmp1, #7
|
||||
and count, src1, #7
|
||||
b.ne L(misaligned8)
|
||||
cbnz count, L(mutual_align)
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
can be done in parallel across the entire word. */
|
||||
.p2align 4
|
||||
L(loop_aligned):
|
||||
ldr data1, [src1], #8
|
||||
ldr data2, [src2], #8
|
||||
L(start_realigned):
|
||||
subs limit, limit, #8
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
csinv endloop, diff, xzr, hi /* Last Dword or differences. */
|
||||
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
ccmp endloop, #0, #0, eq
|
||||
b.eq L(loop_aligned)
|
||||
/* End of main loop */
|
||||
|
||||
L(full_check):
|
||||
#ifndef __AARCH64EB__
|
||||
orr syndrome, diff, has_nul
|
||||
add limit, limit, 8 /* Rewind limit to before last subs. */
|
||||
L(syndrome_check):
|
||||
/* Limit was reached. Check if the NUL byte or the difference
|
||||
is before the limit. */
|
||||
rev syndrome, syndrome
|
||||
rev data1, data1
|
||||
clz pos, syndrome
|
||||
rev data2, data2
|
||||
lsl data1, data1, pos
|
||||
cmp limit, pos, lsr #3
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
csel result, result, xzr, hi
|
||||
ret
|
||||
#else
|
||||
/* Not reached the limit, must have found the end or a diff. */
|
||||
tbz limit, #63, L(not_limit)
|
||||
add tmp1, limit, 8
|
||||
cbz limit, L(not_limit)
|
||||
|
||||
lsl limit, tmp1, #3 /* Bits -> bytes. */
|
||||
mov mask, #~0
|
||||
lsr mask, mask, limit
|
||||
bic data1, data1, mask
|
||||
bic data2, data2, mask
|
||||
|
||||
/* Make sure that the NUL byte is marked in the syndrome. */
|
||||
orr has_nul, has_nul, mask
|
||||
|
||||
L(not_limit):
|
||||
/* For big-endian we cannot use the trick with the syndrome value
|
||||
as carry-propagation can corrupt the upper bits if the trailing
|
||||
bytes in the string contain 0x01. */
|
||||
/* However, if there is no NUL byte in the dword, we can generate
|
||||
the result directly. We can't just subtract the bytes as the
|
||||
MSB might be significant. */
|
||||
cbnz has_nul, 1f
|
||||
cmp data1, data2
|
||||
cset result, ne
|
||||
cneg result, result, lo
|
||||
ret
|
||||
1:
|
||||
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
|
||||
rev tmp3, data1
|
||||
sub tmp1, tmp3, zeroones
|
||||
orr tmp2, tmp3, #REP8_7f
|
||||
bic has_nul, tmp1, tmp2
|
||||
rev has_nul, has_nul
|
||||
orr syndrome, diff, has_nul
|
||||
clz pos, syndrome
|
||||
/* The most-significant-non-zero bit of the syndrome marks either the
|
||||
first bit that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
L(end_quick):
|
||||
lsl data1, data1, pos
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
ret
|
||||
#endif
|
||||
|
||||
L(mutual_align):
|
||||
/* Sources are mutually aligned, but are not currently at an
|
||||
alignment boundary. Round down the addresses and then mask off
|
||||
the bytes that precede the start point.
|
||||
We also need to adjust the limit calculations, but without
|
||||
overflowing if the limit is near ULONG_MAX. */
|
||||
bic src1, src1, #7
|
||||
bic src2, src2, #7
|
||||
ldr data1, [src1], #8
|
||||
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
|
||||
ldr data2, [src2], #8
|
||||
mov tmp2, #~0
|
||||
LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
||||
/* Adjust the limit and ensure it doesn't overflow. */
|
||||
adds limit, limit, count
|
||||
csinv limit, limit, xzr, lo
|
||||
orr data1, data1, tmp2
|
||||
orr data2, data2, tmp2
|
||||
b L(start_realigned)
|
||||
|
||||
.p2align 4
|
||||
/* Don't bother with dwords for up to 16 bytes. */
|
||||
L(misaligned8):
|
||||
cmp limit, #16
|
||||
b.hs L(try_misaligned_words)
|
||||
|
||||
L(byte_loop):
|
||||
/* Perhaps we can do better than this. */
|
||||
ldrb data1w, [src1], #1
|
||||
ldrb data2w, [src2], #1
|
||||
subs limit, limit, #1
|
||||
ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
|
||||
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
|
||||
b.eq L(byte_loop)
|
||||
L(done):
|
||||
sub result, data1, data2
|
||||
ret
|
||||
/* Align the SRC1 to a dword by doing a bytewise compare and then do
|
||||
the dword loop. */
|
||||
L(try_misaligned_words):
|
||||
cbz count, L(src1_aligned)
|
||||
|
||||
neg count, count
|
||||
and count, count, #7
|
||||
sub limit, limit, count
|
||||
|
||||
L(page_end_loop):
|
||||
ldrb data1w, [src1], #1
|
||||
ldrb data2w, [src2], #1
|
||||
cmp data1w, #1
|
||||
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
|
||||
b.ne L(done)
|
||||
subs count, count, #1
|
||||
b.hi L(page_end_loop)
|
||||
|
||||
/* The following diagram explains the comparison of misaligned strings.
|
||||
The bytes are shown in natural order. For little-endian, it is
|
||||
reversed in the registers. The "x" bytes are before the string.
|
||||
The "|" separates data that is loaded at one time.
|
||||
src1 | a a a a a a a a | b b b c c c c c | . . .
|
||||
src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
|
||||
|
||||
After shifting in each step, the data looks like this:
|
||||
STEP_A STEP_B STEP_C
|
||||
data1 a a a a a a a a b b b c c c c c b b b c c c c c
|
||||
data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
|
||||
|
||||
The bytes with "0" are eliminated from the syndrome via mask.
|
||||
|
||||
Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
|
||||
time from SRC2. The comparison happens in 3 steps. After each step
|
||||
the loop can exit, or read from SRC1 or SRC2. */
|
||||
L(src1_aligned):
|
||||
/* Calculate offset from 8 byte alignment to string start in bits. No
|
||||
need to mask offset since shifts are ignoring upper bits. */
|
||||
lsl offset, src2, #3
|
||||
bic src2, src2, #0xf
|
||||
mov mask, -1
|
||||
neg neg_offset, offset
|
||||
ldr data1, [src1], #8
|
||||
ldp tmp1, tmp2, [src2], #16
|
||||
LS_BK mask, mask, neg_offset
|
||||
and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
|
||||
/* Skip the first compare if data in tmp1 is irrelevant. */
|
||||
tbnz offset, 6, L(misaligned_mid_loop)
|
||||
|
||||
L(loop_misaligned):
|
||||
/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
|
||||
LS_FW data2, tmp1, offset
|
||||
LS_BK tmp1, tmp2, neg_offset
|
||||
subs limit, limit, #8
|
||||
orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
|
||||
sub has_nul, data1, zeroones
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
orr tmp3, data1, #REP8_7f
|
||||
csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
|
||||
bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
|
||||
orr tmp3, endloop, has_nul
|
||||
cbnz tmp3, L(full_check)
|
||||
|
||||
ldr data1, [src1], #8
|
||||
L(misaligned_mid_loop):
|
||||
/* STEP_B: Compare first part of data1 to second part of tmp2. */
|
||||
LS_FW data2, tmp2, offset
|
||||
#ifdef __AARCH64EB__
|
||||
/* For big-endian we do a byte reverse to avoid carry-propagation
|
||||
problem described above. This way we can reuse the has_nul in the
|
||||
next step and also use syndrome value trick at the end. */
|
||||
rev tmp3, data1
|
||||
#define data1_fixed tmp3
|
||||
#else
|
||||
#define data1_fixed data1
|
||||
#endif
|
||||
sub has_nul, data1_fixed, zeroones
|
||||
orr tmp3, data1_fixed, #REP8_7f
|
||||
eor diff, data2, data1 /* Non-zero if differences found. */
|
||||
bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
|
||||
#ifdef __AARCH64EB__
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
cmp limit, neg_offset, lsr #3
|
||||
orr syndrome, diff, has_nul
|
||||
bic syndrome, syndrome, mask /* Ignore later bytes. */
|
||||
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
||||
cbnz tmp3, L(syndrome_check)
|
||||
|
||||
/* STEP_C: Compare second part of data1 to first part of tmp1. */
|
||||
ldp tmp1, tmp2, [src2], #16
|
||||
cmp limit, #8
|
||||
LS_BK data2, tmp1, neg_offset
|
||||
eor diff, data2, data1 /* Non-zero if differences found. */
|
||||
orr syndrome, diff, has_nul
|
||||
and syndrome, syndrome, mask /* Ignore earlier bytes. */
|
||||
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
||||
cbnz tmp3, L(syndrome_check)
|
||||
|
||||
ldr data1, [src1], #8
|
||||
sub limit, limit, #8
|
||||
b L(loop_misaligned)
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
L(syndrome_check):
|
||||
clz pos, syndrome
|
||||
cmp pos, limit, lsl #3
|
||||
b.lo L(end_quick)
|
||||
#endif
|
||||
|
||||
L(ret0):
|
||||
mov result, #0
|
||||
ret
|
||||
END(__strncmp_aarch64_mte)
|
||||
|
@ -1,20 +1,20 @@
|
||||
/*
|
||||
* strncmp - compare two strings
|
||||
*
|
||||
* Copyright (c) 2013-2021, Arm Limited.
|
||||
* Copyright (c) 2013-2022, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64
|
||||
* ARMv8-a, AArch64.
|
||||
* MTE compatible.
|
||||
*/
|
||||
|
||||
#include "../asmdefs.h"
|
||||
|
||||
#define REP8_01 0x0101010101010101
|
||||
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||
#define REP8_80 0x8080808080808080
|
||||
|
||||
/* Parameters and result. */
|
||||
#define src1 x0
|
||||
@ -35,10 +35,24 @@
|
||||
#define tmp3 x10
|
||||
#define zeroones x11
|
||||
#define pos x12
|
||||
#define limit_wd x13
|
||||
#define mask x14
|
||||
#define endloop x15
|
||||
#define mask x13
|
||||
#define endloop x14
|
||||
#define count mask
|
||||
#define offset pos
|
||||
#define neg_offset x15
|
||||
|
||||
/* Define endian dependent shift operations.
|
||||
On big-endian early bytes are at MSB and on little-endian LSB.
|
||||
LS_FW means shifting towards early bytes.
|
||||
LS_BK means shifting towards later bytes.
|
||||
*/
|
||||
#ifdef __AARCH64EB__
|
||||
#define LS_FW lsl
|
||||
#define LS_BK lsr
|
||||
#else
|
||||
#define LS_FW lsr
|
||||
#define LS_BK lsl
|
||||
#endif
|
||||
|
||||
ENTRY (__strncmp_aarch64)
|
||||
PTR_ARG (0)
|
||||
@ -51,9 +65,6 @@ ENTRY (__strncmp_aarch64)
|
||||
and count, src1, #7
|
||||
b.ne L(misaligned8)
|
||||
cbnz count, L(mutual_align)
|
||||
/* Calculate the number of full and partial words -1. */
|
||||
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
|
||||
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
|
||||
|
||||
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
|
||||
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
|
||||
@ -63,30 +74,45 @@ L(loop_aligned):
|
||||
ldr data1, [src1], #8
|
||||
ldr data2, [src2], #8
|
||||
L(start_realigned):
|
||||
subs limit_wd, limit_wd, #1
|
||||
subs limit, limit, #8
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
csinv endloop, diff, xzr, pl /* Last Dword or differences. */
|
||||
csinv endloop, diff, xzr, hi /* Last Dword or differences. */
|
||||
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
ccmp endloop, #0, #0, eq
|
||||
b.eq L(loop_aligned)
|
||||
/* End of main loop */
|
||||
|
||||
/* Not reached the limit, must have found the end or a diff. */
|
||||
tbz limit_wd, #63, L(not_limit)
|
||||
|
||||
/* Limit % 8 == 0 => all bytes significant. */
|
||||
ands limit, limit, #7
|
||||
b.eq L(not_limit)
|
||||
|
||||
lsl limit, limit, #3 /* Bits -> bytes. */
|
||||
mov mask, #~0
|
||||
#ifdef __AARCH64EB__
|
||||
lsr mask, mask, limit
|
||||
L(full_check):
|
||||
#ifndef __AARCH64EB__
|
||||
orr syndrome, diff, has_nul
|
||||
add limit, limit, 8 /* Rewind limit to before last subs. */
|
||||
L(syndrome_check):
|
||||
/* Limit was reached. Check if the NUL byte or the difference
|
||||
is before the limit. */
|
||||
rev syndrome, syndrome
|
||||
rev data1, data1
|
||||
clz pos, syndrome
|
||||
rev data2, data2
|
||||
lsl data1, data1, pos
|
||||
cmp limit, pos, lsr #3
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
csel result, result, xzr, hi
|
||||
ret
|
||||
#else
|
||||
lsl mask, mask, limit
|
||||
#endif
|
||||
/* Not reached the limit, must have found the end or a diff. */
|
||||
tbz limit, #63, L(not_limit)
|
||||
add tmp1, limit, 8
|
||||
cbz limit, L(not_limit)
|
||||
|
||||
lsl limit, tmp1, #3 /* Bits -> bytes. */
|
||||
mov mask, #~0
|
||||
lsr mask, mask, limit
|
||||
bic data1, data1, mask
|
||||
bic data2, data2, mask
|
||||
|
||||
@ -94,25 +120,6 @@ L(start_realigned):
|
||||
orr has_nul, has_nul, mask
|
||||
|
||||
L(not_limit):
|
||||
orr syndrome, diff, has_nul
|
||||
|
||||
#ifndef __AARCH64EB__
|
||||
rev syndrome, syndrome
|
||||
rev data1, data1
|
||||
/* The MS-non-zero bit of the syndrome marks either the first bit
|
||||
that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
clz pos, syndrome
|
||||
rev data2, data2
|
||||
lsl data1, data1, pos
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
perform a signed 32-bit subtraction. */
|
||||
lsr data1, data1, #56
|
||||
sub result, data1, data2, lsr #56
|
||||
ret
|
||||
#else
|
||||
/* For big-endian we cannot use the trick with the syndrome value
|
||||
as carry-propagation can corrupt the upper bits if the trailing
|
||||
bytes in the string contain 0x01. */
|
||||
@ -133,10 +140,11 @@ L(not_limit):
|
||||
rev has_nul, has_nul
|
||||
orr syndrome, diff, has_nul
|
||||
clz pos, syndrome
|
||||
/* The MS-non-zero bit of the syndrome marks either the first bit
|
||||
that is different, or the top bit of the first zero byte.
|
||||
/* The most-significant-non-zero bit of the syndrome marks either the
|
||||
first bit that is different, or the top bit of the first zero byte.
|
||||
Shifting left now will bring the critical information into the
|
||||
top bits. */
|
||||
L(end_quick):
|
||||
lsl data1, data1, pos
|
||||
lsl data2, data2, pos
|
||||
/* But we need to zero-extend (char is unsigned) the value and then
|
||||
@ -158,22 +166,12 @@ L(mutual_align):
|
||||
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
|
||||
ldr data2, [src2], #8
|
||||
mov tmp2, #~0
|
||||
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
|
||||
#ifdef __AARCH64EB__
|
||||
/* Big-endian. Early bytes are at MSB. */
|
||||
lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
||||
#else
|
||||
/* Little-endian. Early bytes are at LSB. */
|
||||
lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
||||
#endif
|
||||
and tmp3, limit_wd, #7
|
||||
lsr limit_wd, limit_wd, #3
|
||||
/* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
|
||||
add limit, limit, count
|
||||
add tmp3, tmp3, count
|
||||
LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
|
||||
/* Adjust the limit and ensure it doesn't overflow. */
|
||||
adds limit, limit, count
|
||||
csinv limit, limit, xzr, lo
|
||||
orr data1, data1, tmp2
|
||||
orr data2, data2, tmp2
|
||||
add limit_wd, limit_wd, tmp3, lsr #3
|
||||
b L(start_realigned)
|
||||
|
||||
.p2align 4
|
||||
@ -196,13 +194,11 @@ L(done):
|
||||
/* Align the SRC1 to a dword by doing a bytewise compare and then do
|
||||
the dword loop. */
|
||||
L(try_misaligned_words):
|
||||
lsr limit_wd, limit, #3
|
||||
cbz count, L(do_misaligned)
|
||||
cbz count, L(src1_aligned)
|
||||
|
||||
neg count, count
|
||||
and count, count, #7
|
||||
sub limit, limit, count
|
||||
lsr limit_wd, limit, #3
|
||||
|
||||
L(page_end_loop):
|
||||
ldrb data1w, [src1], #1
|
||||
@ -213,48 +209,100 @@ L(page_end_loop):
|
||||
subs count, count, #1
|
||||
b.hi L(page_end_loop)
|
||||
|
||||
L(do_misaligned):
|
||||
/* Prepare ourselves for the next page crossing. Unlike the aligned
|
||||
loop, we fetch 1 less dword because we risk crossing bounds on
|
||||
SRC2. */
|
||||
mov count, #8
|
||||
subs limit_wd, limit_wd, #1
|
||||
b.lo L(done_loop)
|
||||
/* The following diagram explains the comparison of misaligned strings.
|
||||
The bytes are shown in natural order. For little-endian, it is
|
||||
reversed in the registers. The "x" bytes are before the string.
|
||||
The "|" separates data that is loaded at one time.
|
||||
src1 | a a a a a a a a | b b b c c c c c | . . .
|
||||
src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
|
||||
|
||||
After shifting in each step, the data looks like this:
|
||||
STEP_A STEP_B STEP_C
|
||||
data1 a a a a a a a a b b b c c c c c b b b c c c c c
|
||||
data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
|
||||
|
||||
The bytes with "0" are eliminated from the syndrome via mask.
|
||||
|
||||
Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
|
||||
time from SRC2. The comparison happens in 3 steps. After each step
|
||||
the loop can exit, or read from SRC1 or SRC2. */
|
||||
L(src1_aligned):
|
||||
/* Calculate offset from 8 byte alignment to string start in bits. No
|
||||
need to mask offset since shifts are ignoring upper bits. */
|
||||
lsl offset, src2, #3
|
||||
bic src2, src2, #0xf
|
||||
mov mask, -1
|
||||
neg neg_offset, offset
|
||||
ldr data1, [src1], #8
|
||||
ldp tmp1, tmp2, [src2], #16
|
||||
LS_BK mask, mask, neg_offset
|
||||
and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
|
||||
/* Skip the first compare if data in tmp1 is irrelevant. */
|
||||
tbnz offset, 6, L(misaligned_mid_loop)
|
||||
|
||||
L(loop_misaligned):
|
||||
and tmp2, src2, #0xff8
|
||||
eor tmp2, tmp2, #0xff8
|
||||
cbz tmp2, L(page_end_loop)
|
||||
/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
|
||||
LS_FW data2, tmp1, offset
|
||||
LS_BK tmp1, tmp2, neg_offset
|
||||
subs limit, limit, #8
|
||||
orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
|
||||
sub has_nul, data1, zeroones
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
orr tmp3, data1, #REP8_7f
|
||||
csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
|
||||
bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
|
||||
orr tmp3, endloop, has_nul
|
||||
cbnz tmp3, L(full_check)
|
||||
|
||||
ldr data1, [src1], #8
|
||||
ldr data2, [src2], #8
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
ccmp diff, #0, #0, eq
|
||||
b.ne L(not_limit)
|
||||
subs limit_wd, limit_wd, #1
|
||||
b.pl L(loop_misaligned)
|
||||
L(misaligned_mid_loop):
|
||||
/* STEP_B: Compare first part of data1 to second part of tmp2. */
|
||||
LS_FW data2, tmp2, offset
|
||||
#ifdef __AARCH64EB__
|
||||
/* For big-endian we do a byte reverse to avoid carry-propagation
|
||||
problem described above. This way we can reuse the has_nul in the
|
||||
next step and also use syndrome value trick at the end. */
|
||||
rev tmp3, data1
|
||||
#define data1_fixed tmp3
|
||||
#else
|
||||
#define data1_fixed data1
|
||||
#endif
|
||||
sub has_nul, data1_fixed, zeroones
|
||||
orr tmp3, data1_fixed, #REP8_7f
|
||||
eor diff, data2, data1 /* Non-zero if differences found. */
|
||||
bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
|
||||
#ifdef __AARCH64EB__
|
||||
rev has_nul, has_nul
|
||||
#endif
|
||||
cmp limit, neg_offset, lsr #3
|
||||
orr syndrome, diff, has_nul
|
||||
bic syndrome, syndrome, mask /* Ignore later bytes. */
|
||||
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
||||
cbnz tmp3, L(syndrome_check)
|
||||
|
||||
L(done_loop):
|
||||
/* We found a difference or a NULL before the limit was reached. */
|
||||
and limit, limit, #7
|
||||
cbz limit, L(not_limit)
|
||||
/* Read the last word. */
|
||||
sub src1, src1, 8
|
||||
sub src2, src2, 8
|
||||
ldr data1, [src1, limit]
|
||||
ldr data2, [src2, limit]
|
||||
sub tmp1, data1, zeroones
|
||||
orr tmp2, data1, #REP8_7f
|
||||
eor diff, data1, data2 /* Non-zero if differences found. */
|
||||
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
|
||||
ccmp diff, #0, #0, eq
|
||||
b.ne L(not_limit)
|
||||
/* STEP_C: Compare second part of data1 to first part of tmp1. */
|
||||
ldp tmp1, tmp2, [src2], #16
|
||||
cmp limit, #8
|
||||
LS_BK data2, tmp1, neg_offset
|
||||
eor diff, data2, data1 /* Non-zero if differences found. */
|
||||
orr syndrome, diff, has_nul
|
||||
and syndrome, syndrome, mask /* Ignore earlier bytes. */
|
||||
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
|
||||
cbnz tmp3, L(syndrome_check)
|
||||
|
||||
ldr data1, [src1], #8
|
||||
sub limit, limit, #8
|
||||
b L(loop_misaligned)
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
L(syndrome_check):
|
||||
clz pos, syndrome
|
||||
cmp pos, limit, lsl #3
|
||||
b.lo L(end_quick)
|
||||
#endif
|
||||
|
||||
L(ret0):
|
||||
mov result, #0
|
||||
ret
|
||||
|
||||
END ( __strncmp_aarch64)
|
||||
END(__strncmp_aarch64)
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* memcpy benchmark.
|
||||
*
|
||||
* Copyright (c) 2020, Arm Limited.
|
||||
* Copyright (c) 2020-2021, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
@ -13,14 +13,15 @@
|
||||
#include "stringlib.h"
|
||||
#include "benchlib.h"
|
||||
|
||||
#define ITERS 5000
|
||||
#define ITERS 5000
|
||||
#define ITERS2 20000000
|
||||
#define ITERS3 500000
|
||||
#define MAX_COPIES 8192
|
||||
#define SIZE (256*1024)
|
||||
#define ITERS3 200000
|
||||
#define NUM_TESTS 16384
|
||||
#define MIN_SIZE 32768
|
||||
#define MAX_SIZE (1024 * 1024)
|
||||
|
||||
static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64)));
|
||||
static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64)));
|
||||
static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
|
||||
static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
|
||||
|
||||
#define F(x) {#x, x},
|
||||
|
||||
@ -30,15 +31,18 @@ static const struct fun
|
||||
void *(*fun)(void *, const void *, size_t);
|
||||
} funtab[] =
|
||||
{
|
||||
F(memcpy)
|
||||
#if __aarch64__
|
||||
F(__memcpy_aarch64)
|
||||
# if __ARM_NEON
|
||||
F(__memcpy_aarch64_simd)
|
||||
# endif
|
||||
# if __ARM_FEATURE_SVE
|
||||
F(__memcpy_aarch64_sve)
|
||||
# endif
|
||||
#elif __arm__
|
||||
F(__memcpy_arm)
|
||||
#endif
|
||||
F(memcpy)
|
||||
#undef F
|
||||
{0, 0}
|
||||
};
|
||||
@ -109,7 +113,7 @@ typedef struct
|
||||
uint64_t len : 16;
|
||||
} copy_t;
|
||||
|
||||
static copy_t copy[MAX_COPIES];
|
||||
static copy_t test_arr[NUM_TESTS];
|
||||
|
||||
typedef char *(*proto_t) (char *, const char *, size_t);
|
||||
|
||||
@ -140,14 +144,14 @@ init_copies (size_t max_size)
|
||||
size_t total = 0;
|
||||
/* Create a random set of copies with the given size and alignment
|
||||
distributions. */
|
||||
for (int i = 0; i < MAX_COPIES; i++)
|
||||
for (int i = 0; i < NUM_TESTS; i++)
|
||||
{
|
||||
copy[i].dst = (rand32 (0) & (max_size - 1));
|
||||
copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
|
||||
copy[i].src = (rand32 (0) & (max_size - 1));
|
||||
copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
|
||||
copy[i].len = size_arr[rand32 (0) & SIZE_MASK];
|
||||
total += copy[i].len;
|
||||
test_arr[i].dst = (rand32 (0) & (max_size - 1));
|
||||
test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
|
||||
test_arr[i].src = (rand32 (0) & (max_size - 1));
|
||||
test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
|
||||
test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK];
|
||||
total += test_arr[i].len;
|
||||
}
|
||||
|
||||
return total;
|
||||
@ -160,25 +164,27 @@ int main (void)
|
||||
memset (a, 1, sizeof (a));
|
||||
memset (b, 2, sizeof (b));
|
||||
|
||||
printf("Random memcpy:\n");
|
||||
printf("Random memcpy (bytes/ns):\n");
|
||||
for (int f = 0; funtab[f].name != 0; f++)
|
||||
{
|
||||
size_t total = 0;
|
||||
uint64_t tsum = 0;
|
||||
printf ("%22s (B/ns) ", funtab[f].name);
|
||||
printf ("%22s ", funtab[f].name);
|
||||
rand32 (0x12345678);
|
||||
|
||||
for (int size = 16384; size <= SIZE; size *= 2)
|
||||
for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
|
||||
{
|
||||
size_t copy_size = init_copies (size) * ITERS;
|
||||
|
||||
for (int c = 0; c < MAX_COPIES; c++)
|
||||
funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
|
||||
for (int c = 0; c < NUM_TESTS; c++)
|
||||
funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
|
||||
test_arr[c].len);
|
||||
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS; i++)
|
||||
for (int c = 0; c < MAX_COPIES; c++)
|
||||
funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
|
||||
for (int c = 0; c < NUM_TESTS; c++)
|
||||
funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
|
||||
test_arr[c].len);
|
||||
t = clock_get_ns () - t;
|
||||
total += copy_size;
|
||||
tsum += t;
|
||||
@ -187,74 +193,147 @@ int main (void)
|
||||
printf( "avg %.2f\n", (double)total / tsum);
|
||||
}
|
||||
|
||||
printf ("\nMedium memcpy:\n");
|
||||
size_t total = 0;
|
||||
uint64_t tsum = 0;
|
||||
printf ("%22s ", "memcpy_call");
|
||||
rand32 (0x12345678);
|
||||
|
||||
for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
|
||||
{
|
||||
size_t copy_size = init_copies (size) * ITERS;
|
||||
|
||||
for (int c = 0; c < NUM_TESTS; c++)
|
||||
memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
|
||||
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS; i++)
|
||||
for (int c = 0; c < NUM_TESTS; c++)
|
||||
memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
|
||||
t = clock_get_ns () - t;
|
||||
total += copy_size;
|
||||
tsum += t;
|
||||
printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
|
||||
}
|
||||
printf( "avg %.2f\n", (double)total / tsum);
|
||||
|
||||
|
||||
printf ("\nAligned medium memcpy (bytes/ns):\n");
|
||||
for (int f = 0; funtab[f].name != 0; f++)
|
||||
{
|
||||
printf ("%22s (B/ns) ", funtab[f].name);
|
||||
printf ("%22s ", funtab[f].name);
|
||||
|
||||
for (int size = 16; size <= 512; size *= 2)
|
||||
for (int size = 8; size <= 512; size *= 2)
|
||||
{
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS2; i++)
|
||||
funtab[f].fun (b, a, size);
|
||||
t = clock_get_ns () - t;
|
||||
printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
|
||||
size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
|
||||
printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
|
||||
}
|
||||
printf ("\n");
|
||||
}
|
||||
|
||||
printf ("\nLarge memcpy:\n");
|
||||
printf ("%22s ", "memcpy_call");
|
||||
for (int size = 8; size <= 512; size *= 2)
|
||||
{
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS2; i++)
|
||||
memcpy (b, a, size);
|
||||
t = clock_get_ns () - t;
|
||||
printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
|
||||
}
|
||||
printf ("\n");
|
||||
|
||||
|
||||
printf ("\nUnaligned medium memcpy (bytes/ns):\n");
|
||||
for (int f = 0; funtab[f].name != 0; f++)
|
||||
{
|
||||
printf ("%22s (B/ns) ", funtab[f].name);
|
||||
printf ("%22s ", funtab[f].name);
|
||||
|
||||
for (int size = 1024; size <= 32768; size *= 2)
|
||||
for (int size = 8; size <= 512; size *= 2)
|
||||
{
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS2; i++)
|
||||
funtab[f].fun (b + 3, a + 1, size);
|
||||
t = clock_get_ns () - t;
|
||||
printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
|
||||
}
|
||||
printf ("\n");
|
||||
}
|
||||
|
||||
printf ("%22s ", "memcpy_call");
|
||||
for (int size = 8; size <= 512; size *= 2)
|
||||
{
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS2; i++)
|
||||
memcpy (b + 3, a + 1, size);
|
||||
t = clock_get_ns () - t;
|
||||
printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
|
||||
}
|
||||
printf ("\n");
|
||||
|
||||
|
||||
printf ("\nLarge memcpy (bytes/ns):\n");
|
||||
for (int f = 0; funtab[f].name != 0; f++)
|
||||
{
|
||||
printf ("%22s ", funtab[f].name);
|
||||
|
||||
for (int size = 1024; size <= 65536; size *= 2)
|
||||
{
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS3; i++)
|
||||
funtab[f].fun (b, a, size);
|
||||
t = clock_get_ns () - t;
|
||||
printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
|
||||
size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
|
||||
printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
|
||||
}
|
||||
printf ("\n");
|
||||
}
|
||||
|
||||
printf ("\nUnaligned forwards memmove:\n");
|
||||
printf ("%22s ", "memcpy_call");
|
||||
for (int size = 1024; size <= 65536; size *= 2)
|
||||
{
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS3; i++)
|
||||
memcpy (b, a, size);
|
||||
t = clock_get_ns () - t;
|
||||
printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
|
||||
}
|
||||
printf ("\n");
|
||||
|
||||
|
||||
printf ("\nUnaligned forwards memmove (bytes/ns):\n");
|
||||
for (int f = 0; funtab[f].name != 0; f++)
|
||||
{
|
||||
printf ("%22s (B/ns) ", funtab[f].name);
|
||||
printf ("%22s ", funtab[f].name);
|
||||
|
||||
for (int size = 1024; size <= 32768; size *= 2)
|
||||
for (int size = 1024; size <= 65536; size *= 2)
|
||||
{
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS3; i++)
|
||||
funtab[f].fun (a, a + 256 + (i & 31), size);
|
||||
t = clock_get_ns () - t;
|
||||
printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
|
||||
size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
|
||||
printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
|
||||
}
|
||||
printf ("\n");
|
||||
}
|
||||
|
||||
|
||||
printf ("\nUnaligned backwards memmove:\n");
|
||||
printf ("\nUnaligned backwards memmove (bytes/ns):\n");
|
||||
for (int f = 0; funtab[f].name != 0; f++)
|
||||
{
|
||||
printf ("%22s (B/ns) ", funtab[f].name);
|
||||
printf ("%22s ", funtab[f].name);
|
||||
|
||||
for (int size = 1024; size <= 32768; size *= 2)
|
||||
for (int size = 1024; size <= 65536; size *= 2)
|
||||
{
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS3; i++)
|
||||
funtab[f].fun (a + 256 + (i & 31), a, size);
|
||||
t = clock_get_ns () - t;
|
||||
printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
|
||||
size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
|
||||
printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
|
||||
}
|
||||
printf ("\n");
|
||||
}
|
||||
printf ("\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
243
contrib/arm-optimized-routines/string/bench/memset.c
Normal file
243
contrib/arm-optimized-routines/string/bench/memset.c
Normal file
@ -0,0 +1,243 @@
|
||||
/*
|
||||
* memset benchmark.
|
||||
*
|
||||
* Copyright (c) 2021, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#define _GNU_SOURCE
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include "stringlib.h"
|
||||
#include "benchlib.h"
|
||||
|
||||
#define ITERS 5000
|
||||
#define ITERS2 20000000
|
||||
#define ITERS3 1000000
|
||||
#define NUM_TESTS 16384
|
||||
#define MIN_SIZE 32768
|
||||
#define MAX_SIZE (1024 * 1024)
|
||||
|
||||
static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64)));
|
||||
|
||||
#define F(x) {#x, x},
|
||||
|
||||
static const struct fun
|
||||
{
|
||||
const char *name;
|
||||
void *(*fun)(void *, int, size_t);
|
||||
} funtab[] =
|
||||
{
|
||||
#if __aarch64__
|
||||
F(__memset_aarch64)
|
||||
#elif __arm__
|
||||
F(__memset_arm)
|
||||
#endif
|
||||
F(memset)
|
||||
#undef F
|
||||
{0, 0}
|
||||
};
|
||||
|
||||
typedef struct { uint32_t offset : 20, len : 12; } memset_test_t;
|
||||
static memset_test_t test_arr[NUM_TESTS];
|
||||
|
||||
typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
|
||||
typedef struct { uint8_t align; uint16_t freq; } align_data_t;
|
||||
|
||||
#define SIZE_NUM 65536
|
||||
#define SIZE_MASK (SIZE_NUM-1)
|
||||
static uint8_t len_arr[SIZE_NUM];
|
||||
|
||||
/* Frequency data for memset sizes up to 4096 based on SPEC2017. */
|
||||
static freq_data_t memset_len_freq[] =
|
||||
{
|
||||
{40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, { 8,1412},
|
||||
{292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414},
|
||||
{ 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, { 2, 200}, { 4, 192},
|
||||
{ 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140},
|
||||
{4095,133}, { 10, 130}, { 9, 124}, { 3, 124}, { 28, 120}, { 0, 118},
|
||||
{288, 110}, {1152, 96}, {104, 90}, { 1, 86}, {832, 76}, {248, 74},
|
||||
{1024, 69}, {120, 64}, {512, 63}, {384, 60}, { 6, 59}, { 80, 54},
|
||||
{ 17, 50}, { 7, 49}, {520, 47}, {2048, 39}, {256, 37}, {864, 33},
|
||||
{1440, 28}, { 22, 27}, {2056, 24}, {260, 23}, { 68, 23}, { 5, 22},
|
||||
{ 18, 21}, {200, 18}, {2120, 18}, { 60, 17}, { 52, 16}, {336, 15},
|
||||
{ 44, 13}, {192, 13}, {160, 12}, {2064, 12}, {128, 12}, { 76, 11},
|
||||
{164, 11}, {152, 10}, {136, 9}, {488, 7}, { 96, 6}, {560, 6},
|
||||
{1016, 6}, {112, 5}, {232, 5}, {168, 5}, {952, 5}, {184, 5},
|
||||
{144, 4}, {252, 4}, { 84, 3}, {960, 3}, {3808, 3}, {244, 3},
|
||||
{280, 3}, {224, 3}, {156, 3}, {1088, 3}, {440, 3}, {216, 2},
|
||||
{304, 2}, { 23, 2}, { 25, 2}, { 26, 2}, {264, 2}, {328, 2},
|
||||
{1096, 2}, {240, 2}, {1104, 2}, {704, 2}, {1664, 2}, {360, 2},
|
||||
{808, 1}, {544, 1}, {236, 1}, {720, 1}, {368, 1}, {424, 1},
|
||||
{640, 1}, {1112, 1}, {552, 1}, {272, 1}, {776, 1}, {376, 1},
|
||||
{ 92, 1}, {536, 1}, {824, 1}, {496, 1}, {760, 1}, {792, 1},
|
||||
{504, 1}, {344, 1}, {1816, 1}, {880, 1}, {176, 1}, {320, 1},
|
||||
{352, 1}, {2008, 1}, {208, 1}, {408, 1}, {228, 1}, {2072, 1},
|
||||
{568, 1}, {220, 1}, {616, 1}, {600, 1}, {392, 1}, {696, 1},
|
||||
{2144, 1}, {1280, 1}, {2136, 1}, {632, 1}, {584, 1}, {456, 1},
|
||||
{472, 1}, {3440, 1}, {2088, 1}, {680, 1}, {2928, 1}, {212, 1},
|
||||
{648, 1}, {1752, 1}, {664, 1}, {3512, 1}, {1032, 1}, {528, 1},
|
||||
{4072, 1}, {204, 1}, {2880, 1}, {3392, 1}, {712, 1}, { 59, 1},
|
||||
{736, 1}, {592, 1}, {2520, 1}, {744, 1}, {196, 1}, {172, 1},
|
||||
{728, 1}, {2040, 1}, {1192, 1}, {3600, 1}, {0, 0}
|
||||
};
|
||||
|
||||
#define ALIGN_NUM 1024
|
||||
#define ALIGN_MASK (ALIGN_NUM-1)
|
||||
static uint8_t align_arr[ALIGN_NUM];
|
||||
|
||||
/* Alignment data for memset based on SPEC2017. */
|
||||
static align_data_t memset_align_freq[] =
|
||||
{
|
||||
{16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0}
|
||||
};
|
||||
|
||||
static void
|
||||
init_memset_distribution (void)
|
||||
{
|
||||
int i, j, freq, size, n;
|
||||
|
||||
for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++)
|
||||
for (j = 0, size = memset_len_freq[i].size; j < freq; j++)
|
||||
len_arr[n++] = size;
|
||||
assert (n == SIZE_NUM);
|
||||
|
||||
for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++)
|
||||
for (j = 0, size = memset_align_freq[i].align; j < freq; j++)
|
||||
align_arr[n++] = size - 1;
|
||||
assert (n == ALIGN_NUM);
|
||||
}
|
||||
|
||||
static size_t
|
||||
init_memset (size_t max_size)
|
||||
{
|
||||
size_t total = 0;
|
||||
/* Create a random set of memsets with the given size and alignment
|
||||
distributions. */
|
||||
for (int i = 0; i < NUM_TESTS; i++)
|
||||
{
|
||||
test_arr[i].offset = (rand32 (0) & (max_size - 1));
|
||||
test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK];
|
||||
test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK];
|
||||
total += test_arr[i].len;
|
||||
}
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
|
||||
int main (void)
|
||||
{
|
||||
init_memset_distribution ();
|
||||
|
||||
memset (a, 1, sizeof (a));
|
||||
|
||||
printf("Random memset (bytes/ns):\n");
|
||||
for (int f = 0; funtab[f].name != 0; f++)
|
||||
{
|
||||
size_t total_size = 0;
|
||||
uint64_t tsum = 0;
|
||||
printf ("%22s ", funtab[f].name);
|
||||
rand32 (0x12345678);
|
||||
|
||||
for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
|
||||
{
|
||||
size_t memset_size = init_memset (size) * ITERS;
|
||||
|
||||
for (int c = 0; c < NUM_TESTS; c++)
|
||||
funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
|
||||
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS; i++)
|
||||
for (int c = 0; c < NUM_TESTS; c++)
|
||||
funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
|
||||
t = clock_get_ns () - t;
|
||||
total_size += memset_size;
|
||||
tsum += t;
|
||||
printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
|
||||
}
|
||||
printf( "avg %.2f\n", (double)total_size / tsum);
|
||||
}
|
||||
|
||||
size_t total_size = 0;
|
||||
uint64_t tsum = 0;
|
||||
printf ("%22s ", "memset_call");
|
||||
rand32 (0x12345678);
|
||||
|
||||
for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
|
||||
{
|
||||
size_t memset_size = init_memset (size) * ITERS;
|
||||
|
||||
for (int c = 0; c < NUM_TESTS; c++)
|
||||
memset (a + test_arr[c].offset, 0, test_arr[c].len);
|
||||
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS; i++)
|
||||
for (int c = 0; c < NUM_TESTS; c++)
|
||||
memset (a + test_arr[c].offset, 0, test_arr[c].len);
|
||||
t = clock_get_ns () - t;
|
||||
total_size += memset_size;
|
||||
tsum += t;
|
||||
printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
|
||||
}
|
||||
printf( "avg %.2f\n", (double)total_size / tsum);
|
||||
|
||||
|
||||
printf ("\nMedium memset (bytes/ns):\n");
|
||||
for (int f = 0; funtab[f].name != 0; f++)
|
||||
{
|
||||
printf ("%22s ", funtab[f].name);
|
||||
|
||||
for (int size = 8; size <= 512; size *= 2)
|
||||
{
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS2; i++)
|
||||
funtab[f].fun (a, 0, size);
|
||||
t = clock_get_ns () - t;
|
||||
printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
|
||||
}
|
||||
printf ("\n");
|
||||
}
|
||||
|
||||
printf ("%22s ", "memset_call");
|
||||
for (int size = 8; size <= 512; size *= 2)
|
||||
{
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS2; i++)
|
||||
memset (a, 0, size);
|
||||
t = clock_get_ns () - t;
|
||||
printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
|
||||
}
|
||||
|
||||
|
||||
printf ("\nLarge memset (bytes/ns):\n");
|
||||
for (int f = 0; funtab[f].name != 0; f++)
|
||||
{
|
||||
printf ("%22s ", funtab[f].name);
|
||||
|
||||
for (int size = 1024; size <= 65536; size *= 2)
|
||||
{
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS3; i++)
|
||||
funtab[f].fun (a, 0, size);
|
||||
t = clock_get_ns () - t;
|
||||
printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
|
||||
}
|
||||
printf ("\n");
|
||||
}
|
||||
|
||||
printf ("%22s ", "memset_call");
|
||||
for (int size = 1024; size <= 65536; size *= 2)
|
||||
{
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS3; i++)
|
||||
memset (a, 0, size);
|
||||
t = clock_get_ns () - t;
|
||||
printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
|
||||
}
|
||||
printf ("\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* strlen benchmark.
|
||||
*
|
||||
* Copyright (c) 2020, Arm Limited.
|
||||
* Copyright (c) 2020-2021, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
@ -13,10 +13,10 @@
|
||||
#include "stringlib.h"
|
||||
#include "benchlib.h"
|
||||
|
||||
#define ITERS 2000
|
||||
#define ITERS 5000
|
||||
#define ITERS2 20000000
|
||||
#define ITERS3 2000000
|
||||
#define NUM_STRLEN 16384
|
||||
#define NUM_TESTS 16384
|
||||
|
||||
#define MAX_ALIGN 32
|
||||
#define MAX_STRLEN 256
|
||||
@ -49,7 +49,7 @@ static const struct fun
|
||||
};
|
||||
#undef F
|
||||
|
||||
static uint16_t strlen_tests[NUM_STRLEN];
|
||||
static uint16_t strlen_tests[NUM_TESTS];
|
||||
|
||||
typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
|
||||
typedef struct { uint8_t align; uint16_t freq; } align_data_t;
|
||||
@ -117,7 +117,7 @@ init_strlen_tests (void)
|
||||
|
||||
/* Create a random set of strlen input strings using the string length
|
||||
and alignment distributions. */
|
||||
for (int n = 0; n < NUM_STRLEN; n++)
|
||||
for (int n = 0; n < NUM_TESTS; n++)
|
||||
{
|
||||
int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
|
||||
int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
|
||||
@ -141,14 +141,14 @@ int main (void)
|
||||
size_t res = 0, strlen_size = 0, mask = maskv;
|
||||
printf ("%22s ", funtab[f].name);
|
||||
|
||||
for (int c = 0; c < NUM_STRLEN; c++)
|
||||
for (int c = 0; c < NUM_TESTS; c++)
|
||||
strlen_size += funtab[f].fun (a + strlen_tests[c]);
|
||||
strlen_size *= ITERS;
|
||||
|
||||
/* Measure latency of strlen result with (res & mask). */
|
||||
uint64_t t = clock_get_ns ();
|
||||
for (int i = 0; i < ITERS; i++)
|
||||
for (int c = 0; c < NUM_STRLEN; c++)
|
||||
for (int c = 0; c < NUM_TESTS; c++)
|
||||
res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
|
||||
t = clock_get_ns () - t;
|
||||
printf ("%.2f\n", (double)strlen_size / t);
|
||||
|
@ -29,19 +29,17 @@ size_t __strlen_aarch64 (const char *);
|
||||
size_t __strnlen_aarch64 (const char *, size_t);
|
||||
int __strncmp_aarch64 (const char *, const char *, size_t);
|
||||
void * __memchr_aarch64_mte (const void *, int, size_t);
|
||||
char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);
|
||||
char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);
|
||||
char *__strchr_aarch64_mte (const char *, int);
|
||||
char * __strchrnul_aarch64_mte (const char *, int );
|
||||
size_t __strlen_aarch64_mte (const char *);
|
||||
char *__strrchr_aarch64_mte (const char *, int);
|
||||
int __strcmp_aarch64_mte (const char *, const char *);
|
||||
int __strncmp_aarch64_mte (const char *, const char *, size_t);
|
||||
#if __ARM_NEON
|
||||
void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
|
||||
void *__memmove_aarch64_simd (void *, const void *, size_t);
|
||||
#endif
|
||||
# if __ARM_FEATURE_SVE
|
||||
void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t);
|
||||
void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t);
|
||||
void *__memchr_aarch64_sve (const void *, int, size_t);
|
||||
int __memcmp_aarch64_sve (const void *, const void *, size_t);
|
||||
char *__strchr_aarch64_sve (const char *, int);
|
||||
|
@ -28,6 +28,9 @@ static const struct fun
|
||||
# if __ARM_NEON
|
||||
F(__memcpy_aarch64_simd, 1)
|
||||
# endif
|
||||
# if __ARM_FEATURE_SVE
|
||||
F(__memcpy_aarch64_sve, 1)
|
||||
# endif
|
||||
#elif __arm__
|
||||
F(__memcpy_arm, 0)
|
||||
#endif
|
||||
|
@ -28,6 +28,9 @@ static const struct fun
|
||||
# if __ARM_NEON
|
||||
F(__memmove_aarch64_simd, 1)
|
||||
# endif
|
||||
# if __ARM_FEATURE_SVE
|
||||
F(__memmove_aarch64_sve, 1)
|
||||
# endif
|
||||
#endif
|
||||
{0, 0, 0}
|
||||
// clang-format on
|
||||
|
@ -28,8 +28,7 @@ static const struct fun
|
||||
// clang-format off
|
||||
F(stpcpy, 0)
|
||||
#if __aarch64__
|
||||
F(__stpcpy_aarch64, 0)
|
||||
F(__stpcpy_aarch64_mte, 1)
|
||||
F(__stpcpy_aarch64, 1)
|
||||
# if __ARM_FEATURE_SVE
|
||||
F(__stpcpy_aarch64_sve, 1)
|
||||
# endif
|
||||
|
@ -24,8 +24,7 @@ static const struct fun
|
||||
// clang-format off
|
||||
F(strcmp, 0)
|
||||
#if __aarch64__
|
||||
F(__strcmp_aarch64, 0)
|
||||
F(__strcmp_aarch64_mte, 1)
|
||||
F(__strcmp_aarch64, 1)
|
||||
# if __ARM_FEATURE_SVE
|
||||
F(__strcmp_aarch64_sve, 1)
|
||||
# endif
|
||||
|
@ -24,8 +24,7 @@ static const struct fun
|
||||
// clang-format off
|
||||
F(strcpy, 0)
|
||||
#if __aarch64__
|
||||
F(__strcpy_aarch64, 0)
|
||||
F(__strcpy_aarch64_mte, 1)
|
||||
F(__strcpy_aarch64, 1)
|
||||
# if __ARM_FEATURE_SVE
|
||||
F(__strcpy_aarch64_sve, 1)
|
||||
# endif
|
||||
|
@ -24,8 +24,7 @@ static const struct fun
|
||||
// clang-format off
|
||||
F(strncmp, 0)
|
||||
#if __aarch64__
|
||||
F(__strncmp_aarch64, 0)
|
||||
F(__strncmp_aarch64_mte, 1)
|
||||
F(__strncmp_aarch64, 1)
|
||||
# if __ARM_FEATURE_SVE
|
||||
F(__strncmp_aarch64_sve, 1)
|
||||
# endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user