Import the Linaro Cortex Strings library into contrib.

Sponsored by:	The FreeBSD Foundation
This commit is contained in:
Andrew Turner 2016-09-19 13:12:09 +00:00
commit 09a53ad8f1
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=305972
39 changed files with 7670 additions and 0 deletions

11
contrib/cortex-strings/.gitignore vendored Normal file
View File

@ -0,0 +1,11 @@
*.a
*.o
*.la
*.lo
*.png
*.pyc
.deps
.dirstamp
.libs
try-*
cache.txt

View File

@ -0,0 +1,327 @@
# Copyright (c) 2011, Linaro Limited
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Linaro nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Top level Makefile for cortex-strings
# Used to record the compiler version in the executables
COMPILER = $(shell $(CC) --version 2>&1 | head -n1)
# The main library
lib_LTLIBRARIES = \
libcortex-strings.la
## Test suite
check_PROGRAMS = \
tests/test-memchr \
tests/test-memcmp \
tests/test-memcpy \
tests/test-memmove \
tests/test-memset \
tests/test-strchr \
tests/test-strcmp \
tests/test-strcpy \
tests/test-strlen \
tests/test-strncmp \
tests/test-strnlen
# Options for the tests
tests_cflags = -I$(srcdir)/tests $(AM_CFLAGS)
tests_ldadd = libcortex-strings.la
tests_test_memchr_LDADD = $(tests_ldadd)
tests_test_memchr_CFLAGS = $(tests_cflags)
tests_test_memcmp_LDADD = $(tests_ldadd)
tests_test_memcmp_CFLAGS = $(tests_cflags)
tests_test_memcpy_LDADD = $(tests_ldadd)
tests_test_memcpy_CFLAGS = $(tests_cflags)
tests_test_memmove_LDADD = $(tests_ldadd)
tests_test_memmove_CFLAGS = $(tests_cflags)
tests_test_memset_LDADD = $(tests_ldadd)
tests_test_memset_CFLAGS = $(tests_cflags)
tests_test_strchr_LDADD = $(tests_ldadd)
tests_test_strchr_CFLAGS = $(tests_cflags)
tests_test_strcmp_LDADD = $(tests_ldadd)
tests_test_strcmp_CFLAGS = $(tests_cflags)
tests_test_strcpy_LDADD = $(tests_ldadd)
tests_test_strcpy_CFLAGS = $(tests_cflags)
tests_test_strlen_LDADD = $(tests_ldadd)
tests_test_strlen_CFLAGS = $(tests_cflags)
tests_test_strncmp_LDADD = $(tests_ldadd)
tests_test_strncmp_CFLAGS = $(tests_cflags)
TESTS = $(check_PROGRAMS)
## Benchmarks
noinst_PROGRAMS = \
dhry \
dhry-native \
try-none \
try-this \
try-plain \
try-newlib-c \
try-bionic-c \
try-glibc-c
# Good 'ol Dhrystone
dhry_SOURCES = \
benchmarks/dhry/dhry_1.c \
benchmarks/dhry/dhry_2.c \
benchmarks/dhry/dhry.h
dhry_CFLAGS = -Dcompiler="\"$(COMPILER)\"" -Doptions="\"$(CFLAGS)\""
dhry_LDADD = libcortex-strings.la
dhry_native_SOURCES = $(dhry_SOURCES)
dhry_native_CFLAGS = $(dhry_CFLAGS)
# Benchmark harness
noinst_LIBRARIES = \
libmulti.a \
libbionic-c.a \
libglibc-c.a \
libnewlib-c.a \
libplain.a
libmulti_a_SOURCES = \
benchmarks/multi/harness.c
libmulti_a_CFLAGS = -DVERSION=\"$(VERSION)\" $(AM_CFLAGS)
## Other architecture independant implementaions
libbionic_c_a_SOURCES = \
reference/bionic-c/bcopy.c \
reference/bionic-c/memchr.c \
reference/bionic-c/memcmp.c \
reference/bionic-c/memcpy.c \
reference/bionic-c/memset.c \
reference/bionic-c/strchr.c \
reference/bionic-c/strcmp.c \
reference/bionic-c/strcpy.c \
reference/bionic-c/strlen.c
libglibc_c_a_SOURCES = \
reference/glibc-c/memchr.c \
reference/glibc-c/memcmp.c \
reference/glibc-c/memcpy.c \
reference/glibc-c/memset.c \
reference/glibc-c/strchr.c \
reference/glibc-c/strcmp.c \
reference/glibc-c/strcpy.c \
reference/glibc-c/strlen.c \
reference/glibc-c/wordcopy.c \
reference/glibc-c/memcopy.h \
reference/glibc-c/pagecopy.h
libnewlib_c_a_SOURCES = \
reference/newlib-c/memchr.c \
reference/newlib-c/memcmp.c \
reference/newlib-c/memcpy.c \
reference/newlib-c/memset.c \
reference/newlib-c/strchr.c \
reference/newlib-c/strcmp.c \
reference/newlib-c/strcpy.c \
reference/newlib-c/strlen.c \
reference/newlib-c/shim.h
libplain_a_SOURCES = \
reference/plain/memset.c \
reference/plain/memcpy.c \
reference/plain/strcmp.c \
reference/plain/strcpy.c
try_none_SOURCES =
try_none_LDADD = libmulti.a -lrt
try_this_SOURCES =
try_this_LDADD = libmulti.a libcortex-strings.la -lrt
try_bionic_c_SOURCES =
try_bionic_c_LDADD = libmulti.a libbionic-c.a -lrt
try_glibc_c_SOURCES =
try_glibc_c_LDADD = libmulti.a libglibc-c.a -lrt
try_newlib_c_SOURCES =
try_newlib_c_LDADD = libmulti.a libnewlib-c.a -lrt
try_plain_SOURCES =
try_plain_LDADD = libmulti.a libplain.a -lrt
# Architecture specific
if HOST_AARCH32
if WITH_NEON
# Pull in the NEON specific files
neon_bionic_a9_sources = \
reference/bionic-a9/memcpy.S \
reference/bionic-a9/memset.S
neon_bionic_a15_sources = \
reference/bionic-a15/memcpy.S \
reference/bionic-a15/memset.S
fpu_flags = -mfpu=neon
else
if WITH_VFP
fpu_flags = -mfpu=vfp
else
fpu_flags = -msoft-float
endif
endif
# Benchmarks and example programs
noinst_PROGRAMS += \
try-bionic-a9 \
try-bionic-a15 \
try-csl \
try-glibc \
try-newlib \
try-newlib-xscale
# Libraries used in the benchmarks and examples
noinst_LIBRARIES += \
libbionic-a9.a \
libbionic-a15.a \
libcsl.a \
libglibc.a \
libnewlib.a \
libnewlib-xscale.a
# Main library
libcortex_strings_la_SOURCES = \
src/thumb-2/strcpy.c \
src/arm/memchr.S \
src/arm/strchr.S \
src/thumb-2/strlen.S \
src/arm/memset.S \
src/arm/memcpy.S \
src/arm/strcmp.S
# Libraries containing the difference reference versions
libbionic_a9_a_SOURCES = \
$(neon_bionic_a9_sources) \
reference/bionic-a9/memcmp.S \
reference/bionic-a9/strcmp.S \
reference/bionic-a9/strcpy.S \
reference/bionic-a9/strlen.c
libbionic_a9_a_CFLAGS = -Wa,-mimplicit-it=thumb
libbionic_a15_a_SOURCES = \
$(neon_bionic_a15_sources) \
reference/bionic-a15/memcmp.S \
reference/bionic-a15/strcmp.S \
reference/bionic-a15/strcpy.S \
reference/bionic-a15/strlen.c
libbionic_a15_a_CFLAGS = -Wa,-mimplicit-it=thumb
libcsl_a_SOURCES = \
reference/csl/memcpy.c \
reference/csl/memset.c \
reference/csl/arm_asm.h
libglibc_a_SOURCES = \
reference/glibc/memcpy.S \
reference/glibc/memset.S \
reference/glibc/strchr.S \
reference/glibc/strlen.S
libnewlib_a_SOURCES = \
reference/newlib/memcpy.S \
reference/newlib/strcmp.S \
reference/newlib/strcpy.c \
reference/newlib/strlen.c \
reference/newlib/arm_asm.h \
reference/newlib/shim.h
libnewlib_xscale_a_SOURCES = \
reference/newlib-xscale/memchr.c \
reference/newlib-xscale/memcpy.c \
reference/newlib-xscale/memset.c \
reference/newlib-xscale/strchr.c \
reference/newlib-xscale/strcmp.c \
reference/newlib-xscale/strcpy.c \
reference/newlib-xscale/strlen.c \
reference/newlib-xscale/xscale.h
# Flags for the benchmark helpers
try_bionic_a9_SOURCES =
try_bionic_a9_LDADD = libmulti.a libbionic-a9.a -lrt
try_bionic_a15_SOURCES =
try_bionic_a15_LDADD = libmulti.a libbionic-a15.a -lrt
try_csl_SOURCES =
try_csl_LDADD = libmulti.a libcsl.a -lrt
try_glibc_SOURCES =
try_glibc_LDADD = libmulti.a libglibc.a -lrt
try_newlib_SOURCES =
try_newlib_LDADD = libmulti.a libnewlib.a -lrt
try_newlib_xscale_SOURCES =
try_newlib_xscale_LDADD = libmulti.a libnewlib-xscale.a -lrt
AM_CPPFLAGS = $(fpu_flags)
AM_LDFLAGS = $(fpu_flags)
endif
# aarch64 specific
if HOST_AARCH64
libcortex_strings_la_SOURCES = \
src/aarch64/memchr.S \
src/aarch64/memcmp.S \
src/aarch64/memcpy.S \
src/aarch64/memmove.S \
src/aarch64/memset.S \
src/aarch64/strchr.S \
src/aarch64/strchrnul.S \
src/aarch64/strcmp.S \
src/aarch64/strcpy.S \
src/aarch64/strlen.S \
src/aarch64/strncmp.S \
src/aarch64/strnlen.S
endif
libcortex_strings_la_LDFLAGS = -version-info 1:0:0
AM_CFLAGS = \
-std=gnu99 -Wall \
-fno-builtin -fno-stack-protector -U_FORTIFY_SOURCE \
$(AM_CPPFLAGS)
if WITH_SUBMACHINE
AM_CFLAGS += \
-mtune=$(submachine)
endif
EXTRA_DIST = \
tests/hp-timing.h \
tests/test-string.h \
tests/test-skeleton.c \
scripts/add-license.sh \
scripts/bench.py \
scripts/fixup.py \
scripts/libplot.py \
scripts/plot-align.py \
scripts/plot.py \
scripts/plot-sizes.py \
scripts/plot-top.py \
scripts/trim.sh \
autogen.sh

View File

@ -0,0 +1,111 @@
= Cortex-A String Routines =
This package contains optimised string routines including memcpy(), memset(),
strcpy(), strlen() for the ARM Cortex-A series of cores.
Various implementations of these routines are provided, including generic
implementations for ARMv7-A cores with/without Neon, Thumb2 implementations
and generic implementations for cores supporting AArch64.
== Getting started ==
First configure and then install libcortex-strings.so. To make other
applications use this library, either add -lcortex-strings to the link
command or use LD_PRELOAD to load the library into existing applications.
Our intent is to get these routines into the common C libraries such
as GLIBC, Bionic, and Newlib. Your system may already include them!
== Contents ==
* src/ contains the routines themselves
* tests/ contains the unit tests
* reference/ contains reference copies of other ARM-focused
implementations gathered from around the Internet
* benchmarks/ contains various benchmarks, tools, and scripts used to
check and report on the different implementations.
The src directory contains different variants organised by the
implementation they run on and optional features used. For example:
* src/thumb-2 contains generic non-NEON routines for AArch32 (with Thumb-2).
* src/arm contains tuned routines for Cortex-A class processors.
* src/aarch64 contains generic routines for AArch64.
* src/thumb contains generic routines for armv6-M (with Thumb).
== Reference versions ==
reference/ contains versions collected from various popular Open
Source libraries. These have been modified for use in benchmarking.
Please refer to the individual files for any licensing terms.
The routines were collected from the following releases:
* EGLIBC 2.13
* Newlib 1.19.0
* Bionic android-2.3.5_r1
== Licensing ==
All Linaro-authored routines are under the modified BSD license:
Copyright (c) 2011, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Linaro nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
All ARM-authored routines are under the modified BSD license:
Copyright (c) 2014 ARM Ltd
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Linaro nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
All third party routines are under a GPL compatible license.
== Notes and Limitations ==
Some of the implementations have been collected from other
projects and have a variety of licenses and copyright holders.
== Style ==
Assembly code attempts to follow the GLIBC coding convetions. They
are:
* Copyright headers in C style comment blocks
* Instructions indented with one tab
* Operands indented with one tab
* Text is wrapped at 70 characters
* End of line comments are fine

View File

@ -0,0 +1,69 @@
#!/bin/sh
#
# autogen.sh glue for hplip
#
# HPLIP used to have five or so different autotools trees. Upstream
# has reduced it to two. Still, this script is capable of cleaning
# just about any possible mess of autoconf files.
#
# BE CAREFUL with trees that are not completely automake-generated,
# this script deletes all Makefile.in files it can find.
#
# Requires: automake 1.9, autoconf 2.57+
# Conflicts: autoconf 2.13
set -e
# Refresh GNU autotools toolchain.
echo Cleaning autotools files...
find -type d -name autom4te.cache -print0 | xargs -0 rm -rf \;
find -type f \( -name missing -o -name install-sh -o -name mkinstalldirs \
-o -name depcomp -o -name ltmain.sh -o -name configure \
-o -name config.sub -o -name config.guess \
-o -name Makefile.in \) -print0 | xargs -0 rm -f
echo Running autoreconf...
autoreconf --force --install
# For the Debian package build
test -d debian && {
# link these in Debian builds
rm -f config.sub config.guess
ln -s /usr/share/misc/config.sub .
ln -s /usr/share/misc/config.guess .
# refresh list of executable scripts, to avoid possible breakage if
# upstream tarball does not include the file or if it is mispackaged
# for whatever reason.
[ "$1" = "updateexec" ] && {
echo Generating list of executable files...
rm -f debian/executable.files
find -type f -perm +111 ! -name '.*' -fprint debian/executable.files
}
# Remove any files in upstream tarball that we don't have in the Debian
# package (because diff cannot remove files)
version=`dpkg-parsechangelog | awk '/Version:/ { print $2 }' | sed -e 's/-[^-]\+$//'`
source=`dpkg-parsechangelog | awk '/Source:/ { print $2 }' | tr -d ' '`
if test -r ../${source}_${version}.orig.tar.gz ; then
echo Generating list of files that should be removed...
rm -f debian/deletable.files
touch debian/deletable.files
[ -e debian/tmp ] && rm -rf debian/tmp
mkdir debian/tmp
( cd debian/tmp ; tar -zxf ../../../${source}_${version}.orig.tar.gz )
find debian/tmp/ -type f ! -name '.*' -print0 | xargs -0 -ri echo '{}' | \
while read -r i ; do
if test -e "${i}" ; then
filename=$(echo "${i}" | sed -e 's#.*debian/tmp/[^/]\+/##')
test -e "${filename}" || echo "${filename}" >>debian/deletable.files
fi
done
rm -fr debian/tmp
else
echo Emptying list of files that should be deleted...
rm -f debian/deletable.files
touch debian/deletable.files
fi
}
exit 0

View File

@ -0,0 +1,311 @@
/*
**************************************************************************
* DHRYSTONE 2.1 BENCHMARK PC VERSION
**************************************************************************
*
* "DHRYSTONE" Benchmark Program
* -----------------------------
*
* Version: C, Version 2.1
*
* File: dhry.h (part 1 of 3)
*
* Date: May 25, 1988
*
* Author: Reinhold P. Weicker
* Siemens AG, AUT E 51
* Postfach 3220
* 8520 Erlangen
* Germany (West)
* Phone: [+49]-9131-7-20330
* (8-17 Central European Time)
* Usenet: ..!mcsun!unido!estevax!weicker
*
* Original Version (in Ada) published in
* "Communications of the ACM" vol. 27., no. 10 (Oct. 1984),
* pp. 1013 - 1030, together with the statistics
* on which the distribution of statements etc. is based.
*
* In this C version, the following C library functions are used:
* - strcpy, strcmp (inside the measurement loop)
* - printf, scanf (outside the measurement loop)
* In addition, Berkeley UNIX system calls "times ()" or "time ()"
* are used for execution time measurement. For measurements
* on other systems, these calls have to be changed.
*
* Collection of Results:
* Reinhold Weicker (address see above) and
*
* Rick Richardson
* PC Research. Inc.
* 94 Apple Orchard Drive
* Tinton Falls, NJ 07724
* Phone: (201) 389-8963 (9-17 EST)
* Usenet: ...!uunet!pcrat!rick
*
* Please send results to Rick Richardson and/or Reinhold Weicker.
* Complete information should be given on hardware and software used.
* Hardware information includes: Machine type, CPU, type and size
* of caches; for microprocessors: clock frequency, memory speed
* (number of wait states).
* Software information includes: Compiler (and runtime library)
* manufacturer and version, compilation switches, OS version.
* The Operating System version may give an indication about the
* compiler; Dhrystone itself performs no OS calls in the measurement
* loop.
*
* The complete output generated by the program should be mailed
* such that at least some checks for correctness can be made.
*
**************************************************************************
*
* This version has changes made by Roy Longbottom to conform to a common
* format for a series of standard benchmarks for PCs:
*
* Running time greater than 5 seconds due to inaccuracy of the PC clock.
*
* Automatic adjustment of run time, no manually inserted parameters.
*
* Initial display of calibration times to confirm linearity.
*
* Display of results within one screen (or at a slow speed as the test
* progresses) so that it can be seen to have run successfully.
*
* Facilities to type in details of system used etc.
*
* All results and details appended to a results file.
*
*
* Roy Longbottom
* 101323.2241@compuserve.com
*
**************************************************************************
*
* For details of history, changes, other defines, benchmark construction
* statistics see official versions from ftp.nosc.mil/pub/aburto where
* the latest table of results (dhry.tbl) are available. See also
* netlib@ornl.gov
*
**************************************************************************
*
* Defines: The following "Defines" are possible:
* -DREG=register (default: Not defined)
* As an approximation to what an average C programmer
* might do, the "register" storage class is applied
* (if enabled by -DREG=register)
* - for local variables, if they are used (dynamically)
* five or more times
* - for parameters if they are used (dynamically)
* six or more times
* Note that an optimal "register" strategy is
* compiler-dependent, and that "register" declarations
* do not necessarily lead to faster execution.
* -DNOSTRUCTASSIGN (default: Not defined)
* Define if the C compiler does not support
* assignment of structures.
* -DNOENUMS (default: Not defined)
* Define if the C compiler does not support
* enumeration types.
***************************************************************************
*
* Compilation model and measurement (IMPORTANT):
*
* This C version of Dhrystone consists of three files:
* - dhry.h (this file, containing global definitions and comments)
* - dhry_1.c (containing the code corresponding to Ada package Pack_1)
* - dhry_2.c (containing the code corresponding to Ada package Pack_2)
*
* The following "ground rules" apply for measurements:
* - Separate compilation
* - No procedure merging
* - Otherwise, compiler optimizations are allowed but should be indicated
* - Default results are those without register declarations
* See the companion paper "Rationale for Dhrystone Version 2" for a more
* detailed discussion of these ground rules.
*
* For 16-Bit processors (e.g. 80186, 80286), times for all compilation
* models ("small", "medium", "large" etc.) should be given if possible,
* together with a definition of these models for the compiler system used.
*
**************************************************************************
* Examples of Pentium Results
*
* Dhrystone Benchmark Version 2.1 (Language: C)
*
* Month run 4/1996
* PC model Escom
* CPU Pentium
* Clock MHz 100
* Cache 256K
* Options Neptune chipset
* OS/DOS Windows 95
* Compiler Watcom C/ C++ 10.5 Win386
* OptLevel -otexan -zp8 -fp5 -5r
* Run by Roy Longbottom
* From UK
* Mail 101323.2241@compuserve.com
*
* Final values (* implementation-dependent):
*
* Int_Glob: O.K. 5
* Bool_Glob: O.K. 1
* Ch_1_Glob: O.K. A
* Ch_2_Glob: O.K. B
* Arr_1_Glob[8]: O.K. 7
* Arr_2_Glob8/7: O.K. 1600010
* Ptr_Glob->
* Ptr_Comp: * 98008
* Discr: O.K. 0
* Enum_Comp: O.K. 2
* Int_Comp: O.K. 17
* Str_Comp: O.K. DHRYSTONE PROGRAM, SOME STRING
* Next_Ptr_Glob->
* Ptr_Comp: * 98008 same as above
* Discr: O.K. 0
* Enum_Comp: O.K. 1
* Int_Comp: O.K. 18
* Str_Comp: O.K. DHRYSTONE PROGRAM, SOME STRING
* Int_1_Loc: O.K. 5
* Int_2_Loc: O.K. 13
* Int_3_Loc: O.K. 7
* Enum_Loc: O.K. 1
* Str_1_Loc: O.K. DHRYSTONE PROGRAM, 1'ST STRING
* Str_2_Loc: O.K. DHRYSTONE PROGRAM, 2'ND STRING
*
* Register option Selected.
*
* Microseconds 1 loop: 4.53
* Dhrystones / second: 220690
* VAX MIPS rating: 125.61
*
*
* Dhrystone Benchmark Version 2.1 (Language: C)
*
* Month run 4/1996
* PC model Escom
* CPU Pentium
* Clock MHz 100
* Cache 256K
* Options Neptune chipset
* OS/DOS Windows 95
* Compiler Watcom C/ C++ 10.5 Win386
* OptLevel No optimisation
* Run by Roy Longbottom
* From UK
* Mail 101323.2241@compuserve.com
*
* Final values (* implementation-dependent):
*
* Int_Glob: O.K. 5
* Bool_Glob: O.K. 1
* Ch_1_Glob: O.K. A
* Ch_2_Glob: O.K. B
* Arr_1_Glob[8]: O.K. 7
* Arr_2_Glob8/7: O.K. 320010
* Ptr_Glob->
* Ptr_Comp: * 98004
* Discr: O.K. 0
* Enum_Comp: O.K. 2
* Int_Comp: O.K. 17
* Str_Comp: O.K. DHRYSTONE PROGRAM, SOME STRING
* Next_Ptr_Glob->
* Ptr_Comp: * 98004 same as above
* Discr: O.K. 0
* Enum_Comp: O.K. 1
* Int_Comp: O.K. 18
* Str_Comp: O.K. DHRYSTONE PROGRAM, SOME STRING
* Int_1_Loc: O.K. 5
* Int_2_Loc: O.K. 13
* Int_3_Loc: O.K. 7
* Enum_Loc: O.K. 1
* Str_1_Loc: O.K. DHRYSTONE PROGRAM, 1'ST STRING
* Str_2_Loc: O.K. DHRYSTONE PROGRAM, 2'ND STRING
*
* Register option Not selected.
*
* Microseconds 1 loop: 20.06
* Dhrystones / second: 49844
* VAX MIPS rating: 28.37
*
**************************************************************************
*/
/* Compiler and system dependent definitions: */
#ifndef TIME
#define TIMES
#endif
/* Use times(2) time function unless */
/* explicitly defined otherwise */
#ifdef TIMES
/* #include <sys/types.h>
#include <sys/times.h> */
/* for "times" */
#endif
#define Mic_secs_Per_Second 1000000.0
/* Berkeley UNIX C returns process times in seconds/HZ */
#ifdef NOSTRUCTASSIGN
#define structassign(d, s) memcpy(&(d), &(s), sizeof(d))
#else
#define structassign(d, s) d = s
#endif
#ifdef NOENUM
#define Ident_1 0
#define Ident_2 1
#define Ident_3 2
#define Ident_4 3
#define Ident_5 4
typedef int Enumeration;
#else
typedef enum {Ident_1, Ident_2, Ident_3, Ident_4, Ident_5}
Enumeration;
#endif
/* for boolean and enumeration types in Ada, Pascal */
/* General definitions: */
#include <stdio.h>
#include <string.h>
/* for strcpy, strcmp */
#define Null 0
/* Value of a Null pointer */
#define true 1
#define false 0
typedef int One_Thirty;
typedef int One_Fifty;
typedef char Capital_Letter;
typedef int Boolean;
typedef char Str_30 [31];
typedef int Arr_1_Dim [50];
typedef int Arr_2_Dim [50] [50];
typedef struct record
{
struct record *Ptr_Comp;
Enumeration Discr;
union {
struct {
Enumeration Enum_Comp;
int Int_Comp;
char Str_Comp [31];
} var_1;
struct {
Enumeration E_Comp_2;
char Str_2_Comp [31];
} var_2;
struct {
char Ch_1_Comp;
char Ch_2_Comp;
} var_3;
} variant;
} Rec_Type, *Rec_Pointer;

View File

@ -0,0 +1,778 @@
/*
*************************************************************************
*
* "DHRYSTONE" Benchmark Program
* -----------------------------
*
* Version: C, Version 2.1
*
* File: dhry_1.c (part 2 of 3)
*
* Date: May 25, 1988
*
* Author: Reinhold P. Weicker
*
*************************************************************************
*/
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include "dhry.h"
/*COMPILER COMPILER COMPILER COMPILER COMPILER COMPILER COMPILER*/
#ifdef COW
#define compiler "Watcom C/C++ 10.5 Win386"
#define options " -otexan -zp8 -5r -ms"
#endif
#ifdef CNW
#define compiler "Watcom C/C++ 10.5 Win386"
#define options " No optimisation"
#endif
#ifdef COD
#define compiler "Watcom C/C++ 10.5 Dos4GW"
#define options " -otexan -zp8 -5r -ms"
#endif
#ifdef CND
#define compiler "Watcom C/C++ 10.5 Dos4GW"
#define options " No optimisation"
#endif
#ifdef CONT
#define compiler "Watcom C/C++ 10.5 Win32NT"
#define options " -otexan -zp8 -5r -ms"
#endif
#ifdef CNNT
#define compiler "Watcom C/C++ 10.5 Win32NT"
#define options " No optimisation"
#endif
#ifdef COO2
#define compiler "Watcom C/C++ 10.5 OS/2-32"
#define options " -otexan -zp8 -5r -ms"
#endif
#ifdef CNO2
#define compiler "Watcom C/C++ 10.5 OS/2-32"
#define options " No optimisation"
#endif
/* Global Variables: */
Rec_Pointer Ptr_Glob,
Next_Ptr_Glob;
int Int_Glob;
Boolean Bool_Glob;
char Ch_1_Glob,
Ch_2_Glob;
int Arr_1_Glob [50];
int Arr_2_Glob [50] [50];
int getinput = 1;
char Reg_Define[100] = "Register option Selected.";
Enumeration Func_1 (Capital_Letter Ch_1_Par_Val,
Capital_Letter Ch_2_Par_Val);
/*
forward declaration necessary since Enumeration may not simply be int
*/
#ifndef ROPT
#define REG
/* REG becomes defined as empty */
/* i.e. no register variables */
#else
#define REG register
#endif
void Proc_1 (REG Rec_Pointer Ptr_Val_Par);
void Proc_2 (One_Fifty *Int_Par_Ref);
void Proc_3 (Rec_Pointer *Ptr_Ref_Par);
void Proc_4 ();
void Proc_5 ();
void Proc_6 (Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par);
void Proc_7 (One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val,
One_Fifty *Int_Par_Ref);
void Proc_8 (Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref,
int Int_1_Par_Val, int Int_2_Par_Val);
Boolean Func_2 (Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref);
/* variables for time measurement: */
#define Too_Small_Time 2
/* Measurements should last at least 2 seconds */
double Begin_Time,
End_Time,
User_Time;
double Microseconds,
Dhrystones_Per_Second,
Vax_Mips;
/* end of variables for time measurement */
void main (int argc, char *argv[])
/*****/
/* main program, corresponds to procedures */
/* Main and Proc_0 in the Ada version */
{
double dtime();
One_Fifty Int_1_Loc;
REG One_Fifty Int_2_Loc;
One_Fifty Int_3_Loc;
REG char Ch_Index;
Enumeration Enum_Loc;
Str_30 Str_1_Loc;
Str_30 Str_2_Loc;
REG int Run_Index;
REG int Number_Of_Runs;
int endit, count = 10;
FILE *Ap;
char general[9][80] = {" "};
/* Initializations */
if (argc > 1)
{
switch (argv[1][0])
{
case 'N':
getinput = 0;
break;
case 'n':
getinput = 0;
break;
}
}
if ((Ap = fopen("Dhry.txt","a+")) == NULL)
{
printf("Can not open Dhry.txt\n\n");
printf("Press any key\n");
exit(1);
}
/***********************************************************************
* Change for compiler and optimisation used *
***********************************************************************/
Next_Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type));
Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type));
Ptr_Glob->Ptr_Comp = Next_Ptr_Glob;
Ptr_Glob->Discr = Ident_1;
Ptr_Glob->variant.var_1.Enum_Comp = Ident_3;
Ptr_Glob->variant.var_1.Int_Comp = 40;
strcpy (Ptr_Glob->variant.var_1.Str_Comp,
"DHRYSTONE PROGRAM, SOME STRING");
strcpy (Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");
Arr_2_Glob [8][7] = 10;
/* Was missing in published program. Without this statement, */
/* Arr_2_Glob [8][7] would have an undefined value. */
/* Warning: With 16-Bit processors and Number_Of_Runs > 32000, */
/* overflow may occur for this array element. */
printf ("\n");
printf ("Dhrystone Benchmark, Version 2.1 (Language: C or C++)\n");
printf ("\n");
if (getinput == 0)
{
printf ("No run time input data\n\n");
}
else
{
printf ("With run time input data\n\n");
}
printf ("Compiler %s\n", compiler);
printf ("Optimisation %s\n", options);
#ifdef ROPT
printf ("Register option selected\n\n");
#else
printf ("Register option not selected\n\n");
strcpy(Reg_Define, "Register option Not selected.");
#endif
/*
if (Reg)
{
printf ("Program compiled with 'register' attribute\n");
printf ("\n");
}
else
{
printf ("Program compiled without 'register' attribute\n");
printf ("\n");
}
printf ("Please give the number of runs through the benchmark: ");
{
int n;
scanf ("%d", &n);
Number_Of_Runs = n;
}
printf ("\n");
printf ("Execution starts, %d runs through Dhrystone\n",
Number_Of_Runs);
*/
Number_Of_Runs = 5000;
do
{
Number_Of_Runs = Number_Of_Runs * 2;
count = count - 1;
Arr_2_Glob [8][7] = 10;
/***************/
/* Start timer */
/***************/
Begin_Time = dtime();
for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index)
{
Proc_5();
Proc_4();
/* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */
Int_1_Loc = 2;
Int_2_Loc = 3;
strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
Enum_Loc = Ident_2;
Bool_Glob = ! Func_2 (Str_1_Loc, Str_2_Loc);
/* Bool_Glob == 1 */
while (Int_1_Loc < Int_2_Loc) /* loop body executed once */
{
Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc;
/* Int_3_Loc == 7 */
Proc_7 (Int_1_Loc, Int_2_Loc, &Int_3_Loc);
/* Int_3_Loc == 7 */
Int_1_Loc += 1;
} /* while */
/* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
Proc_8 (Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc);
/* Int_Glob == 5 */
Proc_1 (Ptr_Glob);
for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index)
/* loop body executed twice */
{
if (Enum_Loc == Func_1 (Ch_Index, 'C'))
/* then, not executed */
{
Proc_6 (Ident_1, &Enum_Loc);
strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING");
Int_2_Loc = Run_Index;
Int_Glob = Run_Index;
}
}
/* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
Int_2_Loc = Int_2_Loc * Int_1_Loc;
Int_1_Loc = Int_2_Loc / Int_3_Loc;
Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc;
/* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */
Proc_2 (&Int_1_Loc);
/* Int_1_Loc == 5 */
} /* loop "for Run_Index" */
/**************/
/* Stop timer */
/**************/
End_Time = dtime();
User_Time = End_Time - Begin_Time;
printf ("%12.0f runs %6.2f seconds \n",(double) Number_Of_Runs, User_Time);
if (User_Time > 5)
{
count = 0;
}
else
{
if (User_Time < 0.1)
{
Number_Of_Runs = Number_Of_Runs * 5;
}
}
} /* calibrate/run do while */
while (count >0);
printf ("\n");
printf ("Final values (* implementation-dependent):\n");
printf ("\n");
printf ("Int_Glob: ");
if (Int_Glob == 5) printf ("O.K. ");
else printf ("WRONG ");
printf ("%d ", Int_Glob);
printf ("Bool_Glob: ");
if (Bool_Glob == 1) printf ("O.K. ");
else printf ("WRONG ");
printf ("%d\n", Bool_Glob);
printf ("Ch_1_Glob: ");
if (Ch_1_Glob == 'A') printf ("O.K. ");
else printf ("WRONG ");
printf ("%c ", Ch_1_Glob);
printf ("Ch_2_Glob: ");
if (Ch_2_Glob == 'B') printf ("O.K. ");
else printf ("WRONG ");
printf ("%c\n", Ch_2_Glob);
printf ("Arr_1_Glob[8]: ");
if (Arr_1_Glob[8] == 7) printf ("O.K. ");
else printf ("WRONG ");
printf ("%d ", Arr_1_Glob[8]);
printf ("Arr_2_Glob8/7: ");
if (Arr_2_Glob[8][7] == Number_Of_Runs + 10)
printf ("O.K. ");
else printf ("WRONG ");
printf ("%10d\n", Arr_2_Glob[8][7]);
printf ("Ptr_Glob-> ");
printf (" Ptr_Comp: * %d\n", (int) Ptr_Glob->Ptr_Comp);
printf (" Discr: ");
if (Ptr_Glob->Discr == 0) printf ("O.K. ");
else printf ("WRONG ");
printf ("%d ", Ptr_Glob->Discr);
printf ("Enum_Comp: ");
if (Ptr_Glob->variant.var_1.Enum_Comp == 2)
printf ("O.K. ");
else printf ("WRONG ");
printf ("%d\n", Ptr_Glob->variant.var_1.Enum_Comp);
printf (" Int_Comp: ");
if (Ptr_Glob->variant.var_1.Int_Comp == 17) printf ("O.K. ");
else printf ("WRONG ");
printf ("%d ", Ptr_Glob->variant.var_1.Int_Comp);
printf ("Str_Comp: ");
if (strcmp(Ptr_Glob->variant.var_1.Str_Comp,
"DHRYSTONE PROGRAM, SOME STRING") == 0)
printf ("O.K. ");
else printf ("WRONG ");
printf ("%s\n", Ptr_Glob->variant.var_1.Str_Comp);
printf ("Next_Ptr_Glob-> ");
printf (" Ptr_Comp: * %d", (int) Next_Ptr_Glob->Ptr_Comp);
printf (" same as above\n");
printf (" Discr: ");
if (Next_Ptr_Glob->Discr == 0)
printf ("O.K. ");
else printf ("WRONG ");
printf ("%d ", Next_Ptr_Glob->Discr);
printf ("Enum_Comp: ");
if (Next_Ptr_Glob->variant.var_1.Enum_Comp == 1)
printf ("O.K. ");
else printf ("WRONG ");
printf ("%d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
printf (" Int_Comp: ");
if (Next_Ptr_Glob->variant.var_1.Int_Comp == 18)
printf ("O.K. ");
else printf ("WRONG ");
printf ("%d ", Next_Ptr_Glob->variant.var_1.Int_Comp);
printf ("Str_Comp: ");
if (strcmp(Next_Ptr_Glob->variant.var_1.Str_Comp,
"DHRYSTONE PROGRAM, SOME STRING") == 0)
printf ("O.K. ");
else printf ("WRONG ");
printf ("%s\n", Next_Ptr_Glob->variant.var_1.Str_Comp);
printf ("Int_1_Loc: ");
if (Int_1_Loc == 5)
printf ("O.K. ");
else printf ("WRONG ");
printf ("%d ", Int_1_Loc);
printf ("Int_2_Loc: ");
if (Int_2_Loc == 13)
printf ("O.K. ");
else printf ("WRONG ");
printf ("%d\n", Int_2_Loc);
printf ("Int_3_Loc: ");
if (Int_3_Loc == 7)
printf ("O.K. ");
else printf ("WRONG ");
printf ("%d ", Int_3_Loc);
printf ("Enum_Loc: ");
if (Enum_Loc == 1)
printf ("O.K. ");
else printf ("WRONG ");
printf ("%d\n", Enum_Loc);
printf ("Str_1_Loc: ");
if (strcmp(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING") == 0)
printf ("O.K. ");
else printf ("WRONG ");
printf ("%s\n", Str_1_Loc);
printf ("Str_2_Loc: ");
if (strcmp(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING") == 0)
printf ("O.K. ");
else printf ("WRONG ");
printf ("%s\n", Str_2_Loc);
printf ("\n");
if (User_Time < Too_Small_Time)
{
printf ("Measured time too small to obtain meaningful results\n");
printf ("Please increase number of runs\n");
printf ("\n");
}
else
{
Microseconds = User_Time * Mic_secs_Per_Second
/ (double) Number_Of_Runs;
Dhrystones_Per_Second = (double) Number_Of_Runs / User_Time;
Vax_Mips = Dhrystones_Per_Second / 1757.0;
printf ("Microseconds for one run through Dhrystone: ");
printf ("%12.2lf \n", Microseconds);
printf ("Dhrystones per Second: ");
printf ("%10.0lf \n", Dhrystones_Per_Second);
printf ("VAX MIPS rating = ");
printf ("%12.2lf \n",Vax_Mips);
printf ("\n");
/************************************************************************
* Type details of hardware, software etc. *
************************************************************************/
if (getinput == 1)
{
printf ("Enter the following which will be added with results to file DHRY.TXT\n");
printf ("When submitting a number of results you need only provide details once\n");
printf ("but a cross reference such as an abbreviated CPU type would be useful.\n");
printf ("You can kill (exit or close) the program now and no data will be added.\n\n");
printf ("PC Supplier/model ? ");
gets(general[1]);
printf ("CPU chip ? ");
gets(general[2]);
printf ("Clock MHz ? ");
gets(general[3]);
printf ("Cache size ? ");
gets(general[4]);
printf ("Chipset & H/W options ? ");
gets(general[5]);
printf ("OS/DOS version ? ");
gets(general[6]);
printf ("Your name ? ");
gets(general[7]);
printf ("Company/Location ? ");
gets(general[8]);
printf ("E-mail address ? ");
gets(general[0]);
}
/************************************************************************
* Add results to output file Dhry.txt *
************************************************************************/
fprintf (Ap, "-------------------- -----------------------------------"
"\n");
fprintf (Ap, "Dhrystone Benchmark Version 2.1 (Language: C++)\n\n");
fprintf (Ap, "PC model %s\n", general[1]);
fprintf (Ap, "CPU %s\n", general[2]);
fprintf (Ap, "Clock MHz %s\n", general[3]);
fprintf (Ap, "Cache %s\n", general[4]);
fprintf (Ap, "Options %s\n", general[5]);
fprintf (Ap, "OS/DOS %s\n", general[6]);
fprintf (Ap, "Compiler %s\n", compiler);
fprintf (Ap, "OptLevel %s\n", options);
fprintf (Ap, "Run by %s\n", general[7]);
fprintf (Ap, "From %s\n", general[8]);
fprintf (Ap, "Mail %s\n\n", general[0]);
fprintf (Ap, "Final values (* implementation-dependent):\n");
fprintf (Ap, "\n");
fprintf (Ap, "Int_Glob: ");
if (Int_Glob == 5) fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%d\n", Int_Glob);
fprintf (Ap, "Bool_Glob: ");
if (Bool_Glob == 1) fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%d\n", Bool_Glob);
fprintf (Ap, "Ch_1_Glob: ");
if (Ch_1_Glob == 'A') fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%c\n", Ch_1_Glob);
fprintf (Ap, "Ch_2_Glob: ");
if (Ch_2_Glob == 'B') fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%c\n", Ch_2_Glob);
fprintf (Ap, "Arr_1_Glob[8]: ");
if (Arr_1_Glob[8] == 7) fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%d\n", Arr_1_Glob[8]);
fprintf (Ap, "Arr_2_Glob8/7: ");
if (Arr_2_Glob[8][7] == Number_Of_Runs + 10)
fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%10d\n", Arr_2_Glob[8][7]);
fprintf (Ap, "Ptr_Glob-> \n");
fprintf (Ap, " Ptr_Comp: * %d\n", (int) Ptr_Glob->Ptr_Comp);
fprintf (Ap, " Discr: ");
if (Ptr_Glob->Discr == 0) fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%d\n", Ptr_Glob->Discr);
fprintf (Ap, " Enum_Comp: ");
if (Ptr_Glob->variant.var_1.Enum_Comp == 2)
fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%d\n", Ptr_Glob->variant.var_1.Enum_Comp);
fprintf (Ap, " Int_Comp: ");
if (Ptr_Glob->variant.var_1.Int_Comp == 17) fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%d\n", Ptr_Glob->variant.var_1.Int_Comp);
fprintf (Ap, " Str_Comp: ");
if (strcmp(Ptr_Glob->variant.var_1.Str_Comp,
"DHRYSTONE PROGRAM, SOME STRING") == 0)
fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%s\n", Ptr_Glob->variant.var_1.Str_Comp);
fprintf (Ap, "Next_Ptr_Glob-> \n");
fprintf (Ap, " Ptr_Comp: * %d", (int) Next_Ptr_Glob->Ptr_Comp);
fprintf (Ap, " same as above\n");
fprintf (Ap, " Discr: ");
if (Next_Ptr_Glob->Discr == 0)
fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%d\n", Next_Ptr_Glob->Discr);
fprintf (Ap, " Enum_Comp: ");
if (Next_Ptr_Glob->variant.var_1.Enum_Comp == 1)
fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
fprintf (Ap, " Int_Comp: ");
if (Next_Ptr_Glob->variant.var_1.Int_Comp == 18)
fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%d\n", Next_Ptr_Glob->variant.var_1.Int_Comp);
fprintf (Ap, " Str_Comp: ");
if (strcmp(Next_Ptr_Glob->variant.var_1.Str_Comp,
"DHRYSTONE PROGRAM, SOME STRING") == 0)
fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%s\n", Next_Ptr_Glob->variant.var_1.Str_Comp);
fprintf (Ap, "Int_1_Loc: ");
if (Int_1_Loc == 5)
fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%d\n", Int_1_Loc);
fprintf (Ap, "Int_2_Loc: ");
if (Int_2_Loc == 13)
fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%d\n", Int_2_Loc);
fprintf (Ap, "Int_3_Loc: ");
if (Int_3_Loc == 7)
fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%d\n", Int_3_Loc);
fprintf (Ap, "Enum_Loc: ");
if (Enum_Loc == 1)
fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%d\n", Enum_Loc);
fprintf (Ap, "Str_1_Loc: ");
if (strcmp(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING") == 0)
fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%s\n", Str_1_Loc);
fprintf (Ap, "Str_2_Loc: ");
if (strcmp(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING") == 0)
fprintf (Ap, "O.K. ");
else fprintf (Ap, "WRONG ");
fprintf (Ap, "%s\n", Str_2_Loc);
fprintf (Ap, "\n");
fprintf(Ap,"%s\n",Reg_Define);
fprintf (Ap, "\n");
fprintf(Ap,"Microseconds 1 loop: %12.2lf\n",Microseconds);
fprintf(Ap,"Dhrystones / second: %10.0lf\n",Dhrystones_Per_Second);
fprintf(Ap,"VAX MIPS rating: %12.2lf\n\n",Vax_Mips);
fclose(Ap);
}
printf ("\n");
printf ("A new results file will have been created in the same directory as the\n");
printf (".EXE files if one did not already exist. If you made a mistake on input, \n");
printf ("you can use a text editor to correct it, delete the results or copy \n");
printf ("them to a different file name. If you intend to run multiple tests you\n");
printf ("you may wish to rename DHRY.TXT with a more informative title.\n\n");
printf ("Please submit feedback and results files as a posting in Section 12\n");
printf ("or to Roy_Longbottom@compuserve.com\n\n");
if (getinput == 1)
{
printf("Press any key to exit\n");
printf ("\nIf this is displayed you must close the window in the normal way\n");
}
}
void Proc_1 (REG Rec_Pointer Ptr_Val_Par)
/******************/
/* executed once */
{
REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp;
/* == Ptr_Glob_Next */
/* Local variable, initialized with Ptr_Val_Par->Ptr_Comp, */
/* corresponds to "rename" in Ada, "with" in Pascal */
structassign (*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob);
Ptr_Val_Par->variant.var_1.Int_Comp = 5;
Next_Record->variant.var_1.Int_Comp
= Ptr_Val_Par->variant.var_1.Int_Comp;
Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp;
Proc_3 (&Next_Record->Ptr_Comp);
/* Ptr_Val_Par->Ptr_Comp->Ptr_Comp
== Ptr_Glob->Ptr_Comp */
if (Next_Record->Discr == Ident_1)
/* then, executed */
{
Next_Record->variant.var_1.Int_Comp = 6;
Proc_6 (Ptr_Val_Par->variant.var_1.Enum_Comp,
&Next_Record->variant.var_1.Enum_Comp);
Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp;
Proc_7 (Next_Record->variant.var_1.Int_Comp, 10,
&Next_Record->variant.var_1.Int_Comp);
}
else /* not executed */
structassign (*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp);
} /* Proc_1 */
void Proc_2 (One_Fifty *Int_Par_Ref)
/******************/
/* executed once */
/* *Int_Par_Ref == 1, becomes 4 */
{
One_Fifty Int_Loc;
Enumeration Enum_Loc;
Int_Loc = *Int_Par_Ref + 10;
do /* executed once */
if (Ch_1_Glob == 'A')
/* then, executed */
{
Int_Loc -= 1;
*Int_Par_Ref = Int_Loc - Int_Glob;
Enum_Loc = Ident_1;
} /* if */
while (Enum_Loc != Ident_1); /* true */
} /* Proc_2 */
void Proc_3 (Rec_Pointer *Ptr_Ref_Par)
/******************/
/* executed once */
/* Ptr_Ref_Par becomes Ptr_Glob */
{
if (Ptr_Glob != Null)
/* then, executed */
*Ptr_Ref_Par = Ptr_Glob->Ptr_Comp;
Proc_7 (10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp);
} /* Proc_3 */
void Proc_4 () /* without parameters */
/*******/
/* executed once */
{
Boolean Bool_Loc;
Bool_Loc = Ch_1_Glob == 'A';
Bool_Glob = Bool_Loc | Bool_Glob;
Ch_2_Glob = 'B';
} /* Proc_4 */
void Proc_5 () /* without parameters */
/*******/
/* executed once */
{
Ch_1_Glob = 'A';
Bool_Glob = false;
} /* Proc_5 */
/* Procedure for the assignment of structures, */
/* if the C compiler doesn't support this feature */
#ifdef NOSTRUCTASSIGN
memcpy (d, s, l)
register char *d;
register char *s;
register int l;
{
while (l--) *d++ = *s++;
}
#endif
double dtime()
{
/* #include <ctype.h> */
#define HZ CLOCKS_PER_SEC
clock_t tnow;
double q;
tnow = clock();
q = (double)tnow / (double)HZ;
return q;
}

View File

@ -0,0 +1,186 @@
/*
*************************************************************************
*
* "DHRYSTONE" Benchmark Program
* -----------------------------
*
* Version: C, Version 2.1
*
* File: dhry_2.c (part 3 of 3)
*
* Date: May 25, 1988
*
* Author: Reinhold P. Weicker
*
*************************************************************************
*/
#include "dhry.h"
#ifndef REG
#define REG
/* REG becomes defined as empty */
/* i.e. no register variables */
#else
#define REG register
#endif
extern int Int_Glob;
extern char Ch_1_Glob;
Boolean Func_3 (Enumeration Enum_Par_Val);
void Proc_6 (Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par)
/*********************************/
/* executed once */
/* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */
{
*Enum_Ref_Par = Enum_Val_Par;
if (! Func_3 (Enum_Val_Par))
/* then, not executed */
*Enum_Ref_Par = Ident_4;
switch (Enum_Val_Par)
{
case Ident_1:
*Enum_Ref_Par = Ident_1;
break;
case Ident_2:
if (Int_Glob > 100)
/* then */
*Enum_Ref_Par = Ident_1;
else *Enum_Ref_Par = Ident_4;
break;
case Ident_3: /* executed */
*Enum_Ref_Par = Ident_2;
break;
case Ident_4: break;
case Ident_5:
*Enum_Ref_Par = Ident_3;
break;
} /* switch */
} /* Proc_6 */
void Proc_7 (One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val,
One_Fifty *Int_Par_Ref)
/**********************************************/
/* executed three times */
/* first call: Int_1_Par_Val == 2, Int_2_Par_Val == 3, */
/* Int_Par_Ref becomes 7 */
/* second call: Int_1_Par_Val == 10, Int_2_Par_Val == 5, */
/* Int_Par_Ref becomes 17 */
/* third call: Int_1_Par_Val == 6, Int_2_Par_Val == 10, */
/* Int_Par_Ref becomes 18 */
{
One_Fifty Int_Loc;
Int_Loc = Int_1_Par_Val + 2;
*Int_Par_Ref = Int_2_Par_Val + Int_Loc;
} /* Proc_7 */
void Proc_8 (Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref,
int Int_1_Par_Val, int Int_2_Par_Val)
/*********************************************************************/
/* executed once */
/* Int_Par_Val_1 == 3 */
/* Int_Par_Val_2 == 7 */
{
REG One_Fifty Int_Index;
REG One_Fifty Int_Loc;
Int_Loc = Int_1_Par_Val + 5;
Arr_1_Par_Ref [Int_Loc] = Int_2_Par_Val;
Arr_1_Par_Ref [Int_Loc+1] = Arr_1_Par_Ref [Int_Loc];
Arr_1_Par_Ref [Int_Loc+30] = Int_Loc;
for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index)
Arr_2_Par_Ref [Int_Loc] [Int_Index] = Int_Loc;
Arr_2_Par_Ref [Int_Loc] [Int_Loc-1] += 1;
Arr_2_Par_Ref [Int_Loc+20] [Int_Loc] = Arr_1_Par_Ref [Int_Loc];
Int_Glob = 5;
} /* Proc_8 */
Enumeration Func_1 (Capital_Letter Ch_1_Par_Val,
Capital_Letter Ch_2_Par_Val)
/*************************************************/
/* executed three times */
/* first call: Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R' */
/* second call: Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C' */
/* third call: Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C' */
{
Capital_Letter Ch_1_Loc;
Capital_Letter Ch_2_Loc;
Ch_1_Loc = Ch_1_Par_Val;
Ch_2_Loc = Ch_1_Loc;
if (Ch_2_Loc != Ch_2_Par_Val)
/* then, executed */
return (Ident_1);
else /* not executed */
{
Ch_1_Glob = Ch_1_Loc;
return (Ident_2);
}
} /* Func_1 */
Boolean Func_2 (Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref)
/*************************************************/
/* executed once */
/* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */
/* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */
{
REG One_Thirty Int_Loc;
Capital_Letter Ch_Loc;
Int_Loc = 2;
while (Int_Loc <= 2) /* loop body executed once */
if (Func_1 (Str_1_Par_Ref[Int_Loc],
Str_2_Par_Ref[Int_Loc+1]) == Ident_1)
/* then, executed */
{
Ch_Loc = 'A';
Int_Loc += 1;
} /* if, while */
if (Ch_Loc >= 'W' && Ch_Loc < 'Z')
/* then, not executed */
Int_Loc = 7;
if (Ch_Loc == 'R')
/* then, not executed */
return (true);
else /* executed */
{
if (strcmp (Str_1_Par_Ref, Str_2_Par_Ref) > 0)
/* then, not executed */
{
Int_Loc += 7;
Int_Glob = Int_Loc;
return (true);
}
else /* executed */
return (false);
} /* if Ch_Loc */
} /* Func_2 */
Boolean Func_3 (Enumeration Enum_Par_Val)
/***************************/
/* executed once */
/* Enum_Par_Val == Ident_3 */
{
Enumeration Enum_Loc;
Enum_Loc = Enum_Par_Val;
if (Enum_Loc == Ident_3)
/* then, executed */
return (true);
else /* not executed */
return (false);
} /* Func_3 */

View File

@ -0,0 +1,407 @@
/*
* Copyright (c) 2011, Linaro Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the Linaro nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/** A simple harness that times how long a string function takes to
* run.
*/
/* PENDING: Add EPL */
#include <string.h>
#include <time.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <assert.h>
#include <unistd.h>
#include <errno.h>
#define NUM_ELEMS(_x) (sizeof(_x) / sizeof((_x)[0]))
#ifndef VERSION
#define VERSION "(unknown version)"
#endif
/** Make sure a function is called by using the return value */
#define SPOIL(_x) volatile long x = (long)(_x); (void)x
/** Type of functions that can be tested */
typedef void (*stub_t)(void *dest, void *src, size_t n);
/** Meta data about one test */
struct test
{
/** Test name */
const char *name;
/** Function to test */
stub_t stub;
};
/** Flush the cache by reading a chunk of memory */
static void empty(volatile char *against)
{
/* We know that there's a 16 k cache with 64 byte lines giving
a total of 256 lines. Read randomly from 256*5 places should
flush everything */
int offset = (1024 - 256)*1024;
for (int i = offset; i < offset + 16*1024*3; i += 64)
{
against[i];
}
}
/** Stub that does nothing. Used for calibrating */
static void xbounce(void *dest, void *src, size_t n)
{
SPOIL(0);
}
/** Stub that calls memcpy */
static void xmemcpy(void *dest, void *src, size_t n)
{
SPOIL(memcpy(dest, src, n));
}
/** Stub that calls memset */
static void xmemset(void *dest, void *src, size_t n)
{
SPOIL(memset(dest, 0, n));
}
/** Stub that calls memcmp */
static void xmemcmp(void *dest, void *src, size_t n)
{
SPOIL(memcmp(dest, src, n));
}
/** Stub that calls strcpy */
static void xstrcpy(void *dest, void *src, size_t n)
{
SPOIL(strcpy(dest, src));
}
/** Stub that calls strlen */
static void xstrlen(void *dest, void *src, size_t n)
{
SPOIL(strlen(dest));
}
/** Stub that calls strcmp */
static void xstrcmp(void *dest, void *src, size_t n)
{
SPOIL(strcmp(dest, src));
}
/** Stub that calls strchr */
static void xstrchr(void *dest, void *src, size_t n)
{
/* Put the character at the end of the string and before the null */
((char *)src)[n-1] = 32;
SPOIL(strchr(src, 32));
}
/** Stub that calls memchr */
static void xmemchr(void *dest, void *src, size_t n)
{
/* Put the character at the end of the block */
((char *)src)[n-1] = 32;
SPOIL(memchr(src, 32, n));
}
/** All functions that can be tested */
static const struct test tests[] =
{
{ "bounce", xbounce },
{ "memchr", xmemchr },
{ "memcpy", xmemcpy },
{ "memset", xmemset },
{ "memcmp", xmemcmp },
{ "strchr", xstrchr },
{ "strcmp", xstrcmp },
{ "strcpy", xstrcpy },
{ "strlen", xstrlen },
{ NULL }
};
/** Show basic usage */
static void usage(const char* name)
{
printf("%s %s: run a string related benchmark.\n"
"usage: %s [-c block-size] [-l loop-count] [-a alignment|src_alignment:dst_alignment] [-f] [-t test-name] [-r run-id]\n"
, name, VERSION, name);
printf("Tests:");
for (const struct test *ptest = tests; ptest->name != NULL; ptest++)
{
printf(" %s", ptest->name);
}
printf("\n");
exit(-1);
}
/** Find the test by name */
static const struct test *find_test(const char *name)
{
if (name == NULL)
{
return tests + 0;
}
else
{
for (const struct test *p = tests; p->name != NULL; p++)
{
if (strcmp(p->name, name) == 0)
{
return p;
}
}
}
return NULL;
}
#define MIN_BUFFER_SIZE 1024*1024
#define MAX_ALIGNMENT 256
/** Take a pointer and ensure that the lower bits == alignment */
static char *realign(char *p, int alignment)
{
uintptr_t pp = (uintptr_t)p;
pp = (pp + (MAX_ALIGNMENT - 1)) & ~(MAX_ALIGNMENT - 1);
pp += alignment;
return (char *)pp;
}
static int parse_int_arg(const char *arg, const char *exe_name)
{
long int ret;
errno = 0;
ret = strtol(arg, NULL, 0);
if (errno)
{
usage(exe_name);
}
return (int)ret;
}
static void parse_alignment_arg(const char *arg, const char *exe_name,
int *src_alignment, int *dst_alignment)
{
long int ret;
char *endptr;
errno = 0;
ret = strtol(arg, &endptr, 0);
if (errno)
{
usage(exe_name);
}
*src_alignment = (int)ret;
if (ret > 256 || ret < 1)
{
printf("Alignment should be in the range [1, 256].\n");
usage(exe_name);
}
if (ret == 256)
ret = 0;
if (endptr && *endptr == ':')
{
errno = 0;
ret = strtol(endptr + 1, NULL, 0);
if (errno)
{
usage(exe_name);
}
if (ret > 256 || ret < 1)
{
printf("Alignment should be in the range [1, 256].\n");
usage(exe_name);
}
if (ret == 256)
ret = 0;
}
*dst_alignment = (int)ret;
}
/** Setup and run a test */
int main(int argc, char **argv)
{
/* Size of src and dest buffers */
size_t buffer_size = MIN_BUFFER_SIZE;
/* Number of bytes per call */
int count = 31;
/* Number of times to run */
int loops = 10000000;
/* True to flush the cache each time */
int flush = 0;
/* Name of the test */
const char *name = NULL;
/* Alignment of buffers */
int src_alignment = 8;
int dst_alignment = 8;
/* Name of the run */
const char *run_id = "0";
int opt;
while ((opt = getopt(argc, argv, "c:l:ft:r:hva:")) > 0)
{
switch (opt)
{
case 'c':
count = parse_int_arg(optarg, argv[0]);
break;
case 'l':
loops = parse_int_arg(optarg, argv[0]);
break;
case 'a':
parse_alignment_arg(optarg, argv[0], &src_alignment, &dst_alignment);
break;
case 'f':
flush = 1;
break;
case 't':
name = strdup(optarg);
break;
case 'r':
run_id = strdup(optarg);
break;
case 'h':
usage(argv[0]);
break;
default:
usage(argv[0]);
break;
}
}
/* Find the test by name */
const struct test *ptest = find_test(name);
if (ptest == NULL)
{
usage(argv[0]);
}
if (count + MAX_ALIGNMENT * 2 > MIN_BUFFER_SIZE)
{
buffer_size = count + MAX_ALIGNMENT * 2;
}
/* Buffers to read and write from */
char *src = malloc(buffer_size);
char *dest = malloc(buffer_size);
assert(src != NULL && dest != NULL);
src = realign(src, src_alignment);
dest = realign(dest, dst_alignment);
/* Fill the buffer with non-zero, reproducable random data */
srandom(1539);
for (int i = 0; i < buffer_size; i++)
{
src[i] = (char)random() | 1;
dest[i] = src[i];
}
/* Make sure the buffers are null terminated for any string tests */
src[count] = 0;
dest[count] = 0;
struct timespec start, end;
int err = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
assert(err == 0);
/* Preload */
stub_t stub = ptest->stub;
/* Run two variants to reduce the cost of testing for the flush */
if (flush == 0)
{
for (int i = 0; i < loops; i++)
{
(*stub)(dest, src, count);
}
}
else
{
for (int i = 0; i < loops; i++)
{
(*stub)(dest, src, count);
empty(dest);
}
}
err = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
assert(err == 0);
/* Drop any leading path and pull the variant name out of the executable */
char *variant = strrchr(argv[0], '/');
if (variant == NULL)
{
variant = argv[0];
}
variant = strstr(variant, "try-");
assert(variant != NULL);
double elapsed = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) * 1e-9;
/* Estimate the bounce time. Measured on a Panda. */
double bounced = 0.448730 * loops / 50000000;
/* Dump both machine and human readable versions */
printf("%s:%s:%u:%u:%d:%d:%s:%.6f: took %.6f s for %u calls to %s of %u bytes. ~%.3f MB/s corrected.\n",
variant + 4, ptest->name,
count, loops, src_alignment, dst_alignment, run_id,
elapsed,
elapsed, loops, ptest->name, count,
(double)loops*count/(elapsed - bounced)/(1024*1024));
return 0;
}

View File

@ -0,0 +1,88 @@
# Copyright (c) 2011-2012, Linaro Limited
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Linaro nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
AC_INIT(cortex-strings, 1.1-2012.06~dev)
AM_INIT_AUTOMAKE(foreign subdir-objects color-tests dist-bzip2)
AC_CONFIG_HEADERS([config.h])
AC_CONFIG_FILES(Makefile)
AC_CANONICAL_HOST
AM_PROG_AS
AC_PROG_CC
AC_PROG_LIBTOOL
default_submachine=
case $host in
aarch64*-*-*)
arch=aarch64
;;
arm*-*-*)
arch=aarch32
default_submachine=cortex-a9
;;
x86_64-*-*-*)
arch=generic
;;
*)
AC_MSG_ERROR([unknown architecture $host])
;;
esac
AM_CONDITIONAL([HOST_AARCH32], [test x$arch = xaarch32])
AM_CONDITIONAL([HOST_AARCH64], [test x$arch = xaarch64])
AM_CONDITIONAL([HOST_GENERIC], [test x$arch = xgeneric])
AC_ARG_WITH([cpu],
AS_HELP_STRING([--with-cpu=CPU],
[select code for CPU variant @<:@default=cortex-a9@:>@]]),
[dnl
case "$withval" in
yes|'') AC_MSG_ERROR([--with-cpu requires an argument]) ;;
no) ;;
*) submachine="$withval" ;;
esac
],
[submachine=$default_submachine])
AC_SUBST(submachine)
AM_CONDITIONAL([WITH_SUBMACHINE], [test x$submachine != x])
AC_ARG_WITH([neon],
AC_HELP_STRING([--with-neon],
[include NEON specific routines @<:@default=yes@:>@]),
[with_neon=$withval],
[with_neon=yes])
AC_SUBST(with_neon)
AM_CONDITIONAL(WITH_NEON, test x$with_neon = xyes)
AC_ARG_WITH([vfp],
AC_HELP_STRING([--with-vfp],
[include VFP specific routines @<:@default=yes@:>@]),
[with_vfp=$withval],
[with_vfp=yes])
AC_SUBST(with_vfp)
AM_CONDITIONAL(WITH_VFP, test x$with_vfp = xyes)
AC_OUTPUT

View File

@ -0,0 +1,79 @@
#!/bin/bash
#
# Add the modified BSD license to a file
#
f=`mktemp -d`
trap "rm -rf $f" EXIT
year=`date +%Y`
cat > $f/original <<EOF
Copyright (c) $year, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Linaro nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
EOF
# Translate it to C style
echo "/*" > $f/c
sed -r 's/(.*)/ * \1/' $f/original | sed -r 's/ +$//' >> $f/c
echo " */" >> $f/c
echo >> $f/c
# ...and shell style
sed -r 's/(.*)/# \1/' $f/original | sed -r 's/ +$//' >> $f/shell
echo '#' >> $f/shell
echo >> $f/shell
for name in $@; do
if grep -q Copyright $name; then
echo $name already has some type of copyright
continue
fi
case $name in
# These files don't have an explicit license
*autogen.sh*)
continue;;
*reference/newlib/*)
continue;;
*reference/newlib-xscale/*)
continue;;
*/dhry/*)
continue;;
*.c)
src=$f/c
;;
*.sh|*.am|*.ac)
src=$f/shell
;;
*)
echo Unrecognied extension on $name
continue
esac
cat $src $name > $f/next
mv $f/next $name
echo Updated $name
done

View File

@ -0,0 +1,175 @@
#!/usr/bin/env python
"""Simple harness that benchmarks different variants of the routines,
caches the results, and emits all of the records at the end.
Results are generated for different values of:
* Source
* Routine
* Length
* Alignment
"""
import argparse
import subprocess
import math
import sys
# Prefix to the executables
build = '../build/try-'
ALL = 'memchr memcmp memcpy memset strchr strcmp strcpy strlen'
HAS = {
'this': 'bounce memchr memcpy memset strchr strcmp strcpy strlen',
'bionic-a9': 'memcmp memcpy memset strcmp strcpy strlen',
'bionic-a15': 'memcmp memcpy memset strcmp strcpy strlen',
'bionic-c': ALL,
'csl': 'memcpy memset',
'glibc': 'memcpy memset strchr strlen',
'glibc-c': ALL,
'newlib': 'memcpy strcmp strcpy strlen',
'newlib-c': ALL,
'newlib-xscale': 'memchr memcpy memset strchr strcmp strcpy strlen',
'plain': 'memset memcpy strcmp strcpy',
}
BOUNCE_ALIGNMENTS = ['1']
SINGLE_BUFFER_ALIGNMENTS = ['1', '2', '4', '8', '16', '32']
DUAL_BUFFER_ALIGNMENTS = ['1:32', '2:32', '4:32', '8:32', '16:32', '32:32']
ALIGNMENTS = {
'bounce': BOUNCE_ALIGNMENTS,
'memchr': SINGLE_BUFFER_ALIGNMENTS,
'memset': SINGLE_BUFFER_ALIGNMENTS,
'strchr': SINGLE_BUFFER_ALIGNMENTS,
'strlen': SINGLE_BUFFER_ALIGNMENTS,
'memcmp': DUAL_BUFFER_ALIGNMENTS,
'memcpy': DUAL_BUFFER_ALIGNMENTS,
'strcmp': DUAL_BUFFER_ALIGNMENTS,
'strcpy': DUAL_BUFFER_ALIGNMENTS,
}
VARIANTS = sorted(HAS.keys())
FUNCTIONS = sorted(ALIGNMENTS.keys())
NUM_RUNS = 5
def run(cache, variant, function, bytes, loops, alignment, run_id, quiet=False):
"""Perform a single run, exercising the cache as appropriate."""
key = ':'.join('%s' % x for x in (variant, function, bytes, loops, alignment, run_id))
if key in cache:
got = cache[key]
else:
xbuild = build
cmd = '%(xbuild)s%(variant)s -t %(function)s -c %(bytes)s -l %(loops)s -a %(alignment)s -r %(run_id)s' % locals()
try:
got = subprocess.check_output(cmd.split()).strip()
except OSError, ex:
assert False, 'Error %s while running %s' % (ex, cmd)
parts = got.split(':')
took = float(parts[7])
cache[key] = got
if not quiet:
print got
sys.stdout.flush()
return took
def run_many(cache, variants, bytes, all_functions):
# We want the data to come out in a useful order. So fix an
# alignment and function, and do all sizes for a variant first
bytes = sorted(bytes)
mid = bytes[int(len(bytes)/1.5)]
if not all_functions:
# Use the ordering in 'this' as the default
all_functions = HAS['this'].split()
# Find all other functions
for functions in HAS.values():
for function in functions.split():
if function not in all_functions:
all_functions.append(function)
for function in all_functions:
for alignment in ALIGNMENTS[function]:
for variant in variants:
if function not in HAS[variant].split():
continue
# Run a tracer through and see how long it takes and
# adjust the number of loops based on that. Not great
# for memchr() and similar which are O(n), but it will
# do
f = 50000000
want = 5.0
loops = int(f / math.sqrt(max(1, mid)))
took = run(cache, variant, function, mid, loops, alignment, 0,
quiet=True)
# Keep it reasonable for silly routines like bounce
factor = min(20, max(0.05, want/took))
f = f * factor
# Round f to a few significant figures
scale = 10**int(math.log10(f) - 1)
f = scale*int(f/scale)
for b in sorted(bytes):
# Figure out the number of loops to give a roughly consistent run
loops = int(f / math.sqrt(max(1, b)))
for run_id in range(0, NUM_RUNS):
run(cache, variant, function, b, loops, alignment,
run_id)
def run_top(cache):
parser = argparse.ArgumentParser()
parser.add_argument("-v", "--variants", nargs="+", help="library variant to run (run all if not specified)", default = VARIANTS, choices = VARIANTS)
parser.add_argument("-f", "--functions", nargs="+", help="function to run (run all if not specified)", default = FUNCTIONS, choices = FUNCTIONS)
parser.add_argument("-l", "--limit", type=int, help="upper limit to test to (in bytes)", default = 512*1024)
args = parser.parse_args()
# Test all powers of 2
step1 = 2.0
# Test intermediate powers of 1.4
step2 = 1.4
bytes = []
for step in [step1, step2]:
if step:
# Figure out how many steps get us up to the top
steps = int(round(math.log(args.limit) / math.log(step)))
bytes.extend([int(step**x) for x in range(0, steps+1)])
run_many(cache, args.variants, bytes, args.functions)
def main():
cachename = 'cache.txt'
cache = {}
try:
with open(cachename) as f:
for line in f:
line = line.strip()
parts = line.split(':')
cache[':'.join(parts[:7])] = line
except:
pass
try:
run_top(cache)
finally:
with open(cachename, 'w') as f:
for line in sorted(cache.values()):
print >> f, line
if __name__ == '__main__':
main()

View File

@ -0,0 +1,27 @@
"""Simple script that enables target specific blocks based on the first argument.
Matches comment blocks like this:
/* For Foo: abc
def
*/
and de-comments them giving:
abc
def
"""
import re
import sys
def main():
key = sys.argv[1]
expr = re.compile(r'/\* For %s:\s([^*]+)\*/' % key, re.M)
for arg in sys.argv[2:]:
with open(arg) as f:
body = f.read()
with open(arg, 'w') as f:
f.write(expr.sub(r'\1', body))
if __name__ == '__main__':
main()

View File

@ -0,0 +1,78 @@
"""Shared routines for the plotters."""
import fileinput
import collections
Record = collections.namedtuple('Record', 'variant function bytes loops src_alignment dst_alignment run_id elapsed rest')
def make_colours():
return iter('m b g r c y k pink orange brown grey'.split())
def parse_value(v):
"""Turn text into a primitive"""
try:
if '.' in v:
return float(v)
else:
return int(v)
except ValueError:
return v
def create_column_tuple(record, names):
cols = [getattr(record, name) for name in names]
return tuple(cols)
def unique(records, name, prefer=''):
"""Return the unique values of a column in the records"""
if type(name) == tuple:
values = list(set(create_column_tuple(x, name) for x in records))
else:
values = list(set(getattr(x, name) for x in records))
if not values:
return values
elif type(values[0]) == str:
return sorted(values, key=lambda x: '%-06d|%s' % (-prefer.find(x), x))
else:
return sorted(values)
def alignments_equal(alignments):
for alignment in alignments:
if alignment[0] != alignment[1]:
return False
return True
def parse_row(line):
return Record(*[parse_value(y) for y in line.split(':')])
def parse():
"""Parse a record file into named tuples, correcting for loop
overhead along the way.
"""
records = [parse_row(x) for x in fileinput.input()]
# Pull out any bounce values
costs = {}
for record in [x for x in records if x.function=='bounce']:
costs[(record.bytes, record.loops)] = record.elapsed
# Fix up all of the records for cost
out = []
for record in records:
if record.function == 'bounce':
continue
cost = costs.get((record.bytes, record.loops), None)
if not cost:
out.append(record)
else:
# Unfortunately you can't update a namedtuple...
values = list(record)
values[-2] -= cost
out.append(Record(*values))
return out

View File

@ -0,0 +1,67 @@
#!/usr/bin/env python
"""Plot the performance of different variants of one routine versus alignment.
"""
import libplot
import pylab
def plot(records, bytes, function):
records = [x for x in records if x.bytes==bytes and x.function==function]
variants = libplot.unique(records, 'variant', prefer='this')
alignments = libplot.unique(records, ('src_alignment', 'dst_alignment'))
X = pylab.arange(len(alignments))
width = 1.0/(len(variants)+1)
colours = libplot.make_colours()
pylab.figure(1).set_size_inches((16, 12))
pylab.clf()
for i, variant in enumerate(variants):
heights = []
for alignment in alignments:
matches = [x for x in records if x.variant==variant and x.src_alignment==alignment[0] and x.dst_alignment==alignment[1]]
if matches:
vals = [match.bytes*match.loops/match.elapsed/(1024*1024) for
match in matches]
mean = sum(vals)/len(vals)
heights.append(mean)
else:
heights.append(0)
pylab.bar(X+i*width, heights, width, color=colours.next(), label=variant)
axes = pylab.axes()
if libplot.alignments_equal(alignments):
alignment_labels = ["%s" % x[0] for x in alignments]
else:
alignment_labels = ["%s:%s" % (x[0], x[1]) for x in alignments]
axes.set_xticklabels(alignment_labels)
axes.set_xticks(X + 0.5)
pylab.title('Performance of different variants of %(function)s for %(bytes)d byte blocks' % locals())
pylab.xlabel('Alignment')
pylab.ylabel('Rate (MB/s)')
pylab.legend(loc='lower right', ncol=3)
pylab.grid()
pylab.savefig('alignment-%(function)s-%(bytes)d.png' % locals(), dpi=72)
def main():
records = libplot.parse()
for function in libplot.unique(records, 'function'):
for bytes in libplot.unique(records, 'bytes'):
plot(records, bytes, function)
pylab.show()
if __name__ == '__main__':
main()

View File

@ -0,0 +1,120 @@
#!/usr/bin/env python
"""Plot the performance for different block sizes of one function across
variants.
"""
import libplot
import pylab
import pdb
import math
def pretty_kb(v):
if v < 1024:
return '%d' % v
else:
if v % 1024 == 0:
return '%d k' % (v//1024)
else:
return '%.1f k' % (v/1024)
def plot(records, function, alignment=None, scale=1):
variants = libplot.unique(records, 'variant', prefer='this')
records = [x for x in records if x.function==function]
if alignment != None:
records = [x for x in records if x.src_alignment==alignment[0] and
x.dst_alignment==alignment[1]]
alignments = libplot.unique(records, ('src_alignment', 'dst_alignment'))
if len(alignments) != 1:
return False
if libplot.alignments_equal(alignments):
aalignment = alignments[0][0]
else:
aalignment = "%s:%s" % (alignments[0][0], alignments[0][1])
bytes = libplot.unique(records, 'bytes')[0]
colours = libplot.make_colours()
all_x = []
pylab.figure(1).set_size_inches((6.4*scale, 4.8*scale))
pylab.clf()
if 'str' in function:
# The harness fills out to 16k. Anything past that is an
# early match
top = 16384
else:
top = 2**31
for variant in variants:
matches = [x for x in records if x.variant==variant and x.bytes <= top]
matches.sort(key=lambda x: x.bytes)
X = sorted(list(set([x.bytes for x in matches])))
Y = []
Yerr = []
for xbytes in X:
vals = [x.bytes*x.loops/x.elapsed/(1024*1024) for x in matches if x.bytes == xbytes]
if len(vals) > 1:
mean = sum(vals)/len(vals)
Y.append(mean)
if len(Yerr) == 0:
Yerr = [[], []]
err1 = max(vals) - mean
assert err1 >= 0
err2 = min(vals) - mean
assert err2 <= 0
Yerr[0].append(abs(err2))
Yerr[1].append(err1)
else:
Y.append(vals[0])
all_x.extend(X)
colour = colours.next()
if X:
pylab.plot(X, Y, c=colour)
if len(Yerr) > 0:
pylab.errorbar(X, Y, yerr=Yerr, c=colour, label=variant, fmt='o')
else:
pylab.scatter(X, Y, c=colour, label=variant, edgecolors='none')
pylab.legend(loc='upper left', ncol=3, prop={'size': 'small'})
pylab.grid()
pylab.title('%(function)s of %(aalignment)s byte aligned blocks' % locals())
pylab.xlabel('Size (B)')
pylab.ylabel('Rate (MB/s)')
# Figure out how high the range goes
top = max(all_x)
power = int(round(math.log(max(all_x)) / math.log(2)))
pylab.semilogx()
pylab.axes().set_xticks([2**x for x in range(0, power+1)])
pylab.axes().set_xticklabels([pretty_kb(2**x) for x in range(0, power+1)])
pylab.xlim(0, top)
pylab.ylim(0, pylab.ylim()[1])
return True
def main():
records = libplot.parse()
functions = libplot.unique(records, 'function')
alignments = libplot.unique(records, ('src_alignment', 'dst_alignment'))
for function in functions:
for alignment in alignments:
for scale in [1, 2.5]:
if plot(records, function, alignment, scale):
pylab.savefig('sizes-%s-%02d-%02d-%.1f.png' % (function, alignment[0], alignment[1], scale), dpi=72)
pylab.show()
if __name__ == '__main__':
main()

View File

@ -0,0 +1,61 @@
#!/usr/bin/env python
"""Plot the performance of different variants of the string routines
for one size.
"""
import libplot
import pylab
def plot(records, bytes):
records = [x for x in records if x.bytes==bytes]
variants = libplot.unique(records, 'variant', prefer='this')
functions = libplot.unique(records, 'function')
X = pylab.arange(len(functions))
width = 1.0/(len(variants)+1)
colours = libplot.make_colours()
pylab.figure(1).set_size_inches((16, 12))
pylab.clf()
for i, variant in enumerate(variants):
heights = []
for function in functions:
matches = [x for x in records if x.variant==variant and x.function==function and x.src_alignment==8]
if matches:
vals = [match.bytes*match.loops/match.elapsed/(1024*1024) for
match in matches]
mean = sum(vals)/len(vals)
heights.append(mean)
else:
heights.append(0)
pylab.bar(X+i*width, heights, width, color=colours.next(), label=variant)
axes = pylab.axes()
axes.set_xticklabels(functions)
axes.set_xticks(X + 0.5)
pylab.title('Performance of different variants for %d byte blocks' % bytes)
pylab.ylabel('Rate (MB/s)')
pylab.legend(loc='upper left', ncol=3)
pylab.grid()
pylab.savefig('top-%06d.png' % bytes, dpi=72)
def main():
records = libplot.parse()
for bytes in libplot.unique(records, 'bytes'):
plot(records, bytes)
pylab.show()
if __name__ == '__main__':
main()

View File

@ -0,0 +1,123 @@
"""Plot the results for each test. Spits out a set of images into the
current directory.
"""
import libplot
import fileinput
import collections
import pprint
import pylab
Record = collections.namedtuple('Record', 'variant test size loops src_alignment dst_alignment run_id rawtime comment time bytes rate')
def unique(rows, name):
"""Takes a list of values, pulls out the named field, and returns
a list of the unique values of this field.
"""
return sorted(set(getattr(x, name) for x in rows))
def to_float(v):
"""Convert a string into a better type.
>>> to_float('foo')
'foo'
>>> to_float('1.23')
1.23
>>> to_float('45')
45
"""
try:
if '.' in v:
return float(v)
else:
return int(v)
except:
return v
def parse():
# Split the input up
rows = [x.strip().split(':') for x in fileinput.input()]
# Automatically turn numbers into the base type
rows = [[to_float(y) for y in x] for x in rows]
# Scan once to calculate the overhead
r = [Record(*(x + [0, 0, 0])) for x in rows]
bounces = pylab.array([(x.loops, x.rawtime) for x in r if x.test == 'bounce'])
fit = pylab.polyfit(bounces[:,0], bounces[:,1], 1)
records = []
for row in rows:
# Make a dummy record so we can use the names
r1 = Record(*(row + [0, 0, 0]))
bytes = r1.size * r1.loops
# Calculate the bounce time
delta = pylab.polyval(fit, [r1.loops])
time = r1.rawtime - delta
rate = bytes / time
records.append(Record(*(row + [time, bytes, rate])))
return records
def plot(records, field, scale, ylabel):
variants = unique(records, 'variant')
tests = unique(records, 'test')
colours = libplot.make_colours()
# A little hack. We want the 'all' record to be drawn last so
# that it's obvious on the graph. Assume that no tests come
# before it alphabetically
variants.reverse()
for test in tests:
for variant in variants:
v = [x for x in records if x.test==test and x.variant==variant]
v.sort(key=lambda x: x.size)
V = pylab.array([(x.size, getattr(x, field)) for x in v])
# Ensure our results appear
order = 1 if variant == 'this' else 0
try:
# A little hack. We want the 'all' to be obvious on
# the graph
if variant == 'all':
pylab.scatter(V[:,0], V[:,1]/scale, label=variant)
pylab.plot(V[:,0], V[:,1]/scale)
else:
pylab.plot(V[:,0], V[:,1]/scale, label=variant,
zorder=order, c = colours.next())
except Exception, ex:
# michaelh1 likes to run this script while the test is
# still running which can lead to bad data
print ex, 'on %s of %s' % (variant, test)
pylab.legend(loc='lower right', ncol=2, prop={'size': 'small'})
pylab.xlabel('Block size (B)')
pylab.ylabel(ylabel)
pylab.title('%s %s' % (test, field))
pylab.grid()
pylab.savefig('%s-%s.png' % (test, field), dpi=100)
pylab.semilogx(basex=2)
pylab.savefig('%s-%s-semilog.png' % (test, field), dpi=100)
pylab.clf()
def test():
import doctest
doctest.testmod()
def main():
records = parse()
plot(records, 'rate', 1024**2, 'Rate (MB/s)')
plot(records, 'time', 1, 'Total time (s)')
if __name__ == '__main__':
main()

View File

@ -0,0 +1,9 @@
#!/bin/bash
#
# Trims the whitespace from around any given images
#
for i in $@; do
convert $i -bordercolor white -border 1x1 -trim +repage -alpha off +dither -colors 32 PNG8:next-$i
mv next-$i $i
done

View File

@ -0,0 +1,172 @@
/*
* memchr - find a character in a memory zone
*
* Copyright (c) 2014, ARM Limited
* All rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the company nor the names of its contributors
* may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Assumptions:
*
* ARMv8-a, AArch64
* Neon Available.
*/
/* Arguments and results. */
#define srcin x0
#define chrin w1
#define cntin x2
#define result x0
#define src x3
#define tmp x4
#define wtmp2 w5
#define synd x6
#define soff x9
#define cntrem x10
#define vrepchr v0
#define vdata1 v1
#define vdata2 v2
#define vhas_chr1 v3
#define vhas_chr2 v4
#define vrepmask v5
#define vend v6
/*
* Core algorithm:
*
* For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
* per byte. For each tuple, bit 0 is set if the relevant byte matched the
* requested character and bit 1 is not used (faster than using a 32bit
* syndrome). Since the bits in the syndrome reflect exactly the order in which
* things occur in the original string, counting trailing zeros allows to
* identify exactly which byte has matched.
*/
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
def_fn memchr
/* Do not dereference srcin if no bytes to compare. */
cbz cntin, .Lzero_length
/*
* Magic constant 0x40100401 allows us to identify which lane matches
* the requested byte.
*/
mov wtmp2, #0x0401
movk wtmp2, #0x4010, lsl #16
dup vrepchr.16b, chrin
/* Work with aligned 32-byte chunks */
bic src, srcin, #31
dup vrepmask.4s, wtmp2
ands soff, srcin, #31
and cntrem, cntin, #31
b.eq .Lloop
/*
* Input string is not 32-byte aligned. We calculate the syndrome
* value for the aligned 32 bytes block containing the first bytes
* and mask the irrelevant part.
*/
ld1 {vdata1.16b, vdata2.16b}, [src], #32
sub tmp, soff, #32
adds cntin, cntin, tmp
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov synd, vend.2d[0]
/* Clear the soff*2 lower bits */
lsl tmp, soff, #1
lsr synd, synd, tmp
lsl synd, synd, tmp
/* The first block can also be the last */
b.ls .Lmasklast
/* Have we found something already? */
cbnz synd, .Ltail
.Lloop:
ld1 {vdata1.16b, vdata2.16b}, [src], #32
subs cntin, cntin, #32
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
/* If we're out of data we finish regardless of the result */
b.ls .Lend
/* Use a fast check for the termination condition */
orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
addp vend.2d, vend.2d, vend.2d
mov synd, vend.2d[0]
/* We're not out of data, loop if we haven't found the character */
cbz synd, .Lloop
.Lend:
/* Termination condition found, let's calculate the syndrome value */
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov synd, vend.2d[0]
/* Only do the clear for the last possible block */
b.hi .Ltail
.Lmasklast:
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
add tmp, cntrem, soff
and tmp, tmp, #31
sub tmp, tmp, #32
neg tmp, tmp, lsl #1
lsl synd, synd, tmp
lsr synd, synd, tmp
.Ltail:
/* Count the trailing zeros using bit reversing */
rbit synd, synd
/* Compensate the last post-increment */
sub src, src, #32
/* Check that we have found a character */
cmp synd, #0
/* And count the leading zeros */
clz synd, synd
/* Compute the potential result */
add result, src, synd, lsr #1
/* Select result or NULL */
csel result, xzr, result, eq
ret
.Lzero_length:
mov result, #0
ret
.size memchr, . - memchr

View File

@ -0,0 +1,162 @@
/* memcmp - compare memory
Copyright (c) 2013, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Linaro nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/* Assumptions:
*
* ARMv8-a, AArch64
*/
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
/* Parameters and result. */
#define src1 x0
#define src2 x1
#define limit x2
#define result x0
/* Internal variables. */
#define data1 x3
#define data1w w3
#define data2 x4
#define data2w w4
#define has_nul x5
#define diff x6
#define endloop x7
#define tmp1 x8
#define tmp2 x9
#define tmp3 x10
#define pos x11
#define limit_wd x12
#define mask x13
def_fn memcmp p2align=6
cbz limit, .Lret0
eor tmp1, src1, src2
tst tmp1, #7
b.ne .Lmisaligned8
ands tmp1, src1, #7
b.ne .Lmutual_align
add limit_wd, limit, #7
lsr limit_wd, limit_wd, #3
/* Start of performance-critical section -- one 64B cache line. */
.Lloop_aligned:
ldr data1, [src1], #8
ldr data2, [src2], #8
.Lstart_realigned:
subs limit_wd, limit_wd, #1
eor diff, data1, data2 /* Non-zero if differences found. */
csinv endloop, diff, xzr, ne /* Last Dword or differences. */
cbz endloop, .Lloop_aligned
/* End of performance-critical section -- one 64B cache line. */
/* Not reached the limit, must have found a diff. */
cbnz limit_wd, .Lnot_limit
/* Limit % 8 == 0 => all bytes significant. */
ands limit, limit, #7
b.eq .Lnot_limit
lsl limit, limit, #3 /* Bits -> bytes. */
mov mask, #~0
#ifdef __AARCH64EB__
lsr mask, mask, limit
#else
lsl mask, mask, limit
#endif
bic data1, data1, mask
bic data2, data2, mask
orr diff, diff, mask
.Lnot_limit:
#ifndef __AARCH64EB__
rev diff, diff
rev data1, data1
rev data2, data2
#endif
/* The MS-non-zero bit of DIFF marks either the first bit
that is different, or the end of the significant data.
Shifting left now will bring the critical information into the
top bits. */
clz pos, diff
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
ret
.Lmutual_align:
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that precede the start point. */
bic src1, src1, #7
bic src2, src2, #7
add limit, limit, tmp1 /* Adjust the limit for the extra. */
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
ldr data1, [src1], #8
neg tmp1, tmp1 /* Bits to alignment -64. */
ldr data2, [src2], #8
mov tmp2, #~0
#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
#endif
add limit_wd, limit, #7
orr data1, data1, tmp2
orr data2, data2, tmp2
lsr limit_wd, limit_wd, #3
b .Lstart_realigned
.Lret0:
mov result, #0
ret
.p2align 6
.Lmisaligned8:
sub limit, limit, #1
1:
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
b.eq 1b
sub result, data1, data2
ret
.size memcmp, . - memcmp

View File

@ -0,0 +1,225 @@
/* Copyright (c) 2012, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Linaro nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/*
* Copyright (c) 2015 ARM Ltd
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the company may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Assumptions:
*
* ARMv8-a, AArch64, unaligned accesses.
*
*/
#define dstin x0
#define src x1
#define count x2
#define dst x3
#define srcend x4
#define dstend x5
#define A_l x6
#define A_lw w6
#define A_h x7
#define A_hw w7
#define B_l x8
#define B_lw w8
#define B_h x9
#define C_l x10
#define C_h x11
#define D_l x12
#define D_h x13
#define E_l src
#define E_h count
#define F_l srcend
#define F_h dst
#define tmp1 x9
#define L(l) .L ## l
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
/* Copies are split into 3 main cases: small copies of up to 16 bytes,
medium copies of 17..96 bytes which are fully unrolled. Large copies
of more than 96 bytes align the destination and use an unrolled loop
processing 64 bytes per iteration.
Small and medium copies read all data before writing, allowing any
kind of overlap, and memmove tailcalls memcpy for these cases as
well as non-overlapping copies.
*/
def_fn memcpy p2align=6
prfm PLDL1KEEP, [src]
add srcend, src, count
add dstend, dstin, count
cmp count, 16
b.ls L(copy16)
cmp count, 96
b.hi L(copy_long)
/* Medium copies: 17..96 bytes. */
sub tmp1, count, 1
ldp A_l, A_h, [src]
tbnz tmp1, 6, L(copy96)
ldp D_l, D_h, [srcend, -16]
tbz tmp1, 5, 1f
ldp B_l, B_h, [src, 16]
ldp C_l, C_h, [srcend, -32]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstend, -32]
1:
stp A_l, A_h, [dstin]
stp D_l, D_h, [dstend, -16]
ret
.p2align 4
/* Small copies: 0..16 bytes. */
L(copy16):
cmp count, 8
b.lo 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
.p2align 4
1:
tbz count, 2, 1f
ldr A_lw, [src]
ldr A_hw, [srcend, -4]
str A_lw, [dstin]
str A_hw, [dstend, -4]
ret
/* Copy 0..3 bytes. Use a branchless sequence that copies the same
byte 3 times if count==1, or the 2nd byte twice if count==2. */
1:
cbz count, 2f
lsr tmp1, count, 1
ldrb A_lw, [src]
ldrb A_hw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
strb A_hw, [dstend, -1]
2: ret
.p2align 4
/* Copy 64..96 bytes. Copy 64 bytes from the start and
32 bytes from the end. */
L(copy96):
ldp B_l, B_h, [src, 16]
ldp C_l, C_h, [src, 32]
ldp D_l, D_h, [src, 48]
ldp E_l, E_h, [srcend, -32]
ldp F_l, F_h, [srcend, -16]
stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin, 32]
stp D_l, D_h, [dstin, 48]
stp E_l, E_h, [dstend, -32]
stp F_l, F_h, [dstend, -16]
ret
/* Align DST to 16 byte alignment so that we don't cross cache line
boundaries on both loads and stores. There are at least 96 bytes
to copy, so copy 16 bytes unaligned and then align. The loop
copies 64 bytes per iteration and prefetches one iteration ahead. */
.p2align 4
L(copy_long):
and tmp1, dstin, 15
bic dst, dstin, 15
ldp D_l, D_h, [src]
sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
stp D_l, D_h, [dstin]
ldp B_l, B_h, [src, 32]
ldp C_l, C_h, [src, 48]
ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */
b.ls 2f
1:
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
stp B_l, B_h, [dst, 32]
ldp B_l, B_h, [src, 32]
stp C_l, C_h, [dst, 48]
ldp C_l, C_h, [src, 48]
stp D_l, D_h, [dst, 64]!
ldp D_l, D_h, [src, 64]!
subs count, count, 64
b.hi 1b
/* Write the last full set of 64 bytes. The remainder is at most 64
bytes, so it is safe to always copy 64 bytes from the end even if
there is just 1 byte left. */
2:
ldp E_l, E_h, [srcend, -64]
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [srcend, -48]
stp B_l, B_h, [dst, 32]
ldp B_l, B_h, [srcend, -32]
stp C_l, C_h, [dst, 48]
ldp C_l, C_h, [srcend, -16]
stp D_l, D_h, [dst, 64]
stp E_l, E_h, [dstend, -64]
stp A_l, A_h, [dstend, -48]
stp B_l, B_h, [dstend, -32]
stp C_l, C_h, [dstend, -16]
ret
.size memcpy, . - memcpy

View File

@ -0,0 +1,150 @@
/* Copyright (c) 2013, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Linaro nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/*
* Copyright (c) 2015 ARM Ltd
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the company may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Assumptions:
*
* ARMv8-a, AArch64, unaligned accesses
*/
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
/* Parameters and result. */
#define dstin x0
#define src x1
#define count x2
#define srcend x3
#define dstend x4
#define tmp1 x5
#define A_l x6
#define A_h x7
#define B_l x8
#define B_h x9
#define C_l x10
#define C_h x11
#define D_l x12
#define D_h x13
#define E_l count
#define E_h tmp1
/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
Larger backwards copies are also handled by memcpy. The only remaining
case is forward large copies. The destination is aligned, and an
unrolled loop processes 64 bytes per iteration.
*/
def_fn memmove, 6
sub tmp1, dstin, src
cmp count, 96
ccmp tmp1, count, 2, hi
b.hs memcpy
cbz tmp1, 3f
add dstend, dstin, count
add srcend, src, count
/* Align dstend to 16 byte alignment so that we don't cross cache line
boundaries on both loads and stores. There are at least 96 bytes
to copy, so copy 16 bytes unaligned and then align. The loop
copies 64 bytes per iteration and prefetches one iteration ahead. */
and tmp1, dstend, 15
ldp D_l, D_h, [srcend, -16]
sub srcend, srcend, tmp1
sub count, count, tmp1
ldp A_l, A_h, [srcend, -16]
stp D_l, D_h, [dstend, -16]
ldp B_l, B_h, [srcend, -32]
ldp C_l, C_h, [srcend, -48]
ldp D_l, D_h, [srcend, -64]!
sub dstend, dstend, tmp1
subs count, count, 128
b.ls 2f
nop
1:
stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [srcend, -16]
stp B_l, B_h, [dstend, -32]
ldp B_l, B_h, [srcend, -32]
stp C_l, C_h, [dstend, -48]
ldp C_l, C_h, [srcend, -48]
stp D_l, D_h, [dstend, -64]!
ldp D_l, D_h, [srcend, -64]!
subs count, count, 64
b.hi 1b
/* Write the last full set of 64 bytes. The remainder is at most 64
bytes, so it is safe to always copy 64 bytes from the start even if
there is just 1 byte left. */
2:
ldp E_l, E_h, [src, 48]
stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [src, 32]
stp B_l, B_h, [dstend, -32]
ldp B_l, B_h, [src, 16]
stp C_l, C_h, [dstend, -48]
ldp C_l, C_h, [src]
stp D_l, D_h, [dstend, -64]
stp E_l, E_h, [dstin, 48]
stp A_l, A_h, [dstin, 32]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin]
3: ret
.size memmove, . - memmove

View File

@ -0,0 +1,235 @@
/* Copyright (c) 2012, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Linaro nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/*
* Copyright (c) 2015 ARM Ltd
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the company may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Assumptions:
*
* ARMv8-a, AArch64, unaligned accesses
*
*/
#define dstin x0
#define val x1
#define valw w1
#define count x2
#define dst x3
#define dstend x4
#define tmp1 x5
#define tmp1w w5
#define tmp2 x6
#define tmp2w w6
#define zva_len x7
#define zva_lenw w7
#define L(l) .L ## l
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
def_fn memset p2align=6
dup v0.16B, valw
add dstend, dstin, count
cmp count, 96
b.hi L(set_long)
cmp count, 16
b.hs L(set_medium)
mov val, v0.D[0]
/* Set 0..15 bytes. */
tbz count, 3, 1f
str val, [dstin]
str val, [dstend, -8]
ret
nop
1: tbz count, 2, 2f
str valw, [dstin]
str valw, [dstend, -4]
ret
2: cbz count, 3f
strb valw, [dstin]
tbz count, 1, 3f
strh valw, [dstend, -2]
3: ret
/* Set 17..96 bytes. */
L(set_medium):
str q0, [dstin]
tbnz count, 6, L(set96)
str q0, [dstend, -16]
tbz count, 5, 1f
str q0, [dstin, 16]
str q0, [dstend, -32]
1: ret
.p2align 4
/* Set 64..96 bytes. Write 64 bytes from the start and
32 bytes from the end. */
L(set96):
str q0, [dstin, 16]
stp q0, q0, [dstin, 32]
stp q0, q0, [dstend, -32]
ret
.p2align 3
nop
L(set_long):
and valw, valw, 255
bic dst, dstin, 15
str q0, [dstin]
cmp count, 256
ccmp valw, 0, 0, cs
b.eq L(try_zva)
L(no_zva):
sub count, dstend, dst /* Count is 16 too large. */
add dst, dst, 16
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
1: stp q0, q0, [dst], 64
stp q0, q0, [dst, -32]
L(tail64):
subs count, count, 64
b.hi 1b
2: stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
.p2align 3
L(try_zva):
mrs tmp1, dczid_el0
tbnz tmp1w, 4, L(no_zva)
and tmp1w, tmp1w, 15
cmp tmp1w, 4 /* ZVA size is 64 bytes. */
b.ne L(zva_128)
/* Write the first and last 64 byte aligned block using stp rather
than using DC ZVA. This is faster on some cores.
*/
L(zva_64):
str q0, [dst, 16]
stp q0, q0, [dst, 32]
bic dst, dst, 63
stp q0, q0, [dst, 64]
stp q0, q0, [dst, 96]
sub count, dstend, dst /* Count is now 128 too large. */
sub count, count, 128+64+64 /* Adjust count and bias for loop. */
add dst, dst, 128
nop
1: dc zva, dst
add dst, dst, 64
subs count, count, 64
b.hi 1b
stp q0, q0, [dst, 0]
stp q0, q0, [dst, 32]
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
.p2align 3
L(zva_128):
cmp tmp1w, 5 /* ZVA size is 128 bytes. */
b.ne L(zva_other)
str q0, [dst, 16]
stp q0, q0, [dst, 32]
stp q0, q0, [dst, 64]
stp q0, q0, [dst, 96]
bic dst, dst, 127
sub count, dstend, dst /* Count is now 128 too large. */
sub count, count, 128+128 /* Adjust count and bias for loop. */
add dst, dst, 128
1: dc zva, dst
add dst, dst, 128
subs count, count, 128
b.hi 1b
stp q0, q0, [dstend, -128]
stp q0, q0, [dstend, -96]
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
L(zva_other):
mov tmp2w, 4
lsl zva_lenw, tmp2w, tmp1w
add tmp1, zva_len, 64 /* Max alignment bytes written. */
cmp count, tmp1
blo L(no_zva)
sub tmp2, zva_len, 1
add tmp1, dst, zva_len
add dst, dst, 16
subs count, tmp1, dst /* Actual alignment bytes to write. */
bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
beq 2f
1: stp q0, q0, [dst], 64
stp q0, q0, [dst, -32]
subs count, count, 64
b.hi 1b
2: mov dst, tmp1
sub count, dstend, tmp1 /* Remaining bytes to write. */
subs count, count, zva_len
b.lo 4f
3: dc zva, dst
add dst, dst, zva_len
subs count, count, zva_len
b.hs 3b
4: add count, count, zva_len
b L(tail64)
.size memset, . - memset

View File

@ -0,0 +1,159 @@
/*
strchr - find a character in a string
Copyright (c) 2014, ARM Limited
All rights Reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the company nor the names of its contributors
may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/* Assumptions:
*
* ARMv8-a, AArch64
* Neon Available.
*/
/* Arguments and results. */
#define srcin x0
#define chrin w1
#define result x0
#define src x2
#define tmp1 x3
#define wtmp2 w4
#define tmp3 x5
#define vrepchr v0
#define vdata1 v1
#define vdata2 v2
#define vhas_nul1 v3
#define vhas_nul2 v4
#define vhas_chr1 v5
#define vhas_chr2 v6
#define vrepmask_0 v7
#define vrepmask_c v16
#define vend1 v17
#define vend2 v18
/* Core algorithm.
For each 32-byte hunk we calculate a 64-bit syndrome value, with
two bits per byte (LSB is always in bits 0 and 1, for both big
and little-endian systems). For each tuple, bit 0 is set iff
the relevant byte matched the requested character; bit 1 is set
iff the relevant byte matched the NUL end of string (we trigger
off bit0 for the special case of looking for NUL). Since the bits
in the syndrome reflect exactly the order in which things occur
in the original string a count_trailing_zeros() operation will
identify exactly which byte is causing the termination, and why. */
/* Locals and temporaries. */
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
def_fn strchr
/* Magic constant 0x40100401 to allow us to identify which lane
matches the requested byte. Magic constant 0x80200802 used
similarly for NUL termination. */
mov wtmp2, #0x0401
movk wtmp2, #0x4010, lsl #16
dup vrepchr.16b, chrin
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
dup vrepmask_c.4s, wtmp2
ands tmp1, srcin, #31
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
b.eq .Lloop
/* Input string is not 32-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
for all the bytes, but then mask off those bits of the
syndrome that are related to the padding. */
ld1 {vdata1.16b, vdata2.16b}, [src], #32
neg tmp1, tmp1
cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
lsl tmp1, tmp1, #1
addp vend1.16b, vend1.16b, vend2.16b // 256->128
mov tmp3, #~0
addp vend1.16b, vend1.16b, vend2.16b // 128->64
lsr tmp1, tmp3, tmp1
mov tmp3, vend1.2d[0]
bic tmp1, tmp3, tmp1 // Mask padding bits.
cbnz tmp1, .Ltail
.Lloop:
ld1 {vdata1.16b, vdata2.16b}, [src], #32
cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
/* Use a fast check for the termination condition. */
orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
orr vend1.16b, vend1.16b, vend2.16b
addp vend1.2d, vend1.2d, vend1.2d
mov tmp1, vend1.2d[0]
cbz tmp1, .Lloop
/* Termination condition found. Now need to establish exactly why
we terminated. */
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
addp vend1.16b, vend1.16b, vend2.16b // 256->128
addp vend1.16b, vend1.16b, vend2.16b // 128->64
mov tmp1, vend1.2d[0]
.Ltail:
/* Count the trailing zeros, by bit reversing... */
rbit tmp1, tmp1
/* Re-bias source. */
sub src, src, #32
clz tmp1, tmp1 /* And counting the leading zeros. */
/* Tmp1 is even if the target charager was found first. Otherwise
we've found the end of string and we weren't looking for NUL. */
tst tmp1, #1
add result, src, tmp1, lsr #1
csel result, result, xzr, eq
ret
.size strchr, . - strchr

View File

@ -0,0 +1,144 @@
/*
strchrnul - find a character or nul in a string
Copyright (c) 2014, ARM Limited
All rights Reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the company nor the names of its contributors
may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/* Assumptions:
*
* ARMv8-a, AArch64
* Neon Available.
*/
/* Arguments and results. */
#define srcin x0
#define chrin w1
#define result x0
#define src x2
#define tmp1 x3
#define wtmp2 w4
#define tmp3 x5
#define vrepchr v0
#define vdata1 v1
#define vdata2 v2
#define vhas_nul1 v3
#define vhas_nul2 v4
#define vhas_chr1 v5
#define vhas_chr2 v6
#define vrepmask v7
#define vend1 v16
/* Core algorithm.
For each 32-byte hunk we calculate a 64-bit syndrome value, with
two bits per byte (LSB is always in bits 0 and 1, for both big
and little-endian systems). For each tuple, bit 0 is set iff
the relevant byte matched the requested character or nul. Since the
bits in the syndrome reflect exactly the order in which things occur
in the original string a count_trailing_zeros() operation will
identify exactly which byte is causing the termination. */
/* Locals and temporaries. */
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
def_fn strchrnul
/* Magic constant 0x40100401 to allow us to identify which lane
matches the termination condition. */
mov wtmp2, #0x0401
movk wtmp2, #0x4010, lsl #16
dup vrepchr.16b, chrin
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
dup vrepmask.4s, wtmp2
ands tmp1, srcin, #31
b.eq .Lloop
/* Input string is not 32-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
for all the bytes, but then mask off those bits of the
syndrome that are related to the padding. */
ld1 {vdata1.16b, vdata2.16b}, [src], #32
neg tmp1, tmp1
cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
lsl tmp1, tmp1, #1
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
mov tmp3, #~0
addp vend1.16b, vend1.16b, vend1.16b // 128->64
lsr tmp1, tmp3, tmp1
mov tmp3, vend1.2d[0]
bic tmp1, tmp3, tmp1 // Mask padding bits.
cbnz tmp1, .Ltail
.Lloop:
ld1 {vdata1.16b, vdata2.16b}, [src], #32
cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
/* Use a fast check for the termination condition. */
orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b
addp vend1.2d, vend1.2d, vend1.2d
mov tmp1, vend1.2d[0]
cbz tmp1, .Lloop
/* Termination condition found. Now need to establish exactly why
we terminated. */
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
addp vend1.16b, vend1.16b, vend1.16b // 128->64
mov tmp1, vend1.2d[0]
.Ltail:
/* Count the trailing zeros, by bit reversing... */
rbit tmp1, tmp1
/* Re-bias source. */
sub src, src, #32
clz tmp1, tmp1 /* ... and counting the leading zeros. */
/* tmp1 is twice the offset into the fragment. */
add result, src, tmp1, lsr #1
ret
.size strchrnul, . - strchrnul

View File

@ -0,0 +1,166 @@
/* Copyright (c) 2012, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Linaro nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/* Assumptions:
*
* ARMv8-a, AArch64
*/
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
/* Parameters and result. */
#define src1 x0
#define src2 x1
#define result x0
/* Internal variables. */
#define data1 x2
#define data1w w2
#define data2 x3
#define data2w w3
#define has_nul x4
#define diff x5
#define syndrome x6
#define tmp1 x7
#define tmp2 x8
#define tmp3 x9
#define zeroones x10
#define pos x11
/* Start of performance-critical section -- one 64B cache line. */
def_fn strcmp p2align=6
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
b.ne .Lmisaligned8
ands tmp1, src1, #7
b.ne .Lmutual_align
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
.Lloop_aligned:
ldr data1, [src1], #8
ldr data2, [src2], #8
.Lstart_realigned:
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
orr syndrome, diff, has_nul
cbz syndrome, .Lloop_aligned
/* End of performance-critical section -- one 64B cache line. */
#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
/* The MS-non-zero bit of the syndrome marks either the first bit
that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
clz pos, syndrome
rev data2, data2
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
ret
#else
/* For big-endian we cannot use the trick with the syndrome value
as carry-propagation can corrupt the upper bits if the trailing
bytes in the string contain 0x01. */
/* However, if there is no NUL byte in the dword, we can generate
the result directly. We can't just subtract the bytes as the
MSB might be significant. */
cbnz has_nul, 1f
cmp data1, data2
cset result, ne
cneg result, result, lo
ret
1:
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
rev tmp3, data1
sub tmp1, tmp3, zeroones
orr tmp2, tmp3, #REP8_7f
bic has_nul, tmp1, tmp2
rev has_nul, has_nul
orr syndrome, diff, has_nul
clz pos, syndrome
/* The MS-non-zero bit of the syndrome marks either the first bit
that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
ret
#endif
.Lmutual_align:
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that preceed the start point. */
bic src1, src1, #7
bic src2, src2, #7
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
ldr data1, [src1], #8
neg tmp1, tmp1 /* Bits to alignment -64. */
ldr data2, [src2], #8
mov tmp2, #~0
#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
#endif
orr data1, data1, tmp2
orr data2, data2, tmp2
b .Lstart_realigned
.Lmisaligned8:
/* We can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
cmp data1w, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
b.eq .Lmisaligned8
sub result, data1, data2
ret

View File

@ -0,0 +1,336 @@
/*
strcpy/stpcpy - copy a string returning pointer to start/end.
Copyright (c) 2013, 2014, 2015 ARM Ltd.
All Rights Reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the company nor the names of its contributors
may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/* Assumptions:
*
* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
*/
/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
To test the page crossing code path more thoroughly, compile with
-DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
entry path. This option is not intended for production use. */
/* Arguments and results. */
#define dstin x0
#define srcin x1
/* Locals and temporaries. */
#define src x2
#define dst x3
#define data1 x4
#define data1w w4
#define data2 x5
#define data2w w5
#define has_nul1 x6
#define has_nul2 x7
#define tmp1 x8
#define tmp2 x9
#define tmp3 x10
#define tmp4 x11
#define zeroones x12
#define data1a x13
#define data2a x14
#define pos x15
#define len x16
#define to_align x17
#ifdef BUILD_STPCPY
#define STRCPY stpcpy
#else
#define STRCPY strcpy
#endif
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
/* AArch64 systems have a minimum page size of 4k. We can do a quick
page size check for crossing this boundary on entry and if we
do not, then we can short-circuit much of the entry code. We
expect early page-crossing strings to be rare (probability of
16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
predictable, even with random strings.
We don't bother checking for larger page sizes, the cost of setting
up the correct page size is just not worth the extra gain from
a small reduction in the cases taking the slow path. Note that
we only care about whether the first fetch, which may be
misaligned, crosses a page boundary - after that we move to aligned
fetches for the remainder of the string. */
#ifdef STRCPY_TEST_PAGE_CROSS
/* Make everything that isn't Qword aligned look like a page cross. */
#define MIN_PAGE_P2 4
#else
#define MIN_PAGE_P2 12
#endif
#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
def_fn STRCPY p2align=6
/* For moderately short strings, the fastest way to do the copy is to
calculate the length of the string in the same way as strlen, then
essentially do a memcpy of the result. This avoids the need for
multiple byte copies and further means that by the time we
reach the bulk copy loop we know we can always use DWord
accesses. We expect strcpy to rarely be called repeatedly
with the same source string, so branch prediction is likely to
always be difficult - we mitigate against this by preferring
conditional select operations over branches whenever this is
feasible. */
and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
mov zeroones, #REP8_01
and to_align, srcin, #15
cmp tmp2, #(MIN_PAGE_SIZE - 16)
neg tmp1, to_align
/* The first fetch will straddle a (possible) page boundary iff
srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
aligned string will never fail the page align check, so will
always take the fast path. */
b.gt .Lpage_cross
.Lpage_cross_ok:
ldp data1, data2, [srcin]
#ifdef __AARCH64EB__
/* Because we expect the end to be found within 16 characters
(profiling shows this is the most common case), it's worth
swapping the bytes now to save having to recalculate the
termination syndrome later. We preserve data1 and data2
so that we can re-use the values later on. */
rev tmp2, data1
sub tmp1, tmp2, zeroones
orr tmp2, tmp2, #REP8_7f
bics has_nul1, tmp1, tmp2
b.ne .Lfp_le8
rev tmp4, data2
sub tmp3, tmp4, zeroones
orr tmp4, tmp4, #REP8_7f
#else
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
bics has_nul1, tmp1, tmp2
b.ne .Lfp_le8
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
#endif
bics has_nul2, tmp3, tmp4
b.eq .Lbulk_entry
/* The string is short (<=16 bytes). We don't know exactly how
short though, yet. Work out the exact length so that we can
quickly select the optimal copy strategy. */
.Lfp_gt8:
rev has_nul2, has_nul2
clz pos, has_nul2
mov tmp2, #56
add dst, dstin, pos, lsr #3 /* Bits to bytes. */
sub pos, tmp2, pos
#ifdef __AARCH64EB__
lsr data2, data2, pos
#else
lsl data2, data2, pos
#endif
str data2, [dst, #1]
str data1, [dstin]
#ifdef BUILD_STPCPY
add dstin, dst, #8
#endif
ret
.Lfp_le8:
rev has_nul1, has_nul1
clz pos, has_nul1
add dst, dstin, pos, lsr #3 /* Bits to bytes. */
subs tmp2, pos, #24 /* Pos in bits. */
b.lt .Lfp_lt4
#ifdef __AARCH64EB__
mov tmp2, #56
sub pos, tmp2, pos
lsr data2, data1, pos
lsr data1, data1, #32
#else
lsr data2, data1, tmp2
#endif
/* 4->7 bytes to copy. */
str data2w, [dst, #-3]
str data1w, [dstin]
#ifdef BUILD_STPCPY
mov dstin, dst
#endif
ret
.Lfp_lt4:
cbz pos, .Lfp_lt2
/* 2->3 bytes to copy. */
#ifdef __AARCH64EB__
lsr data1, data1, #48
#endif
strh data1w, [dstin]
/* Fall-through, one byte (max) to go. */
.Lfp_lt2:
/* Null-terminated string. Last character must be zero! */
strb wzr, [dst]
#ifdef BUILD_STPCPY
mov dstin, dst
#endif
ret
.p2align 6
/* Aligning here ensures that the entry code and main loop all lies
within one 64-byte cache line. */
.Lbulk_entry:
sub to_align, to_align, #16
stp data1, data2, [dstin]
sub src, srcin, to_align
sub dst, dstin, to_align
b .Lentry_no_page_cross
/* The inner loop deals with two Dwords at a time. This has a
slightly higher start-up cost, but we should win quite quickly,
especially on cores with a high number of issue slots per
cycle, as we get much better parallelism out of the operations. */
.Lmain_loop:
stp data1, data2, [dst], #16
.Lentry_no_page_cross:
ldp data1, data2, [src], #16
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
b.eq .Lmain_loop
/* Since we know we are copying at least 16 bytes, the fastest way
to deal with the tail is to determine the location of the
trailing NUL, then (re)copy the 16 bytes leading up to that. */
cmp has_nul1, #0
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul directly. The
easiest way to get the correct byte is to byte-swap the data
and calculate the syndrome a second time. */
csel data1, data1, data2, ne
rev data1, data1
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
bic has_nul1, tmp1, tmp2
#else
csel has_nul1, has_nul1, has_nul2, ne
#endif
rev has_nul1, has_nul1
clz pos, has_nul1
add tmp1, pos, #72
add pos, pos, #8
csel pos, pos, tmp1, ne
add src, src, pos, lsr #3
add dst, dst, pos, lsr #3
ldp data1, data2, [src, #-32]
stp data1, data2, [dst, #-16]
#ifdef BUILD_STPCPY
sub dstin, dst, #1
#endif
ret
.Lpage_cross:
bic src, srcin, #15
/* Start by loading two words at [srcin & ~15], then forcing the
bytes that precede srcin to 0xff. This means they never look
like termination bytes. */
ldp data1, data2, [src]
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
tst to_align, #7
csetm tmp2, ne
#ifdef __AARCH64EB__
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
#else
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
#endif
orr data1, data1, tmp2
orr data2a, data2, tmp2
cmp to_align, #8
csinv data1, data1, xzr, lt
csel data2, data2, data2a, lt
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
b.eq .Lpage_cross_ok
/* We now need to make data1 and data2 look like they've been
loaded directly from srcin. Do a rotate on the 128-bit value. */
lsl tmp1, to_align, #3 /* Bytes->bits. */
neg tmp2, to_align, lsl #3
#ifdef __AARCH64EB__
lsl data1a, data1, tmp1
lsr tmp4, data2, tmp2
lsl data2, data2, tmp1
orr tmp4, tmp4, data1a
cmp to_align, #8
csel data1, tmp4, data2, lt
rev tmp2, data1
rev tmp4, data2
sub tmp1, tmp2, zeroones
orr tmp2, tmp2, #REP8_7f
sub tmp3, tmp4, zeroones
orr tmp4, tmp4, #REP8_7f
#else
lsr data1a, data1, tmp1
lsl tmp4, data2, tmp2
lsr data2, data2, tmp1
orr tmp4, tmp4, data1a
cmp to_align, #8
csel data1, tmp4, data2, lt
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
#endif
bic has_nul1, tmp1, tmp2
cbnz has_nul1, .Lfp_le8
bic has_nul2, tmp3, tmp4
b .Lfp_gt8
.size STRCPY, . - STRCPY

View File

@ -0,0 +1,233 @@
/* Copyright (c) 2013-2015, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Linaro nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/* Assumptions:
*
* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
*/
/* To test the page crossing code path more thoroughly, compile with
-DTEST_PAGE_CROSS - this will force all calls through the slower
entry path. This option is not intended for production use. */
/* Arguments and results. */
#define srcin x0
#define len x0
/* Locals and temporaries. */
#define src x1
#define data1 x2
#define data2 x3
#define has_nul1 x4
#define has_nul2 x5
#define tmp1 x4
#define tmp2 x5
#define tmp3 x6
#define tmp4 x7
#define zeroones x8
#define L(l) .L ## l
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. A faster check
(X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
false hits for characters 129..255. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
#ifdef TEST_PAGE_CROSS
# define MIN_PAGE_SIZE 15
#else
# define MIN_PAGE_SIZE 4096
#endif
/* Since strings are short on average, we check the first 16 bytes
of the string for a NUL character. In order to do an unaligned ldp
safely we have to do a page cross check first. If there is a NUL
byte we calculate the length from the 2 8-byte words using
conditional select to reduce branch mispredictions (it is unlikely
strlen will be repeatedly called on strings with the same length).
If the string is longer than 16 bytes, we align src so don't need
further page cross checks, and process 32 bytes per iteration
using the fast NUL check. If we encounter non-ASCII characters,
fallback to a second loop using the full NUL check.
If the page cross check fails, we read 16 bytes from an aligned
address, remove any characters before the string, and continue
in the main loop using aligned loads. Since strings crossing a
page in the first 16 bytes are rare (probability of
16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
AArch64 systems have a minimum page size of 4k. We don't bother
checking for larger page sizes - the cost of setting up the correct
page size is just not worth the extra gain from a small reduction in
the cases taking the slow path. Note that we only care about
whether the first fetch, which may be misaligned, crosses a page
boundary. */
def_fn strlen p2align=6
and tmp1, srcin, MIN_PAGE_SIZE - 1
mov zeroones, REP8_01
cmp tmp1, MIN_PAGE_SIZE - 16
b.gt L(page_cross)
ldp data1, data2, [srcin]
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul1/2 directly.
Since we expect strings to be small and early-exit,
byte-swap the data now so has_null1/2 will be correct. */
rev data1, data1
rev data2, data2
#endif
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
beq L(main_loop_entry)
/* Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
mov len, 8
rev has_nul1, has_nul1
clz tmp1, has_nul1
csel len, xzr, len, cc
add len, len, tmp1, lsr 3
ret
/* The inner loop processes 32 bytes per iteration and uses the fast
NUL check. If we encounter non-ASCII characters, use a second
loop with the accurate NUL check. */
.p2align 4
L(main_loop_entry):
bic src, srcin, 15
sub src, src, 16
L(main_loop):
ldp data1, data2, [src, 32]!
.Lpage_cross_entry:
sub tmp1, data1, zeroones
sub tmp3, data2, zeroones
orr tmp2, tmp1, tmp3
tst tmp2, zeroones, lsl 7
bne 1f
ldp data1, data2, [src, 16]
sub tmp1, data1, zeroones
sub tmp3, data2, zeroones
orr tmp2, tmp1, tmp3
tst tmp2, zeroones, lsl 7
beq L(main_loop)
add src, src, 16
1:
/* The fast check failed, so do the slower, accurate NUL check. */
orr tmp2, data1, REP8_7f
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
beq L(nonascii_loop)
/* Enter with C = has_nul1 == 0. */
L(tail):
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul1/2 directly. The
easiest way to get the correct byte is to byte-swap the data
and calculate the syndrome a second time. */
csel data1, data1, data2, cc
rev data1, data1
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
bic has_nul1, tmp1, tmp2
#else
csel has_nul1, has_nul1, has_nul2, cc
#endif
sub len, src, srcin
rev has_nul1, has_nul1
add tmp2, len, 8
clz tmp1, has_nul1
csel len, len, tmp2, cc
add len, len, tmp1, lsr 3
ret
L(nonascii_loop):
ldp data1, data2, [src, 16]!
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
bne L(tail)
ldp data1, data2, [src, 16]!
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
beq L(nonascii_loop)
b L(tail)
/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
srcin to 0x7f, so we ignore any NUL bytes before the string.
Then continue in the aligned loop. */
L(page_cross):
bic src, srcin, 15
ldp data1, data2, [src]
lsl tmp1, srcin, 3
mov tmp4, -1
#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
#endif
orr tmp1, tmp1, REP8_80
orn data1, data1, tmp1
orn tmp2, data2, tmp1
tst srcin, 8
csel data1, data1, tmp4, eq
csel data2, data2, tmp2, eq
b L(page_cross_entry)
.size strlen, . - strlen

View File

@ -0,0 +1,222 @@
/* Copyright (c) 2013, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Linaro nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/* Assumptions:
*
* ARMv8-a, AArch64
*/
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
/* Parameters and result. */
#define src1 x0
#define src2 x1
#define limit x2
#define result x0
/* Internal variables. */
#define data1 x3
#define data1w w3
#define data2 x4
#define data2w w4
#define has_nul x5
#define diff x6
#define syndrome x7
#define tmp1 x8
#define tmp2 x9
#define tmp3 x10
#define zeroones x11
#define pos x12
#define limit_wd x13
#define mask x14
#define endloop x15
.text
.p2align 6
.rep 7
nop /* Pad so that the loop below fits a cache line. */
.endr
def_fn strncmp
cbz limit, .Lret0
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
b.ne .Lmisaligned8
ands tmp1, src1, #7
b.ne .Lmutual_align
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
/* Start of performance-critical section -- one 64B cache line. */
.Lloop_aligned:
ldr data1, [src1], #8
ldr data2, [src2], #8
.Lstart_realigned:
subs limit_wd, limit_wd, #1
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
csinv endloop, diff, xzr, pl /* Last Dword or differences. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
b.eq .Lloop_aligned
/* End of performance-critical section -- one 64B cache line. */
/* Not reached the limit, must have found the end or a diff. */
tbz limit_wd, #63, .Lnot_limit
/* Limit % 8 == 0 => all bytes significant. */
ands limit, limit, #7
b.eq .Lnot_limit
lsl limit, limit, #3 /* Bits -> bytes. */
mov mask, #~0
#ifdef __AARCH64EB__
lsr mask, mask, limit
#else
lsl mask, mask, limit
#endif
bic data1, data1, mask
bic data2, data2, mask
/* Make sure that the NUL byte is marked in the syndrome. */
orr has_nul, has_nul, mask
.Lnot_limit:
orr syndrome, diff, has_nul
#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
/* The MS-non-zero bit of the syndrome marks either the first bit
that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
clz pos, syndrome
rev data2, data2
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
ret
#else
/* For big-endian we cannot use the trick with the syndrome value
as carry-propagation can corrupt the upper bits if the trailing
bytes in the string contain 0x01. */
/* However, if there is no NUL byte in the dword, we can generate
the result directly. We can't just subtract the bytes as the
MSB might be significant. */
cbnz has_nul, 1f
cmp data1, data2
cset result, ne
cneg result, result, lo
ret
1:
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
rev tmp3, data1
sub tmp1, tmp3, zeroones
orr tmp2, tmp3, #REP8_7f
bic has_nul, tmp1, tmp2
rev has_nul, has_nul
orr syndrome, diff, has_nul
clz pos, syndrome
/* The MS-non-zero bit of the syndrome marks either the first bit
that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
ret
#endif
.Lmutual_align:
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that precede the start point.
We also need to adjust the limit calculations, but without
overflowing if the limit is near ULONG_MAX. */
bic src1, src1, #7
bic src2, src2, #7
ldr data1, [src1], #8
neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */
#endif
and tmp3, limit_wd, #7
lsr limit_wd, limit_wd, #3
/* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
add limit, limit, tmp1
add tmp3, tmp3, tmp1
orr data1, data1, tmp2
orr data2, data2, tmp2
add limit_wd, limit_wd, tmp3, lsr #3
b .Lstart_realigned
.Lret0:
mov result, #0
ret
.p2align 6
.Lmisaligned8:
sub limit, limit, #1
1:
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
b.eq 1b
sub result, data1, data2
ret
.size strncmp, . - strncmp

View File

@ -0,0 +1,181 @@
/* strnlen - calculate the length of a string with limit.
Copyright (c) 2013, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Linaro nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/* Assumptions:
*
* ARMv8-a, AArch64
*/
/* Arguments and results. */
#define srcin x0
#define len x0
#define limit x1
/* Locals and temporaries. */
#define src x2
#define data1 x3
#define data2 x4
#define data2a x5
#define has_nul1 x6
#define has_nul2 x7
#define tmp1 x8
#define tmp2 x9
#define tmp3 x10
#define tmp4 x11
#define zeroones x12
#define pos x13
#define limit_wd x14
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
.text
.p2align 6
.Lstart:
/* Pre-pad to ensure critical loop begins an icache line. */
.rep 7
nop
.endr
/* Put this code here to avoid wasting more space with pre-padding. */
.Lhit_limit:
mov len, limit
ret
def_fn strnlen
cbz limit, .Lhit_limit
mov zeroones, #REP8_01
bic src, srcin, #15
ands tmp1, srcin, #15
b.ne .Lmisaligned
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
/* The inner loop deals with two Dwords at a time. This has a
slightly higher start-up cost, but we should win quite quickly,
especially on cores with a high number of issue slots per
cycle, as we get much better parallelism out of the operations. */
/* Start of critial section -- keep to one 64Byte cache line. */
.Lloop:
ldp data1, data2, [src], #16
.Lrealigned:
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
bic has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
subs limit_wd, limit_wd, #1
orr tmp1, has_nul1, has_nul2
ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
b.eq .Lloop
/* End of critical section -- keep to one 64Byte cache line. */
orr tmp1, has_nul1, has_nul2
cbz tmp1, .Lhit_limit /* No null in final Qword. */
/* We know there's a null in the final Qword. The easiest thing
to do now is work out the length of the string and return
MIN (len, limit). */
sub len, src, srcin
cbz has_nul1, .Lnul_in_data2
#ifdef __AARCH64EB__
mov data2, data1
#endif
sub len, len, #8
mov has_nul2, has_nul1
.Lnul_in_data2:
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul directly. The
easiest way to get the correct byte is to byte-swap the data
and calculate the syndrome a second time. */
rev data2, data2
sub tmp1, data2, zeroones
orr tmp2, data2, #REP8_7f
bic has_nul2, tmp1, tmp2
#endif
sub len, len, #8
rev has_nul2, has_nul2
clz pos, has_nul2
add len, len, pos, lsr #3 /* Bits to bytes. */
cmp len, limit
csel len, len, limit, ls /* Return the lower value. */
ret
.Lmisaligned:
/* Deal with a partial first word.
We're doing two things in parallel here;
1) Calculate the number of words (but avoiding overflow if
limit is near ULONG_MAX) - to do this we need to work out
limit + tmp1 - 1 as a 65-bit value before shifting it;
2) Load and mask the initial data words - we force the bytes
before the ones we are interested in to 0xff - this ensures
early bytes will not hit any zero detection. */
sub limit_wd, limit, #1
neg tmp4, tmp1
cmp tmp1, #8
and tmp3, limit_wd, #15
lsr limit_wd, limit_wd, #4
mov tmp2, #~0
ldp data1, data2, [src], #16
lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
add tmp3, tmp3, tmp1
#ifdef __AARCH64EB__
/* Big-endian. Early bytes are at MSB. */
lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
#endif
add limit_wd, limit_wd, tmp3, lsr #4
orr data1, data1, tmp2
orr data2a, data2, tmp2
csinv data1, data1, xzr, le
csel data2, data2, data2a, le
b .Lrealigned
.size strnlen, . - .Lstart /* Include pre-padding in size. */

View File

@ -0,0 +1,155 @@
/* Copyright (c) 2010-2011, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Linaro Limited nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Written by Dave Gilbert <david.gilbert@linaro.org>
This memchr routine is optimised on a Cortex-A9 and should work on
all ARMv7 processors. It has a fast past for short sizes, and has
an optimised path for large data sets; the worst case is finding the
match early in a large data set.
*/
@ 2011-02-07 david.gilbert@linaro.org
@ Extracted from local git a5b438d861
@ 2011-07-14 david.gilbert@linaro.org
@ Import endianness fix from local git ea786f1b
@ 2011-12-07 david.gilbert@linaro.org
@ Removed unneeded cbz from align loop
.syntax unified
.arch armv7-a
@ this lets us check a flag in a 00/ff byte easily in either endianness
#ifdef __ARMEB__
#define CHARTSTMASK(c) 1<<(31-(c*8))
#else
#define CHARTSTMASK(c) 1<<(c*8)
#endif
.text
.thumb
@ ---------------------------------------------------------------------------
.thumb_func
.align 2
.p2align 4,,15
.global memchr
.type memchr,%function
memchr:
@ r0 = start of memory to scan
@ r1 = character to look for
@ r2 = length
@ returns r0 = pointer to character or NULL if not found
and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char
cmp r2,#16 @ If it's short don't bother with anything clever
blt 20f
tst r0, #7 @ If it's already aligned skip the next bit
beq 10f
@ Work up to an aligned point
5:
ldrb r3, [r0],#1
subs r2, r2, #1
cmp r3, r1
beq 50f @ If it matches exit found
tst r0, #7
bne 5b @ If not aligned yet then do next byte
10:
@ At this point, we are aligned, we know we have at least 8 bytes to work with
push {r4,r5,r6,r7}
orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes
orr r1, r1, r1, lsl #16
bic r4, r2, #7 @ Number of double words to work with
mvns r7, #0 @ all F's
movs r3, #0
15:
ldmia r0!,{r5,r6}
subs r4, r4, #8
eor r5,r5, r1 @ Get it so that r5,r6 have 00's where the bytes match the target
eor r6,r6, r1
uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
sel r5, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
uadd8 r6, r6, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
cbnz r6, 60f
bne 15b @ (Flags from the subs above) If not run out of bytes then go around again
pop {r4,r5,r6,r7}
and r1,r1,#0xff @ Get r1 back to a single character from the expansion above
and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done
20:
cbz r2, 40f @ 0 length or hit the end already then not found
21: @ Post aligned section, or just a short call
ldrb r3,[r0],#1
subs r2,r2,#1
eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub
cbz r3, 50f
bne 21b @ on r2 flags
40:
movs r0,#0 @ not found
bx lr
50:
subs r0,r0,#1 @ found
bx lr
60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was
@ r0 points to the start of the double word after the one that was tested
@ r5 has the 00/ff pattern for the first word, r6 has the chained value
cmp r5, #0
itte eq
moveq r5, r6 @ the end is in the 2nd word
subeq r0,r0,#3 @ Points to 2nd byte of 2nd word
subne r0,r0,#7 @ or 2nd byte of 1st word
@ r0 currently points to the 3rd byte of the word containing the hit
tst r5, # CHARTSTMASK(0) @ 1st character
bne 61f
adds r0,r0,#1
tst r5, # CHARTSTMASK(1) @ 2nd character
ittt eq
addeq r0,r0,#1
tsteq r5, # (3<<15) @ 2nd & 3rd character
@ If not the 3rd must be the last one
addeq r0,r0,#1
61:
pop {r4,r5,r6,r7}
subs r0,r0,#1
bx lr

View File

@ -0,0 +1,617 @@
/* Copyright (c) 2013, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Linaro Limited nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
This memcpy routine is optimised for Cortex-A15 cores and takes advantage
of VFP or NEON when built with the appropriate flags.
Assumptions:
ARMv6 (ARMv7-a if using Neon)
ARM state
Unaligned accesses
*/
.syntax unified
/* This implementation requires ARM state. */
.arm
#ifdef __ARM_NEON__
.fpu neon
.arch armv7-a
# define FRAME_SIZE 4
# define USE_VFP
# define USE_NEON
#elif !defined (__SOFTFP__)
.arch armv6
.fpu vfpv2
# define FRAME_SIZE 32
# define USE_VFP
#else
.arch armv6
# define FRAME_SIZE 32
#endif
/* Old versions of GAS incorrectly implement the NEON align semantics. */
#ifdef BROKEN_ASM_NEON_ALIGN
#define ALIGN(addr, align) addr,:align
#else
#define ALIGN(addr, align) addr:align
#endif
#define PC_OFFSET 8 /* PC pipeline compensation. */
#define INSN_SIZE 4
/* Call parameters. */
#define dstin r0
#define src r1
#define count r2
/* Locals. */
#define tmp1 r3
#define dst ip
#define tmp2 r10
#ifndef USE_NEON
/* For bulk copies using GP registers. */
#define A_l r2 /* Call-clobbered. */
#define A_h r3 /* Call-clobbered. */
#define B_l r4
#define B_h r5
#define C_l r6
#define C_h r7
#define D_l r8
#define D_h r9
#endif
/* Number of lines ahead to pre-fetch data. If you change this the code
below will need adjustment to compensate. */
#define prefetch_lines 5
#ifdef USE_VFP
.macro cpy_line_vfp vreg, base
vstr \vreg, [dst, #\base]
vldr \vreg, [src, #\base]
vstr d0, [dst, #\base + 8]
vldr d0, [src, #\base + 8]
vstr d1, [dst, #\base + 16]
vldr d1, [src, #\base + 16]
vstr d2, [dst, #\base + 24]
vldr d2, [src, #\base + 24]
vstr \vreg, [dst, #\base + 32]
vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
vstr d0, [dst, #\base + 40]
vldr d0, [src, #\base + 40]
vstr d1, [dst, #\base + 48]
vldr d1, [src, #\base + 48]
vstr d2, [dst, #\base + 56]
vldr d2, [src, #\base + 56]
.endm
.macro cpy_tail_vfp vreg, base
vstr \vreg, [dst, #\base]
vldr \vreg, [src, #\base]
vstr d0, [dst, #\base + 8]
vldr d0, [src, #\base + 8]
vstr d1, [dst, #\base + 16]
vldr d1, [src, #\base + 16]
vstr d2, [dst, #\base + 24]
vldr d2, [src, #\base + 24]
vstr \vreg, [dst, #\base + 32]
vstr d0, [dst, #\base + 40]
vldr d0, [src, #\base + 40]
vstr d1, [dst, #\base + 48]
vldr d1, [src, #\base + 48]
vstr d2, [dst, #\base + 56]
vldr d2, [src, #\base + 56]
.endm
#endif
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
def_fn memcpy p2align=6
mov dst, dstin /* Preserve dstin, we need to return it. */
cmp count, #64
bge .Lcpy_not_short
/* Deal with small copies quickly by dropping straight into the
exit block. */
.Ltail63unaligned:
#ifdef USE_NEON
and tmp1, count, #0x38
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
add pc, pc, tmp1
vld1.8 {d0}, [src]! /* 14 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 12 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 10 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 8 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 6 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 4 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 2 words to go. */
vst1.8 {d0}, [dst]!
tst count, #4
ldrne tmp1, [src], #4
strne tmp1, [dst], #4
#else
/* Copy up to 15 full words of data. May not be aligned. */
/* Cannot use VFP for unaligned data. */
and tmp1, count, #0x3c
add dst, dst, tmp1
add src, src, tmp1
rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
/* Jump directly into the sequence below at the correct offset. */
add pc, pc, tmp1, lsl #1
ldr tmp1, [src, #-60] /* 15 words to go. */
str tmp1, [dst, #-60]
ldr tmp1, [src, #-56] /* 14 words to go. */
str tmp1, [dst, #-56]
ldr tmp1, [src, #-52]
str tmp1, [dst, #-52]
ldr tmp1, [src, #-48] /* 12 words to go. */
str tmp1, [dst, #-48]
ldr tmp1, [src, #-44]
str tmp1, [dst, #-44]
ldr tmp1, [src, #-40] /* 10 words to go. */
str tmp1, [dst, #-40]
ldr tmp1, [src, #-36]
str tmp1, [dst, #-36]
ldr tmp1, [src, #-32] /* 8 words to go. */
str tmp1, [dst, #-32]
ldr tmp1, [src, #-28]
str tmp1, [dst, #-28]
ldr tmp1, [src, #-24] /* 6 words to go. */
str tmp1, [dst, #-24]
ldr tmp1, [src, #-20]
str tmp1, [dst, #-20]
ldr tmp1, [src, #-16] /* 4 words to go. */
str tmp1, [dst, #-16]
ldr tmp1, [src, #-12]
str tmp1, [dst, #-12]
ldr tmp1, [src, #-8] /* 2 words to go. */
str tmp1, [dst, #-8]
ldr tmp1, [src, #-4]
str tmp1, [dst, #-4]
#endif
lsls count, count, #31
ldrhcs tmp1, [src], #2
ldrbne src, [src] /* Src is dead, use as a scratch. */
strhcs tmp1, [dst], #2
strbne src, [dst]
bx lr
.Lcpy_not_short:
/* At least 64 bytes to copy, but don't know the alignment yet. */
str tmp2, [sp, #-FRAME_SIZE]!
and tmp2, src, #7
and tmp1, dst, #7
cmp tmp1, tmp2
bne .Lcpy_notaligned
#ifdef USE_VFP
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
that the FP pipeline is much better at streaming loads and
stores. This is outside the critical loop. */
vmov.f32 s0, s0
#endif
/* SRC and DST have the same mutual 64-bit alignment, but we may
still need to pre-copy some bytes to get to natural alignment.
We bring SRC and DST into full 64-bit alignment. */
lsls tmp2, dst, #29
beq 1f
rsbs tmp2, tmp2, #0
sub count, count, tmp2, lsr #29
ldrmi tmp1, [src], #4
strmi tmp1, [dst], #4
lsls tmp2, tmp2, #2
ldrhcs tmp1, [src], #2
ldrbne tmp2, [src], #1
strhcs tmp1, [dst], #2
strbne tmp2, [dst], #1
1:
subs tmp2, count, #64 /* Use tmp2 for count. */
blt .Ltail63aligned
cmp tmp2, #512
bge .Lcpy_body_long
.Lcpy_body_medium: /* Count in tmp2. */
#ifdef USE_VFP
1:
vldr d0, [src, #0]
subs tmp2, tmp2, #64
vldr d1, [src, #8]
vstr d0, [dst, #0]
vldr d0, [src, #16]
vstr d1, [dst, #8]
vldr d1, [src, #24]
vstr d0, [dst, #16]
vldr d0, [src, #32]
vstr d1, [dst, #24]
vldr d1, [src, #40]
vstr d0, [dst, #32]
vldr d0, [src, #48]
vstr d1, [dst, #40]
vldr d1, [src, #56]
vstr d0, [dst, #48]
add src, src, #64
vstr d1, [dst, #56]
add dst, dst, #64
bge 1b
tst tmp2, #0x3f
beq .Ldone
.Ltail63aligned: /* Count in tmp2. */
and tmp1, tmp2, #0x38
add dst, dst, tmp1
add src, src, tmp1
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
add pc, pc, tmp1
vldr d0, [src, #-56] /* 14 words to go. */
vstr d0, [dst, #-56]
vldr d0, [src, #-48] /* 12 words to go. */
vstr d0, [dst, #-48]
vldr d0, [src, #-40] /* 10 words to go. */
vstr d0, [dst, #-40]
vldr d0, [src, #-32] /* 8 words to go. */
vstr d0, [dst, #-32]
vldr d0, [src, #-24] /* 6 words to go. */
vstr d0, [dst, #-24]
vldr d0, [src, #-16] /* 4 words to go. */
vstr d0, [dst, #-16]
vldr d0, [src, #-8] /* 2 words to go. */
vstr d0, [dst, #-8]
#else
sub src, src, #8
sub dst, dst, #8
1:
ldrd A_l, A_h, [src, #8]
strd A_l, A_h, [dst, #8]
ldrd A_l, A_h, [src, #16]
strd A_l, A_h, [dst, #16]
ldrd A_l, A_h, [src, #24]
strd A_l, A_h, [dst, #24]
ldrd A_l, A_h, [src, #32]
strd A_l, A_h, [dst, #32]
ldrd A_l, A_h, [src, #40]
strd A_l, A_h, [dst, #40]
ldrd A_l, A_h, [src, #48]
strd A_l, A_h, [dst, #48]
ldrd A_l, A_h, [src, #56]
strd A_l, A_h, [dst, #56]
ldrd A_l, A_h, [src, #64]!
strd A_l, A_h, [dst, #64]!
subs tmp2, tmp2, #64
bge 1b
tst tmp2, #0x3f
bne 1f
ldr tmp2,[sp], #FRAME_SIZE
bx lr
1:
add src, src, #8
add dst, dst, #8
.Ltail63aligned: /* Count in tmp2. */
/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
we know that the src and dest are 64-bit aligned so we can use
LDRD/STRD to improve efficiency. */
/* TMP2 is now negative, but we don't care about that. The bottom
six bits still tell us how many bytes are left to copy. */
and tmp1, tmp2, #0x38
add dst, dst, tmp1
add src, src, tmp1
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
add pc, pc, tmp1
ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
strd A_l, A_h, [dst, #-56]
ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
strd A_l, A_h, [dst, #-48]
ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
strd A_l, A_h, [dst, #-40]
ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
strd A_l, A_h, [dst, #-32]
ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
strd A_l, A_h, [dst, #-24]
ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
strd A_l, A_h, [dst, #-16]
ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
strd A_l, A_h, [dst, #-8]
#endif
tst tmp2, #4
ldrne tmp1, [src], #4
strne tmp1, [dst], #4
lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
ldrhcs tmp1, [src], #2
ldrbne tmp2, [src]
strhcs tmp1, [dst], #2
strbne tmp2, [dst]
.Ldone:
ldr tmp2, [sp], #FRAME_SIZE
bx lr
.Lcpy_body_long: /* Count in tmp2. */
/* Long copy. We know that there's at least (prefetch_lines * 64)
bytes to go. */
#ifdef USE_VFP
/* Don't use PLD. Instead, read some data in advance of the current
copy position into a register. This should act like a PLD
operation but we won't have to repeat the transfer. */
vldr d3, [src, #0]
vldr d4, [src, #64]
vldr d5, [src, #128]
vldr d6, [src, #192]
vldr d7, [src, #256]
vldr d0, [src, #8]
vldr d1, [src, #16]
vldr d2, [src, #24]
add src, src, #32
subs tmp2, tmp2, #prefetch_lines * 64 * 2
blt 2f
1:
cpy_line_vfp d3, 0
cpy_line_vfp d4, 64
cpy_line_vfp d5, 128
add dst, dst, #3 * 64
add src, src, #3 * 64
cpy_line_vfp d6, 0
cpy_line_vfp d7, 64
add dst, dst, #2 * 64
add src, src, #2 * 64
subs tmp2, tmp2, #prefetch_lines * 64
bge 1b
2:
cpy_tail_vfp d3, 0
cpy_tail_vfp d4, 64
cpy_tail_vfp d5, 128
add src, src, #3 * 64
add dst, dst, #3 * 64
cpy_tail_vfp d6, 0
vstr d7, [dst, #64]
vldr d7, [src, #64]
vstr d0, [dst, #64 + 8]
vldr d0, [src, #64 + 8]
vstr d1, [dst, #64 + 16]
vldr d1, [src, #64 + 16]
vstr d2, [dst, #64 + 24]
vldr d2, [src, #64 + 24]
vstr d7, [dst, #64 + 32]
add src, src, #96
vstr d0, [dst, #64 + 40]
vstr d1, [dst, #64 + 48]
vstr d2, [dst, #64 + 56]
add dst, dst, #128
add tmp2, tmp2, #prefetch_lines * 64
b .Lcpy_body_medium
#else
/* Long copy. Use an SMS style loop to maximize the I/O
bandwidth of the core. We don't have enough spare registers
to synthesise prefetching, so use PLD operations. */
/* Pre-bias src and dst. */
sub src, src, #8
sub dst, dst, #8
pld [src, #8]
pld [src, #72]
subs tmp2, tmp2, #64
pld [src, #136]
ldrd A_l, A_h, [src, #8]
strd B_l, B_h, [sp, #8]
ldrd B_l, B_h, [src, #16]
strd C_l, C_h, [sp, #16]
ldrd C_l, C_h, [src, #24]
strd D_l, D_h, [sp, #24]
pld [src, #200]
ldrd D_l, D_h, [src, #32]!
b 1f
.p2align 6
2:
pld [src, #232]
strd A_l, A_h, [dst, #40]
ldrd A_l, A_h, [src, #40]
strd B_l, B_h, [dst, #48]
ldrd B_l, B_h, [src, #48]
strd C_l, C_h, [dst, #56]
ldrd C_l, C_h, [src, #56]
strd D_l, D_h, [dst, #64]!
ldrd D_l, D_h, [src, #64]!
subs tmp2, tmp2, #64
1:
strd A_l, A_h, [dst, #8]
ldrd A_l, A_h, [src, #8]
strd B_l, B_h, [dst, #16]
ldrd B_l, B_h, [src, #16]
strd C_l, C_h, [dst, #24]
ldrd C_l, C_h, [src, #24]
strd D_l, D_h, [dst, #32]
ldrd D_l, D_h, [src, #32]
bcs 2b
/* Save the remaining bytes and restore the callee-saved regs. */
strd A_l, A_h, [dst, #40]
add src, src, #40
strd B_l, B_h, [dst, #48]
ldrd B_l, B_h, [sp, #8]
strd C_l, C_h, [dst, #56]
ldrd C_l, C_h, [sp, #16]
strd D_l, D_h, [dst, #64]
ldrd D_l, D_h, [sp, #24]
add dst, dst, #72
tst tmp2, #0x3f
bne .Ltail63aligned
ldr tmp2, [sp], #FRAME_SIZE
bx lr
#endif
.Lcpy_notaligned:
pld [src]
pld [src, #64]
/* There's at least 64 bytes to copy, but there is no mutual
alignment. */
/* Bring DST to 64-bit alignment. */
lsls tmp2, dst, #29
pld [src, #(2 * 64)]
beq 1f
rsbs tmp2, tmp2, #0
sub count, count, tmp2, lsr #29
ldrmi tmp1, [src], #4
strmi tmp1, [dst], #4
lsls tmp2, tmp2, #2
ldrbne tmp1, [src], #1
ldrhcs tmp2, [src], #2
strbne tmp1, [dst], #1
strhcs tmp2, [dst], #2
1:
pld [src, #(3 * 64)]
subs count, count, #64
ldrmi tmp2, [sp], #FRAME_SIZE
bmi .Ltail63unaligned
pld [src, #(4 * 64)]
#ifdef USE_NEON
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
subs count, count, #64
bmi 2f
1:
pld [src, #(4 * 64)]
vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
vld1.8 {d0-d3}, [src]!
vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
vld1.8 {d4-d7}, [src]!
subs count, count, #64
bpl 1b
2:
vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
ands count, count, #0x3f
#else
/* Use an SMS style loop to maximize the I/O bandwidth. */
sub src, src, #4
sub dst, dst, #8
subs tmp2, count, #64 /* Use tmp2 for count. */
ldr A_l, [src, #4]
ldr A_h, [src, #8]
strd B_l, B_h, [sp, #8]
ldr B_l, [src, #12]
ldr B_h, [src, #16]
strd C_l, C_h, [sp, #16]
ldr C_l, [src, #20]
ldr C_h, [src, #24]
strd D_l, D_h, [sp, #24]
ldr D_l, [src, #28]
ldr D_h, [src, #32]!
b 1f
.p2align 6
2:
pld [src, #(5 * 64) - (32 - 4)]
strd A_l, A_h, [dst, #40]
ldr A_l, [src, #36]
ldr A_h, [src, #40]
strd B_l, B_h, [dst, #48]
ldr B_l, [src, #44]
ldr B_h, [src, #48]
strd C_l, C_h, [dst, #56]
ldr C_l, [src, #52]
ldr C_h, [src, #56]
strd D_l, D_h, [dst, #64]!
ldr D_l, [src, #60]
ldr D_h, [src, #64]!
subs tmp2, tmp2, #64
1:
strd A_l, A_h, [dst, #8]
ldr A_l, [src, #4]
ldr A_h, [src, #8]
strd B_l, B_h, [dst, #16]
ldr B_l, [src, #12]
ldr B_h, [src, #16]
strd C_l, C_h, [dst, #24]
ldr C_l, [src, #20]
ldr C_h, [src, #24]
strd D_l, D_h, [dst, #32]
ldr D_l, [src, #28]
ldr D_h, [src, #32]
bcs 2b
/* Save the remaining bytes and restore the callee-saved regs. */
strd A_l, A_h, [dst, #40]
add src, src, #36
strd B_l, B_h, [dst, #48]
ldrd B_l, B_h, [sp, #8]
strd C_l, C_h, [dst, #56]
ldrd C_l, C_h, [sp, #16]
strd D_l, D_h, [dst, #64]
ldrd D_l, D_h, [sp, #24]
add dst, dst, #72
ands count, tmp2, #0x3f
#endif
ldr tmp2, [sp], #FRAME_SIZE
bne .Ltail63unaligned
bx lr
.size memcpy, . - memcpy

View File

@ -0,0 +1,122 @@
/* Copyright (c) 2010-2011, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Linaro Limited nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Written by Dave Gilbert <david.gilbert@linaro.org>
This memset routine is optimised on a Cortex-A9 and should work on
all ARMv7 processors.
*/
.syntax unified
.arch armv7-a
@ 2011-08-30 david.gilbert@linaro.org
@ Extracted from local git 2f11b436
@ this lets us check a flag in a 00/ff byte easily in either endianness
#ifdef __ARMEB__
#define CHARTSTMASK(c) 1<<(31-(c*8))
#else
#define CHARTSTMASK(c) 1<<(c*8)
#endif
.text
.thumb
@ ---------------------------------------------------------------------------
.thumb_func
.align 2
.p2align 4,,15
.global memset
.type memset,%function
memset:
@ r0 = address
@ r1 = character
@ r2 = count
@ returns original address in r0
mov r3, r0 @ Leave r0 alone
cbz r2, 10f @ Exit if 0 length
tst r0, #7
beq 2f @ Already aligned
@ Ok, so we're misaligned here
1:
strb r1, [r3], #1
subs r2,r2,#1
tst r3, #7
cbz r2, 10f @ Exit if we hit the end
bne 1b @ go round again if still misaligned
2:
@ OK, so we're aligned
push {r4,r5,r6,r7}
bics r4, r2, #15 @ if less than 16 bytes then need to finish it off
beq 5f
3:
@ POSIX says that ch is cast to an unsigned char. A uxtb is one
@ byte and takes two cycles, where an AND is four bytes but one
@ cycle.
and r1, #0xFF
orr r1, r1, r1, lsl#8 @ Same character into all bytes
orr r1, r1, r1, lsl#16
mov r5,r1
mov r6,r1
mov r7,r1
4:
subs r4,r4,#16
stmia r3!,{r1,r5,r6,r7}
bne 4b
and r2,r2,#15
@ At this point we're still aligned and we have upto align-1 bytes left to right
@ we can avoid some of the byte-at-a time now by testing for some big chunks
tst r2,#8
itt ne
subne r2,r2,#8
stmiane r3!,{r1,r5}
5:
pop {r4,r5,r6,r7}
cbz r2, 10f
@ Got to do any last < alignment bytes
6:
subs r2,r2,#1
strb r1,[r3],#1
bne 6b
10:
bx lr @ goodbye

View File

@ -0,0 +1,80 @@
/* Copyright (c) 2010-2011, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Linaro Limited nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Written by Dave Gilbert <david.gilbert@linaro.org>
A very simple strchr routine, from benchmarks on A9 it's a bit faster than
the current version in eglibc (2.12.1-0ubuntu14 package)
I don't think doing a word at a time version is worth it since a lot
of strchr cases are very short anyway.
*/
@ 2011-02-07 david.gilbert@linaro.org
@ Extracted from local git a5b438d861
.syntax unified
.arch armv7-a
.text
.thumb
@ ---------------------------------------------------------------------------
.thumb_func
.align 2
.p2align 4,,15
.global strchr
.type strchr,%function
strchr:
@ r0 = start of string
@ r1 = character to match
@ returns NULL for no match, or a pointer to the match
and r1,r1, #255
1:
ldrb r2,[r0],#1
cmp r2,r1
cbz r2,10f
bne 1b
@ We're here if it matched
5:
subs r0,r0,#1
bx lr
10:
@ We're here if we ran off the end
cmp r1, #0 @ Corner case - you're allowed to search for the nil and get a pointer to it
beq 5b @ A bit messy, if it's common we should branch at the start to a special loop
mov r0,#0
bx lr

View File

@ -0,0 +1,500 @@
/*
* Copyright (c) 2012-2014 ARM Ltd
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the company may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Implementation of strcmp for ARMv7 when DSP instructions are
available. Use ldrd to support wider loads, provided the data
is sufficiently aligned. Use saturating arithmetic to optimize
the compares. */
/* Build Options:
STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
byte in the string. If comparing completely random strings
the pre-check will save time, since there is a very high
probability of a mismatch in the first character: we save
significant overhead if this is the common case. However,
if strings are likely to be identical (eg because we're
verifying a hit in a hash table), then this check is largely
redundant. */
#define STRCMP_NO_PRECHECK 0
/* This version uses Thumb-2 code. */
.thumb
.syntax unified
#ifdef __ARM_BIG_ENDIAN
#define S2LO lsl
#define S2LOEQ lsleq
#define S2HI lsr
#define MSB 0x000000ff
#define LSB 0xff000000
#define BYTE0_OFFSET 24
#define BYTE1_OFFSET 16
#define BYTE2_OFFSET 8
#define BYTE3_OFFSET 0
#else /* not __ARM_BIG_ENDIAN */
#define S2LO lsr
#define S2LOEQ lsreq
#define S2HI lsl
#define BYTE0_OFFSET 0
#define BYTE1_OFFSET 8
#define BYTE2_OFFSET 16
#define BYTE3_OFFSET 24
#define MSB 0xff000000
#define LSB 0x000000ff
#endif /* not __ARM_BIG_ENDIAN */
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
/* Parameters and result. */
#define src1 r0
#define src2 r1
#define result r0 /* Overlaps src1. */
/* Internal variables. */
#define tmp1 r4
#define tmp2 r5
#define const_m1 r12
/* Additional internal variables for 64-bit aligned data. */
#define data1a r2
#define data1b r3
#define data2a r6
#define data2b r7
#define syndrome_a tmp1
#define syndrome_b tmp2
/* Additional internal variables for 32-bit aligned data. */
#define data1 r2
#define data2 r3
#define syndrome tmp2
/* Macro to compute and return the result value for word-aligned
cases. */
.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
#ifdef __ARM_BIG_ENDIAN
/* If data1 contains a zero byte, then syndrome will contain a 1 in
bit 7 of that byte. Otherwise, the highest set bit in the
syndrome will highlight the first different bit. It is therefore
sufficient to extract the eight bits starting with the syndrome
bit. */
clz tmp1, \synd
lsl r1, \d2, tmp1
.if \restore_r6
ldrd r6, r7, [sp, #8]
.endif
.cfi_restore 6
.cfi_restore 7
lsl \d1, \d1, tmp1
.cfi_remember_state
lsr result, \d1, #24
ldrd r4, r5, [sp], #16
.cfi_restore 4
.cfi_restore 5
sub result, result, r1, lsr #24
bx lr
#else
/* To use the big-endian trick we'd have to reverse all three words.
that's slower than this approach. */
rev \synd, \synd
clz tmp1, \synd
bic tmp1, tmp1, #7
lsr r1, \d2, tmp1
.cfi_remember_state
.if \restore_r6
ldrd r6, r7, [sp, #8]
.endif
.cfi_restore 6
.cfi_restore 7
lsr \d1, \d1, tmp1
and result, \d1, #255
and r1, r1, #255
ldrd r4, r5, [sp], #16
.cfi_restore 4
.cfi_restore 5
sub result, result, r1
bx lr
#endif
.endm
.text
.p2align 5
.Lstrcmp_start_addr:
#if STRCMP_NO_PRECHECK == 0
.Lfastpath_exit:
sub r0, r2, r3
bx lr
nop
#endif
def_fn strcmp
#if STRCMP_NO_PRECHECK == 0
ldrb r2, [src1]
ldrb r3, [src2]
cmp r2, #1
it cs
cmpcs r2, r3
bne .Lfastpath_exit
#endif
.cfi_startproc
strd r4, r5, [sp, #-16]!
.cfi_def_cfa_offset 16
.cfi_offset 4, -16
.cfi_offset 5, -12
orr tmp1, src1, src2
strd r6, r7, [sp, #8]
.cfi_offset 6, -8
.cfi_offset 7, -4
mvn const_m1, #0
lsl r2, tmp1, #29
cbz r2, .Lloop_aligned8
.Lnot_aligned:
eor tmp1, src1, src2
tst tmp1, #7
bne .Lmisaligned8
/* Deal with mutual misalignment by aligning downwards and then
masking off the unwanted loaded data to prevent a difference. */
and tmp1, src1, #7
bic src1, src1, #7
and tmp2, tmp1, #3
bic src2, src2, #7
lsl tmp2, tmp2, #3 /* Bytes -> bits. */
ldrd data1a, data1b, [src1], #16
tst tmp1, #4
ldrd data2a, data2b, [src2], #16
/* In thumb code we can't use MVN with a register shift, but
we do have ORN. */
S2HI tmp1, const_m1, tmp2
orn data1a, data1a, tmp1
orn data2a, data2a, tmp1
beq .Lstart_realigned8
orn data1b, data1b, tmp1
mov data1a, const_m1
orn data2b, data2b, tmp1
mov data2a, const_m1
b .Lstart_realigned8
/* Unwind the inner loop by a factor of 2, giving 16 bytes per
pass. */
.p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
.p2align 2 /* Always word aligned. */
.Lloop_aligned8:
ldrd data1a, data1b, [src1], #16
ldrd data2a, data2b, [src2], #16
.Lstart_realigned8:
uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
eor syndrome_a, data1a, data2a
sel syndrome_a, syndrome_a, const_m1
cbnz syndrome_a, .Ldiff_in_a
uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
eor syndrome_b, data1b, data2b
sel syndrome_b, syndrome_b, const_m1
cbnz syndrome_b, .Ldiff_in_b
ldrd data1a, data1b, [src1, #-8]
ldrd data2a, data2b, [src2, #-8]
uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
eor syndrome_a, data1a, data2a
sel syndrome_a, syndrome_a, const_m1
uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
eor syndrome_b, data1b, data2b
sel syndrome_b, syndrome_b, const_m1
/* Can't use CBZ for backwards branch. */
orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
beq .Lloop_aligned8
.Ldiff_found:
cbnz syndrome_a, .Ldiff_in_a
.Ldiff_in_b:
strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
.Ldiff_in_a:
.cfi_restore_state
strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
.cfi_restore_state
.Lmisaligned8:
tst tmp1, #3
bne .Lmisaligned4
ands tmp1, src1, #3
bne .Lmutual_align4
/* Unrolled by a factor of 2, to reduce the number of post-increment
operations. */
.Lloop_aligned4:
ldr data1, [src1], #8
ldr data2, [src2], #8
.Lstart_realigned4:
uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
eor syndrome, data1, data2
sel syndrome, syndrome, const_m1
cbnz syndrome, .Laligned4_done
ldr data1, [src1, #-4]
ldr data2, [src2, #-4]
uadd8 syndrome, data1, const_m1
eor syndrome, data1, data2
sel syndrome, syndrome, const_m1
cmp syndrome, #0
beq .Lloop_aligned4
.Laligned4_done:
strcmp_epilogue_aligned syndrome, data1, data2, 0
.Lmutual_align4:
.cfi_restore_state
/* Deal with mutual misalignment by aligning downwards and then
masking off the unwanted loaded data to prevent a difference. */
lsl tmp1, tmp1, #3 /* Bytes -> bits. */
bic src1, src1, #3
ldr data1, [src1], #8
bic src2, src2, #3
ldr data2, [src2], #8
/* In thumb code we can't use MVN with a register shift, but
we do have ORN. */
S2HI tmp1, const_m1, tmp1
orn data1, data1, tmp1
orn data2, data2, tmp1
b .Lstart_realigned4
.Lmisaligned4:
ands tmp1, src1, #3
beq .Lsrc1_aligned
sub src2, src2, tmp1
bic src1, src1, #3
lsls tmp1, tmp1, #31
ldr data1, [src1], #4
beq .Laligned_m2
bcs .Laligned_m1
#if STRCMP_NO_PRECHECK == 1
ldrb data2, [src2, #1]
uxtb tmp1, data1, ror #BYTE1_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
cbz data2, .Lmisaligned_exit
.Laligned_m2:
ldrb data2, [src2, #2]
uxtb tmp1, data1, ror #BYTE2_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
cbz data2, .Lmisaligned_exit
.Laligned_m1:
ldrb data2, [src2, #3]
uxtb tmp1, data1, ror #BYTE3_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
add src2, src2, #4
cbnz data2, .Lsrc1_aligned
#else /* STRCMP_NO_PRECHECK */
/* If we've done the pre-check, then we don't need to check the
first byte again here. */
ldrb data2, [src2, #2]
uxtb tmp1, data1, ror #BYTE2_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
cbz data2, .Lmisaligned_exit
.Laligned_m2:
ldrb data2, [src2, #3]
uxtb tmp1, data1, ror #BYTE3_OFFSET
subs tmp1, tmp1, data2
bne .Lmisaligned_exit
cbnz data2, .Laligned_m1
#endif
.Lmisaligned_exit:
.cfi_remember_state
mov result, tmp1
ldr r4, [sp], #16
.cfi_restore 4
bx lr
#if STRCMP_NO_PRECHECK == 0
.Laligned_m1:
add src2, src2, #4
#endif
.Lsrc1_aligned:
.cfi_restore_state
/* src1 is word aligned, but src2 has no common alignment
with it. */
ldr data1, [src1], #4
lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
bic src2, src2, #3
ldr data2, [src2], #4
bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
.Loverlap3:
bic tmp1, data1, #MSB
uadd8 syndrome, data1, const_m1
eors syndrome, tmp1, data2, S2LO #8
sel syndrome, syndrome, const_m1
bne 4f
cbnz syndrome, 5f
ldr data2, [src2], #4
eor tmp1, tmp1, data1
cmp tmp1, data2, S2HI #24
bne 6f
ldr data1, [src1], #4
b .Loverlap3
4:
S2LO data2, data2, #8
b .Lstrcmp_tail
5:
bics syndrome, syndrome, #MSB
bne .Lstrcmp_done_equal
/* We can only get here if the MSB of data1 contains 0, so
fast-path the exit. */
ldrb result, [src2]
.cfi_remember_state
ldrd r4, r5, [sp], #16
.cfi_restore 4
.cfi_restore 5
/* R6/7 Not used in this sequence. */
.cfi_restore 6
.cfi_restore 7
neg result, result
bx lr
6:
.cfi_restore_state
S2LO data1, data1, #24
and data2, data2, #LSB
b .Lstrcmp_tail
.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
.Loverlap2:
and tmp1, data1, const_m1, S2LO #16
uadd8 syndrome, data1, const_m1
eors syndrome, tmp1, data2, S2LO #16
sel syndrome, syndrome, const_m1
bne 4f
cbnz syndrome, 5f
ldr data2, [src2], #4
eor tmp1, tmp1, data1
cmp tmp1, data2, S2HI #16
bne 6f
ldr data1, [src1], #4
b .Loverlap2
4:
S2LO data2, data2, #16
b .Lstrcmp_tail
5:
ands syndrome, syndrome, const_m1, S2LO #16
bne .Lstrcmp_done_equal
ldrh data2, [src2]
S2LO data1, data1, #16
#ifdef __ARM_BIG_ENDIAN
lsl data2, data2, #16
#endif
b .Lstrcmp_tail
6:
S2LO data1, data1, #16
and data2, data2, const_m1, S2LO #16
b .Lstrcmp_tail
.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
.Loverlap1:
and tmp1, data1, #LSB
uadd8 syndrome, data1, const_m1
eors syndrome, tmp1, data2, S2LO #24
sel syndrome, syndrome, const_m1
bne 4f
cbnz syndrome, 5f
ldr data2, [src2], #4
eor tmp1, tmp1, data1
cmp tmp1, data2, S2HI #8
bne 6f
ldr data1, [src1], #4
b .Loverlap1
4:
S2LO data2, data2, #24
b .Lstrcmp_tail
5:
tst syndrome, #LSB
bne .Lstrcmp_done_equal
ldr data2, [src2]
6:
S2LO data1, data1, #8
bic data2, data2, #MSB
b .Lstrcmp_tail
.Lstrcmp_done_equal:
mov result, #0
.cfi_remember_state
ldrd r4, r5, [sp], #16
.cfi_restore 4
.cfi_restore 5
/* R6/7 not used in this sequence. */
.cfi_restore 6
.cfi_restore 7
bx lr
.Lstrcmp_tail:
.cfi_restore_state
#ifndef __ARM_BIG_ENDIAN
rev data1, data1
rev data2, data2
/* Now everything looks big-endian... */
#endif
uadd8 tmp1, data1, const_m1
eor tmp1, data1, data2
sel syndrome, tmp1, const_m1
clz tmp1, syndrome
lsl data1, data1, tmp1
lsl data2, data2, tmp1
lsr result, data1, #24
ldrd r4, r5, [sp], #16
.cfi_restore 4
.cfi_restore 5
/* R6/7 not used in this sequence. */
.cfi_restore 6
.cfi_restore 7
sub result, result, data2, lsr #24
bx lr
.cfi_endproc
.size strcmp, . - .Lstrcmp_start_addr

View File

@ -0,0 +1,173 @@
/*
* Copyright (c) 2008 ARM Ltd
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the company may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* For GLIBC:
#include <string.h>
#include <memcopy.h>
#undef strcmp
*/
#ifdef __thumb2__
#define magic1(REG) "#0x01010101"
#define magic2(REG) "#0x80808080"
#else
#define magic1(REG) #REG
#define magic2(REG) #REG ", lsl #7"
#endif
char* __attribute__((naked))
strcpy (char* dst, const char* src)
{
asm (
#if !(defined(__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
(defined (__thumb__) && !defined (__thumb2__)))
"pld [r1, #0]\n\t"
"eor r2, r0, r1\n\t"
"mov ip, r0\n\t"
"tst r2, #3\n\t"
"bne 4f\n\t"
"tst r1, #3\n\t"
"bne 3f\n"
"5:\n\t"
#ifndef __thumb2__
"str r5, [sp, #-4]!\n\t"
"mov r5, #0x01\n\t"
"orr r5, r5, r5, lsl #8\n\t"
"orr r5, r5, r5, lsl #16\n\t"
#endif
"str r4, [sp, #-4]!\n\t"
"tst r1, #4\n\t"
"ldr r3, [r1], #4\n\t"
"beq 2f\n\t"
"sub r2, r3, "magic1(r5)"\n\t"
"bics r2, r2, r3\n\t"
"tst r2, "magic2(r5)"\n\t"
"itt eq\n\t"
"streq r3, [ip], #4\n\t"
"ldreq r3, [r1], #4\n"
"bne 1f\n\t"
/* Inner loop. We now know that r1 is 64-bit aligned, so we
can safely fetch up to two words. This allows us to avoid
load stalls. */
".p2align 2\n"
"2:\n\t"
"pld [r1, #8]\n\t"
"ldr r4, [r1], #4\n\t"
"sub r2, r3, "magic1(r5)"\n\t"
"bics r2, r2, r3\n\t"
"tst r2, "magic2(r5)"\n\t"
"sub r2, r4, "magic1(r5)"\n\t"
"bne 1f\n\t"
"str r3, [ip], #4\n\t"
"bics r2, r2, r4\n\t"
"tst r2, "magic2(r5)"\n\t"
"itt eq\n\t"
"ldreq r3, [r1], #4\n\t"
"streq r4, [ip], #4\n\t"
"beq 2b\n\t"
"mov r3, r4\n"
"1:\n\t"
#ifdef __ARMEB__
"rors r3, r3, #24\n\t"
#endif
"strb r3, [ip], #1\n\t"
"tst r3, #0xff\n\t"
#ifdef __ARMEL__
"ror r3, r3, #8\n\t"
#endif
"bne 1b\n\t"
"ldr r4, [sp], #4\n\t"
#ifndef __thumb2__
"ldr r5, [sp], #4\n\t"
#endif
"BX LR\n"
/* Strings have the same offset from word alignment, but it's
not zero. */
"3:\n\t"
"tst r1, #1\n\t"
"beq 1f\n\t"
"ldrb r2, [r1], #1\n\t"
"strb r2, [ip], #1\n\t"
"cmp r2, #0\n\t"
"it eq\n"
"BXEQ LR\n"
"1:\n\t"
"tst r1, #2\n\t"
"beq 5b\n\t"
"ldrh r2, [r1], #2\n\t"
#ifdef __ARMEB__
"tst r2, #0xff00\n\t"
"iteet ne\n\t"
"strneh r2, [ip], #2\n\t"
"lsreq r2, r2, #8\n\t"
"streqb r2, [ip]\n\t"
"tstne r2, #0xff\n\t"
#else
"tst r2, #0xff\n\t"
"itet ne\n\t"
"strneh r2, [ip], #2\n\t"
"streqb r2, [ip]\n\t"
"tstne r2, #0xff00\n\t"
#endif
"bne 5b\n\t"
"BX LR\n"
/* src and dst do not have a common word-alignement. Fall back to
byte copying. */
"4:\n\t"
"ldrb r2, [r1], #1\n\t"
"strb r2, [ip], #1\n\t"
"cmp r2, #0\n\t"
"bne 4b\n\t"
"BX LR"
#elif !defined (__thumb__) || defined (__thumb2__)
"mov r3, r0\n\t"
"1:\n\t"
"ldrb r2, [r1], #1\n\t"
"strb r2, [r3], #1\n\t"
"cmp r2, #0\n\t"
"bne 1b\n\t"
"BX LR"
#else
"mov r3, r0\n\t"
"1:\n\t"
"ldrb r2, [r1]\n\t"
"add r1, r1, #1\n\t"
"strb r2, [r3]\n\t"
"add r3, r3, #1\n\t"
"cmp r2, #0\n\t"
"bne 1b\n\t"
"BX LR"
#endif
);
}
/* For GLIBC: libc_hidden_builtin_def (strcpy) */

View File

@ -0,0 +1,150 @@
/* Copyright (c) 2010-2011,2013 Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Linaro Limited nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
Assumes:
ARMv6T2, AArch32
*/
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
#ifdef __ARMEB__
#define S2LO lsl
#define S2HI lsr
#else
#define S2LO lsr
#define S2HI lsl
#endif
/* This code requires Thumb. */
.thumb
.syntax unified
/* Parameters and result. */
#define srcin r0
#define result r0
/* Internal variables. */
#define src r1
#define data1a r2
#define data1b r3
#define const_m1 r12
#define const_0 r4
#define tmp1 r4 /* Overlaps const_0 */
#define tmp2 r5
def_fn strlen p2align=6
pld [srcin, #0]
strd r4, r5, [sp, #-8]!
bic src, srcin, #7
mvn const_m1, #0
ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */
pld [src, #32]
bne.w .Lmisaligned8
mov const_0, #0
mov result, #-8
.Lloop_aligned:
/* Bytes 0-7. */
ldrd data1a, data1b, [src]
pld [src, #64]
add result, result, #8
.Lstart_realigned:
uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
cbnz data1b, .Lnull_found
/* Bytes 8-15. */
ldrd data1a, data1b, [src, #8]
uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
add result, result, #8
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
cbnz data1b, .Lnull_found
/* Bytes 16-23. */
ldrd data1a, data1b, [src, #16]
uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
add result, result, #8
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
cbnz data1b, .Lnull_found
/* Bytes 24-31. */
ldrd data1a, data1b, [src, #24]
add src, src, #32
uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
add result, result, #8
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
cmp data1b, #0
beq .Lloop_aligned
.Lnull_found:
cmp data1a, #0
itt eq
addeq result, result, #4
moveq data1a, data1b
#ifndef __ARMEB__
rev data1a, data1a
#endif
clz data1a, data1a
ldrd r4, r5, [sp], #8
add result, result, data1a, lsr #3 /* Bits -> Bytes. */
bx lr
.Lmisaligned8:
ldrd data1a, data1b, [src]
and tmp2, tmp1, #3
rsb result, tmp1, #0
lsl tmp2, tmp2, #3 /* Bytes -> bits. */
tst tmp1, #4
pld [src, #64]
S2HI tmp2, const_m1, tmp2
orn data1a, data1a, tmp2
itt ne
ornne data1b, data1b, tmp2
movne data1a, const_m1
mov const_0, #0
b .Lstart_realigned
.size strlen, . - strlen

View File

@ -0,0 +1,318 @@
/*
* Copyright (c) 2014 ARM Ltd
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the company may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* An executable stack is *not* required for these functions. */
.section .note.GNU-stack,"",%progbits
.previous
.eabi_attribute 25, 1
/* ANSI concatenation macros. */
#define CONCAT1(a, b) CONCAT2(a, b)
#define CONCAT2(a, b) a ## b
/* Use the right prefix for global labels. */
#define SYM(x) CONCAT1 (__USER_LABEL_PREFIX__, x)
#define TYPE(x) .type SYM(x),function
#define SIZE(x) .size SYM(x), . - SYM(x)
#define LSYM(x) .x
.macro cfi_start start_label, end_label
.pushsection .debug_frame
LSYM(Lstart_frame):
.4byte LSYM(Lend_cie) - LSYM(Lstart_cie)
LSYM(Lstart_cie):
.4byte 0xffffffff
.byte 0x1
.ascii "\0"
.uleb128 0x1
.sleb128 -4
.byte 0xe
.byte 0xc
.uleb128 0xd
.uleb128 0x0
.align 2
LSYM(Lend_cie):
.4byte LSYM(Lend_fde)-LSYM(Lstart_fde)
LSYM(Lstart_fde):
.4byte LSYM(Lstart_frame)
.4byte \start_label
.4byte \end_label-\start_label
.popsection
.endm
.macro cfi_end end_label
.pushsection .debug_frame
.align 2
LSYM(Lend_fde):
.popsection
\end_label:
.endm
.macro THUMB_LDIV0 name signed
push {r0, lr}
movs r0, #0
bl SYM(__aeabi_idiv0)
pop {r1, pc}
.endm
.macro FUNC_END name
SIZE (__\name)
.endm
.macro DIV_FUNC_END name signed
cfi_start __\name, LSYM(Lend_div0)
LSYM(Ldiv0):
THUMB_LDIV0 \name \signed
cfi_end LSYM(Lend_div0)
FUNC_END \name
.endm
.macro THUMB_FUNC_START name
.globl SYM (\name)
TYPE (\name)
.thumb_func
SYM (\name):
.endm
.macro FUNC_START name
.text
.globl SYM (__\name)
TYPE (__\name)
.align 0
.force_thumb
.thumb_func
.syntax unified
SYM (__\name):
.endm
.macro FUNC_ALIAS new old
.globl SYM (__\new)
.thumb_set SYM (__\new), SYM (__\old)
.endm
/* Register aliases. */
work .req r4
dividend .req r0
divisor .req r1
overdone .req r2
result .req r2
curbit .req r3
/* ------------------------------------------------------------------------ */
/* Bodies of the division and modulo routines. */
/* ------------------------------------------------------------------------ */
.macro BranchToDiv n, label
lsrs curbit, dividend, \n
cmp curbit, divisor
bcc \label
.endm
.macro DoDiv n
lsrs curbit, dividend, \n
cmp curbit, divisor
bcc 1f
lsls curbit, divisor, \n
subs dividend, dividend, curbit
1: adcs result, result
.endm
.macro THUMB1_Div_Positive
movs result, #0
BranchToDiv #1, LSYM(Lthumb1_div1)
BranchToDiv #4, LSYM(Lthumb1_div4)
BranchToDiv #8, LSYM(Lthumb1_div8)
BranchToDiv #12, LSYM(Lthumb1_div12)
BranchToDiv #16, LSYM(Lthumb1_div16)
LSYM(Lthumb1_div_large_positive):
movs result, #0xff
lsls divisor, divisor, #8
rev result, result
lsrs curbit, dividend, #16
cmp curbit, divisor
bcc 1f
asrs result, #8
lsls divisor, divisor, #8
beq LSYM(Ldivbyzero_waypoint)
1: lsrs curbit, dividend, #12
cmp curbit, divisor
bcc LSYM(Lthumb1_div12)
b LSYM(Lthumb1_div16)
LSYM(Lthumb1_div_loop):
lsrs divisor, divisor, #8
LSYM(Lthumb1_div16):
Dodiv #15
Dodiv #14
Dodiv #13
Dodiv #12
LSYM(Lthumb1_div12):
Dodiv #11
Dodiv #10
Dodiv #9
Dodiv #8
bcs LSYM(Lthumb1_div_loop)
LSYM(Lthumb1_div8):
Dodiv #7
Dodiv #6
Dodiv #5
LSYM(Lthumb1_div5):
Dodiv #4
LSYM(Lthumb1_div4):
Dodiv #3
LSYM(Lthumb1_div3):
Dodiv #2
LSYM(Lthumb1_div2):
Dodiv #1
LSYM(Lthumb1_div1):
subs divisor, dividend, divisor
bcs 1f
mov divisor, dividend
1: adcs result, result
mov dividend, result
bx lr
LSYM(Ldivbyzero_waypoint):
b LSYM(Ldiv0)
.endm
.macro THUMB1_Div_Negative
lsrs result, divisor, #31
beq 1f
rsbs divisor, divisor, #0
1: asrs curbit, dividend, #32
bcc 2f
rsbs dividend, dividend, #0
2: eors curbit, result
movs result, #0
mov ip, curbit
BranchToDiv #4, LSYM(Lthumb1_div_negative4)
BranchToDiv #8, LSYM(Lthumb1_div_negative8)
LSYM(Lthumb1_div_large):
movs result, #0xfc
lsls divisor, divisor, #6
rev result, result
lsrs curbit, dividend, #8
cmp curbit, divisor
bcc LSYM(Lthumb1_div_negative8)
lsls divisor, divisor, #6
asrs result, result, #6
cmp curbit, divisor
bcc LSYM(Lthumb1_div_negative8)
lsls divisor, divisor, #6
asrs result, result, #6
cmp curbit, divisor
bcc LSYM(Lthumb1_div_negative8)
lsls divisor, divisor, #6
beq LSYM(Ldivbyzero_negative)
asrs result, result, #6
b LSYM(Lthumb1_div_negative8)
LSYM(Lthumb1_div_negative_loop):
lsrs divisor, divisor, #6
LSYM(Lthumb1_div_negative8):
DoDiv #7
DoDiv #6
DoDiv #5
DoDiv #4
LSYM(Lthumb1_div_negative4):
DoDiv #3
DoDiv #2
bcs LSYM(Lthumb1_div_negative_loop)
DoDiv #1
subs divisor, dividend, divisor
bcs 1f
mov divisor, dividend
1: mov curbit, ip
adcs result, result
asrs curbit, curbit, #1
mov dividend, result
bcc 2f
rsbs dividend, dividend, #0
cmp curbit, #0
2: bpl 3f
rsbs divisor, divisor, #0
3: bx lr
LSYM(Ldivbyzero_negative):
mov curbit, ip
asrs curbit, curbit, #1
bcc LSYM(Ldiv0)
rsbs dividend, dividend, #0
.endm
/* ------------------------------------------------------------------------ */
/* Start of the Real Functions */
/* ------------------------------------------------------------------------ */
FUNC_START aeabi_idiv0
bx lr
FUNC_END aeabi_idiv0
FUNC_START divsi3
FUNC_ALIAS aeabi_idiv divsi3
LSYM(divsi3_skip_div0_test):
mov curbit, dividend
orrs curbit, divisor
bmi LSYM(Lthumb1_div_negative)
LSYM(Lthumb1_div_positive):
THUMB1_Div_Positive
LSYM(Lthumb1_div_negative):
THUMB1_Div_Negative
DIV_FUNC_END divsi3 signed
FUNC_START aeabi_idivmod
cmp r1, #0
beq LSYM(Ldiv0)
push {r0, r1, lr}
bl LSYM(divsi3_skip_div0_test)
POP {r1, r2, r3}
mul r2, r0
sub r1, r1, r2
bx r3
FUNC_END aeabi_idivmod
/* ------------------------------------------------------------------------ */

View File

@ -0,0 +1,143 @@
/*
* Copyright (c) 2014 ARM Ltd
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the company may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Implementation of strcmp for ARMv6m. This version is only used in
ARMv6-M when we want an efficient implementation. Otherwize if the
code size is preferred, strcmp-armv4t.S will be used. */
.thumb_func
.syntax unified
.arch armv6-m
.macro DoSub n, label
subs r0, r0, r1
#ifdef __ARM_BIG_ENDIAN
lsrs r1, r4, \n
#else
lsls r1, r4, \n
#endif
orrs r1, r0
bne \label
.endm
.macro Byte_Test n, label
lsrs r0, r2, \n
lsrs r1, r3, \n
DoSub \n, \label
.endm
.text
.p2align 0
.global strcmp
.type strcmp, %function
strcmp:
.cfi_startproc
mov r2, r0
push {r4, r5, r6, lr}
orrs r2, r1
lsls r2, r2, #30
bne 6f
ldr r5, =0x01010101
lsls r6, r5, #7
1:
ldmia r0!, {r2}
ldmia r1!, {r3}
subs r4, r2, r5
bics r4, r2
ands r4, r6
beq 3f
#ifdef __ARM_BIG_ENDIAN
Byte_Test #24, 4f
Byte_Test #16, 4f
Byte_Test #8, 4f
b 7f
3:
cmp r2, r3
beq 1b
cmp r2, r3
#else
uxtb r0, r2
uxtb r1, r3
DoSub #24, 2f
uxth r0, r2
uxth r1, r3
DoSub #16, 2f
lsls r0, r2, #8
lsls r1, r3, #8
lsrs r0, r0, #8
lsrs r1, r1, #8
DoSub #8, 2f
lsrs r0, r2, #24
lsrs r1, r3, #24
subs r0, r0, r1
2:
pop {r4, r5, r6, pc}
3:
cmp r2, r3
beq 1b
rev r0, r2
rev r1, r3
cmp r0, r1
#endif
bls 5f
movs r0, #1
4:
pop {r4, r5, r6, pc}
5:
movs r0, #0
mvns r0, r0
pop {r4, r5, r6, pc}
6:
ldrb r2, [r0, #0]
ldrb r3, [r1, #0]
adds r0, #1
adds r1, #1
cmp r2, #0
beq 7f
cmp r2, r3
bne 7f
ldrb r2, [r0, #0]
ldrb r3, [r1, #0]
adds r0, #1
adds r1, #1
cmp r2, #0
beq 7f
cmp r2, r3
beq 6b
7:
subs r0, r2, r3
pop {r4, r5, r6, pc}
.cfi_endproc
.size strcmp, . - strcmp