Import the Linaro Cortex Strings library into contrib.

Sponsored by: The FreeBSD Foundation
svn path=/head/; revision=305972
2016-09-19 13:12:09 +00:00 · 2016-09-19 13:12:09 +00:00 · 09a53ad8f1 · 2020-12-20 02:59:44 +00:00
commit 09a53ad8f1
parent bddfc749fa 5a194ab478
39 changed files with 7670 additions and 0 deletions
--- a/contrib/cortex-strings/.gitignore
+++ b/contrib/cortex-strings/.gitignore
@ -0,0 +1,11 @@
+*.a
+*.o
+*.la
+*.lo
+*.png
+*.pyc
+.deps
+.dirstamp
+.libs
+try-*
+cache.txt
--- a/contrib/cortex-strings/Makefile.am
+++ b/contrib/cortex-strings/Makefile.am
@ -0,0 +1,327 @@
+# Copyright (c) 2011, Linaro Limited
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of the Linaro nor the
+#       names of its contributors may be used to endorse or promote products
+#       derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# Top level Makefile for cortex-strings
+
+# Used to record the compiler version in the executables
+COMPILER = $(shell $(CC) --version 2>&1 | head -n1)
+
+# The main library
+lib_LTLIBRARIES = \
+	libcortex-strings.la
+
+## Test suite
+check_PROGRAMS = \
+	tests/test-memchr \
+	tests/test-memcmp \
+	tests/test-memcpy \
+	tests/test-memmove \
+	tests/test-memset \
+	tests/test-strchr \
+	tests/test-strcmp \
+	tests/test-strcpy \
+	tests/test-strlen \
+	tests/test-strncmp \
+	tests/test-strnlen
+
+# Options for the tests
+tests_cflags = -I$(srcdir)/tests $(AM_CFLAGS)
+tests_ldadd = libcortex-strings.la
+tests_test_memchr_LDADD = $(tests_ldadd)
+tests_test_memchr_CFLAGS = $(tests_cflags)
+tests_test_memcmp_LDADD = $(tests_ldadd)
+tests_test_memcmp_CFLAGS = $(tests_cflags)
+tests_test_memcpy_LDADD = $(tests_ldadd)
+tests_test_memcpy_CFLAGS = $(tests_cflags)
+tests_test_memmove_LDADD = $(tests_ldadd)
+tests_test_memmove_CFLAGS = $(tests_cflags)
+tests_test_memset_LDADD = $(tests_ldadd)
+tests_test_memset_CFLAGS = $(tests_cflags)
+tests_test_strchr_LDADD = $(tests_ldadd)
+tests_test_strchr_CFLAGS = $(tests_cflags)
+tests_test_strcmp_LDADD = $(tests_ldadd)
+tests_test_strcmp_CFLAGS = $(tests_cflags)
+tests_test_strcpy_LDADD = $(tests_ldadd)
+tests_test_strcpy_CFLAGS = $(tests_cflags)
+tests_test_strlen_LDADD = $(tests_ldadd)
+tests_test_strlen_CFLAGS = $(tests_cflags)
+tests_test_strncmp_LDADD = $(tests_ldadd)
+tests_test_strncmp_CFLAGS = $(tests_cflags)
+
+TESTS = $(check_PROGRAMS)
+
+## Benchmarks
+noinst_PROGRAMS = \
+	dhry \
+	dhry-native \
+	try-none \
+	try-this \
+	try-plain \
+	try-newlib-c \
+	try-bionic-c \
+	try-glibc-c
+
+# Good 'ol Dhrystone
+dhry_SOURCES = \
+	benchmarks/dhry/dhry_1.c \
+	benchmarks/dhry/dhry_2.c \
+	benchmarks/dhry/dhry.h
+
+dhry_CFLAGS = -Dcompiler="\"$(COMPILER)\"" -Doptions="\"$(CFLAGS)\""
+dhry_LDADD = libcortex-strings.la
+
+dhry_native_SOURCES = $(dhry_SOURCES)
+dhry_native_CFLAGS = $(dhry_CFLAGS)
+
+# Benchmark harness
+noinst_LIBRARIES = \
+	libmulti.a \
+	libbionic-c.a \
+	libglibc-c.a \
+	libnewlib-c.a \
+	libplain.a
+
+libmulti_a_SOURCES = \
+	benchmarks/multi/harness.c
+
+libmulti_a_CFLAGS = -DVERSION=\"$(VERSION)\" $(AM_CFLAGS)
+
+## Other architecture independant implementaions
+libbionic_c_a_SOURCES = \
+	reference/bionic-c/bcopy.c \
+	reference/bionic-c/memchr.c \
+	reference/bionic-c/memcmp.c \
+	reference/bionic-c/memcpy.c \
+	reference/bionic-c/memset.c \
+	reference/bionic-c/strchr.c \
+	reference/bionic-c/strcmp.c \
+	reference/bionic-c/strcpy.c \
+	reference/bionic-c/strlen.c
+
+libglibc_c_a_SOURCES = \
+	reference/glibc-c/memchr.c \
+	reference/glibc-c/memcmp.c \
+	reference/glibc-c/memcpy.c \
+	reference/glibc-c/memset.c \
+	reference/glibc-c/strchr.c \
+	reference/glibc-c/strcmp.c \
+	reference/glibc-c/strcpy.c \
+	reference/glibc-c/strlen.c \
+	reference/glibc-c/wordcopy.c \
+	reference/glibc-c/memcopy.h \
+	reference/glibc-c/pagecopy.h
+
+libnewlib_c_a_SOURCES = \
+	reference/newlib-c/memchr.c \
+	reference/newlib-c/memcmp.c \
+	reference/newlib-c/memcpy.c \
+	reference/newlib-c/memset.c \
+	reference/newlib-c/strchr.c \
+	reference/newlib-c/strcmp.c \
+	reference/newlib-c/strcpy.c \
+	reference/newlib-c/strlen.c \
+	reference/newlib-c/shim.h
+
+libplain_a_SOURCES = \
+	reference/plain/memset.c \
+	reference/plain/memcpy.c \
+	reference/plain/strcmp.c \
+	reference/plain/strcpy.c
+
+try_none_SOURCES =
+try_none_LDADD = libmulti.a -lrt
+try_this_SOURCES =
+try_this_LDADD = libmulti.a libcortex-strings.la -lrt
+try_bionic_c_SOURCES =
+try_bionic_c_LDADD = libmulti.a libbionic-c.a -lrt
+try_glibc_c_SOURCES =
+try_glibc_c_LDADD = libmulti.a libglibc-c.a -lrt
+try_newlib_c_SOURCES =
+try_newlib_c_LDADD = libmulti.a libnewlib-c.a -lrt
+try_plain_SOURCES =
+try_plain_LDADD = libmulti.a libplain.a -lrt
+
+# Architecture specific
+
+if HOST_AARCH32
+
+if WITH_NEON
+# Pull in the NEON specific files
+neon_bionic_a9_sources = \
+	reference/bionic-a9/memcpy.S \
+	reference/bionic-a9/memset.S
+neon_bionic_a15_sources = \
+	reference/bionic-a15/memcpy.S \
+	reference/bionic-a15/memset.S
+fpu_flags = -mfpu=neon
+else
+if WITH_VFP
+fpu_flags = -mfpu=vfp
+else
+fpu_flags = -msoft-float
+endif
+endif
+
+# Benchmarks and example programs
+noinst_PROGRAMS += \
+	try-bionic-a9 \
+	try-bionic-a15 \
+	try-csl \
+	try-glibc \
+	try-newlib \
+	try-newlib-xscale
+
+# Libraries used in the benchmarks and examples
+noinst_LIBRARIES += \
+	libbionic-a9.a \
+	libbionic-a15.a \
+	libcsl.a \
+	libglibc.a \
+	libnewlib.a \
+	libnewlib-xscale.a
+
+# Main library
+libcortex_strings_la_SOURCES = \
+	src/thumb-2/strcpy.c \
+	src/arm/memchr.S \
+	src/arm/strchr.S \
+	src/thumb-2/strlen.S \
+	src/arm/memset.S \
+	src/arm/memcpy.S \
+	src/arm/strcmp.S
+
+# Libraries containing the difference reference versions
+libbionic_a9_a_SOURCES = \
+	$(neon_bionic_a9_sources) \
+	reference/bionic-a9/memcmp.S \
+	reference/bionic-a9/strcmp.S \
+	reference/bionic-a9/strcpy.S \
+	reference/bionic-a9/strlen.c
+
+libbionic_a9_a_CFLAGS = -Wa,-mimplicit-it=thumb
+
+libbionic_a15_a_SOURCES = \
+	$(neon_bionic_a15_sources) \
+	reference/bionic-a15/memcmp.S \
+	reference/bionic-a15/strcmp.S \
+	reference/bionic-a15/strcpy.S \
+	reference/bionic-a15/strlen.c
+
+libbionic_a15_a_CFLAGS = -Wa,-mimplicit-it=thumb
+
+libcsl_a_SOURCES = \
+	reference/csl/memcpy.c \
+	reference/csl/memset.c \
+	reference/csl/arm_asm.h
+
+libglibc_a_SOURCES = \
+	reference/glibc/memcpy.S \
+	reference/glibc/memset.S \
+	reference/glibc/strchr.S \
+	reference/glibc/strlen.S
+
+libnewlib_a_SOURCES = \
+	reference/newlib/memcpy.S \
+	reference/newlib/strcmp.S \
+	reference/newlib/strcpy.c \
+	reference/newlib/strlen.c \
+	reference/newlib/arm_asm.h \
+	reference/newlib/shim.h
+
+libnewlib_xscale_a_SOURCES = \
+	reference/newlib-xscale/memchr.c \
+	reference/newlib-xscale/memcpy.c \
+	reference/newlib-xscale/memset.c \
+	reference/newlib-xscale/strchr.c \
+	reference/newlib-xscale/strcmp.c \
+	reference/newlib-xscale/strcpy.c \
+	reference/newlib-xscale/strlen.c \
+	reference/newlib-xscale/xscale.h
+
+# Flags for the benchmark helpers
+try_bionic_a9_SOURCES =
+try_bionic_a9_LDADD = libmulti.a libbionic-a9.a -lrt
+try_bionic_a15_SOURCES =
+try_bionic_a15_LDADD = libmulti.a libbionic-a15.a -lrt
+try_csl_SOURCES =
+try_csl_LDADD = libmulti.a libcsl.a -lrt
+try_glibc_SOURCES =
+try_glibc_LDADD = libmulti.a libglibc.a -lrt
+try_newlib_SOURCES =
+try_newlib_LDADD = libmulti.a libnewlib.a -lrt
+try_newlib_xscale_SOURCES =
+try_newlib_xscale_LDADD = libmulti.a libnewlib-xscale.a -lrt
+
+AM_CPPFLAGS = $(fpu_flags)
+AM_LDFLAGS = $(fpu_flags)
+
+endif
+
+# aarch64 specific
+if HOST_AARCH64
+
+libcortex_strings_la_SOURCES = \
+	src/aarch64/memchr.S \
+	src/aarch64/memcmp.S \
+	src/aarch64/memcpy.S \
+	src/aarch64/memmove.S \
+	src/aarch64/memset.S \
+	src/aarch64/strchr.S \
+	src/aarch64/strchrnul.S \
+	src/aarch64/strcmp.S \
+	src/aarch64/strcpy.S \
+	src/aarch64/strlen.S \
+	src/aarch64/strncmp.S \
+	src/aarch64/strnlen.S
+
+endif
+
+libcortex_strings_la_LDFLAGS = -version-info 1:0:0
+
+AM_CFLAGS = \
+	-std=gnu99 -Wall \
+	-fno-builtin -fno-stack-protector -U_FORTIFY_SOURCE \
+	$(AM_CPPFLAGS)
+
+if WITH_SUBMACHINE
+AM_CFLAGS += \
+	-mtune=$(submachine)
+endif
+
+EXTRA_DIST = \
+	tests/hp-timing.h \
+	tests/test-string.h \
+	tests/test-skeleton.c \
+	scripts/add-license.sh \
+	scripts/bench.py \
+	scripts/fixup.py \
+	scripts/libplot.py \
+	scripts/plot-align.py \
+	scripts/plot.py \
+	scripts/plot-sizes.py \
+	scripts/plot-top.py \
+	scripts/trim.sh \
+	autogen.sh
--- a/contrib/cortex-strings/README
+++ b/contrib/cortex-strings/README
@ -0,0 +1,111 @@
+= Cortex-A String Routines =
+
+This package contains optimised string routines including memcpy(), memset(),
+strcpy(), strlen() for the ARM Cortex-A series of cores.
+
+Various implementations of these routines are provided, including generic
+implementations for ARMv7-A cores with/without Neon, Thumb2 implementations
+and generic implementations for cores supporting AArch64.
+
+== Getting started ==
+First configure and then install libcortex-strings.so.  To make other
+applications use this library, either add -lcortex-strings to the link
+command or use LD_PRELOAD to load the library into existing applications.
+
+Our intent is to get these routines into the common C libraries such
+as GLIBC, Bionic, and Newlib.  Your system may already include them!
+
+== Contents ==
+ * src/  contains the routines themselves
+ * tests/  contains the unit tests
+ * reference/  contains reference copies of other ARM-focused
+   implementations gathered from around the Internet
+ * benchmarks/  contains various benchmarks, tools, and scripts used to
+   check and report on the different implementations.
+
+The src directory contains different variants organised by the
+implementation they run on and optional features used.  For example:
+  * src/thumb-2  contains generic non-NEON routines for AArch32 (with Thumb-2).
+  * src/arm  contains tuned routines for Cortex-A class processors.
+  * src/aarch64  contains generic routines for AArch64.
+  * src/thumb  contains generic routines for armv6-M (with Thumb).
+
+== Reference versions ==
+reference/ contains versions collected from various popular Open
+Source libraries.  These have been modified for use in benchmarking.
+Please refer to the individual files for any licensing terms.
+
+The routines were collected from the following releases:
+ * EGLIBC 2.13
+ * Newlib 1.19.0
+ * Bionic android-2.3.5_r1
+
+== Licensing ==
+All Linaro-authored routines are under the modified BSD license:
+
+Copyright (c) 2011, Linaro Limited
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Linaro nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+
+All ARM-authored routines are under the modified BSD license:
+
+Copyright (c) 2014 ARM Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Linaro nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+All third party routines are under a GPL compatible license.
+
+== Notes and Limitations ==
+Some of the implementations have been collected from other
+projects and have a variety of licenses and copyright holders.
+
+== Style ==
+Assembly code attempts to follow the GLIBC coding convetions.  They
+are:
+ * Copyright headers in C style comment blocks
+ * Instructions indented with one tab
+ * Operands indented with one tab
+ * Text is wrapped at 70 characters
+ * End of line comments are fine
--- a/contrib/cortex-strings/autogen.sh
+++ b/contrib/cortex-strings/autogen.sh
@ -0,0 +1,69 @@
+#!/bin/sh
+#
+# autogen.sh glue for hplip
+#
+# HPLIP used to have five or so different autotools trees.  Upstream
+# has reduced it to two.  Still, this script is capable of cleaning
+# just about any possible mess of autoconf files.
+#
+# BE CAREFUL with trees that are not completely automake-generated,
+# this script deletes all Makefile.in files it can find.
+#
+# Requires: automake 1.9, autoconf 2.57+
+# Conflicts: autoconf 2.13
+set -e
+
+# Refresh GNU autotools toolchain.
+echo Cleaning autotools files...
+find -type d -name autom4te.cache -print0 | xargs -0 rm -rf \;
+find -type f \( -name missing -o -name install-sh -o -name mkinstalldirs \
+	-o -name depcomp -o -name ltmain.sh -o -name configure \
+	-o -name config.sub -o -name config.guess \
+	-o -name Makefile.in \) -print0 | xargs -0 rm -f
+
+echo Running autoreconf...
+autoreconf --force --install
+
+# For the Debian package build
+test -d debian && {
+	# link these in Debian builds
+	rm -f config.sub config.guess
+	ln -s /usr/share/misc/config.sub .
+	ln -s /usr/share/misc/config.guess .
+
+	# refresh list of executable scripts, to avoid possible breakage if
+	# upstream tarball does not include the file or if it is mispackaged
+	# for whatever reason.
+	[ "$1" = "updateexec" ] && {
+		echo Generating list of executable files...
+		rm -f debian/executable.files
+		find -type f -perm +111 ! -name '.*' -fprint debian/executable.files
+	}
+
+	# Remove any files in upstream tarball that we don't have in the Debian
+	# package (because diff cannot remove files)
+	version=`dpkg-parsechangelog | awk '/Version:/ { print $2 }' | sed -e 's/-[^-]\+$//'`
+	source=`dpkg-parsechangelog | awk '/Source:/ { print $2 }' | tr -d ' '`
+	if test -r ../${source}_${version}.orig.tar.gz ; then
+		echo Generating list of files that should be removed...
+		rm -f debian/deletable.files
+		touch debian/deletable.files
+		[ -e debian/tmp ] && rm -rf debian/tmp
+		mkdir debian/tmp
+		( cd debian/tmp ; tar -zxf ../../../${source}_${version}.orig.tar.gz )
+		find debian/tmp/ -type f ! -name '.*' -print0 | xargs -0 -ri echo '{}' | \
+		  while read -r i ; do
+			if test -e "${i}" ; then
+				filename=$(echo "${i}" | sed -e 's#.*debian/tmp/[^/]\+/##')
+				test -e "${filename}" || echo "${filename}" >>debian/deletable.files
+			fi
+		  done
+		rm -fr debian/tmp
+	else
+		echo Emptying list of files that should be deleted...
+		rm -f debian/deletable.files
+		touch debian/deletable.files
+	fi
+}
+
+exit 0
--- a/contrib/cortex-strings/benchmarks/dhry/dhry.h
+++ b/contrib/cortex-strings/benchmarks/dhry/dhry.h
@ -0,0 +1,311 @@
+/*
+ **************************************************************************
+ *                       DHRYSTONE 2.1 BENCHMARK PC VERSION
+ **************************************************************************
+ *
+ *                   "DHRYSTONE" Benchmark Program
+ *                   -----------------------------
+ *                                                                            
+ *  Version:    C, Version 2.1
+ *                                                                            
+ *  File:       dhry.h (part 1 of 3)
+ *
+ *  Date:       May 25, 1988
+ *
+ *  Author:     Reinhold P. Weicker
+ *                      Siemens AG, AUT E 51
+ *                      Postfach 3220
+ *                      8520 Erlangen
+ *                      Germany (West)
+ *                              Phone:  [+49]-9131-7-20330
+ *                                      (8-17 Central European Time)
+ *                              Usenet: ..!mcsun!unido!estevax!weicker
+ *
+ *            Original Version (in Ada) published in
+ *            "Communications of the ACM" vol. 27., no. 10 (Oct. 1984),
+ *            pp. 1013 - 1030, together with the statistics
+ *            on which the distribution of statements etc. is based.
+ *
+ *            In this C version, the following C library functions are used:
+ *            - strcpy, strcmp (inside the measurement loop)
+ *            - printf, scanf (outside the measurement loop)
+ *            In addition, Berkeley UNIX system calls "times ()" or "time ()"
+ *            are used for execution time measurement. For measurements
+ *            on other systems, these calls have to be changed.
+ *
+ *  Collection of Results:
+ *              Reinhold Weicker (address see above) and
+ *              
+ *              Rick Richardson
+ *              PC Research. Inc.
+ *              94 Apple Orchard Drive
+ *              Tinton Falls, NJ 07724
+ *                      Phone:  (201) 389-8963 (9-17 EST)               
+ *                      Usenet: ...!uunet!pcrat!rick
+ *
+ *      Please send results to Rick Richardson and/or Reinhold Weicker.
+ *      Complete information should be given on hardware and software used.
+ *      Hardware information includes: Machine type, CPU, type and size
+ *      of caches; for microprocessors: clock frequency, memory speed
+ *      (number of wait states).
+ *      Software information includes: Compiler (and runtime library)
+ *      manufacturer and version, compilation switches, OS version.
+ *      The Operating System version may give an indication about the
+ *      compiler; Dhrystone itself performs no OS calls in the measurement
+ *      loop.
+ *
+ *      The complete output generated by the program should be mailed
+ *      such that at least some checks for correctness can be made.
+ *
+ **************************************************************************
+ *
+ *  This version has changes made by Roy Longbottom to conform to a common
+ *  format for a series of standard benchmarks for PCs:
+ *
+ *  Running time greater than 5 seconds due to inaccuracy of the PC clock.
+ *
+ *  Automatic adjustment of run time, no manually inserted parameters.
+ *
+ *  Initial display of calibration times to confirm linearity.
+ *
+ *  Display of results within one screen (or at a slow speed as the test
+ *  progresses) so that it can be seen to have run successfully. 
+ *
+ *  Facilities to type in details of system used etc.
+ *
+ *  All results and details appended to a results file.
+ *
+ *
+ *  Roy Longbottom
+ *  101323.2241@compuserve.com
+ *
+ **************************************************************************
+ *
+ *  For details of history, changes, other defines, benchmark construction
+ *  statistics see official versions from ftp.nosc.mil/pub/aburto where
+ *  the latest table of results (dhry.tbl) are available. See also
+ *  netlib@ornl.gov
+ *
+ **************************************************************************
+ *
+ * Defines:     The following "Defines" are possible:
+ *              -DREG=register          (default: Not defined)
+ *                      As an approximation to what an average C programmer
+ *                      might do, the "register" storage class is applied
+ *                      (if enabled by -DREG=register)
+ *                      - for local variables, if they are used (dynamically)
+ *                        five or more times
+ *                      - for parameters if they are used (dynamically)
+ *                        six or more times
+ *                      Note that an optimal "register" strategy is
+ *                      compiler-dependent, and that "register" declarations
+ *                      do not necessarily lead to faster execution.
+ *              -DNOSTRUCTASSIGN        (default: Not defined)
+ *                      Define if the C compiler does not support
+ *                      assignment of structures.
+ *              -DNOENUMS               (default: Not defined)
+ *                      Define if the C compiler does not support
+ *                      enumeration types.
+ ***************************************************************************
+ *
+ *  Compilation model and measurement (IMPORTANT):
+ *
+ *  This C version of Dhrystone consists of three files:
+ *  - dhry.h (this file, containing global definitions and comments)
+ *  - dhry_1.c (containing the code corresponding to Ada package Pack_1)
+ *  - dhry_2.c (containing the code corresponding to Ada package Pack_2)
+ *
+ *  The following "ground rules" apply for measurements:
+ *  - Separate compilation
+ *  - No procedure merging
+ *  - Otherwise, compiler optimizations are allowed but should be indicated
+ *  - Default results are those without register declarations
+ *  See the companion paper "Rationale for Dhrystone Version 2" for a more
+ *  detailed discussion of these ground rules.
+ *
+ *  For 16-Bit processors (e.g. 80186, 80286), times for all compilation
+ *  models ("small", "medium", "large" etc.) should be given if possible,
+ *  together with a definition of these models for the compiler system used.
+ *
+ **************************************************************************
+ *                Examples of Pentium Results
+ *
+ * Dhrystone Benchmark  Version 2.1 (Language: C)
+ *
+ * Month run            4/1996
+ * PC model             Escom
+ * CPU                  Pentium
+ * Clock MHz            100
+ * Cache                256K
+ * Options              Neptune chipset
+ * OS/DOS               Windows 95
+ * Compiler             Watcom C/ C++ 10.5 Win386 
+ * OptLevel             -otexan -zp8 -fp5 -5r
+ * Run by               Roy Longbottom
+ * From                 UK
+ * Mail                 101323.2241@compuserve.com
+ * 
+ * Final values         (* implementation-dependent):
+ *
+ * Int_Glob:      O.K.  5
+ * Bool_Glob:     O.K.  1
+ * Ch_1_Glob:     O.K.  A
+ * Ch_2_Glob:     O.K.  B
+ * Arr_1_Glob[8]: O.K.  7
+ * Arr_2_Glob8/7: O.K.     1600010
+ * Ptr_Glob->  
+ *   Ptr_Comp:       *  98008
+ *   Discr:       O.K.  0
+ *   Enum_Comp:   O.K.  2
+ *   Int_Comp:    O.K.  17
+ *   Str_Comp:    O.K.  DHRYSTONE PROGRAM, SOME STRING
+ * Next_Ptr_Glob-> 
+ *   Ptr_Comp:       *  98008 same as above
+ *   Discr:       O.K.  0
+ *   Enum_Comp:   O.K.  1
+ *   Int_Comp:    O.K.  18
+ *   Str_Comp:    O.K.  DHRYSTONE PROGRAM, SOME STRING
+ * Int_1_Loc:     O.K.  5
+ * Int_2_Loc:     O.K.  13
+ * Int_3_Loc:     O.K.  7
+ * Enum_Loc:      O.K.  1
+ * Str_1_Loc:     O.K.  DHRYSTONE PROGRAM, 1'ST STRING
+ * Str_2_Loc:     O.K.  DHRYSTONE PROGRAM, 2'ND STRING
+ *
+ * Register option      Selected.
+ * 
+ * Microseconds 1 loop:          4.53
+ * Dhrystones / second:      220690
+ * VAX MIPS rating:            125.61
+ *
+ *
+ * Dhrystone Benchmark  Version 2.1 (Language: C)
+ *
+ * Month run            4/1996
+ * PC model             Escom
+ * CPU                  Pentium
+ * Clock MHz            100
+ * Cache                256K
+ * Options              Neptune chipset
+ * OS/DOS               Windows 95
+ * Compiler             Watcom C/ C++ 10.5 Win386 
+ * OptLevel                 No optimisation
+ * Run by               Roy Longbottom
+ * From                 UK
+ * Mail                 101323.2241@compuserve.com
+ * 
+ * Final values         (* implementation-dependent):
+ *
+ * Int_Glob:      O.K.  5
+ * Bool_Glob:     O.K.  1
+ * Ch_1_Glob:     O.K.  A
+ * Ch_2_Glob:     O.K.  B
+ * Arr_1_Glob[8]: O.K.  7
+ * Arr_2_Glob8/7: O.K.      320010
+ * Ptr_Glob->  
+ *   Ptr_Comp:       *  98004
+ *   Discr:       O.K.  0
+ *   Enum_Comp:   O.K.  2
+ *   Int_Comp:    O.K.  17
+ *   Str_Comp:    O.K.  DHRYSTONE PROGRAM, SOME STRING
+ * Next_Ptr_Glob-> 
+ *   Ptr_Comp:       *  98004 same as above
+ *   Discr:       O.K.  0
+ *   Enum_Comp:   O.K.  1
+ *   Int_Comp:    O.K.  18
+ *   Str_Comp:    O.K.  DHRYSTONE PROGRAM, SOME STRING
+ * Int_1_Loc:     O.K.  5
+ * Int_2_Loc:     O.K.  13
+ * Int_3_Loc:     O.K.  7
+ * Enum_Loc:      O.K.  1
+ * Str_1_Loc:     O.K.  DHRYSTONE PROGRAM, 1'ST STRING
+ * Str_2_Loc:     O.K.  DHRYSTONE PROGRAM, 2'ND STRING
+ *
+ * Register option      Not selected.
+ *
+ * Microseconds 1 loop:         20.06
+ * Dhrystones / second:       49844
+ * VAX MIPS rating:             28.37
+ *
+ **************************************************************************
+ */
+
+/* Compiler and system dependent definitions: */
+
+#ifndef TIME
+#define TIMES
+#endif
+                /* Use times(2) time function unless    */
+                /* explicitly defined otherwise         */
+
+#ifdef TIMES
+/* #include <sys/types.h> 
+   #include <sys/times.h> */
+                /* for "times" */
+#endif
+
+#define Mic_secs_Per_Second     1000000.0
+                /* Berkeley UNIX C returns process times in seconds/HZ */
+
+#ifdef  NOSTRUCTASSIGN
+#define structassign(d, s)      memcpy(&(d), &(s), sizeof(d))
+#else
+#define structassign(d, s)      d = s
+#endif
+
+#ifdef  NOENUM
+#define Ident_1 0
+#define Ident_2 1
+#define Ident_3 2
+#define Ident_4 3
+#define Ident_5 4
+  typedef int   Enumeration;
+#else
+  typedef       enum    {Ident_1, Ident_2, Ident_3, Ident_4, Ident_5}
+                Enumeration;
+#endif
+        /* for boolean and enumeration types in Ada, Pascal */
+
+/* General definitions: */
+
+#include <stdio.h>
+#include <string.h>
+
+                /* for strcpy, strcmp */
+
+#define Null 0 
+                /* Value of a Null pointer */
+#define true  1
+#define false 0
+
+typedef int     One_Thirty;
+typedef int     One_Fifty;
+typedef char    Capital_Letter;
+typedef int     Boolean;
+typedef char    Str_30 [31];
+typedef int     Arr_1_Dim [50];
+typedef int     Arr_2_Dim [50] [50];
+
+typedef struct record 
+    {
+    struct record *Ptr_Comp;
+    Enumeration    Discr;
+    union {
+          struct {
+                  Enumeration Enum_Comp;
+                  int         Int_Comp;
+                  char        Str_Comp [31];
+                  } var_1;
+          struct {
+                  Enumeration E_Comp_2;
+                  char        Str_2_Comp [31];
+                  } var_2;
+          struct {
+                  char        Ch_1_Comp;
+                  char        Ch_2_Comp;
+                  } var_3;
+          } variant;
+      } Rec_Type, *Rec_Pointer;
+
+
+
--- a/contrib/cortex-strings/benchmarks/dhry/dhry_1.c
+++ b/contrib/cortex-strings/benchmarks/dhry/dhry_1.c
@ -0,0 +1,778 @@
+/*
+ *************************************************************************
+ *
+ *                   "DHRYSTONE" Benchmark Program
+ *                   -----------------------------
+ *
+ *  Version:    C, Version 2.1
+ *
+ *  File:       dhry_1.c (part 2 of 3)
+ *
+ *  Date:       May 25, 1988
+ *
+ *  Author:     Reinhold P. Weicker
+ *
+ *************************************************************************
+ */
+
+ #include <time.h>
+ #include <stdlib.h>
+ #include <stdio.h>
+ #include "dhry.h"
+ /*COMPILER COMPILER COMPILER COMPILER COMPILER COMPILER COMPILER*/
+               
+ #ifdef COW
+    #define compiler  "Watcom C/C++ 10.5 Win386"
+    #define options   "  -otexan -zp8 -5r -ms"
+ #endif
+ #ifdef CNW
+    #define compiler  "Watcom C/C++ 10.5 Win386"
+    #define options   "   No optimisation"
+ #endif
+ #ifdef COD
+    #define compiler  "Watcom C/C++ 10.5 Dos4GW"
+    #define options   "  -otexan -zp8 -5r -ms"
+ #endif
+ #ifdef CND
+    #define compiler  "Watcom C/C++ 10.5 Dos4GW"
+    #define options   "   No optimisation"
+ #endif
+ #ifdef CONT
+    #define compiler  "Watcom C/C++ 10.5 Win32NT"
+    #define options   "  -otexan -zp8 -5r -ms"
+ #endif
+ #ifdef CNNT
+    #define compiler  "Watcom C/C++ 10.5 Win32NT"
+    #define options   "   No optimisation"
+ #endif
+ #ifdef COO2
+    #define compiler  "Watcom C/C++ 10.5 OS/2-32"
+    #define options   "  -otexan -zp8 -5r -ms"
+ #endif
+ #ifdef CNO2
+    #define compiler  "Watcom C/C++ 10.5 OS/2-32"
+    #define options   "   No optimisation"
+ #endif
+ 
+
+/* Global Variables: */
+ 
+Rec_Pointer     Ptr_Glob,
+                 Next_Ptr_Glob;
+int             Int_Glob;
+ Boolean         Bool_Glob;
+ char            Ch_1_Glob,
+                 Ch_2_Glob;
+ int             Arr_1_Glob [50];
+ int             Arr_2_Glob [50] [50];
+ int             getinput = 1;
+
+ 
+ char Reg_Define[100] = "Register option      Selected.";
+ 
+ Enumeration Func_1 (Capital_Letter Ch_1_Par_Val,
+                                           Capital_Letter Ch_2_Par_Val);
+   /* 
+   forward declaration necessary since Enumeration may not simply be int
+   */
+ 
+ #ifndef ROPT
+ #define REG
+         /* REG becomes defined as empty */
+         /* i.e. no register variables   */
+ #else
+ #define REG register
+ #endif
+
+ void Proc_1 (REG Rec_Pointer Ptr_Val_Par);
+ void Proc_2 (One_Fifty *Int_Par_Ref);
+ void Proc_3 (Rec_Pointer *Ptr_Ref_Par);
+ void Proc_4 (); 
+ void Proc_5 ();
+ void Proc_6 (Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par);
+ void Proc_7 (One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val,
+                                              One_Fifty *Int_Par_Ref);
+ void Proc_8 (Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref,
+                               int Int_1_Par_Val, int Int_2_Par_Val);
+                               
+ Boolean Func_2 (Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref);
+
+ 
+ /* variables for time measurement: */
+ 
+ #define Too_Small_Time 2
+                 /* Measurements should last at least 2 seconds */
+ 
+ double          Begin_Time,
+                 End_Time,
+                 User_Time;
+ 
+ double          Microseconds,
+                 Dhrystones_Per_Second,
+                 Vax_Mips;
+ 
+ /* end of variables for time measurement */
+ 
+ 
+ void main (int argc, char *argv[])
+ /*****/
+ 
+   /* main program, corresponds to procedures        */
+   /* Main and Proc_0 in the Ada version             */
+ {
+   double   dtime();
+ 
+         One_Fifty   Int_1_Loc;
+   REG   One_Fifty   Int_2_Loc;
+         One_Fifty   Int_3_Loc;
+   REG   char        Ch_Index;
+         Enumeration Enum_Loc;
+         Str_30      Str_1_Loc;
+         Str_30      Str_2_Loc;
+   REG   int         Run_Index;
+   REG   int         Number_Of_Runs; 
+         int         endit, count = 10;
+         FILE        *Ap;
+         char        general[9][80] = {" "};
+ 
+   /* Initializations */
+    if (argc > 1)
+     {
+        switch (argv[1][0])
+         {
+             case 'N':
+                getinput = 0;
+                break;
+             case 'n':
+                getinput = 0;
+                break;
+         }
+      }
+ 
+   if ((Ap = fopen("Dhry.txt","a+")) == NULL)
+     {
+        printf("Can not open Dhry.txt\n\n");
+        printf("Press any key\n");
+        exit(1);
+     }
+
+/***********************************************************************
+ *         Change for compiler and optimisation used                   *
+ ***********************************************************************/
+ 
+   Next_Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type));
+   Ptr_Glob = (Rec_Pointer) malloc (sizeof (Rec_Type));
+ 
+   Ptr_Glob->Ptr_Comp                    = Next_Ptr_Glob;
+   Ptr_Glob->Discr                       = Ident_1;
+   Ptr_Glob->variant.var_1.Enum_Comp     = Ident_3;
+   Ptr_Glob->variant.var_1.Int_Comp      = 40;
+   strcpy (Ptr_Glob->variant.var_1.Str_Comp, 
+           "DHRYSTONE PROGRAM, SOME STRING");       
+   strcpy (Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");
+ 
+   Arr_2_Glob [8][7] = 10;
+         /* Was missing in published program. Without this statement,   */
+         /* Arr_2_Glob [8][7] would have an undefined value.            */
+         /* Warning: With 16-Bit processors and Number_Of_Runs > 32000, */
+         /* overflow may occur for this array element.                  */
+ 
+   printf ("\n");
+   printf ("Dhrystone Benchmark, Version 2.1 (Language: C or C++)\n");
+   printf ("\n");
+   
+   if (getinput == 0)
+    {
+        printf ("No run time input data\n\n");
+    }
+   else
+    {
+         printf ("With run time input data\n\n");
+    }
+   
+   printf ("Compiler        %s\n", compiler);
+   printf ("Optimisation    %s\n", options);
+   #ifdef ROPT
+       printf ("Register option selected\n\n");
+   #else
+       printf ("Register option not selected\n\n");
+       strcpy(Reg_Define, "Register option      Not selected.");
+   #endif
+
+ /*  
+   if (Reg)
+   {
+     printf ("Program compiled with 'register' attribute\n");
+     printf ("\n");
+   }
+   else
+   {
+     printf ("Program compiled without 'register' attribute\n");
+     printf ("\n");
+   }
+
+   printf ("Please give the number of runs through the benchmark: ");
+   {
+     int n;
+     scanf ("%d", &n);
+     Number_Of_Runs = n;
+   }   
+   printf ("\n"); 
+   printf ("Execution starts, %d runs through Dhrystone\n",
+                                                 Number_Of_Runs);
+ */
+
+   Number_Of_Runs = 5000;
+
+   do
+     {
+
+       Number_Of_Runs = Number_Of_Runs * 2;
+       count = count - 1;
+       Arr_2_Glob [8][7] = 10;
+        
+       /***************/
+       /* Start timer */
+       /***************/
+  
+       Begin_Time = dtime();
+   
+       for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index)
+       {
+ 
+         Proc_5();
+         Proc_4();
+           /* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */
+         Int_1_Loc = 2;
+         Int_2_Loc = 3;
+         strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
+         Enum_Loc = Ident_2;
+         Bool_Glob = ! Func_2 (Str_1_Loc, Str_2_Loc);
+           /* Bool_Glob == 1 */
+         while (Int_1_Loc < Int_2_Loc)  /* loop body executed once */
+         {
+           Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc;
+             /* Int_3_Loc == 7 */
+           Proc_7 (Int_1_Loc, Int_2_Loc, &Int_3_Loc);
+             /* Int_3_Loc == 7 */
+           Int_1_Loc += 1;
+         }   /* while */
+            /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+         Proc_8 (Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc);
+           /* Int_Glob == 5 */
+         Proc_1 (Ptr_Glob);
+         for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index)
+                              /* loop body executed twice */
+         {
+           if (Enum_Loc == Func_1 (Ch_Index, 'C'))
+               /* then, not executed */
+             {
+               Proc_6 (Ident_1, &Enum_Loc);
+               strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING");
+               Int_2_Loc = Run_Index;
+               Int_Glob = Run_Index;
+             }
+         }
+           /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+         Int_2_Loc = Int_2_Loc * Int_1_Loc;
+         Int_1_Loc = Int_2_Loc / Int_3_Loc;
+         Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc;
+           /* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */
+         Proc_2 (&Int_1_Loc);
+           /* Int_1_Loc == 5 */
+ 
+       }   /* loop "for Run_Index" */
+ 
+       /**************/
+       /* Stop timer */
+       /**************/
+ 
+       End_Time = dtime();
+       User_Time = End_Time - Begin_Time;
+             
+       printf ("%12.0f runs %6.2f seconds \n",(double) Number_Of_Runs, User_Time);
+       if (User_Time > 5)
+         {
+             count = 0;
+         }
+       else
+         {
+             if (User_Time < 0.1)
+               {
+                  Number_Of_Runs = Number_Of_Runs * 5;
+               }
+         }
+     }   /* calibrate/run do while */
+   while (count >0);
+ 
+   printf ("\n");
+   printf ("Final values (* implementation-dependent):\n");
+   printf ("\n");
+   printf ("Int_Glob:      ");
+   if (Int_Glob == 5)  printf ("O.K.  ");
+   else                printf ("WRONG ");
+   printf ("%d  ", Int_Glob);
+      
+   printf ("Bool_Glob:     ");
+   if (Bool_Glob == 1) printf ("O.K.  ");
+   else                printf ("WRONG ");
+   printf ("%d\n", Bool_Glob);
+      
+   printf ("Ch_1_Glob:     ");
+   if (Ch_1_Glob == 'A')  printf ("O.K.  ");               
+   else                   printf ("WRONG ");
+   printf ("%c  ", Ch_1_Glob);
+         
+   printf ("Ch_2_Glob:     ");
+   if (Ch_2_Glob == 'B')  printf ("O.K.  ");
+   else                   printf ("WRONG ");
+   printf ("%c\n",  Ch_2_Glob);
+   
+   printf ("Arr_1_Glob[8]: ");
+   if (Arr_1_Glob[8] == 7)  printf ("O.K.  ");
+   else                     printf ("WRONG ");
+   printf ("%d  ", Arr_1_Glob[8]);
+            
+   printf ("Arr_2_Glob8/7: ");
+   if (Arr_2_Glob[8][7] == Number_Of_Runs + 10)
+                          printf ("O.K.  ");
+   else                   printf ("WRONG ");
+   printf ("%10d\n", Arr_2_Glob[8][7]);
+   
+   printf ("Ptr_Glob->            ");
+   printf ("  Ptr_Comp:       *    %d\n", (int) Ptr_Glob->Ptr_Comp);
+   
+   printf ("  Discr:       ");
+   if (Ptr_Glob->Discr == 0)  printf ("O.K.  ");
+   else                       printf ("WRONG ");
+   printf ("%d  ", Ptr_Glob->Discr);
+            
+   printf ("Enum_Comp:     ");
+   if (Ptr_Glob->variant.var_1.Enum_Comp == 2)
+                        printf ("O.K.  ");
+   else                printf ("WRONG ");
+   printf ("%d\n", Ptr_Glob->variant.var_1.Enum_Comp);
+      
+   printf ("  Int_Comp:    ");
+   if (Ptr_Glob->variant.var_1.Int_Comp == 17)  printf ("O.K.  ");
+   else                                         printf ("WRONG ");
+   printf ("%d ", Ptr_Glob->variant.var_1.Int_Comp);
+      
+   printf ("Str_Comp:      ");
+   if (strcmp(Ptr_Glob->variant.var_1.Str_Comp,
+                        "DHRYSTONE PROGRAM, SOME STRING") == 0)
+                        printf ("O.K.  ");
+   else                printf ("WRONG ");   
+   printf ("%s\n", Ptr_Glob->variant.var_1.Str_Comp);
+   
+   printf ("Next_Ptr_Glob->       "); 
+   printf ("  Ptr_Comp:       *    %d", (int) Next_Ptr_Glob->Ptr_Comp);
+   printf (" same as above\n");
+   
+   printf ("  Discr:       ");
+   if (Next_Ptr_Glob->Discr == 0)
+                        printf ("O.K.  ");
+   else                printf ("WRONG ");
+   printf ("%d  ", Next_Ptr_Glob->Discr);
+   
+   printf ("Enum_Comp:     ");
+   if (Next_Ptr_Glob->variant.var_1.Enum_Comp == 1)
+                        printf ("O.K.  ");
+   else                printf ("WRONG ");
+   printf ("%d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
+   
+   printf ("  Int_Comp:    ");
+   if (Next_Ptr_Glob->variant.var_1.Int_Comp == 18)
+                        printf ("O.K.  ");
+   else                printf ("WRONG ");
+   printf ("%d ", Next_Ptr_Glob->variant.var_1.Int_Comp);
+   
+   printf ("Str_Comp:      ");
+   if (strcmp(Next_Ptr_Glob->variant.var_1.Str_Comp,
+                        "DHRYSTONE PROGRAM, SOME STRING") == 0)
+                        printf ("O.K.  ");
+   else                printf ("WRONG ");   
+   printf ("%s\n", Next_Ptr_Glob->variant.var_1.Str_Comp);
+   
+   printf ("Int_1_Loc:     ");
+   if (Int_1_Loc == 5)
+                        printf ("O.K.  ");
+   else                printf ("WRONG ");
+   printf ("%d  ", Int_1_Loc);
+      
+   printf ("Int_2_Loc:     ");
+   if (Int_2_Loc == 13)
+                        printf ("O.K.  ");
+   else                printf ("WRONG ");
+   printf ("%d\n", Int_2_Loc);
+   
+   printf ("Int_3_Loc:     ");
+   if (Int_3_Loc == 7)
+                        printf ("O.K.  ");
+   else                printf ("WRONG ");
+   printf ("%d  ", Int_3_Loc);
+   
+   printf ("Enum_Loc:      ");
+   if (Enum_Loc == 1)
+                        printf ("O.K.  ");
+   else                printf ("WRONG ");
+   printf ("%d\n", Enum_Loc);
+   
+   printf ("Str_1_Loc:                             ");
+   if (strcmp(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING") == 0)
+                        printf ("O.K.  ");
+   else                printf ("WRONG ");   
+   printf ("%s\n", Str_1_Loc);
+   
+   printf ("Str_2_Loc:                             ");
+   if (strcmp(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING") == 0)
+                        printf ("O.K.  ");
+   else                printf ("WRONG ");   
+   printf ("%s\n", Str_2_Loc);
+         
+   printf ("\n");
+    
+ 
+   if (User_Time < Too_Small_Time)
+   {
+     printf ("Measured time too small to obtain meaningful results\n");
+     printf ("Please increase number of runs\n");
+     printf ("\n");
+   }
+   else
+   {
+     Microseconds = User_Time * Mic_secs_Per_Second 
+                         / (double) Number_Of_Runs;
+     Dhrystones_Per_Second = (double) Number_Of_Runs / User_Time;
+     Vax_Mips = Dhrystones_Per_Second / 1757.0;
+ 
+     printf ("Microseconds for one run through Dhrystone: ");
+     printf ("%12.2lf \n", Microseconds);
+     printf ("Dhrystones per Second:                      ");
+     printf ("%10.0lf \n", Dhrystones_Per_Second);
+     printf ("VAX  MIPS rating =                          ");
+     printf ("%12.2lf \n",Vax_Mips);
+     printf ("\n");
+
+/************************************************************************
+ *             Type details of hardware, software etc.                  *
+ ************************************************************************/
+
+   if (getinput == 1)
+     {
+        printf ("Enter the following which will be added with results to file DHRY.TXT\n");
+        printf ("When submitting a number of results you need only provide details once\n");
+        printf ("but a cross reference such as an abbreviated CPU type would be useful.\n");    
+        printf ("You can kill (exit or close) the program now and no data will be added.\n\n");
+                
+        printf ("PC Supplier/model     ? ");
+        gets(general[1]);
+    
+        printf ("CPU chip              ? ");
+        gets(general[2]);
+    
+        printf ("Clock MHz             ? ");
+        gets(general[3]);
+     
+        printf ("Cache size            ? ");
+        gets(general[4]);
+     
+        printf ("Chipset & H/W options ? ");
+        gets(general[5]);
+      
+        printf ("OS/DOS version        ? ");
+        gets(general[6]);
+        
+        printf ("Your name             ? ");
+        gets(general[7]);
+     
+        printf ("Company/Location      ? ");
+        gets(general[8]);
+     
+        printf ("E-mail address        ? ");
+        gets(general[0]);
+     } 
+/************************************************************************
+ *                Add results to output file Dhry.txt                   *
+ ************************************************************************/
+   fprintf (Ap, "-------------------- -----------------------------------"        
+                         "\n");
+   fprintf (Ap, "Dhrystone Benchmark  Version 2.1 (Language: C++)\n\n");
+   fprintf (Ap, "PC model             %s\n", general[1]);
+   fprintf (Ap, "CPU                  %s\n", general[2]);
+   fprintf (Ap, "Clock MHz            %s\n", general[3]);
+   fprintf (Ap, "Cache                %s\n", general[4]);
+   fprintf (Ap, "Options              %s\n", general[5]);
+   fprintf (Ap, "OS/DOS               %s\n", general[6]);
+   fprintf (Ap, "Compiler             %s\n", compiler);
+   fprintf (Ap, "OptLevel             %s\n", options);
+   fprintf (Ap, "Run by               %s\n", general[7]);
+   fprintf (Ap, "From                 %s\n", general[8]);
+   fprintf (Ap, "Mail                 %s\n\n", general[0]);
+
+   fprintf (Ap, "Final values         (* implementation-dependent):\n");
+   fprintf (Ap, "\n");
+   fprintf (Ap, "Int_Glob:      ");
+   if (Int_Glob == 5)  fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%d\n", Int_Glob);
+      
+   fprintf (Ap, "Bool_Glob:     ");
+   if (Bool_Glob == 1) fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%d\n", Bool_Glob);
+      
+   fprintf (Ap, "Ch_1_Glob:     ");
+   if (Ch_1_Glob == 'A')  fprintf (Ap, "O.K.  ");               
+   else                   fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%c\n", Ch_1_Glob);
+         
+   fprintf (Ap, "Ch_2_Glob:     ");
+   if (Ch_2_Glob == 'B')  fprintf (Ap, "O.K.  ");
+   else                   fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%c\n",  Ch_2_Glob);
+   
+   fprintf (Ap, "Arr_1_Glob[8]: ");
+   if (Arr_1_Glob[8] == 7)  fprintf (Ap, "O.K.  ");
+   else                     fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%d\n", Arr_1_Glob[8]);
+            
+   fprintf (Ap, "Arr_2_Glob8/7: ");
+   if (Arr_2_Glob[8][7] == Number_Of_Runs + 10)
+                          fprintf (Ap, "O.K.  ");
+   else                   fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%10d\n", Arr_2_Glob[8][7]);
+   
+   fprintf (Ap, "Ptr_Glob->  \n");
+   fprintf (Ap, "  Ptr_Comp:       *  %d\n", (int) Ptr_Glob->Ptr_Comp);
+   
+   fprintf (Ap, "  Discr:       ");
+   if (Ptr_Glob->Discr == 0)  fprintf (Ap, "O.K.  ");
+   else                       fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%d\n", Ptr_Glob->Discr);
+            
+   fprintf (Ap, "  Enum_Comp:   ");
+   if (Ptr_Glob->variant.var_1.Enum_Comp == 2)
+                        fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%d\n", Ptr_Glob->variant.var_1.Enum_Comp);
+      
+   fprintf (Ap, "  Int_Comp:    ");
+   if (Ptr_Glob->variant.var_1.Int_Comp == 17)  fprintf (Ap, "O.K.  ");
+   else                                         fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%d\n", Ptr_Glob->variant.var_1.Int_Comp);
+      
+   fprintf (Ap, "  Str_Comp:    ");
+   if (strcmp(Ptr_Glob->variant.var_1.Str_Comp,
+                        "DHRYSTONE PROGRAM, SOME STRING") == 0)
+                        fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");   
+   fprintf (Ap, "%s\n", Ptr_Glob->variant.var_1.Str_Comp);
+   
+   fprintf (Ap, "Next_Ptr_Glob-> \n"); 
+   fprintf (Ap, "  Ptr_Comp:       *  %d", (int) Next_Ptr_Glob->Ptr_Comp);
+   fprintf (Ap, " same as above\n");
+   
+   fprintf (Ap, "  Discr:       ");
+   if (Next_Ptr_Glob->Discr == 0)
+                        fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%d\n", Next_Ptr_Glob->Discr);
+   
+   fprintf (Ap, "  Enum_Comp:   ");
+   if (Next_Ptr_Glob->variant.var_1.Enum_Comp == 1)
+                        fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
+   
+   fprintf (Ap, "  Int_Comp:    ");
+   if (Next_Ptr_Glob->variant.var_1.Int_Comp == 18)
+                        fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%d\n", Next_Ptr_Glob->variant.var_1.Int_Comp);
+   
+   fprintf (Ap, "  Str_Comp:    ");
+   if (strcmp(Next_Ptr_Glob->variant.var_1.Str_Comp,
+                        "DHRYSTONE PROGRAM, SOME STRING") == 0)
+                        fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");   
+   fprintf (Ap, "%s\n", Next_Ptr_Glob->variant.var_1.Str_Comp);
+   
+   fprintf (Ap, "Int_1_Loc:     ");
+   if (Int_1_Loc == 5)
+                        fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%d\n", Int_1_Loc);
+      
+   fprintf (Ap, "Int_2_Loc:     ");
+   if (Int_2_Loc == 13)
+                        fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%d\n", Int_2_Loc);
+   
+   fprintf (Ap, "Int_3_Loc:     ");
+   if (Int_3_Loc == 7)
+                        fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%d\n", Int_3_Loc);
+   
+   fprintf (Ap, "Enum_Loc:      ");
+   if (Enum_Loc == 1)
+                        fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");
+   fprintf (Ap, "%d\n", Enum_Loc);
+   
+   fprintf (Ap, "Str_1_Loc:     ");
+   if (strcmp(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING") == 0)
+                        fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");   
+   fprintf (Ap, "%s\n", Str_1_Loc);
+   
+   fprintf (Ap, "Str_2_Loc:     ");
+   if (strcmp(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING") == 0)
+                        fprintf (Ap, "O.K.  ");
+   else                fprintf (Ap, "WRONG ");   
+   fprintf (Ap, "%s\n", Str_2_Loc);
+         
+   
+   fprintf (Ap, "\n");
+   fprintf(Ap,"%s\n",Reg_Define);
+   fprintf (Ap, "\n");
+   fprintf(Ap,"Microseconds 1 loop:  %12.2lf\n",Microseconds);
+   fprintf(Ap,"Dhrystones / second:  %10.0lf\n",Dhrystones_Per_Second);
+   fprintf(Ap,"VAX MIPS rating:      %12.2lf\n\n",Vax_Mips);
+   fclose(Ap);
+   }
+   
+    printf ("\n");
+    printf ("A new results file will have been created in the same directory as the\n");
+    printf (".EXE files if one did not already exist. If you made a mistake on input, \n");
+    printf ("you can use a text editor to correct it, delete the results or copy \n");
+    printf ("them to a different file name. If you intend to run multiple tests you\n");
+    printf ("you may wish to rename DHRY.TXT with a more informative title.\n\n");
+    printf ("Please submit feedback and results files as a posting in Section 12\n");
+    printf ("or to Roy_Longbottom@compuserve.com\n\n");
+
+    if (getinput == 1)
+     {
+        printf("Press any key to exit\n");
+        printf ("\nIf this is displayed you must close the window in the normal way\n");    
+     }
+ }
+ 
+ 
+ void Proc_1 (REG Rec_Pointer Ptr_Val_Par)
+ /******************/
+ 
+     /* executed once */
+ {
+   REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp;  
+                                         /* == Ptr_Glob_Next */
+   /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp,    */
+   /* corresponds to "rename" in Ada, "with" in Pascal           */
+   
+   structassign (*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob);
+   Ptr_Val_Par->variant.var_1.Int_Comp = 5;
+   Next_Record->variant.var_1.Int_Comp 
+         = Ptr_Val_Par->variant.var_1.Int_Comp;
+   Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp;
+   Proc_3 (&Next_Record->Ptr_Comp);
+     /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp 
+                         == Ptr_Glob->Ptr_Comp */
+   if (Next_Record->Discr == Ident_1)
+     /* then, executed */
+   {
+     Next_Record->variant.var_1.Int_Comp = 6;
+     Proc_6 (Ptr_Val_Par->variant.var_1.Enum_Comp, 
+            &Next_Record->variant.var_1.Enum_Comp);
+     Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp;
+     Proc_7 (Next_Record->variant.var_1.Int_Comp, 10, 
+            &Next_Record->variant.var_1.Int_Comp);
+   }
+   else /* not executed */
+     structassign (*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp);
+ } /* Proc_1 */
+ 
+ 
+ void Proc_2 (One_Fifty *Int_Par_Ref)
+ /******************/
+     /* executed once */
+     /* *Int_Par_Ref == 1, becomes 4 */
+ 
+ {
+   One_Fifty  Int_Loc;
+   Enumeration   Enum_Loc;
+ 
+   Int_Loc = *Int_Par_Ref + 10;
+   do /* executed once */
+     if (Ch_1_Glob == 'A')
+       /* then, executed */
+     {
+       Int_Loc -= 1;
+       *Int_Par_Ref = Int_Loc - Int_Glob;
+       Enum_Loc = Ident_1;
+     } /* if */
+   while (Enum_Loc != Ident_1); /* true */
+ } /* Proc_2 */
+ 
+ 
+ void Proc_3 (Rec_Pointer *Ptr_Ref_Par)
+ /******************/
+     /* executed once */
+     /* Ptr_Ref_Par becomes Ptr_Glob */
+ 
+ {
+   if (Ptr_Glob != Null)
+     /* then, executed */
+     *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp;
+   Proc_7 (10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp);
+ } /* Proc_3 */
+ 
+ 
+void Proc_4 () /* without parameters */
+ /*******/
+     /* executed once */
+ {
+   Boolean Bool_Loc;
+ 
+   Bool_Loc = Ch_1_Glob == 'A';
+   Bool_Glob = Bool_Loc | Bool_Glob;
+   Ch_2_Glob = 'B';
+ } /* Proc_4 */
+ 
+ 
+ void Proc_5 () /* without parameters */
+ /*******/
+     /* executed once */
+ {
+   Ch_1_Glob = 'A';
+   Bool_Glob = false;
+ } /* Proc_5 */
+ 
+ 
+         /* Procedure for the assignment of structures,          */
+         /* if the C compiler doesn't support this feature       */
+ #ifdef  NOSTRUCTASSIGN
+ memcpy (d, s, l)
+ register char   *d;
+ register char   *s;
+ register int    l;
+ {
+         while (l--) *d++ = *s++;
+ }
+ #endif
+
+
+double dtime()
+{
+  
+  /* #include <ctype.h> */
+
+  #define HZ CLOCKS_PER_SEC
+  clock_t tnow;
+
+  double q;
+  tnow = clock();
+  q = (double)tnow / (double)HZ;     
+  return q;
+}
--- a/contrib/cortex-strings/benchmarks/dhry/dhry_2.c
+++ b/contrib/cortex-strings/benchmarks/dhry/dhry_2.c
@ -0,0 +1,186 @@
+ /*
+  *************************************************************************
+  *
+  *                   "DHRYSTONE" Benchmark Program
+  *                   -----------------------------
+  *
+  *  Version:    C, Version 2.1
+  *
+  *  File:       dhry_2.c (part 3 of 3)
+  *
+  *  Date:       May 25, 1988
+  *
+  *  Author:     Reinhold P. Weicker
+  *
+  *************************************************************************
+  */
+
+ #include "dhry.h"
+ 
+ #ifndef REG
+ #define REG
+         /* REG becomes defined as empty */
+         /* i.e. no register variables   */
+ #else
+ #define REG register
+ #endif
+ 
+ extern  int     Int_Glob;
+ extern  char    Ch_1_Glob;
+ 
+ Boolean Func_3 (Enumeration Enum_Par_Val); 
+ 
+ void Proc_6 (Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par)
+ /*********************************/
+     /* executed once */
+     /* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */
+ 
+ {
+   *Enum_Ref_Par = Enum_Val_Par;
+   if (! Func_3 (Enum_Val_Par))
+     /* then, not executed */
+     *Enum_Ref_Par = Ident_4;
+   switch (Enum_Val_Par)
+   {
+     case Ident_1: 
+       *Enum_Ref_Par = Ident_1;
+       break;
+     case Ident_2: 
+       if (Int_Glob > 100)
+         /* then */
+       *Enum_Ref_Par = Ident_1;
+       else *Enum_Ref_Par = Ident_4;
+       break;
+     case Ident_3: /* executed */
+       *Enum_Ref_Par = Ident_2;
+       break;
+     case Ident_4: break;
+     case Ident_5: 
+       *Enum_Ref_Par = Ident_3;
+       break;
+   } /* switch */
+ } /* Proc_6 */
+ 
+ 
+ void Proc_7 (One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val,
+                                              One_Fifty *Int_Par_Ref)
+ /**********************************************/
+     /* executed three times                                      */
+     /* first call:      Int_1_Par_Val == 2, Int_2_Par_Val == 3,  */
+     /*                  Int_Par_Ref becomes 7                    */
+     /* second call:     Int_1_Par_Val == 10, Int_2_Par_Val == 5, */
+     /*                  Int_Par_Ref becomes 17                   */
+     /* third call:      Int_1_Par_Val == 6, Int_2_Par_Val == 10, */
+     /*                  Int_Par_Ref becomes 18                   */
+
+ {
+   One_Fifty Int_Loc;
+ 
+   Int_Loc = Int_1_Par_Val + 2;
+   *Int_Par_Ref = Int_2_Par_Val + Int_Loc;
+ } /* Proc_7 */
+ 
+ 
+ void Proc_8 (Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref,
+                                  int Int_1_Par_Val, int Int_2_Par_Val)
+ /*********************************************************************/
+     /* executed once      */
+     /* Int_Par_Val_1 == 3 */
+     /* Int_Par_Val_2 == 7 */
+
+ {
+   REG One_Fifty Int_Index;
+   REG One_Fifty Int_Loc;
+ 
+   Int_Loc = Int_1_Par_Val + 5;
+   Arr_1_Par_Ref [Int_Loc] = Int_2_Par_Val;
+   Arr_1_Par_Ref [Int_Loc+1] = Arr_1_Par_Ref [Int_Loc];
+   Arr_1_Par_Ref [Int_Loc+30] = Int_Loc;
+   for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index)
+     Arr_2_Par_Ref [Int_Loc] [Int_Index] = Int_Loc;
+   Arr_2_Par_Ref [Int_Loc] [Int_Loc-1] += 1;
+   Arr_2_Par_Ref [Int_Loc+20] [Int_Loc] = Arr_1_Par_Ref [Int_Loc];
+   Int_Glob = 5;
+ } /* Proc_8 */
+ 
+ 
+ Enumeration Func_1 (Capital_Letter Ch_1_Par_Val,
+                                           Capital_Letter Ch_2_Par_Val)
+ /*************************************************/
+     /* executed three times                                         */
+     /* first call:      Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R'    */
+     /* second call:     Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C'    */
+     /* third call:      Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C'    */
+ 
+ {
+   Capital_Letter        Ch_1_Loc;
+   Capital_Letter        Ch_2_Loc;
+ 
+   Ch_1_Loc = Ch_1_Par_Val;
+   Ch_2_Loc = Ch_1_Loc;
+   if (Ch_2_Loc != Ch_2_Par_Val)
+     /* then, executed */
+     return (Ident_1);
+   else  /* not executed */
+   {
+     Ch_1_Glob = Ch_1_Loc;
+     return (Ident_2);
+    }
+ } /* Func_1 */
+ 
+ 
+ Boolean Func_2 (Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref)
+ /*************************************************/
+     /* executed once */
+     /* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */
+     /* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */
+ 
+ {
+   REG One_Thirty        Int_Loc;
+       Capital_Letter    Ch_Loc;
+ 
+   Int_Loc = 2;
+   while (Int_Loc <= 2) /* loop body executed once */
+     if (Func_1 (Str_1_Par_Ref[Int_Loc],
+                 Str_2_Par_Ref[Int_Loc+1]) == Ident_1)
+       /* then, executed */
+     {
+       Ch_Loc = 'A';
+       Int_Loc += 1;
+     } /* if, while */
+   if (Ch_Loc >= 'W' && Ch_Loc < 'Z')
+     /* then, not executed */
+     Int_Loc = 7;
+   if (Ch_Loc == 'R')
+     /* then, not executed */
+     return (true);
+   else /* executed */
+   {
+     if (strcmp (Str_1_Par_Ref, Str_2_Par_Ref) > 0)
+       /* then, not executed */
+     {
+       Int_Loc += 7;
+       Int_Glob = Int_Loc;
+       return (true);
+     }
+     else /* executed */
+       return (false);
+   } /* if Ch_Loc */
+ } /* Func_2 */
+ 
+ 
+ Boolean Func_3 (Enumeration Enum_Par_Val)
+ /***************************/
+     /* executed once        */
+     /* Enum_Par_Val == Ident_3 */
+     
+ {
+   Enumeration Enum_Loc;
+ 
+   Enum_Loc = Enum_Par_Val;
+   if (Enum_Loc == Ident_3)
+     /* then, executed */
+     return (true);
+   else /* not executed */
+     return (false);
+ } /* Func_3 */
--- a/contrib/cortex-strings/benchmarks/multi/harness.c
+++ b/contrib/cortex-strings/benchmarks/multi/harness.c
@ -0,0 +1,407 @@
+/*
+ * Copyright (c) 2011, Linaro Limited
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Linaro nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+/** A simple harness that times how long a string function takes to
+ * run.
+ */
+
+/* PENDING: Add EPL */
+
+#include <string.h>
+#include <time.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <unistd.h>
+#include <errno.h>
+
+#define NUM_ELEMS(_x) (sizeof(_x) / sizeof((_x)[0]))
+
+#ifndef VERSION
+#define VERSION "(unknown version)"
+#endif
+
+/** Make sure a function is called by using the return value */
+#define SPOIL(_x)  volatile long x = (long)(_x); (void)x
+
+/** Type of functions that can be tested */
+typedef void (*stub_t)(void *dest, void *src, size_t n);
+
+/** Meta data about one test */
+struct test
+{
+  /** Test name */
+  const char *name;
+  /** Function to test */
+  stub_t stub;
+};
+
+/** Flush the cache by reading a chunk of memory */
+static void empty(volatile char *against)
+{
+  /* We know that there's a 16 k cache with 64 byte lines giving
+     a total of 256 lines.  Read randomly from 256*5 places should
+     flush everything */
+  int offset = (1024 - 256)*1024;
+
+  for (int i = offset; i < offset + 16*1024*3; i += 64)
+    {
+      against[i];
+    }
+}
+
+/** Stub that does nothing.  Used for calibrating */
+static void xbounce(void *dest, void *src, size_t n)
+{
+  SPOIL(0);
+}
+
+/** Stub that calls memcpy */
+static void xmemcpy(void *dest, void *src, size_t n)
+{
+  SPOIL(memcpy(dest, src, n));
+}
+
+/** Stub that calls memset */
+static void xmemset(void *dest, void *src, size_t n)
+{
+  SPOIL(memset(dest, 0, n));
+}
+
+/** Stub that calls memcmp */
+static void xmemcmp(void *dest, void *src, size_t n)
+{
+  SPOIL(memcmp(dest, src, n));
+}
+
+/** Stub that calls strcpy */
+static void xstrcpy(void *dest, void *src, size_t n)
+{
+  SPOIL(strcpy(dest, src));
+}
+
+/** Stub that calls strlen */
+static void xstrlen(void *dest, void *src, size_t n)
+{
+  SPOIL(strlen(dest));
+}
+
+/** Stub that calls strcmp */
+static void xstrcmp(void *dest, void *src, size_t n)
+{
+  SPOIL(strcmp(dest, src));
+}
+
+/** Stub that calls strchr */
+static void xstrchr(void *dest, void *src, size_t n)
+{
+  /* Put the character at the end of the string and before the null */
+  ((char *)src)[n-1] = 32;
+  SPOIL(strchr(src, 32));
+}
+
+/** Stub that calls memchr */
+static void xmemchr(void *dest, void *src, size_t n)
+{
+  /* Put the character at the end of the block */
+  ((char *)src)[n-1] = 32;
+  SPOIL(memchr(src, 32, n));
+}
+
+/** All functions that can be tested */
+static const struct test tests[] =
+  {
+    { "bounce", xbounce },
+    { "memchr", xmemchr },
+    { "memcpy", xmemcpy },
+    { "memset", xmemset },
+    { "memcmp", xmemcmp },
+    { "strchr", xstrchr },
+    { "strcmp", xstrcmp },
+    { "strcpy", xstrcpy },
+    { "strlen", xstrlen },
+    { NULL }
+  };
+
+/** Show basic usage */
+static void usage(const char* name)
+{
+  printf("%s %s: run a string related benchmark.\n"
+         "usage: %s [-c block-size] [-l loop-count] [-a alignment|src_alignment:dst_alignment] [-f] [-t test-name] [-r run-id]\n"
+         , name, VERSION, name);
+
+  printf("Tests:");
+
+  for (const struct test *ptest = tests; ptest->name != NULL; ptest++)
+    {
+      printf(" %s", ptest->name);
+    }
+
+  printf("\n");
+
+  exit(-1);
+}
+
+/** Find the test by name */
+static const struct test *find_test(const char *name)
+{
+  if (name == NULL)
+    {
+      return tests + 0;
+    }
+  else
+    {
+      for (const struct test *p = tests; p->name != NULL; p++)
+	{
+          if (strcmp(p->name, name) == 0)
+	    {
+              return p;
+	    }
+	}
+    }
+
+  return NULL;
+}
+
+#define MIN_BUFFER_SIZE 1024*1024
+#define MAX_ALIGNMENT	256
+
+/** Take a pointer and ensure that the lower bits == alignment */
+static char *realign(char *p, int alignment)
+{
+  uintptr_t pp = (uintptr_t)p;
+  pp = (pp + (MAX_ALIGNMENT - 1)) & ~(MAX_ALIGNMENT - 1);
+  pp += alignment;
+
+  return (char *)pp;
+}
+
+static int parse_int_arg(const char *arg, const char *exe_name)
+{
+  long int ret;
+
+  errno = 0;
+  ret = strtol(arg, NULL, 0);
+
+  if (errno)
+    {
+      usage(exe_name);
+    }
+
+  return (int)ret;
+}
+
+static void parse_alignment_arg(const char *arg, const char *exe_name,
+				int *src_alignment, int *dst_alignment)
+{
+  long int ret;
+  char *endptr;
+
+  errno = 0;
+  ret = strtol(arg, &endptr, 0);
+
+  if (errno)
+    {
+      usage(exe_name);
+    }
+
+  *src_alignment = (int)ret;
+
+  if (ret > 256 || ret < 1)
+    {
+      printf("Alignment should be in the range [1, 256].\n");
+      usage(exe_name);
+    }
+
+  if (ret == 256)
+    ret = 0;
+
+  if (endptr && *endptr == ':')
+    {
+      errno = 0;
+      ret = strtol(endptr + 1, NULL, 0);
+
+      if (errno)
+	{
+	  usage(exe_name);
+	}
+
+      if (ret > 256 || ret < 1)
+	{
+	  printf("Alignment should be in the range [1, 256].\n");
+	  usage(exe_name);
+	}
+
+      if (ret == 256)
+	ret = 0;
+    }
+
+  *dst_alignment = (int)ret;
+}
+
+/** Setup and run a test */
+int main(int argc, char **argv)
+{
+  /* Size of src and dest buffers */
+  size_t buffer_size = MIN_BUFFER_SIZE;
+
+  /* Number of bytes per call */
+  int count = 31;
+  /* Number of times to run */
+  int loops = 10000000;
+  /* True to flush the cache each time */
+  int flush = 0;
+  /* Name of the test */
+  const char *name = NULL;
+  /* Alignment of buffers */
+  int src_alignment = 8;
+  int dst_alignment = 8;
+  /* Name of the run */
+  const char *run_id = "0";
+
+  int opt;
+
+  while ((opt = getopt(argc, argv, "c:l:ft:r:hva:")) > 0)
+    {
+      switch (opt)
+	{
+	case 'c':
+          count = parse_int_arg(optarg, argv[0]);
+          break;
+	case 'l':
+          loops = parse_int_arg(optarg, argv[0]);
+          break;
+	case 'a':
+          parse_alignment_arg(optarg, argv[0], &src_alignment, &dst_alignment);
+          break;
+	case 'f':
+          flush = 1;
+          break;
+	case 't':
+          name = strdup(optarg);
+          break;
+	case 'r':
+          run_id = strdup(optarg);
+          break;
+	case 'h':
+          usage(argv[0]);
+          break;
+	default:
+          usage(argv[0]);
+          break;
+	}
+    }
+
+  /* Find the test by name */
+  const struct test *ptest = find_test(name);
+
+  if (ptest == NULL)
+    {
+      usage(argv[0]);
+    }
+
+  if (count + MAX_ALIGNMENT * 2 > MIN_BUFFER_SIZE)
+    {
+      buffer_size = count + MAX_ALIGNMENT * 2;
+    }
+
+  /* Buffers to read and write from */
+  char *src = malloc(buffer_size);
+  char *dest = malloc(buffer_size);
+
+  assert(src != NULL && dest != NULL);
+
+  src = realign(src, src_alignment);
+  dest = realign(dest, dst_alignment);
+
+  /* Fill the buffer with non-zero, reproducable random data */
+  srandom(1539);
+
+  for (int i = 0; i < buffer_size; i++)
+    {
+      src[i] = (char)random() | 1;
+      dest[i] = src[i];
+    }
+
+  /* Make sure the buffers are null terminated for any string tests */
+  src[count] = 0;
+  dest[count] = 0;
+
+  struct timespec start, end;
+  int err = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
+  assert(err == 0);
+
+  /* Preload */
+  stub_t stub = ptest->stub;
+
+  /* Run two variants to reduce the cost of testing for the flush */
+  if (flush == 0)
+    {
+      for (int i = 0; i < loops; i++)
+	{
+	  (*stub)(dest, src, count);
+	}
+    }
+  else
+    {
+      for (int i = 0; i < loops; i++)
+	{
+	  (*stub)(dest, src, count);
+	  empty(dest);
+	}
+    }
+
+  err = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
+  assert(err == 0);
+
+  /* Drop any leading path and pull the variant name out of the executable */
+  char *variant = strrchr(argv[0], '/');
+
+  if (variant == NULL)
+    {
+      variant = argv[0];
+    }
+
+  variant = strstr(variant, "try-");
+  assert(variant != NULL);
+
+  double elapsed = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) * 1e-9;
+  /* Estimate the bounce time.  Measured on a Panda. */
+  double bounced = 0.448730 * loops / 50000000;
+
+  /* Dump both machine and human readable versions */
+  printf("%s:%s:%u:%u:%d:%d:%s:%.6f: took %.6f s for %u calls to %s of %u bytes.  ~%.3f MB/s corrected.\n", 
+         variant + 4, ptest->name,
+	 count, loops, src_alignment, dst_alignment, run_id,
+	 elapsed,
+         elapsed, loops, ptest->name, count,
+         (double)loops*count/(elapsed - bounced)/(1024*1024));
+
+  return 0;
+}
--- a/contrib/cortex-strings/configure.ac
+++ b/contrib/cortex-strings/configure.ac
@ -0,0 +1,88 @@
+# Copyright (c) 2011-2012, Linaro Limited
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of the Linaro nor the
+#       names of its contributors may be used to endorse or promote products
+#       derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+AC_INIT(cortex-strings, 1.1-2012.06~dev)
+AM_INIT_AUTOMAKE(foreign subdir-objects color-tests dist-bzip2)
+AC_CONFIG_HEADERS([config.h])
+AC_CONFIG_FILES(Makefile)
+AC_CANONICAL_HOST
+AM_PROG_AS
+AC_PROG_CC
+AC_PROG_LIBTOOL
+
+default_submachine=
+
+case $host in
+aarch64*-*-*)
+  arch=aarch64
+  ;;
+arm*-*-*)
+  arch=aarch32
+  default_submachine=cortex-a9
+  ;;
+x86_64-*-*-*)
+  arch=generic
+  ;;
+*)
+  AC_MSG_ERROR([unknown architecture $host])
+  ;;
+esac
+
+AM_CONDITIONAL([HOST_AARCH32], [test x$arch = xaarch32])
+AM_CONDITIONAL([HOST_AARCH64], [test x$arch = xaarch64])
+AM_CONDITIONAL([HOST_GENERIC], [test x$arch = xgeneric])
+
+AC_ARG_WITH([cpu],
+	    AS_HELP_STRING([--with-cpu=CPU],
+                           [select code for CPU variant @<:@default=cortex-a9@:>@]]),
+	    [dnl
+  case "$withval" in
+  yes|'') AC_MSG_ERROR([--with-cpu requires an argument]) ;;
+  no) ;;
+  *) submachine="$withval" ;;
+  esac
+],
+[submachine=$default_submachine])
+
+AC_SUBST(submachine)
+AM_CONDITIONAL([WITH_SUBMACHINE], [test x$submachine != x])
+
+AC_ARG_WITH([neon],
+            AC_HELP_STRING([--with-neon],
+                           [include NEON specific routines @<:@default=yes@:>@]),
+	    [with_neon=$withval],
+	    [with_neon=yes])
+AC_SUBST(with_neon)
+AM_CONDITIONAL(WITH_NEON, test x$with_neon = xyes)
+
+AC_ARG_WITH([vfp],
+            AC_HELP_STRING([--with-vfp],
+                           [include VFP specific routines @<:@default=yes@:>@]),
+	    [with_vfp=$withval],
+	    [with_vfp=yes])
+AC_SUBST(with_vfp)
+AM_CONDITIONAL(WITH_VFP, test x$with_vfp = xyes)
+
+AC_OUTPUT
--- a/contrib/cortex-strings/scripts/add-license.sh
+++ b/contrib/cortex-strings/scripts/add-license.sh
@ -0,0 +1,79 @@
+#!/bin/bash
+#
+# Add the modified BSD license to a file
+#
+
+f=`mktemp -d`
+trap "rm -rf $f" EXIT
+
+year=`date +%Y`
+cat > $f/original <<EOF
+Copyright (c) $year, Linaro Limited
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Linaro nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+EOF
+
+# Translate it to C style
+echo "/*" > $f/c
+sed -r 's/(.*)/ * \1/' $f/original | sed -r 's/ +$//' >> $f/c
+echo " */" >> $f/c
+echo >> $f/c
+
+# ...and shell style
+sed -r 's/(.*)/# \1/' $f/original | sed -r 's/ +$//' >> $f/shell
+echo '#' >> $f/shell
+echo >> $f/shell
+
+for name in $@; do
+    if grep -q Copyright $name; then
+	echo $name already has some type of copyright
+	continue
+    fi
+
+    case $name in
+	# These files don't have an explicit license
+        *autogen.sh*)
+	    continue;;
+	*reference/newlib/*)
+	    continue;;
+	*reference/newlib-xscale/*)
+	    continue;;
+	*/dhry/*)
+	    continue;;
+
+	*.c)
+	    src=$f/c
+	    ;;
+	*.sh|*.am|*.ac)
+	    src=$f/shell
+	    ;;
+	*)
+	    echo Unrecognied extension on $name
+	    continue
+    esac
+
+    cat $src $name > $f/next
+    mv $f/next $name
+    echo Updated $name
+done
--- a/contrib/cortex-strings/scripts/bench.py
+++ b/contrib/cortex-strings/scripts/bench.py
@ -0,0 +1,175 @@
+#!/usr/bin/env python
+
+"""Simple harness that benchmarks different variants of the routines,
+caches the results, and emits all of the records at the end.
+
+Results are generated for different values of:
+ * Source
+ * Routine
+ * Length
+ * Alignment
+"""
+
+import argparse
+import subprocess
+import math
+import sys
+
+# Prefix to the executables
+build = '../build/try-'
+
+ALL = 'memchr memcmp memcpy memset strchr strcmp strcpy strlen'
+
+HAS = {
+    'this': 'bounce memchr memcpy memset strchr strcmp strcpy strlen',
+    'bionic-a9': 'memcmp memcpy memset strcmp strcpy strlen',
+    'bionic-a15': 'memcmp memcpy memset strcmp strcpy strlen',
+    'bionic-c': ALL,
+    'csl': 'memcpy memset',
+    'glibc': 'memcpy memset strchr strlen',
+    'glibc-c': ALL,
+    'newlib': 'memcpy strcmp strcpy strlen',
+    'newlib-c': ALL,
+    'newlib-xscale': 'memchr memcpy memset strchr strcmp strcpy strlen',
+    'plain': 'memset memcpy strcmp strcpy',
+}
+
+BOUNCE_ALIGNMENTS = ['1']
+SINGLE_BUFFER_ALIGNMENTS = ['1', '2', '4', '8', '16', '32']
+DUAL_BUFFER_ALIGNMENTS = ['1:32', '2:32', '4:32', '8:32', '16:32', '32:32']
+
+ALIGNMENTS = {
+    'bounce': BOUNCE_ALIGNMENTS,
+    'memchr': SINGLE_BUFFER_ALIGNMENTS,
+    'memset': SINGLE_BUFFER_ALIGNMENTS,
+    'strchr': SINGLE_BUFFER_ALIGNMENTS,
+    'strlen': SINGLE_BUFFER_ALIGNMENTS,
+    'memcmp': DUAL_BUFFER_ALIGNMENTS,
+    'memcpy': DUAL_BUFFER_ALIGNMENTS,
+    'strcmp': DUAL_BUFFER_ALIGNMENTS,
+    'strcpy': DUAL_BUFFER_ALIGNMENTS,
+}
+
+VARIANTS = sorted(HAS.keys())
+FUNCTIONS = sorted(ALIGNMENTS.keys())
+
+NUM_RUNS = 5
+
+def run(cache, variant, function, bytes, loops, alignment, run_id, quiet=False):
+    """Perform a single run, exercising the cache as appropriate."""
+    key = ':'.join('%s' % x for x in (variant, function, bytes, loops, alignment, run_id))
+
+    if key in cache:
+        got = cache[key]
+    else:
+        xbuild = build
+        cmd = '%(xbuild)s%(variant)s -t %(function)s -c %(bytes)s -l %(loops)s -a %(alignment)s -r %(run_id)s' % locals()
+
+        try:
+            got = subprocess.check_output(cmd.split()).strip()
+        except OSError, ex:
+            assert False, 'Error %s while running %s' % (ex, cmd)
+
+    parts = got.split(':')
+    took = float(parts[7])
+
+    cache[key] = got
+
+    if not quiet:
+        print got
+        sys.stdout.flush()
+
+    return took
+
+def run_many(cache, variants, bytes, all_functions):
+    # We want the data to come out in a useful order.  So fix an
+    # alignment and function, and do all sizes for a variant first
+    bytes = sorted(bytes)
+    mid = bytes[int(len(bytes)/1.5)]
+
+    if not all_functions:
+        # Use the ordering in 'this' as the default
+        all_functions = HAS['this'].split()
+
+        # Find all other functions
+        for functions in HAS.values():
+            for function in functions.split():
+                if function not in all_functions:
+                    all_functions.append(function)
+
+    for function in all_functions:
+        for alignment in ALIGNMENTS[function]:
+            for variant in variants:
+                if function not in HAS[variant].split():
+                    continue
+
+                # Run a tracer through and see how long it takes and
+                # adjust the number of loops based on that.  Not great
+                # for memchr() and similar which are O(n), but it will
+                # do
+                f = 50000000
+                want = 5.0
+
+                loops = int(f / math.sqrt(max(1, mid)))
+                took = run(cache, variant, function, mid, loops, alignment, 0,
+                           quiet=True)
+                # Keep it reasonable for silly routines like bounce
+                factor = min(20, max(0.05, want/took))
+                f = f * factor
+                
+                # Round f to a few significant figures
+                scale = 10**int(math.log10(f) - 1)
+                f = scale*int(f/scale)
+
+                for b in sorted(bytes):
+                    # Figure out the number of loops to give a roughly consistent run
+                    loops = int(f / math.sqrt(max(1, b)))
+                    for run_id in range(0, NUM_RUNS):
+                        run(cache, variant, function, b, loops, alignment,
+                            run_id)
+
+def run_top(cache):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-v", "--variants", nargs="+", help="library variant to run (run all if not specified)", default = VARIANTS, choices = VARIANTS)
+    parser.add_argument("-f", "--functions", nargs="+", help="function to run (run all if not specified)", default = FUNCTIONS, choices = FUNCTIONS)
+    parser.add_argument("-l", "--limit", type=int, help="upper limit to test to (in bytes)", default = 512*1024)
+    args = parser.parse_args()
+
+    # Test all powers of 2
+    step1 = 2.0
+    # Test intermediate powers of 1.4
+    step2 = 1.4
+    
+    bytes = []
+    
+    for step in [step1, step2]:
+        if step:
+            # Figure out how many steps get us up to the top
+            steps = int(round(math.log(args.limit) / math.log(step)))
+            bytes.extend([int(step**x) for x in range(0, steps+1)])
+
+    run_many(cache, args.variants, bytes, args.functions)
+
+def main():
+    cachename = 'cache.txt'
+
+    cache = {}
+
+    try:
+        with open(cachename) as f:
+            for line in f:
+                line = line.strip()
+                parts = line.split(':')
+                cache[':'.join(parts[:7])] = line
+    except:
+        pass
+
+    try:
+        run_top(cache)
+    finally:
+        with open(cachename, 'w') as f:
+            for line in sorted(cache.values()):
+                print >> f, line
+
+if __name__ == '__main__':
+    main()
--- a/contrib/cortex-strings/scripts/fixup.py
+++ b/contrib/cortex-strings/scripts/fixup.py
@ -0,0 +1,27 @@
+"""Simple script that enables target specific blocks based on the first argument.
+
+Matches comment blocks like this:
+
+/* For Foo: abc
+def
+*/
+
+and de-comments them giving:
+abc
+def
+"""
+import re
+import sys
+
+def main():
+    key = sys.argv[1]
+    expr = re.compile(r'/\* For %s:\s([^*]+)\*/' % key, re.M)
+
+    for arg in sys.argv[2:]:
+        with open(arg) as f:
+            body = f.read()
+        with open(arg, 'w') as f:
+            f.write(expr.sub(r'\1', body))
+
+if __name__ == '__main__':
+    main()
--- a/contrib/cortex-strings/scripts/libplot.py
+++ b/contrib/cortex-strings/scripts/libplot.py
@ -0,0 +1,78 @@
+"""Shared routines for the plotters."""
+
+import fileinput
+import collections
+
+Record = collections.namedtuple('Record', 'variant function bytes loops src_alignment dst_alignment run_id elapsed rest')
+
+
+def make_colours():
+    return iter('m b g r c y k pink orange brown grey'.split())
+
+def parse_value(v):
+    """Turn text into a primitive"""
+    try:
+        if '.' in v:
+            return float(v)
+        else:
+            return int(v)
+    except ValueError:
+        return v
+
+def create_column_tuple(record, names):
+    cols = [getattr(record, name) for name in names]
+    return tuple(cols)
+
+def unique(records, name, prefer=''):
+    """Return the unique values of a column in the records"""
+    if type(name) == tuple:
+        values = list(set(create_column_tuple(x, name) for x in records))
+    else:
+        values = list(set(getattr(x, name) for x in records))
+
+    if not values:
+        return values
+    elif type(values[0]) == str:
+        return sorted(values, key=lambda x: '%-06d|%s' % (-prefer.find(x), x))
+    else:
+        return sorted(values)
+
+def alignments_equal(alignments):
+    for alignment in alignments:
+        if alignment[0] != alignment[1]:
+            return False
+    return True
+
+def parse_row(line):
+    return Record(*[parse_value(y) for y in line.split(':')])
+
+def parse():
+    """Parse a record file into named tuples, correcting for loop
+    overhead along the way.
+    """
+    records = [parse_row(x) for x in fileinput.input()]
+
+    # Pull out any bounce values
+    costs = {}
+
+    for record in [x for x in records if x.function=='bounce']:
+        costs[(record.bytes, record.loops)] = record.elapsed
+
+    # Fix up all of the records for cost
+    out = []
+
+    for record in records:
+        if record.function == 'bounce':
+            continue
+
+        cost = costs.get((record.bytes, record.loops), None)
+
+        if not cost:
+            out.append(record)
+        else:
+            # Unfortunately you can't update a namedtuple...
+            values = list(record)
+            values[-2] -= cost
+            out.append(Record(*values))
+
+    return out
--- a/contrib/cortex-strings/scripts/plot-align.py
+++ b/contrib/cortex-strings/scripts/plot-align.py
@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+"""Plot the performance of different variants of one routine versus alignment.
+"""
+
+import libplot
+
+import pylab
+
+
+def plot(records, bytes, function):
+    records = [x for x in records if x.bytes==bytes and x.function==function]
+
+    variants = libplot.unique(records, 'variant', prefer='this')
+    alignments = libplot.unique(records, ('src_alignment', 'dst_alignment'))
+
+    X = pylab.arange(len(alignments))
+    width = 1.0/(len(variants)+1)
+
+    colours = libplot.make_colours()
+
+    pylab.figure(1).set_size_inches((16, 12))
+    pylab.clf()
+
+    for i, variant in enumerate(variants):
+        heights = []
+
+        for alignment in alignments:
+            matches = [x for x in records if x.variant==variant and x.src_alignment==alignment[0] and x.dst_alignment==alignment[1]]
+
+            if matches:
+                vals = [match.bytes*match.loops/match.elapsed/(1024*1024) for
+                        match in matches]
+                mean = sum(vals)/len(vals)
+                heights.append(mean)
+            else:
+                heights.append(0)
+
+        pylab.bar(X+i*width, heights, width, color=colours.next(), label=variant)
+
+
+    axes = pylab.axes()
+    if libplot.alignments_equal(alignments):
+        alignment_labels = ["%s" % x[0] for x in alignments]
+    else:
+        alignment_labels = ["%s:%s" % (x[0], x[1]) for x in alignments]
+    axes.set_xticklabels(alignment_labels)
+    axes.set_xticks(X + 0.5)
+
+    pylab.title('Performance of different variants of %(function)s for %(bytes)d byte blocks' % locals())
+    pylab.xlabel('Alignment')
+    pylab.ylabel('Rate (MB/s)')
+    pylab.legend(loc='lower right', ncol=3)
+    pylab.grid()
+    pylab.savefig('alignment-%(function)s-%(bytes)d.png' % locals(), dpi=72)
+
+def main():
+    records = libplot.parse()
+
+    for function in libplot.unique(records, 'function'):
+        for bytes in libplot.unique(records, 'bytes'):
+            plot(records, bytes, function)
+
+    pylab.show()
+
+if __name__ == '__main__':
+    main()
--- a/contrib/cortex-strings/scripts/plot-sizes.py
+++ b/contrib/cortex-strings/scripts/plot-sizes.py
@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+"""Plot the performance for different block sizes of one function across
+variants.
+"""
+
+import libplot
+
+import pylab
+import pdb
+import math
+
+def pretty_kb(v):
+    if v < 1024:
+        return '%d' % v
+    else:
+        if v % 1024 == 0:
+            return '%d k' % (v//1024)
+        else:
+            return '%.1f k' % (v/1024)
+
+def plot(records, function, alignment=None, scale=1):
+    variants = libplot.unique(records, 'variant', prefer='this')
+    records = [x for x in records if x.function==function]
+
+    if alignment != None:
+        records = [x for x in records if x.src_alignment==alignment[0] and
+                   x.dst_alignment==alignment[1]]
+
+    alignments = libplot.unique(records, ('src_alignment', 'dst_alignment'))
+    if len(alignments) != 1:
+        return False
+    if libplot.alignments_equal(alignments):
+        aalignment = alignments[0][0]
+    else:
+        aalignment = "%s:%s" % (alignments[0][0], alignments[0][1])
+
+    bytes = libplot.unique(records, 'bytes')[0]
+
+    colours = libplot.make_colours()
+    all_x = []
+
+    pylab.figure(1).set_size_inches((6.4*scale, 4.8*scale))
+    pylab.clf()
+
+    if 'str' in function:
+        # The harness fills out to 16k.  Anything past that is an
+        # early match
+        top = 16384
+    else:
+        top = 2**31
+
+    for variant in variants:
+        matches = [x for x in records if x.variant==variant and x.bytes <= top]
+        matches.sort(key=lambda x: x.bytes)
+
+        X = sorted(list(set([x.bytes for x in matches])))
+        Y = []
+        Yerr = []
+        for xbytes in X:
+            vals = [x.bytes*x.loops/x.elapsed/(1024*1024) for x in matches if x.bytes == xbytes]
+            if len(vals) > 1:
+                mean = sum(vals)/len(vals)
+                Y.append(mean)
+                if len(Yerr) == 0:
+                    Yerr = [[], []]
+                err1 = max(vals) - mean
+                assert err1 >= 0
+                err2 = min(vals) - mean
+                assert err2 <= 0
+                Yerr[0].append(abs(err2))
+                Yerr[1].append(err1)
+            else:
+                Y.append(vals[0])
+
+        all_x.extend(X)
+        colour = colours.next()
+
+        if X:
+            pylab.plot(X, Y, c=colour)
+            if len(Yerr) > 0:
+                pylab.errorbar(X, Y, yerr=Yerr, c=colour, label=variant, fmt='o')
+            else:
+                pylab.scatter(X, Y, c=colour, label=variant, edgecolors='none')
+
+    pylab.legend(loc='upper left', ncol=3, prop={'size': 'small'})
+    pylab.grid()
+    pylab.title('%(function)s of %(aalignment)s byte aligned blocks' % locals())
+    pylab.xlabel('Size (B)')
+    pylab.ylabel('Rate (MB/s)')
+
+    # Figure out how high the range goes
+    top = max(all_x)
+
+    power = int(round(math.log(max(all_x)) / math.log(2)))
+
+    pylab.semilogx()
+
+    pylab.axes().set_xticks([2**x for x in range(0, power+1)])
+    pylab.axes().set_xticklabels([pretty_kb(2**x) for x in range(0, power+1)])
+    pylab.xlim(0, top)
+    pylab.ylim(0, pylab.ylim()[1])
+    return True
+
+def main():
+    records = libplot.parse()
+
+    functions = libplot.unique(records, 'function')
+    alignments = libplot.unique(records, ('src_alignment', 'dst_alignment'))
+
+    for function in functions:
+        for alignment in alignments:
+            for scale in [1, 2.5]:
+                if plot(records, function, alignment, scale):
+                    pylab.savefig('sizes-%s-%02d-%02d-%.1f.png' % (function, alignment[0], alignment[1], scale), dpi=72)
+
+    pylab.show()
+
+if __name__ == '__main__':
+    main()
--- a/contrib/cortex-strings/scripts/plot-top.py
+++ b/contrib/cortex-strings/scripts/plot-top.py
@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+"""Plot the performance of different variants of the string routines
+for one size.
+"""
+
+import libplot
+
+import pylab
+
+
+def plot(records, bytes):
+    records = [x for x in records if x.bytes==bytes]
+
+    variants = libplot.unique(records, 'variant', prefer='this')
+    functions = libplot.unique(records, 'function')
+
+    X = pylab.arange(len(functions))
+    width = 1.0/(len(variants)+1)
+
+    colours = libplot.make_colours()
+
+    pylab.figure(1).set_size_inches((16, 12))
+    pylab.clf()
+
+    for i, variant in enumerate(variants):
+        heights = []
+
+        for function in functions:
+            matches = [x for x in records if x.variant==variant and x.function==function and x.src_alignment==8]
+
+            if matches:
+                vals = [match.bytes*match.loops/match.elapsed/(1024*1024) for
+                        match in matches]
+                mean = sum(vals)/len(vals)
+                heights.append(mean)
+            else:
+                heights.append(0)
+
+        pylab.bar(X+i*width, heights, width, color=colours.next(), label=variant)
+
+    axes = pylab.axes()
+    axes.set_xticklabels(functions)
+    axes.set_xticks(X + 0.5)
+
+    pylab.title('Performance of different variants for %d byte blocks' % bytes)
+    pylab.ylabel('Rate (MB/s)')
+    pylab.legend(loc='upper left', ncol=3)
+    pylab.grid()
+    pylab.savefig('top-%06d.png' % bytes, dpi=72)
+
+def main():
+    records = libplot.parse()
+
+    for bytes in libplot.unique(records, 'bytes'):
+        plot(records, bytes)
+
+    pylab.show()
+
+if __name__ == '__main__':
+    main()
--- a/contrib/cortex-strings/scripts/plot.py
+++ b/contrib/cortex-strings/scripts/plot.py
@ -0,0 +1,123 @@
+"""Plot the results for each test.  Spits out a set of images into the
+current directory.
+"""
+
+import libplot
+
+import fileinput
+import collections
+import pprint
+
+import pylab
+
+Record = collections.namedtuple('Record', 'variant test size loops src_alignment dst_alignment run_id rawtime comment time bytes rate')
+
+def unique(rows, name):
+    """Takes a list of values, pulls out the named field, and returns
+    a list of the unique values of this field.
+    """
+    return sorted(set(getattr(x, name) for x in rows))
+
+def to_float(v):
+    """Convert a string into a better type.
+
+    >>> to_float('foo')
+    'foo'
+    >>> to_float('1.23')
+    1.23
+    >>> to_float('45')
+    45
+    """
+    try:
+        if '.' in v:
+            return float(v)
+        else:
+            return int(v)
+    except:
+        return v
+
+def parse():
+    # Split the input up
+    rows = [x.strip().split(':') for x in fileinput.input()]
+    # Automatically turn numbers into the base type
+    rows = [[to_float(y) for y in x] for x in rows]
+
+    # Scan once to calculate the overhead
+    r = [Record(*(x + [0, 0, 0])) for x in rows]
+    bounces = pylab.array([(x.loops, x.rawtime) for x in r if x.test == 'bounce'])
+    fit = pylab.polyfit(bounces[:,0], bounces[:,1], 1)
+
+    records = []
+
+    for row in rows:
+        # Make a dummy record so we can use the names
+        r1 = Record(*(row + [0, 0, 0]))
+
+        bytes = r1.size * r1.loops
+        # Calculate the bounce time
+        delta = pylab.polyval(fit, [r1.loops])
+        time = r1.rawtime - delta
+        rate = bytes / time
+
+        records.append(Record(*(row + [time, bytes, rate])))
+
+    return records
+
+def plot(records, field, scale, ylabel):
+    variants = unique(records, 'variant')
+    tests = unique(records, 'test')
+
+    colours = libplot.make_colours()
+
+    # A little hack.  We want the 'all' record to be drawn last so
+    # that it's obvious on the graph.  Assume that no tests come
+    # before it alphabetically
+    variants.reverse()
+
+    for test in tests:
+        for variant in variants:
+            v = [x for x in records if x.test==test and x.variant==variant]
+            v.sort(key=lambda x: x.size)
+            V = pylab.array([(x.size, getattr(x, field)) for x in v])
+
+            # Ensure our results appear
+            order = 1 if variant == 'this' else 0
+
+            try:
+                # A little hack.  We want the 'all' to be obvious on
+                # the graph
+                if variant == 'all':
+                    pylab.scatter(V[:,0], V[:,1]/scale, label=variant)
+                    pylab.plot(V[:,0], V[:,1]/scale)
+                else:
+                    pylab.plot(V[:,0], V[:,1]/scale, label=variant,
+                            zorder=order, c = colours.next())
+
+            except Exception, ex:
+                # michaelh1 likes to run this script while the test is
+                # still running which can lead to bad data
+                print ex, 'on %s of %s' % (variant, test)
+
+        pylab.legend(loc='lower right', ncol=2, prop={'size': 'small'})
+        pylab.xlabel('Block size (B)')
+        pylab.ylabel(ylabel)
+        pylab.title('%s %s' % (test, field))
+        pylab.grid()
+
+        pylab.savefig('%s-%s.png' % (test, field), dpi=100)
+        pylab.semilogx(basex=2)
+        pylab.savefig('%s-%s-semilog.png' % (test, field), dpi=100)
+        pylab.clf()
+
+def test():
+    import doctest
+    doctest.testmod()
+
+def main():
+    records = parse()
+
+    plot(records, 'rate', 1024**2, 'Rate (MB/s)')
+    plot(records, 'time', 1, 'Total time (s)')
+
+if __name__ == '__main__':
+    main()
--- a/contrib/cortex-strings/scripts/trim.sh
+++ b/contrib/cortex-strings/scripts/trim.sh
@ -0,0 +1,9 @@
+#!/bin/bash
+#
+# Trims the whitespace from around any given images
+#
+
+for i in $@; do
+    convert $i -bordercolor white -border 1x1 -trim +repage -alpha off +dither -colors 32 PNG8:next-$i
+    mv next-$i $i
+done
--- a/contrib/cortex-strings/src/aarch64/memchr.S
+++ b/contrib/cortex-strings/src/aarch64/memchr.S
@ -0,0 +1,172 @@
+/*
+ * memchr - find a character in a memory zone
+ *
+ * Copyright (c) 2014, ARM Limited
+ * All rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the company nor the names of its contributors
+ *       may be used to endorse or promote products derived from this
+ *       software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+def_fn memchr
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, .Lzero_length
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	.Lloop
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	.Lmasklast
+	/* Have we found something already? */
+	cbnz	synd, .Ltail
+
+.Lloop:
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	.Lend
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.2d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, .Lloop
+
+.Lend:
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.2d[0]
+	/* Only do the clear for the last possible block */
+	b.hi	.Ltail
+
+.Lmasklast:
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+.Ltail:
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+.Lzero_length:
+	mov	result, #0
+	ret
+
+	.size	memchr, . - memchr
--- a/contrib/cortex-strings/src/aarch64/memcmp.S
+++ b/contrib/cortex-strings/src/aarch64/memcmp.S
@ -0,0 +1,162 @@
+/* memcmp - compare memory
+
+   Copyright (c) 2013, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define has_nul		x5
+#define diff		x6
+#define endloop		x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define pos		x11
+#define limit_wd	x12
+#define mask		x13
+
+def_fn memcmp p2align=6
+	cbz	limit, .Lret0
+	eor	tmp1, src1, src2
+	tst	tmp1, #7
+	b.ne	.Lmisaligned8
+	ands	tmp1, src1, #7
+	b.ne	.Lmutual_align
+	add	limit_wd, limit, #7
+	lsr	limit_wd, limit_wd, #3
+	/* Start of performance-critical section  -- one 64B cache line.  */
+.Lloop_aligned:
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+.Lstart_realigned:
+	subs	limit_wd, limit_wd, #1
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, ne	/* Last Dword or differences.  */
+	cbz	endloop, .Lloop_aligned
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+	/* Not reached the limit, must have found a diff.  */
+	cbnz	limit_wd, .Lnot_limit
+
+	/* Limit % 8 == 0 => all bytes significant.  */
+	ands	limit, limit, #7
+	b.eq	.Lnot_limit
+
+	lsl	limit, limit, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+#ifdef __AARCH64EB__
+	lsr	mask, mask, limit
+#else
+	lsl	mask, mask, limit
+#endif
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	orr	diff, diff, mask
+.Lnot_limit:
+
+#ifndef	__AARCH64EB__
+	rev	diff, diff
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	/* The MS-non-zero bit of DIFF marks either the first bit
+	   that is different, or the end of the significant data.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, diff
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+
+.Lmutual_align:
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	add	limit, limit, tmp1	/* Adjust the limit for the extra.  */
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	ldr	data1, [src1], #8
+	neg	tmp1, tmp1		/* Bits to alignment -64.  */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	add	limit_wd, limit, #7
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	lsr	limit_wd, limit_wd, #3
+	b	.Lstart_realigned
+
+.Lret0:
+	mov	result, #0
+	ret
+
+	.p2align 6
+.Lmisaligned8:
+	sub	limit, limit, #1
+1:
+	/* Perhaps we can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	limit, limit, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	1b
+	sub	result, data1, data2
+	ret
+	.size memcmp, . - memcmp
--- a/contrib/cortex-strings/src/aarch64/memcpy.S
+++ b/contrib/cortex-strings/src/aarch64/memcpy.S
@ -0,0 +1,225 @@
+/* Copyright (c) 2012, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define A_hw	w7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	src
+#define E_h	count
+#define F_l	srcend
+#define F_h	dst
+#define tmp1	x9
+
+#define L(l) .L ## l
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   Small and medium copies read all data before writing, allowing any
+   kind of overlap, and memmove tailcalls memcpy for these cases as
+   well as non-overlapping copies.
+*/
+
+def_fn memcpy p2align=6
+	prfm	PLDL1KEEP, [src]
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 16
+	b.ls	L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+	.p2align 4
+1:
+	tbz	count, 2, 1f
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+1:
+	cbz	count, 2f
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
+2:	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [src, 32]
+	ldp	D_l, D_h, [src, 48]
+	ldp	E_l, E_h, [srcend, -32]
+	ldp	F_l, F_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin, 32]
+	stp	D_l, D_h, [dstin, 48]
+	stp	E_l, E_h, [dstend, -32]
+	stp	F_l, F_h, [dstend, -16]
+	ret
+
+	/* Align DST to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.	 There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.	The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	2f
+1:
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.	 The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+2:
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.size	memcpy, . - memcpy
--- a/contrib/cortex-strings/src/aarch64/memmove.S
+++ b/contrib/cortex-strings/src/aarch64/memmove.S
@ -0,0 +1,150 @@
+/* Copyright (c) 2013, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+/* Parameters and result.  */
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define srcend	x3
+#define dstend	x4
+#define tmp1	x5
+#define A_l	x6
+#define A_h	x7
+#define B_l	x8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	count
+#define E_h	tmp1
+
+/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+   Larger backwards copies are also handled by memcpy. The only remaining
+   case is forward large copies.  The destination is aligned, and an
+   unrolled loop processes 64 bytes per iteration.
+*/
+
+def_fn memmove, 6
+	sub	tmp1, dstin, src
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.hs	memcpy
+
+	cbz	tmp1, 3f
+	add	dstend, dstin, count
+	add	srcend, src, count
+
+	/* Align dstend to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.	 There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.	The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	and	tmp1, dstend, 15
+	ldp	D_l, D_h, [srcend, -16]
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	2f
+	nop
+1:
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.	 The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the start even if
+	   there is just 1 byte left.  */
+2:
+	ldp	E_l, E_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	E_l, E_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+3:	ret
+
+	.size	memmove, . - memmove
--- a/contrib/cortex-strings/src/aarch64/memset.S
+++ b/contrib/cortex-strings/src/aarch64/memset.S
@ -0,0 +1,235 @@
+/* Copyright (c) 2012, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses
+ *
+ */
+
+
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define tmp1	x5
+#define tmp1w	w5
+#define tmp2	x6
+#define tmp2w	w6
+#define zva_len x7
+#define zva_lenw w7
+
+#define L(l) .L ## l
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+def_fn memset p2align=6
+
+	dup	v0.16B, valw
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+	cmp	count, 16
+	b.hs	L(set_medium)
+	mov	val, v0.D[0]
+
+	/* Set 0..15 bytes.  */
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+	nop
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	strb	valw, [dstin]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	/* Set 17..96 bytes.  */
+L(set_medium):
+	str	q0, [dstin]
+	tbnz	count, 6, L(set96)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	str	q0, [dstin, 16]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 3
+	nop
+L(set_long):
+	and	valw, valw, 255
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	cmp	count, 256
+	ccmp	valw, 0, 0, cs
+	b.eq	L(try_zva)
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	add	dst, dst, 16
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+L(tail64):
+	subs	count, count, 64
+	b.hi	1b
+2:	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 3
+L(try_zva):
+	mrs	tmp1, dczid_el0
+	tbnz	tmp1w, 4, L(no_zva)
+	and	tmp1w, tmp1w, 15
+	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
+	b.ne	 L(zva_128)
+
+	/* Write the first and last 64 byte aligned block using stp rather
+	   than using DC ZVA.  This is faster on some cores.
+	 */
+L(zva_64):
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+	nop
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 3
+L(zva_128):
+	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
+	b.ne	L(zva_other)
+
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	bic	dst, dst, 127
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 128
+	subs	count, count, 128
+	b.hi	1b
+	stp	q0, q0, [dstend, -128]
+	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+L(zva_other):
+	mov	tmp2w, 4
+	lsl	zva_lenw, tmp2w, tmp1w
+	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+	cmp	count, tmp1
+	blo	L(no_zva)
+
+	sub	tmp2, zva_len, 1
+	add	tmp1, dst, zva_len
+	add	dst, dst, 16
+	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+	beq	2f
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+2:	mov	dst, tmp1
+	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+	subs	count, count, zva_len
+	b.lo	4f
+3:	dc	zva, dst
+	add	dst, dst, zva_len
+	subs	count, count, zva_len
+	b.hs	3b
+4:	add	count, count, zva_len
+	b	L(tail64)
+
+	.size	memset, . - memset
--- a/contrib/cortex-strings/src/aarch64/strchr.S
+++ b/contrib/cortex-strings/src/aarch64/strchr.S
@ -0,0 +1,159 @@
+/*
+   strchr - find a character in a string
+
+   Copyright (c) 2014, ARM Limited
+   All rights Reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the company nor the names of its contributors
+         may be used to endorse or promote products derived from this
+         software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask_0	v7
+#define vrepmask_c	v16
+#define vend1		v17
+#define vend2		v18
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character; bit 1 is set
+   iff the relevant byte matched the NUL end of string (we trigger
+   off bit0 for the special case of looking for NUL).  Since the bits
+   in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination, and why.  */
+
+/* Locals and temporaries.  */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+def_fn strchr
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the requested byte.  Magic constant 0x80200802 used
+	   similarly for NUL termination.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask_c.4s, wtmp2
+	ands	tmp1, srcin, #31
+	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+	b.eq	.Lloop
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	lsl	tmp1, tmp1, #1
+	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
+	mov	tmp3, #~0
+	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
+	lsr	tmp1, tmp3, tmp1
+
+	mov	tmp3, vend1.2d[0]
+	bic	tmp1, tmp3, tmp1	// Mask padding bits.
+	cbnz	tmp1, .Ltail
+
+.Lloop:
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* Use a fast check for the termination condition.  */
+	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	orr	vend1.16b, vend1.16b, vend2.16b
+	addp	vend1.2d, vend1.2d, vend1.2d
+	mov	tmp1, vend1.2d[0]
+	cbz	tmp1, .Lloop
+
+	/* Termination condition found.  Now need to establish exactly why
+	   we terminated.  */
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+	orr	vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
+	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
+
+	mov	tmp1, vend1.2d[0]
+.Ltail:
+	/* Count the trailing zeros, by bit reversing...  */
+	rbit	tmp1, tmp1
+	/* Re-bias source.  */
+	sub	src, src, #32
+	clz	tmp1, tmp1	/* And counting the leading zeros.  */
+	/* Tmp1 is even if the target charager was found first.  Otherwise
+	   we've found the end of string and we weren't looking for NUL.  */
+	tst	tmp1, #1
+	add	result, src, tmp1, lsr #1
+	csel	result, result, xzr, eq
+	ret
+
+	.size	strchr, . - strchr
--- a/contrib/cortex-strings/src/aarch64/strchrnul.S
+++ b/contrib/cortex-strings/src/aarch64/strchrnul.S
@ -0,0 +1,144 @@
+/*
+   strchrnul - find a character or nul in a string
+
+   Copyright (c) 2014, ARM Limited
+   All rights Reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the company nor the names of its contributors
+         may be used to endorse or promote products derived from this
+         software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask	v7
+#define vend1		v16
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character or nul.  Since the
+   bits in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination.  */
+
+/* Locals and temporaries.  */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+def_fn strchrnul
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the termination condition.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask.4s, wtmp2
+	ands	tmp1, srcin, #31
+	b.eq	.Lloop
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
+	orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	lsl	tmp1, tmp1, #1
+	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	mov	tmp3, #~0
+	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
+	lsr	tmp1, tmp3, tmp1
+
+	mov	tmp3, vend1.2d[0]
+	bic	tmp1, tmp3, tmp1	// Mask padding bits.
+	cbnz	tmp1, .Ltail
+
+.Lloop:
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* Use a fast check for the termination condition.  */
+	orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
+	orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
+	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend1.2d, vend1.2d, vend1.2d
+	mov	tmp1, vend1.2d[0]
+	cbz	tmp1, .Lloop
+
+	/* Termination condition found.  Now need to establish exactly why
+	   we terminated.  */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
+	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
+
+	mov	tmp1, vend1.2d[0]
+.Ltail:
+	/* Count the trailing zeros, by bit reversing...  */
+	rbit	tmp1, tmp1
+	/* Re-bias source.  */
+	sub	src, src, #32
+	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
+	/* tmp1 is twice the offset into the fragment.  */
+	add	result, src, tmp1, lsr #1
+	ret
+
+	.size	strchrnul, . - strchrnul
--- a/contrib/cortex-strings/src/aarch64/strcmp.S
+++ b/contrib/cortex-strings/src/aarch64/strcmp.S
@ -0,0 +1,166 @@
+/* Copyright (c) 2012, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x2
+#define data1w		w2
+#define data2		x3
+#define data2w		w3
+#define has_nul		x4
+#define diff		x5
+#define syndrome	x6
+#define tmp1		x7
+#define tmp2		x8
+#define tmp3		x9
+#define zeroones	x10
+#define pos		x11
+
+	/* Start of performance-critical section  -- one 64B cache line.  */
+def_fn strcmp p2align=6
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	b.ne	.Lmisaligned8
+	ands	tmp1, src1, #7
+	b.ne	.Lmutual_align
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+.Lloop_aligned:
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+.Lstart_realigned:
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, .Lloop_aligned
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+#ifndef	__AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#else
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+.Lmutual_align:
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that preceed the start point.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	ldr	data1, [src1], #8
+	neg	tmp1, tmp1		/* Bits to alignment -64.  */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	b	.Lstart_realigned
+
+.Lmisaligned8:
+	/* We can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	.Lmisaligned8
+	sub	result, data1, data2
+	ret
--- a/contrib/cortex-strings/src/aarch64/strcpy.S
+++ b/contrib/cortex-strings/src/aarch64/strcpy.S
@ -0,0 +1,336 @@
+/*
+   strcpy/stpcpy - copy a string returning pointer to start/end.
+
+   Copyright (c) 2013, 2014, 2015 ARM Ltd.
+   All Rights Reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the company nor the names of its contributors
+         may be used to endorse or promote products derived from this
+         software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ */
+
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
+
+   To test the page crossing code path more thoroughly, compile with
+   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
+   entry path.  This option is not intended for production use.  */
+
+/* Arguments and results.  */
+#define dstin		x0
+#define srcin		x1
+
+/* Locals and temporaries.  */
+#define src		x2
+#define dst		x3
+#define data1		x4
+#define data1w		w4
+#define data2		x5
+#define data2w		w5
+#define has_nul1	x6
+#define has_nul2	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define tmp4		x11
+#define zeroones	x12
+#define data1a		x13
+#define data2a		x14
+#define pos		x15
+#define len		x16
+#define to_align	x17
+
+#ifdef BUILD_STPCPY
+#define STRCPY stpcpy
+#else
+#define STRCPY strcpy
+#endif
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
+	   page size check for crossing this boundary on entry and if we
+	   do not, then we can short-circuit much of the entry code.  We
+	   expect early page-crossing strings to be rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
+	   predictable, even with random strings.
+
+	   We don't bother checking for larger page sizes, the cost of setting
+	   up the correct page size is just not worth the extra gain from
+	   a small reduction in the cases taking the slow path.  Note that
+	   we only care about whether the first fetch, which may be
+	   misaligned, crosses a page boundary - after that we move to aligned
+	   fetches for the remainder of the string.  */
+
+#ifdef STRCPY_TEST_PAGE_CROSS
+	/* Make everything that isn't Qword aligned look like a page cross.  */
+#define MIN_PAGE_P2 4
+#else
+#define MIN_PAGE_P2 12
+#endif
+
+#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+
+def_fn STRCPY p2align=6
+	/* For moderately short strings, the fastest way to do the copy is to
+	   calculate the length of the string in the same way as strlen, then
+	   essentially do a memcpy of the result.  This avoids the need for
+	   multiple byte copies and further means that by the time we
+	   reach the bulk copy loop we know we can always use DWord
+	   accesses.  We expect strcpy to rarely be called repeatedly
+	   with the same source string, so branch prediction is likely to
+	   always be difficult - we mitigate against this by preferring
+	   conditional select operations over branches whenever this is
+	   feasible.  */
+	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
+	mov	zeroones, #REP8_01
+	and	to_align, srcin, #15
+	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
+	neg	tmp1, to_align
+	/* The first fetch will straddle a (possible) page boundary iff
+	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
+	   aligned string will never fail the page align check, so will
+	   always take the fast path.  */
+	b.gt	.Lpage_cross
+
+.Lpage_cross_ok:
+	ldp	data1, data2, [srcin]
+#ifdef __AARCH64EB__
+	/* Because we expect the end to be found within 16 characters
+	   (profiling shows this is the most common case), it's worth
+	   swapping the bytes now to save having to recalculate the
+	   termination syndrome later.  We preserve data1 and data2
+	   so that we can re-use the values later on.  */
+	rev	tmp2, data1
+	sub	tmp1, tmp2, zeroones
+	orr	tmp2, tmp2, #REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	b.ne	.Lfp_le8
+	rev	tmp4, data2
+	sub	tmp3, tmp4, zeroones
+	orr	tmp4, tmp4, #REP8_7f
+#else
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	b.ne	.Lfp_le8
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+#endif
+	bics	has_nul2, tmp3, tmp4
+	b.eq	.Lbulk_entry
+
+	/* The string is short (<=16 bytes).  We don't know exactly how
+	   short though, yet.  Work out the exact length so that we can
+	   quickly select the optimal copy strategy.  */
+.Lfp_gt8:
+	rev	has_nul2, has_nul2
+	clz	pos, has_nul2
+	mov	tmp2, #56
+	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
+	sub	pos, tmp2, pos
+#ifdef __AARCH64EB__
+	lsr	data2, data2, pos
+#else
+	lsl	data2, data2, pos
+#endif
+	str	data2, [dst, #1]
+	str	data1, [dstin]
+#ifdef BUILD_STPCPY
+	add	dstin, dst, #8
+#endif
+	ret
+
+.Lfp_le8:
+	rev	has_nul1, has_nul1
+	clz	pos, has_nul1
+	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
+	subs	tmp2, pos, #24			/* Pos in bits. */
+	b.lt	.Lfp_lt4
+#ifdef __AARCH64EB__
+	mov	tmp2, #56
+	sub	pos, tmp2, pos
+	lsr	data2, data1, pos
+	lsr	data1, data1, #32
+#else
+	lsr	data2, data1, tmp2
+#endif
+	/* 4->7 bytes to copy.  */
+	str	data2w, [dst, #-3]
+	str	data1w, [dstin]
+#ifdef BUILD_STPCPY
+	mov	dstin, dst
+#endif
+	ret
+.Lfp_lt4:
+	cbz	pos, .Lfp_lt2
+	/* 2->3 bytes to copy.  */
+#ifdef __AARCH64EB__
+	lsr	data1, data1, #48
+#endif
+	strh	data1w, [dstin]
+	/* Fall-through, one byte (max) to go.  */
+.Lfp_lt2:
+	/* Null-terminated string.  Last character must be zero!  */
+	strb	wzr, [dst]
+#ifdef BUILD_STPCPY
+	mov	dstin, dst
+#endif
+	ret
+
+	.p2align 6
+	/* Aligning here ensures that the entry code and main loop all lies
+	   within one 64-byte cache line.  */
+.Lbulk_entry:
+	sub	to_align, to_align, #16
+	stp	data1, data2, [dstin]
+	sub	src, srcin, to_align
+	sub	dst, dstin, to_align
+	b	.Lentry_no_page_cross
+
+	/* The inner loop deals with two Dwords at a time.  This has a
+	   slightly higher start-up cost, but we should win quite quickly,
+	   especially on cores with a high number of issue slots per
+	   cycle, as we get much better parallelism out of the operations.  */
+.Lmain_loop:
+	stp	data1, data2, [dst], #16
+.Lentry_no_page_cross:
+	ldp	data1, data2, [src], #16
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	.Lmain_loop
+
+	/* Since we know we are copying at least 16 bytes, the fastest way
+	   to deal with the tail is to determine the location of the
+	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
+	cmp	has_nul1, #0
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	csel	data1, data1, data2, ne
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#else
+	csel	has_nul1, has_nul1, has_nul2, ne
+#endif
+	rev	has_nul1, has_nul1
+	clz	pos, has_nul1
+	add	tmp1, pos, #72
+	add	pos, pos, #8
+	csel	pos, pos, tmp1, ne
+	add	src, src, pos, lsr #3
+	add	dst, dst, pos, lsr #3
+	ldp	data1, data2, [src, #-32]
+	stp	data1, data2, [dst, #-16]
+#ifdef BUILD_STPCPY
+	sub	dstin, dst, #1
+#endif
+	ret
+
+.Lpage_cross:
+	bic	src, srcin, #15
+	/* Start by loading two words at [srcin & ~15], then forcing the
+	   bytes that precede srcin to 0xff.  This means they never look
+	   like termination bytes.  */
+	ldp	data1, data2, [src]
+	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
+	tst	to_align, #7
+	csetm	tmp2, ne
+#ifdef __AARCH64EB__
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	data1, data1, tmp2
+	orr	data2a, data2, tmp2
+	cmp	to_align, #8
+	csinv	data1, data1, xzr, lt
+	csel	data2, data2, data2a, lt
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	.Lpage_cross_ok
+	/* We now need to make data1 and data2 look like they've been
+	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
+	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
+	neg	tmp2, to_align, lsl #3
+#ifdef __AARCH64EB__
+	lsl	data1a, data1, tmp1
+	lsr	tmp4, data2, tmp2
+	lsl	data2, data2, tmp1
+	orr	tmp4, tmp4, data1a
+	cmp	to_align, #8
+	csel	data1, tmp4, data2, lt
+	rev	tmp2, data1
+	rev	tmp4, data2
+	sub	tmp1, tmp2, zeroones
+	orr	tmp2, tmp2, #REP8_7f
+	sub	tmp3, tmp4, zeroones
+	orr	tmp4, tmp4, #REP8_7f
+#else
+	lsr	data1a, data1, tmp1
+	lsl	tmp4, data2, tmp2
+	lsr	data2, data2, tmp1
+	orr	tmp4, tmp4, data1a
+	cmp	to_align, #8
+	csel	data1, tmp4, data2, lt
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+#endif
+	bic	has_nul1, tmp1, tmp2
+	cbnz	has_nul1, .Lfp_le8
+	bic	has_nul2, tmp3, tmp4
+	b	.Lfp_gt8
+
+	.size	STRCPY, . - STRCPY
--- a/contrib/cortex-strings/src/aarch64/strlen.S
+++ b/contrib/cortex-strings/src/aarch64/strlen.S
@ -0,0 +1,233 @@
+/* Copyright (c) 2013-2015, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+	 notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+	 notice, this list of conditions and the following disclaimer in the
+	 documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+	 names of its contributors may be used to endorse or promote products
+	 derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ */
+
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.	 */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define len		x0
+
+/* Locals and temporaries.  */
+#define src		x1
+#define data1		x2
+#define data2		x3
+#define has_nul1	x4
+#define has_nul2	x5
+#define tmp1		x4
+#define tmp2		x5
+#define tmp3		x6
+#define tmp4		x7
+#define zeroones	x8
+
+#define L(l) .L ## l
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word. A faster check
+	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+	   false hits for characters 129..255.	*/
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 15
+#else
+# define MIN_PAGE_SIZE 4096
+#endif
+
+	/* Since strings are short on average, we check the first 16 bytes
+	   of the string for a NUL character.  In order to do an unaligned ldp
+	   safely we have to do a page cross check first.  If there is a NUL
+	   byte we calculate the length from the 2 8-byte words using
+	   conditional select to reduce branch mispredictions (it is unlikely
+	   strlen will be repeatedly called on strings with the same length).
+
+	   If the string is longer than 16 bytes, we align src so don't need
+	   further page cross checks, and process 32 bytes per iteration
+	   using the fast NUL check.  If we encounter non-ASCII characters,
+	   fallback to a second loop using the full NUL check.
+
+	   If the page cross check fails, we read 16 bytes from an aligned
+	   address, remove any characters before the string, and continue
+	   in the main loop using aligned loads.  Since strings crossing a
+	   page in the first 16 bytes are rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+	   AArch64 systems have a minimum page size of 4k.  We don't bother
+	   checking for larger page sizes - the cost of setting up the correct
+	   page size is just not worth the extra gain from a small reduction in
+	   the cases taking the slow path.  Note that we only care about
+	   whether the first fetch, which may be misaligned, crosses a page
+	   boundary.  */
+
+def_fn strlen p2align=6
+	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
+	cmp	tmp1, MIN_PAGE_SIZE - 16
+	b.gt	L(page_cross)
+	ldp	data1, data2, [srcin]
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.
+	   Since we expect strings to be small and early-exit,
+	   byte-swap the data now so has_null1/2 will be correct.  */
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
+
+	/* Enter with C = has_nul1 == 0.  */
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 8
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
+	add	len, len, tmp1, lsr 3
+	ret
+
+	/* The inner loop processes 32 bytes per iteration and uses the fast
+	   NUL check.  If we encounter non-ASCII characters, use a second
+	   loop with the accurate NUL check.  */
+	.p2align 4
+L(main_loop_entry):
+	bic	src, srcin, 15
+	sub	src, src, 16
+L(main_loop):
+	ldp	data1, data2, [src, 32]!
+.Lpage_cross_entry:
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	bne	1f
+	ldp	data1, data2, [src, 16]
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	beq	L(main_loop)
+	add	src, src, 16
+1:
+	/* The fast check failed, so do the slower, accurate NUL check.	 */
+	orr	tmp2, data1, REP8_7f
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+
+	/* Enter with C = has_nul1 == 0.  */
+L(tail):
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	csel	data1, data1, data2, cc
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#else
+	csel	has_nul1, has_nul1, has_nul2, cc
+#endif
+	sub	len, src, srcin
+	rev	has_nul1, has_nul1
+	add	tmp2, len, 8
+	clz	tmp1, has_nul1
+	csel	len, len, tmp2, cc
+	add	len, len, tmp1, lsr 3
+	ret
+
+L(nonascii_loop):
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	bne	L(tail)
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+	b	L(tail)
+
+	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+	   srcin to 0x7f, so we ignore any NUL bytes before the string.
+	   Then continue in the aligned loop.  */
+L(page_cross):
+	bic	src, srcin, 15
+	ldp	data1, data2, [src]
+	lsl	tmp1, srcin, 3
+	mov	tmp4, -1
+#ifdef __AARCH64EB__
+	/* Big-endian.	Early bytes are at MSB.	 */
+	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	tmp1, tmp1, REP8_80
+	orn	data1, data1, tmp1
+	orn	tmp2, data2, tmp1
+	tst	srcin, 8
+	csel	data1, data1, tmp4, eq
+	csel	data2, data2, tmp2, eq
+	b	L(page_cross_entry)
+
+	.size	strlen, . - strlen
--- a/contrib/cortex-strings/src/aarch64/strncmp.S
+++ b/contrib/cortex-strings/src/aarch64/strncmp.S
@ -0,0 +1,222 @@
+/* Copyright (c) 2013, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define has_nul		x5
+#define diff		x6
+#define syndrome	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define zeroones	x11
+#define pos		x12
+#define limit_wd	x13
+#define mask		x14
+#define endloop		x15
+
+	.text
+	.p2align 6
+	.rep 7
+	nop	/* Pad so that the loop below fits a cache line.  */
+	.endr
+def_fn strncmp
+	cbz	limit, .Lret0
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	b.ne	.Lmisaligned8
+	ands	tmp1, src1, #7
+	b.ne	.Lmutual_align
+	/* Calculate the number of full and partial words -1.  */
+	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
+	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	/* Start of performance-critical section  -- one 64B cache line.  */
+.Lloop_aligned:
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+.Lstart_realigned:
+	subs	limit_wd, limit_wd, #1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	endloop, #0, #0, eq
+	b.eq	.Lloop_aligned
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit_wd, #63, .Lnot_limit
+
+	/* Limit % 8 == 0 => all bytes significant.  */
+	ands	limit, limit, #7
+	b.eq	.Lnot_limit
+
+	lsl	limit, limit, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+#ifdef __AARCH64EB__
+	lsr	mask, mask, limit
+#else
+	lsl	mask, mask, limit
+#endif
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+.Lnot_limit:
+	orr	syndrome, diff, has_nul
+
+#ifndef	__AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#else
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+.Lmutual_align:
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.
+	   We also need to adjust the limit calculations, but without
+	   overflowing if the limit is near ULONG_MAX.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	ldr	data1, [src1], #8
+	neg	tmp3, tmp1, lsl #3	/* 64 - bits(bytes beyond align). */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp3	/* Shift (tmp1 & 63).  */
+#endif
+	and	tmp3, limit_wd, #7
+	lsr	limit_wd, limit_wd, #3
+	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
+	add	limit, limit, tmp1
+	add	tmp3, tmp3, tmp1
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	add	limit_wd, limit_wd, tmp3, lsr #3
+	b	.Lstart_realigned
+
+.Lret0:
+	mov	result, #0
+	ret
+
+	.p2align 6
+.Lmisaligned8:
+	sub	limit, limit, #1
+1:
+	/* Perhaps we can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	limit, limit, #1
+	ccmp	data1w, #1, #0, cs	/* NZCV = 0b0000.  */
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	1b
+	sub	result, data1, data2
+	ret
+	.size strncmp, . - strncmp
--- a/contrib/cortex-strings/src/aarch64/strnlen.S
+++ b/contrib/cortex-strings/src/aarch64/strnlen.S
@ -0,0 +1,181 @@
+/* strnlen - calculate the length of a string with limit.
+
+   Copyright (c) 2013, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define len		x0
+#define limit		x1
+
+/* Locals and temporaries.  */
+#define src		x2
+#define data1		x3
+#define data2		x4
+#define data2a		x5
+#define has_nul1	x6
+#define has_nul2	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define tmp4		x11
+#define zeroones	x12
+#define pos		x13
+#define limit_wd	x14
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+	.text
+	.p2align	6
+.Lstart:
+	/* Pre-pad to ensure critical loop begins an icache line.  */
+	.rep 7
+	nop
+	.endr
+	/* Put this code here to avoid wasting more space with pre-padding.  */
+.Lhit_limit:
+	mov	len, limit
+	ret
+
+def_fn strnlen
+	cbz	limit, .Lhit_limit
+	mov	zeroones, #REP8_01
+	bic	src, srcin, #15
+	ands	tmp1, srcin, #15
+	b.ne	.Lmisaligned
+	/* Calculate the number of full and partial words -1.  */
+	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
+	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	/* The inner loop deals with two Dwords at a time.  This has a
+	   slightly higher start-up cost, but we should win quite quickly,
+	   especially on cores with a high number of issue slots per
+	   cycle, as we get much better parallelism out of the operations.  */
+
+	/* Start of critial section -- keep to one 64Byte cache line.  */
+.Lloop:
+	ldp	data1, data2, [src], #16
+.Lrealigned:
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	subs	limit_wd, limit_wd, #1
+	orr	tmp1, has_nul1, has_nul2
+	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
+	b.eq	.Lloop
+	/* End of critical section -- keep to one 64Byte cache line.  */
+
+	orr	tmp1, has_nul1, has_nul2
+	cbz	tmp1, .Lhit_limit	/* No null in final Qword.  */
+
+	/* We know there's a null in the final Qword.  The easiest thing
+	   to do now is work out the length of the string and return
+	   MIN (len, limit).  */
+
+	sub	len, src, srcin
+	cbz	has_nul1, .Lnul_in_data2
+#ifdef __AARCH64EB__
+	mov	data2, data1
+#endif
+	sub	len, len, #8
+	mov	has_nul2, has_nul1
+.Lnul_in_data2:
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	rev	data2, data2
+	sub	tmp1, data2, zeroones
+	orr	tmp2, data2, #REP8_7f
+	bic	has_nul2, tmp1, tmp2
+#endif
+	sub	len, len, #8
+	rev	has_nul2, has_nul2
+	clz	pos, has_nul2
+	add	len, len, pos, lsr #3		/* Bits to bytes.  */
+	cmp	len, limit
+	csel	len, len, limit, ls		/* Return the lower value.  */
+	ret
+
+.Lmisaligned:
+	/* Deal with a partial first word.
+	   We're doing two things in parallel here;
+	   1) Calculate the number of words (but avoiding overflow if
+	      limit is near ULONG_MAX) - to do this we need to work out
+	      limit + tmp1 - 1 as a 65-bit value before shifting it;
+	   2) Load and mask the initial data words - we force the bytes
+	      before the ones we are interested in to 0xff - this ensures
+	      early bytes will not hit any zero detection.  */
+	sub	limit_wd, limit, #1
+	neg	tmp4, tmp1
+	cmp	tmp1, #8
+
+	and	tmp3, limit_wd, #15
+	lsr	limit_wd, limit_wd, #4
+	mov	tmp2, #~0
+
+	ldp	data1, data2, [src], #16
+	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
+	add	tmp3, tmp3, tmp1
+
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
+#endif
+	add	limit_wd, limit_wd, tmp3, lsr #4
+
+	orr	data1, data1, tmp2
+	orr	data2a, data2, tmp2
+
+	csinv	data1, data1, xzr, le
+	csel	data2, data2, data2a, le
+	b	.Lrealigned
+	.size	strnlen, . - .Lstart	/* Include pre-padding in size.  */
--- a/contrib/cortex-strings/src/arm/memchr.S
+++ b/contrib/cortex-strings/src/arm/memchr.S
@ -0,0 +1,155 @@
+/* Copyright (c) 2010-2011, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+      * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+      * Neither the name of Linaro Limited nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+   Written by Dave Gilbert <david.gilbert@linaro.org>
+
+   This memchr routine is optimised on a Cortex-A9 and should work on
+   all ARMv7 processors.   It has a fast past for short sizes, and has
+   an optimised path for large data sets; the worst case is finding the
+   match early in a large data set.
+
+ */
+
+@ 2011-02-07 david.gilbert@linaro.org
+@    Extracted from local git a5b438d861
+@ 2011-07-14 david.gilbert@linaro.org
+@    Import endianness fix from local git ea786f1b
+@ 2011-12-07 david.gilbert@linaro.org
+@    Removed unneeded cbz from align loop
+
+	.syntax unified
+	.arch armv7-a
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+	.text
+	.thumb
+
+@ ---------------------------------------------------------------------------
+	.thumb_func
+	.align 2
+	.p2align 4,,15
+	.global memchr
+	.type memchr,%function
+memchr:
+	@ r0 = start of memory to scan
+	@ r1 = character to look for
+	@ r2 = length
+	@ returns r0 = pointer to character or NULL if not found
+	and	r1,r1,#0xff	@ Don't think we can trust the caller to actually pass a char
+
+	cmp	r2,#16		@ If it's short don't bother with anything clever
+	blt	20f 
+
+	tst	r0, #7		@ If it's already aligned skip the next bit
+	beq	10f
+
+	@ Work up to an aligned point
+5:
+	ldrb	r3, [r0],#1
+	subs	r2, r2, #1
+	cmp	r3, r1
+	beq	50f		@ If it matches exit found
+	tst	r0, #7
+	bne	5b		@ If not aligned yet then do next byte
+	
+10:
+	@ At this point, we are aligned, we know we have at least 8 bytes to work with
+	push	{r4,r5,r6,r7}
+	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
+	orr	r1, r1, r1, lsl #16
+	bic	r4, r2, #7	@ Number of double words to work with
+	mvns	r7, #0		@ all F's
+	movs	r3, #0
+	
+15:
+	ldmia	r0!,{r5,r6}
+	subs	r4, r4, #8
+	eor	r5,r5, r1	@ Get it so that r5,r6 have 00's where the bytes match the target
+	eor	r6,r6, r1
+	uadd8	r5, r5, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r5, r3, r7	@ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	uadd8	r6, r6, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r6, r5, r7	@ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	cbnz	r6, 60f
+	bne	15b		@ (Flags from the subs above) If not run out of bytes then go around again
+
+	pop	{r4,r5,r6,r7}
+	and	r1,r1,#0xff	@ Get r1 back to a single character from the expansion above
+	and	r2,r2,#7	@ Leave the count remaining as the number after the double words have been done
+ 
+20:
+	cbz	r2, 40f		@ 0 length or hit the end already then not found
+
+21:  @ Post aligned section, or just a short call
+	ldrb	r3,[r0],#1
+	subs	r2,r2,#1
+	eor	r3,r3,r1	@ r3 = 0 if match - doesn't break flags from sub
+	cbz	r3, 50f
+	bne	21b		@ on r2 flags
+
+40:
+	movs	r0,#0		@ not found
+	bx	lr
+
+50:
+	subs	r0,r0,#1	@ found
+	bx	lr
+
+60:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
+	@ r0 points to the start of the double word after the one that was tested
+	@ r5 has the 00/ff pattern for the first word, r6 has the chained value
+	cmp	r5, #0
+	itte	eq
+	moveq	r5, r6		@ the end is in the 2nd word
+	subeq	r0,r0,#3	@ Points to 2nd byte of 2nd word
+	subne	r0,r0,#7	@ or 2nd byte of 1st word
+
+	@ r0 currently points to the 3rd byte of the word containing the hit
+	tst	r5, # CHARTSTMASK(0)	@ 1st character
+	bne	61f
+	adds	r0,r0,#1
+	tst	r5, # CHARTSTMASK(1)	@ 2nd character
+	ittt	eq
+	addeq	r0,r0,#1
+	tsteq	r5, # (3<<15)		@ 2nd & 3rd character
+	@ If not the 3rd must be the last one
+	addeq	r0,r0,#1
+
+61:
+	pop	{r4,r5,r6,r7}
+	subs	r0,r0,#1
+	bx	lr
--- a/contrib/cortex-strings/src/arm/memcpy.S
+++ b/contrib/cortex-strings/src/arm/memcpy.S
@ -0,0 +1,617 @@
+/* Copyright (c) 2013, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+      * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+      * Neither the name of Linaro Limited nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
+   of VFP or NEON when built with the appropriate flags.
+
+   Assumptions:
+
+    ARMv6 (ARMv7-a if using Neon)
+    ARM state
+    Unaligned accesses
+
+ */
+
+	.syntax unified
+	/* This implementation requires ARM state.  */
+	.arm
+
+#ifdef __ARM_NEON__
+
+	.fpu	neon
+	.arch	armv7-a
+# define FRAME_SIZE	4
+# define USE_VFP
+# define USE_NEON
+
+#elif !defined (__SOFTFP__)
+
+	.arch	armv6
+	.fpu	vfpv2
+# define FRAME_SIZE	32
+# define USE_VFP
+
+#else
+	.arch	armv6
+# define FRAME_SIZE    32
+
+#endif
+
+/* Old versions of GAS incorrectly implement the NEON align semantics.  */
+#ifdef BROKEN_ASM_NEON_ALIGN
+#define ALIGN(addr, align) addr,:align
+#else
+#define ALIGN(addr, align) addr:align
+#endif
+
+#define PC_OFFSET	8	/* PC pipeline compensation.  */
+#define INSN_SIZE	4
+
+/* Call parameters.  */
+#define dstin	r0
+#define src	r1
+#define count	r2
+
+/* Locals.  */
+#define tmp1	r3
+#define dst	ip
+#define tmp2	r10
+
+#ifndef USE_NEON
+/* For bulk copies using GP registers.  */
+#define	A_l	r2		/* Call-clobbered.  */
+#define	A_h	r3		/* Call-clobbered.  */
+#define	B_l	r4
+#define	B_h	r5
+#define	C_l	r6
+#define	C_h	r7
+#define	D_l	r8
+#define	D_h	r9
+#endif
+
+/* Number of lines ahead to pre-fetch data.  If you change this the code
+   below will need adjustment to compensate.  */
+
+#define prefetch_lines	5
+
+#ifdef USE_VFP
+	.macro	cpy_line_vfp vreg, base
+	vstr	\vreg, [dst, #\base]
+	vldr	\vreg, [src, #\base]
+	vstr	d0, [dst, #\base + 8]
+	vldr	d0, [src, #\base + 8]
+	vstr	d1, [dst, #\base + 16]
+	vldr	d1, [src, #\base + 16]
+	vstr	d2, [dst, #\base + 24]
+	vldr	d2, [src, #\base + 24]
+	vstr	\vreg, [dst, #\base + 32]
+	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
+	vstr	d0, [dst, #\base + 40]
+	vldr	d0, [src, #\base + 40]
+	vstr	d1, [dst, #\base + 48]
+	vldr	d1, [src, #\base + 48]
+	vstr	d2, [dst, #\base + 56]
+	vldr	d2, [src, #\base + 56]
+	.endm
+
+	.macro	cpy_tail_vfp vreg, base
+	vstr	\vreg, [dst, #\base]
+	vldr	\vreg, [src, #\base]
+	vstr	d0, [dst, #\base + 8]
+	vldr	d0, [src, #\base + 8]
+	vstr	d1, [dst, #\base + 16]
+	vldr	d1, [src, #\base + 16]
+	vstr	d2, [dst, #\base + 24]
+	vldr	d2, [src, #\base + 24]
+	vstr	\vreg, [dst, #\base + 32]
+	vstr	d0, [dst, #\base + 40]
+	vldr	d0, [src, #\base + 40]
+	vstr	d1, [dst, #\base + 48]
+	vldr	d1, [src, #\base + 48]
+	vstr	d2, [dst, #\base + 56]
+	vldr	d2, [src, #\base + 56]
+	.endm
+#endif
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+def_fn memcpy p2align=6
+
+	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
+	cmp	count, #64
+	bge	.Lcpy_not_short
+	/* Deal with small copies quickly by dropping straight into the
+	   exit block.  */
+
+.Ltail63unaligned:
+#ifdef USE_NEON
+	and	tmp1, count, #0x38
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+	vld1.8	{d0}, [src]!	/* 14 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 12 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 10 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 8 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 6 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 4 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 2 words to go.  */
+	vst1.8	{d0}, [dst]!
+
+	tst	count, #4
+	ldrne	tmp1, [src], #4
+	strne	tmp1, [dst], #4
+#else
+	/* Copy up to 15 full words of data.  May not be aligned.  */
+	/* Cannot use VFP for unaligned data.  */
+	and	tmp1, count, #0x3c
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
+	/* Jump directly into the sequence below at the correct offset.  */
+	add	pc, pc, tmp1, lsl #1
+
+	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
+	str	tmp1, [dst, #-60]
+
+	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
+	str	tmp1, [dst, #-56]
+	ldr	tmp1, [src, #-52]
+	str	tmp1, [dst, #-52]
+
+	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
+	str	tmp1, [dst, #-48]
+	ldr	tmp1, [src, #-44]
+	str	tmp1, [dst, #-44]
+
+	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
+	str	tmp1, [dst, #-40]
+	ldr	tmp1, [src, #-36]
+	str	tmp1, [dst, #-36]
+
+	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
+	str	tmp1, [dst, #-32]
+	ldr	tmp1, [src, #-28]
+	str	tmp1, [dst, #-28]
+
+	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
+	str	tmp1, [dst, #-24]
+	ldr	tmp1, [src, #-20]
+	str	tmp1, [dst, #-20]
+
+	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
+	str	tmp1, [dst, #-16]
+	ldr	tmp1, [src, #-12]
+	str	tmp1, [dst, #-12]
+
+	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
+	str	tmp1, [dst, #-8]
+	ldr	tmp1, [src, #-4]
+	str	tmp1, [dst, #-4]
+#endif
+
+	lsls	count, count, #31
+	ldrhcs	tmp1, [src], #2
+	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
+	strhcs	tmp1, [dst], #2
+	strbne	src, [dst]
+	bx	lr
+
+.Lcpy_not_short:
+	/* At least 64 bytes to copy, but don't know the alignment yet.  */
+	str	tmp2, [sp, #-FRAME_SIZE]!
+	and	tmp2, src, #7
+	and	tmp1, dst, #7
+	cmp	tmp1, tmp2
+	bne	.Lcpy_notaligned
+
+#ifdef USE_VFP
+	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
+	   that the FP pipeline is much better at streaming loads and
+	   stores.  This is outside the critical loop.  */
+	vmov.f32	s0, s0
+#endif
+
+	/* SRC and DST have the same mutual 64-bit alignment, but we may
+	   still need to pre-copy some bytes to get to natural alignment.
+	   We bring SRC and DST into full 64-bit alignment.  */
+	lsls	tmp2, dst, #29
+	beq	1f
+	rsbs	tmp2, tmp2, #0
+	sub	count, count, tmp2, lsr #29
+	ldrmi	tmp1, [src], #4
+	strmi	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #2
+	ldrhcs	tmp1, [src], #2
+	ldrbne	tmp2, [src], #1
+	strhcs	tmp1, [dst], #2
+	strbne	tmp2, [dst], #1
+
+1:
+	subs	tmp2, count, #64	/* Use tmp2 for count.  */
+	blt	.Ltail63aligned
+
+	cmp	tmp2, #512
+	bge	.Lcpy_body_long
+
+.Lcpy_body_medium:			/* Count in tmp2.  */
+#ifdef USE_VFP
+1:
+	vldr	d0, [src, #0]
+	subs	tmp2, tmp2, #64
+	vldr	d1, [src, #8]
+	vstr	d0, [dst, #0]
+	vldr	d0, [src, #16]
+	vstr	d1, [dst, #8]
+	vldr	d1, [src, #24]
+	vstr	d0, [dst, #16]
+	vldr	d0, [src, #32]
+	vstr	d1, [dst, #24]
+	vldr	d1, [src, #40]
+	vstr	d0, [dst, #32]
+	vldr	d0, [src, #48]
+	vstr	d1, [dst, #40]
+	vldr	d1, [src, #56]
+	vstr	d0, [dst, #48]
+	add	src, src, #64
+	vstr	d1, [dst, #56]
+	add	dst, dst, #64
+	bge	1b
+	tst	tmp2, #0x3f
+	beq	.Ldone
+
+.Ltail63aligned:			/* Count in tmp2.  */
+	and	tmp1, tmp2, #0x38
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+
+	vldr	d0, [src, #-56]	/* 14 words to go.  */
+	vstr	d0, [dst, #-56]
+	vldr	d0, [src, #-48]	/* 12 words to go.  */
+	vstr	d0, [dst, #-48]
+	vldr	d0, [src, #-40]	/* 10 words to go.  */
+	vstr	d0, [dst, #-40]
+	vldr	d0, [src, #-32]	/* 8 words to go.  */
+	vstr	d0, [dst, #-32]
+	vldr	d0, [src, #-24]	/* 6 words to go.  */
+	vstr	d0, [dst, #-24]
+	vldr	d0, [src, #-16]	/* 4 words to go.  */
+	vstr	d0, [dst, #-16]
+	vldr	d0, [src, #-8]	/* 2 words to go.  */
+	vstr	d0, [dst, #-8]
+#else
+	sub	src, src, #8
+	sub	dst, dst, #8
+1:
+	ldrd	A_l, A_h, [src, #8]
+	strd	A_l, A_h, [dst, #8]
+	ldrd	A_l, A_h, [src, #16]
+	strd	A_l, A_h, [dst, #16]
+	ldrd	A_l, A_h, [src, #24]
+	strd	A_l, A_h, [dst, #24]
+	ldrd	A_l, A_h, [src, #32]
+	strd	A_l, A_h, [dst, #32]
+	ldrd	A_l, A_h, [src, #40]
+	strd	A_l, A_h, [dst, #40]
+	ldrd	A_l, A_h, [src, #48]
+	strd	A_l, A_h, [dst, #48]
+	ldrd	A_l, A_h, [src, #56]
+	strd	A_l, A_h, [dst, #56]
+	ldrd	A_l, A_h, [src, #64]!
+	strd	A_l, A_h, [dst, #64]!
+	subs	tmp2, tmp2, #64
+	bge	1b
+	tst	tmp2, #0x3f
+	bne	1f
+	ldr	tmp2,[sp], #FRAME_SIZE
+	bx	lr
+1:
+	add	src, src, #8
+	add	dst, dst, #8
+
+.Ltail63aligned:			/* Count in tmp2.  */
+	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
+	   we know that the src and dest are 64-bit aligned so we can use
+	   LDRD/STRD to improve efficiency.  */
+	/* TMP2 is now negative, but we don't care about that.  The bottom
+	   six bits still tell us how many bytes are left to copy.  */
+
+	and	tmp1, tmp2, #0x38
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
+	strd	A_l, A_h, [dst, #-56]
+	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
+	strd	A_l, A_h, [dst, #-48]
+	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
+	strd	A_l, A_h, [dst, #-40]
+	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
+	strd	A_l, A_h, [dst, #-32]
+	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
+	strd	A_l, A_h, [dst, #-24]
+	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
+	strd	A_l, A_h, [dst, #-16]
+	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
+	strd	A_l, A_h, [dst, #-8]
+
+#endif
+	tst	tmp2, #4
+	ldrne	tmp1, [src], #4
+	strne	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
+	ldrhcs	tmp1, [src], #2
+	ldrbne	tmp2, [src]
+	strhcs	tmp1, [dst], #2
+	strbne	tmp2, [dst]
+
+.Ldone:
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bx	lr
+
+.Lcpy_body_long:			/* Count in tmp2.  */
+
+	/* Long copy.  We know that there's at least (prefetch_lines * 64)
+	   bytes to go.  */
+#ifdef USE_VFP
+	/* Don't use PLD.  Instead, read some data in advance of the current
+	   copy position into a register.  This should act like a PLD
+	   operation but we won't have to repeat the transfer.  */
+
+	vldr	d3, [src, #0]
+	vldr	d4, [src, #64]
+	vldr	d5, [src, #128]
+	vldr	d6, [src, #192]
+	vldr	d7, [src, #256]
+
+	vldr	d0, [src, #8]
+	vldr	d1, [src, #16]
+	vldr	d2, [src, #24]
+	add	src, src, #32
+
+	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
+	blt	2f
+1:
+	cpy_line_vfp	d3, 0
+	cpy_line_vfp	d4, 64
+	cpy_line_vfp	d5, 128
+	add	dst, dst, #3 * 64
+	add	src, src, #3 * 64
+	cpy_line_vfp	d6, 0
+	cpy_line_vfp	d7, 64
+	add	dst, dst, #2 * 64
+	add	src, src, #2 * 64
+	subs	tmp2, tmp2, #prefetch_lines * 64
+	bge	1b
+
+2:
+	cpy_tail_vfp	d3, 0
+	cpy_tail_vfp	d4, 64
+	cpy_tail_vfp	d5, 128
+	add	src, src, #3 * 64
+	add	dst, dst, #3 * 64
+	cpy_tail_vfp	d6, 0
+	vstr	d7, [dst, #64]
+	vldr	d7, [src, #64]
+	vstr	d0, [dst, #64 + 8]
+	vldr	d0, [src, #64 + 8]
+	vstr	d1, [dst, #64 + 16]
+	vldr	d1, [src, #64 + 16]
+	vstr	d2, [dst, #64 + 24]
+	vldr	d2, [src, #64 + 24]
+	vstr	d7, [dst, #64 + 32]
+	add	src, src, #96
+	vstr	d0, [dst, #64 + 40]
+	vstr	d1, [dst, #64 + 48]
+	vstr	d2, [dst, #64 + 56]
+	add	dst, dst, #128
+	add	tmp2, tmp2, #prefetch_lines * 64
+	b	.Lcpy_body_medium
+#else
+	/* Long copy.  Use an SMS style loop to maximize the I/O
+	   bandwidth of the core.  We don't have enough spare registers
+	   to synthesise prefetching, so use PLD operations.  */
+	/* Pre-bias src and dst.  */
+	sub	src, src, #8
+	sub	dst, dst, #8
+	pld	[src, #8]
+	pld	[src, #72]
+	subs	tmp2, tmp2, #64
+	pld	[src, #136]
+	ldrd	A_l, A_h, [src, #8]
+	strd	B_l, B_h, [sp, #8]
+	ldrd	B_l, B_h, [src, #16]
+	strd	C_l, C_h, [sp, #16]
+	ldrd	C_l, C_h, [src, #24]
+	strd	D_l, D_h, [sp, #24]
+	pld	[src, #200]
+	ldrd	D_l, D_h, [src, #32]!
+	b	1f
+	.p2align	6
+2:
+	pld	[src, #232]
+	strd	A_l, A_h, [dst, #40]
+	ldrd	A_l, A_h, [src, #40]
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [src, #48]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [src, #56]
+	strd	D_l, D_h, [dst, #64]!
+	ldrd	D_l, D_h, [src, #64]!
+	subs	tmp2, tmp2, #64
+1:
+	strd	A_l, A_h, [dst, #8]
+	ldrd	A_l, A_h, [src, #8]
+	strd	B_l, B_h, [dst, #16]
+	ldrd	B_l, B_h, [src, #16]
+	strd	C_l, C_h, [dst, #24]
+	ldrd	C_l, C_h, [src, #24]
+	strd	D_l, D_h, [dst, #32]
+	ldrd	D_l, D_h, [src, #32]
+	bcs	2b
+	/* Save the remaining bytes and restore the callee-saved regs.  */
+	strd	A_l, A_h, [dst, #40]
+	add	src, src, #40
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [sp, #8]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [sp, #16]
+	strd	D_l, D_h, [dst, #64]
+	ldrd	D_l, D_h, [sp, #24]
+	add	dst, dst, #72
+	tst	tmp2, #0x3f
+	bne	.Ltail63aligned
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bx	lr
+#endif
+
+.Lcpy_notaligned:
+	pld	[src]
+	pld	[src, #64]
+	/* There's at least 64 bytes to copy, but there is no mutual
+	   alignment.  */
+	/* Bring DST to 64-bit alignment.  */
+	lsls	tmp2, dst, #29
+	pld	[src, #(2 * 64)]
+	beq	1f
+	rsbs	tmp2, tmp2, #0
+	sub	count, count, tmp2, lsr #29
+	ldrmi	tmp1, [src], #4
+	strmi	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #2
+	ldrbne	tmp1, [src], #1
+	ldrhcs	tmp2, [src], #2
+	strbne	tmp1, [dst], #1
+	strhcs	tmp2, [dst], #2
+1:
+	pld	[src, #(3 * 64)]
+	subs	count, count, #64
+	ldrmi	tmp2, [sp], #FRAME_SIZE
+	bmi	.Ltail63unaligned
+	pld	[src, #(4 * 64)]
+
+#ifdef USE_NEON
+	vld1.8	{d0-d3}, [src]!
+	vld1.8	{d4-d7}, [src]!
+	subs	count, count, #64
+	bmi	2f
+1:
+	pld	[src, #(4 * 64)]
+	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
+	vld1.8	{d0-d3}, [src]!
+	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
+	vld1.8	{d4-d7}, [src]!
+	subs	count, count, #64
+	bpl	1b
+2:
+	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
+	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
+	ands	count, count, #0x3f
+#else
+	/* Use an SMS style loop to maximize the I/O bandwidth.  */
+	sub	src, src, #4
+	sub	dst, dst, #8
+	subs	tmp2, count, #64	/* Use tmp2 for count.  */
+	ldr	A_l, [src, #4]
+	ldr	A_h, [src, #8]
+	strd	B_l, B_h, [sp, #8]
+	ldr	B_l, [src, #12]
+	ldr	B_h, [src, #16]
+	strd	C_l, C_h, [sp, #16]
+	ldr	C_l, [src, #20]
+	ldr	C_h, [src, #24]
+	strd	D_l, D_h, [sp, #24]
+	ldr	D_l, [src, #28]
+	ldr	D_h, [src, #32]!
+	b	1f
+	.p2align	6
+2:
+	pld	[src, #(5 * 64) - (32 - 4)]
+	strd	A_l, A_h, [dst, #40]
+	ldr	A_l, [src, #36]
+	ldr	A_h, [src, #40]
+	strd	B_l, B_h, [dst, #48]
+	ldr	B_l, [src, #44]
+	ldr	B_h, [src, #48]
+	strd	C_l, C_h, [dst, #56]
+	ldr	C_l, [src, #52]
+	ldr	C_h, [src, #56]
+	strd	D_l, D_h, [dst, #64]!
+	ldr	D_l, [src, #60]
+	ldr	D_h, [src, #64]!
+	subs	tmp2, tmp2, #64
+1:
+	strd	A_l, A_h, [dst, #8]
+	ldr	A_l, [src, #4]
+	ldr	A_h, [src, #8]
+	strd	B_l, B_h, [dst, #16]
+	ldr	B_l, [src, #12]
+	ldr	B_h, [src, #16]
+	strd	C_l, C_h, [dst, #24]
+	ldr	C_l, [src, #20]
+	ldr	C_h, [src, #24]
+	strd	D_l, D_h, [dst, #32]
+	ldr	D_l, [src, #28]
+	ldr	D_h, [src, #32]
+	bcs	2b
+
+	/* Save the remaining bytes and restore the callee-saved regs.  */
+	strd	A_l, A_h, [dst, #40]
+	add	src, src, #36
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [sp, #8]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [sp, #16]
+	strd	D_l, D_h, [dst, #64]
+	ldrd	D_l, D_h, [sp, #24]
+	add	dst, dst, #72
+	ands	count, tmp2, #0x3f
+#endif
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bne	.Ltail63unaligned
+	bx	lr
+
+	.size	memcpy, . - memcpy
--- a/contrib/cortex-strings/src/arm/memset.S
+++ b/contrib/cortex-strings/src/arm/memset.S
@ -0,0 +1,122 @@
+/* Copyright (c) 2010-2011, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+      * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+      * Neither the name of Linaro Limited nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+   Written by Dave Gilbert <david.gilbert@linaro.org>
+
+   This memset routine is optimised on a Cortex-A9 and should work on
+   all ARMv7 processors.
+
+ */
+
+	.syntax unified
+	.arch armv7-a
+
+@ 2011-08-30 david.gilbert@linaro.org
+@    Extracted from local git 2f11b436
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+	.text
+	.thumb
+
+@ ---------------------------------------------------------------------------
+	.thumb_func
+	.align 2
+	.p2align 4,,15
+	.global memset
+	.type memset,%function
+memset: 
+	@ r0 = address
+	@ r1 = character
+	@ r2 = count
+	@ returns original address in r0
+
+	mov	r3, r0		@ Leave r0 alone
+	cbz	r2, 10f		@ Exit if 0 length
+
+	tst	r0, #7
+	beq	2f		@ Already aligned
+
+	@ Ok, so we're misaligned here
+1:
+	strb	r1, [r3], #1
+	subs	r2,r2,#1
+	tst	r3, #7
+	cbz	r2, 10f		@ Exit if we hit the end
+	bne	1b		@ go round again if still misaligned
+
+2:
+	@ OK, so we're aligned
+	push	{r4,r5,r6,r7}
+	bics	r4, r2, #15	@ if less than 16 bytes then need to finish it off
+	beq	5f
+
+3:
+	@ POSIX says that ch is cast to an unsigned char.  A uxtb is one
+	@ byte and takes two cycles, where an AND is four bytes but one
+	@ cycle.
+	and	r1, #0xFF
+	orr	r1, r1, r1, lsl#8	@ Same character into all bytes
+	orr	r1, r1, r1, lsl#16
+	mov	r5,r1
+	mov	r6,r1
+	mov	r7,r1
+
+4:
+	subs	r4,r4,#16
+	stmia	r3!,{r1,r5,r6,r7}
+	bne	4b
+	and	r2,r2,#15
+
+	@ At this point we're still aligned and we have upto align-1 bytes left to right
+	@ we can avoid some of the byte-at-a time now by testing for some big chunks
+	tst	r2,#8
+	itt	ne 
+	subne	r2,r2,#8
+	stmiane	r3!,{r1,r5}
+
+5:
+	pop	{r4,r5,r6,r7}
+	cbz	r2, 10f
+
+	@ Got to do any last < alignment bytes
+6:
+	subs	r2,r2,#1
+	strb	r1,[r3],#1
+	bne	6b
+
+10:
+	bx	lr		@ goodbye
--- a/contrib/cortex-strings/src/arm/strchr.S
+++ b/contrib/cortex-strings/src/arm/strchr.S
@ -0,0 +1,80 @@
+/* Copyright (c) 2010-2011, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+      * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+      * Neither the name of Linaro Limited nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+   Written by Dave Gilbert <david.gilbert@linaro.org>
+
+   A very simple strchr routine, from benchmarks on A9 it's a bit faster than
+   the current version in eglibc (2.12.1-0ubuntu14 package)
+   I don't think doing a word at a time version is worth it since a lot
+   of strchr cases are very short anyway.
+
+ */
+
+@ 2011-02-07 david.gilbert@linaro.org
+@    Extracted from local git a5b438d861
+
+	.syntax unified
+	.arch armv7-a
+
+	.text
+	.thumb
+
+@ ---------------------------------------------------------------------------
+
+	.thumb_func
+	.align 2
+	.p2align 4,,15
+	.global strchr
+	.type strchr,%function
+strchr:
+	@ r0 = start of string
+	@ r1 = character to match
+	@ returns NULL for no match, or a pointer to the match
+	and	r1,r1, #255
+
+1:
+	ldrb	r2,[r0],#1
+	cmp	r2,r1
+	cbz	r2,10f
+	bne	1b
+
+	@ We're here if it matched
+5:
+	subs	r0,r0,#1
+	bx	lr
+
+10:
+	@ We're here if we ran off the end
+	cmp	r1, #0	@ Corner case - you're allowed to search for the nil and get a pointer to it
+	beq	5b	@ A bit messy, if it's common we should branch at the start to a special loop
+	mov	r0,#0
+	bx	lr
--- a/contrib/cortex-strings/src/arm/strcmp.S
+++ b/contrib/cortex-strings/src/arm/strcmp.S
@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2012-2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Implementation of strcmp for ARMv7 when DSP instructions are
+   available.  Use ldrd to support wider loads, provided the data
+   is sufficiently aligned.  Use saturating arithmetic to optimize
+   the compares.  */
+
+/* Build Options:
+   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
+   byte in the string.  If comparing completely random strings
+   the pre-check will save time, since there is a very high
+   probability of a mismatch in the first character: we save
+   significant overhead if this is the common case.  However,
+   if strings are likely to be identical (eg because we're
+   verifying a hit in a hash table), then this check is largely
+   redundant.  */
+
+#define STRCMP_NO_PRECHECK	0
+
+	/* This version uses Thumb-2 code.  */
+	.thumb
+	.syntax unified
+
+#ifdef __ARM_BIG_ENDIAN
+#define S2LO lsl
+#define S2LOEQ lsleq
+#define S2HI lsr
+#define MSB 0x000000ff
+#define LSB 0xff000000
+#define BYTE0_OFFSET 24
+#define BYTE1_OFFSET 16
+#define BYTE2_OFFSET 8
+#define BYTE3_OFFSET 0
+#else /* not  __ARM_BIG_ENDIAN */
+#define S2LO lsr
+#define S2LOEQ lsreq
+#define S2HI lsl
+#define BYTE0_OFFSET 0
+#define BYTE1_OFFSET 8
+#define BYTE2_OFFSET 16
+#define BYTE3_OFFSET 24
+#define MSB 0xff000000
+#define LSB 0x000000ff
+#endif /* not  __ARM_BIG_ENDIAN */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+/* Parameters and result.  */
+#define src1		r0
+#define src2		r1
+#define result		r0	/* Overlaps src1.  */
+
+/* Internal variables.  */
+#define tmp1		r4
+#define tmp2		r5
+#define const_m1	r12
+
+/* Additional internal variables for 64-bit aligned data.  */
+#define data1a		r2
+#define data1b		r3
+#define data2a		r6
+#define data2b		r7
+#define syndrome_a	tmp1
+#define syndrome_b	tmp2
+
+/* Additional internal variables for 32-bit aligned data.  */
+#define data1		r2
+#define data2		r3
+#define syndrome	tmp2
+
+
+	/* Macro to compute and return the result value for word-aligned
+	   cases.  */
+	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
+#ifdef __ARM_BIG_ENDIAN
+	/* If data1 contains a zero byte, then syndrome will contain a 1 in
+	   bit 7 of that byte.  Otherwise, the highest set bit in the
+	   syndrome will highlight the first different bit.  It is therefore
+	   sufficient to extract the eight bits starting with the syndrome
+	   bit.  */
+	clz	tmp1, \synd
+	lsl	r1, \d2, tmp1
+	.if \restore_r6
+	ldrd	r6, r7, [sp, #8]
+	.endif
+	.cfi_restore 6
+	.cfi_restore 7
+	lsl	\d1, \d1, tmp1
+	.cfi_remember_state
+	lsr	result, \d1, #24
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	sub	result, result, r1, lsr #24
+	bx	lr
+#else
+	/* To use the big-endian trick we'd have to reverse all three words.
+	   that's slower than this approach.  */
+	rev	\synd, \synd
+	clz	tmp1, \synd
+	bic	tmp1, tmp1, #7
+	lsr	r1, \d2, tmp1
+	.cfi_remember_state
+	.if \restore_r6
+	ldrd	r6, r7, [sp, #8]
+	.endif
+	.cfi_restore 6
+	.cfi_restore 7
+	lsr	\d1, \d1, tmp1
+	and	result, \d1, #255
+	and	r1, r1, #255
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	sub	result, result, r1
+
+	bx	lr
+#endif
+	.endm
+
+	.text
+	.p2align	5
+.Lstrcmp_start_addr:
+#if STRCMP_NO_PRECHECK == 0
+.Lfastpath_exit:
+	sub	r0, r2, r3
+	bx	lr
+	nop
+#endif
+def_fn	strcmp
+#if STRCMP_NO_PRECHECK == 0
+	ldrb	r2, [src1]
+	ldrb	r3, [src2]
+	cmp	r2, #1
+	it	cs
+	cmpcs	r2, r3
+	bne	.Lfastpath_exit
+#endif
+	.cfi_startproc
+	strd	r4, r5, [sp, #-16]!
+	.cfi_def_cfa_offset 16
+	.cfi_offset 4, -16
+	.cfi_offset 5, -12
+	orr	tmp1, src1, src2
+	strd	r6, r7, [sp, #8]
+	.cfi_offset 6, -8
+	.cfi_offset 7, -4
+	mvn	const_m1, #0
+	lsl	r2, tmp1, #29
+	cbz	r2, .Lloop_aligned8
+
+.Lnot_aligned:
+	eor	tmp1, src1, src2
+	tst	tmp1, #7
+	bne	.Lmisaligned8
+
+	/* Deal with mutual misalignment by aligning downwards and then
+	   masking off the unwanted loaded data to prevent a difference.  */
+	and	tmp1, src1, #7
+	bic	src1, src1, #7
+	and	tmp2, tmp1, #3
+	bic	src2, src2, #7
+	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
+	ldrd	data1a, data1b, [src1], #16
+	tst	tmp1, #4
+	ldrd	data2a, data2b, [src2], #16
+	/* In thumb code we can't use MVN with a register shift, but
+	   we do have ORN.  */
+	S2HI	tmp1, const_m1, tmp2
+	orn	data1a, data1a, tmp1
+	orn	data2a, data2a, tmp1
+	beq	.Lstart_realigned8
+	orn	data1b, data1b, tmp1
+	mov	data1a, const_m1
+	orn	data2b, data2b, tmp1
+	mov	data2a, const_m1
+	b	.Lstart_realigned8
+
+	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
+	   pass.  */
+	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
+	.p2align 2	/* Always word aligned.  */
+.Lloop_aligned8:
+	ldrd	data1a, data1b, [src1], #16
+	ldrd	data2a, data2b, [src2], #16
+.Lstart_realigned8:
+	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
+	eor	syndrome_a, data1a, data2a
+	sel	syndrome_a, syndrome_a, const_m1
+	cbnz	syndrome_a, .Ldiff_in_a
+	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
+	eor	syndrome_b, data1b, data2b
+	sel	syndrome_b, syndrome_b, const_m1
+	cbnz	syndrome_b, .Ldiff_in_b
+
+	ldrd	data1a, data1b, [src1, #-8]
+	ldrd	data2a, data2b, [src2, #-8]
+	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
+	eor	syndrome_a, data1a, data2a
+	sel	syndrome_a, syndrome_a, const_m1
+	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
+	eor	syndrome_b, data1b, data2b
+	sel	syndrome_b, syndrome_b, const_m1
+	/* Can't use CBZ for backwards branch.  */
+	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
+	beq	.Lloop_aligned8
+
+.Ldiff_found:
+	cbnz	syndrome_a, .Ldiff_in_a
+
+.Ldiff_in_b:
+	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
+
+.Ldiff_in_a:
+	.cfi_restore_state
+	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
+
+	.cfi_restore_state
+.Lmisaligned8:
+	tst	tmp1, #3
+	bne	.Lmisaligned4
+	ands	tmp1, src1, #3
+	bne	.Lmutual_align4
+
+	/* Unrolled by a factor of 2, to reduce the number of post-increment
+	   operations.  */
+.Lloop_aligned4:
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+.Lstart_realigned4:
+	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
+	eor	syndrome, data1, data2
+	sel	syndrome, syndrome, const_m1
+	cbnz	syndrome, .Laligned4_done
+	ldr	data1, [src1, #-4]
+	ldr	data2, [src2, #-4]
+	uadd8	syndrome, data1, const_m1
+	eor	syndrome, data1, data2
+	sel	syndrome, syndrome, const_m1
+	cmp	syndrome, #0
+	beq	.Lloop_aligned4
+
+.Laligned4_done:
+	strcmp_epilogue_aligned syndrome, data1, data2, 0
+
+.Lmutual_align4:
+	.cfi_restore_state
+	/* Deal with mutual misalignment by aligning downwards and then
+	   masking off the unwanted loaded data to prevent a difference.  */
+	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
+	bic	src1, src1, #3
+	ldr	data1, [src1], #8
+	bic	src2, src2, #3
+	ldr	data2, [src2], #8
+
+	/* In thumb code we can't use MVN with a register shift, but
+	   we do have ORN.  */
+	S2HI	tmp1, const_m1, tmp1
+	orn	data1, data1, tmp1
+	orn	data2, data2, tmp1
+	b	.Lstart_realigned4
+
+.Lmisaligned4:
+	ands	tmp1, src1, #3
+	beq	.Lsrc1_aligned
+	sub	src2, src2, tmp1
+	bic	src1, src1, #3
+	lsls	tmp1, tmp1, #31
+	ldr	data1, [src1], #4
+	beq	.Laligned_m2
+	bcs	.Laligned_m1
+
+#if STRCMP_NO_PRECHECK == 1
+	ldrb	data2, [src2, #1]
+	uxtb	tmp1, data1, ror #BYTE1_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbz	data2, .Lmisaligned_exit
+
+.Laligned_m2:
+	ldrb	data2, [src2, #2]
+	uxtb	tmp1, data1, ror #BYTE2_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbz	data2, .Lmisaligned_exit
+
+.Laligned_m1:
+	ldrb	data2, [src2, #3]
+	uxtb	tmp1, data1, ror #BYTE3_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	add	src2, src2, #4
+	cbnz	data2, .Lsrc1_aligned
+#else  /* STRCMP_NO_PRECHECK */
+	/* If we've done the pre-check, then we don't need to check the
+	   first byte again here.  */
+	ldrb	data2, [src2, #2]
+	uxtb	tmp1, data1, ror #BYTE2_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbz	data2, .Lmisaligned_exit
+
+.Laligned_m2:
+	ldrb	data2, [src2, #3]
+	uxtb	tmp1, data1, ror #BYTE3_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbnz	data2, .Laligned_m1
+#endif
+
+.Lmisaligned_exit:
+	.cfi_remember_state
+	mov	result, tmp1
+	ldr	r4, [sp], #16
+	.cfi_restore 4
+	bx	lr
+
+#if STRCMP_NO_PRECHECK == 0
+.Laligned_m1:
+	add	src2, src2, #4
+#endif
+.Lsrc1_aligned:
+	.cfi_restore_state
+	/* src1 is word aligned, but src2 has no common alignment
+	   with it.  */
+	ldr	data1, [src1], #4
+	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
+
+	bic	src2, src2, #3
+	ldr	data2, [src2], #4
+	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
+	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
+
+	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
+.Loverlap3:
+	bic	tmp1, data1, #MSB
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #8
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #24
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap3
+4:
+	S2LO	data2, data2, #8
+	b	.Lstrcmp_tail
+
+5:
+	bics	syndrome, syndrome, #MSB
+	bne	.Lstrcmp_done_equal
+
+	/* We can only get here if the MSB of data1 contains 0, so
+	   fast-path the exit.  */
+	ldrb	result, [src2]
+	.cfi_remember_state
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 Not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	neg	result, result
+	bx	lr
+
+6:
+	.cfi_restore_state
+	S2LO	data1, data1, #24
+	and	data2, data2, #LSB
+	b	.Lstrcmp_tail
+
+	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
+.Loverlap2:
+	and	tmp1, data1, const_m1, S2LO #16
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #16
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #16
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap2
+4:
+	S2LO	data2, data2, #16
+	b	.Lstrcmp_tail
+5:
+	ands	syndrome, syndrome, const_m1, S2LO #16
+	bne	.Lstrcmp_done_equal
+
+	ldrh	data2, [src2]
+	S2LO	data1, data1, #16
+#ifdef __ARM_BIG_ENDIAN
+	lsl	data2, data2, #16
+#endif
+	b	.Lstrcmp_tail
+
+6:
+	S2LO	data1, data1, #16
+	and	data2, data2, const_m1, S2LO #16
+	b	.Lstrcmp_tail
+
+	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
+.Loverlap1:
+	and	tmp1, data1, #LSB
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #24
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #8
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap1
+4:
+	S2LO	data2, data2, #24
+	b	.Lstrcmp_tail
+5:
+	tst	syndrome, #LSB
+	bne	.Lstrcmp_done_equal
+	ldr	data2, [src2]
+6:
+	S2LO	data1, data1, #8
+	bic	data2, data2, #MSB
+	b	.Lstrcmp_tail
+
+.Lstrcmp_done_equal:
+	mov	result, #0
+	.cfi_remember_state
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	bx	lr
+
+.Lstrcmp_tail:
+	.cfi_restore_state
+#ifndef __ARM_BIG_ENDIAN
+	rev	data1, data1
+	rev	data2, data2
+	/* Now everything looks big-endian...  */
+#endif
+	uadd8	tmp1, data1, const_m1
+	eor	tmp1, data1, data2
+	sel	syndrome, tmp1, const_m1
+	clz	tmp1, syndrome
+	lsl	data1, data1, tmp1
+	lsl	data2, data2, tmp1
+	lsr	result, data1, #24
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	sub	result, result, data2, lsr #24
+	bx	lr
+	.cfi_endproc
+	.size strcmp, . - .Lstrcmp_start_addr
--- a/contrib/cortex-strings/src/thumb-2/strcpy.c
+++ b/contrib/cortex-strings/src/thumb-2/strcpy.c
@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2008 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* For GLIBC:
+#include <string.h>
+#include <memcopy.h>
+
+#undef strcmp
+*/
+
+#ifdef __thumb2__
+#define magic1(REG) "#0x01010101"
+#define magic2(REG) "#0x80808080"
+#else
+#define magic1(REG) #REG
+#define magic2(REG) #REG ", lsl #7"
+#endif
+
+char* __attribute__((naked))
+strcpy (char* dst, const char* src)
+{
+  asm (
+#if !(defined(__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
+      (defined (__thumb__) && !defined (__thumb2__)))
+       "pld	[r1, #0]\n\t"
+       "eor	r2, r0, r1\n\t"
+       "mov	ip, r0\n\t"
+       "tst	r2, #3\n\t"
+       "bne	4f\n\t"
+       "tst	r1, #3\n\t"
+       "bne	3f\n"
+  "5:\n\t"
+#ifndef __thumb2__
+       "str	r5, [sp, #-4]!\n\t"
+       "mov	r5, #0x01\n\t"
+       "orr	r5, r5, r5, lsl #8\n\t"
+       "orr	r5, r5, r5, lsl #16\n\t"
+#endif
+
+       "str	r4, [sp, #-4]!\n\t"
+       "tst	r1, #4\n\t"
+       "ldr	r3, [r1], #4\n\t"
+       "beq	2f\n\t"
+       "sub	r2, r3, "magic1(r5)"\n\t"
+       "bics	r2, r2, r3\n\t"
+       "tst	r2, "magic2(r5)"\n\t"
+       "itt	eq\n\t"
+       "streq	r3, [ip], #4\n\t"
+       "ldreq	r3, [r1], #4\n"
+       "bne	1f\n\t"
+       /* Inner loop.  We now know that r1 is 64-bit aligned, so we
+	  can safely fetch up to two words.  This allows us to avoid
+	  load stalls.  */
+       ".p2align 2\n"
+  "2:\n\t"
+       "pld	[r1, #8]\n\t"
+       "ldr	r4, [r1], #4\n\t"
+       "sub	r2, r3, "magic1(r5)"\n\t"
+       "bics	r2, r2, r3\n\t"
+       "tst	r2, "magic2(r5)"\n\t"
+       "sub	r2, r4, "magic1(r5)"\n\t"
+       "bne	1f\n\t"
+       "str	r3, [ip], #4\n\t"
+       "bics	r2, r2, r4\n\t"
+       "tst	r2, "magic2(r5)"\n\t"
+       "itt	eq\n\t"
+       "ldreq	r3, [r1], #4\n\t"
+       "streq	r4, [ip], #4\n\t"
+       "beq	2b\n\t"
+       "mov	r3, r4\n"
+  "1:\n\t"
+#ifdef __ARMEB__
+       "rors	r3, r3, #24\n\t"
+#endif
+       "strb	r3, [ip], #1\n\t"
+       "tst	r3, #0xff\n\t"
+#ifdef __ARMEL__
+       "ror	r3, r3, #8\n\t"
+#endif
+       "bne	1b\n\t"
+       "ldr	r4, [sp], #4\n\t"
+#ifndef __thumb2__
+       "ldr	r5, [sp], #4\n\t"
+#endif
+       "BX LR\n"
+
+       /* Strings have the same offset from word alignment, but it's
+	  not zero.  */
+  "3:\n\t"
+       "tst	r1, #1\n\t"
+       "beq	1f\n\t"
+       "ldrb	r2, [r1], #1\n\t"
+       "strb	r2, [ip], #1\n\t"
+       "cmp	r2, #0\n\t"
+       "it	eq\n"
+       "BXEQ LR\n"
+  "1:\n\t"
+       "tst	r1, #2\n\t"
+       "beq	5b\n\t"
+       "ldrh	r2, [r1], #2\n\t"
+#ifdef __ARMEB__
+       "tst	r2, #0xff00\n\t"
+       "iteet	ne\n\t"
+       "strneh	r2, [ip], #2\n\t"
+       "lsreq	r2, r2, #8\n\t"
+       "streqb	r2, [ip]\n\t"
+       "tstne	r2, #0xff\n\t"
+#else
+       "tst	r2, #0xff\n\t"
+       "itet	ne\n\t"
+       "strneh	r2, [ip], #2\n\t"
+       "streqb	r2, [ip]\n\t"
+       "tstne	r2, #0xff00\n\t"
+#endif
+       "bne	5b\n\t"
+       "BX LR\n"
+
+       /* src and dst do not have a common word-alignement.  Fall back to
+	  byte copying.  */
+  "4:\n\t"
+       "ldrb	r2, [r1], #1\n\t"
+       "strb	r2, [ip], #1\n\t"
+       "cmp	r2, #0\n\t"
+       "bne	4b\n\t"
+       "BX LR"
+
+#elif !defined (__thumb__) || defined (__thumb2__)
+       "mov	r3, r0\n\t"
+  "1:\n\t"
+       "ldrb	r2, [r1], #1\n\t"
+       "strb	r2, [r3], #1\n\t"
+       "cmp	r2, #0\n\t"
+       "bne	1b\n\t"
+       "BX LR"
+#else
+       "mov	r3, r0\n\t"
+  "1:\n\t"
+       "ldrb	r2, [r1]\n\t"
+       "add	r1, r1, #1\n\t"
+       "strb	r2, [r3]\n\t"
+       "add	r3, r3, #1\n\t"
+       "cmp	r2, #0\n\t"
+       "bne	1b\n\t"
+       "BX LR"
+#endif
+       );
+}
+/* For GLIBC: libc_hidden_builtin_def (strcpy) */
--- a/contrib/cortex-strings/src/thumb-2/strlen.S
+++ b/contrib/cortex-strings/src/thumb-2/strlen.S
@ -0,0 +1,150 @@
+/* Copyright (c) 2010-2011,2013 Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+      * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+      * Neither the name of Linaro Limited nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+   Assumes:
+   ARMv6T2, AArch32
+
+ */
+
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
+#ifdef __ARMEB__
+#define S2LO		lsl
+#define S2HI		lsr
+#else
+#define S2LO		lsr
+#define S2HI		lsl
+#endif
+
+	/* This code requires Thumb.  */
+	.thumb
+	.syntax unified
+
+/* Parameters and result.  */
+#define srcin		r0
+#define result		r0
+
+/* Internal variables.  */
+#define src		r1
+#define data1a		r2
+#define data1b		r3
+#define const_m1	r12
+#define const_0		r4
+#define tmp1		r4		/* Overlaps const_0  */
+#define tmp2		r5
+
+def_fn	strlen p2align=6
+	pld	[srcin, #0]
+	strd	r4, r5, [sp, #-8]!
+	bic	src, srcin, #7
+	mvn	const_m1, #0
+	ands	tmp1, srcin, #7		/* (8 - bytes) to alignment.  */
+	pld	[src, #32]
+	bne.w	.Lmisaligned8
+	mov	const_0, #0
+	mov	result, #-8
+.Lloop_aligned:
+	/* Bytes 0-7.  */
+	ldrd	data1a, data1b, [src]
+	pld	[src, #64]
+	add	result, result, #8
+.Lstart_realigned:
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
+
+	/* Bytes 8-15.  */
+	ldrd	data1a, data1b, [src, #8]
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
+
+	/* Bytes 16-23.  */
+	ldrd	data1a, data1b, [src, #16]
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
+
+	/* Bytes 24-31.  */
+	ldrd	data1a, data1b, [src, #24]
+	add	src, src, #32
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cmp	data1b, #0
+	beq	.Lloop_aligned
+
+.Lnull_found:
+	cmp	data1a, #0
+	itt	eq
+	addeq	result, result, #4
+	moveq	data1a, data1b
+#ifndef __ARMEB__
+	rev	data1a, data1a
+#endif
+	clz	data1a, data1a
+	ldrd	r4, r5, [sp], #8
+	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
+	bx	lr
+
+.Lmisaligned8:
+	ldrd	data1a, data1b, [src]
+	and	tmp2, tmp1, #3
+	rsb	result, tmp1, #0
+	lsl	tmp2, tmp2, #3			/* Bytes -> bits.  */
+	tst	tmp1, #4
+	pld	[src, #64]
+	S2HI	tmp2, const_m1, tmp2
+	orn	data1a, data1a, tmp2
+	itt	ne
+	ornne	data1b, data1b, tmp2
+	movne	data1a, const_m1
+	mov	const_0, #0
+	b	.Lstart_realigned
+	.size	strlen, . - strlen
+
--- a/contrib/cortex-strings/src/thumb/aeabi_idiv.S
+++ b/contrib/cortex-strings/src/thumb/aeabi_idiv.S
@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* An executable stack is *not* required for these functions.  */
+
+.section .note.GNU-stack,"",%progbits
+.previous
+.eabi_attribute 25, 1
+
+/* ANSI concatenation macros.  */
+
+#define CONCAT1(a, b) CONCAT2(a, b)
+#define CONCAT2(a, b) a ## b
+
+/* Use the right prefix for global labels.  */
+
+#define SYM(x) CONCAT1 (__USER_LABEL_PREFIX__, x)
+
+#define TYPE(x) .type SYM(x),function
+#define SIZE(x) .size SYM(x), . - SYM(x)
+#define LSYM(x) .x
+
+.macro cfi_start	start_label, end_label
+	.pushsection	.debug_frame
+LSYM(Lstart_frame):
+	.4byte	LSYM(Lend_cie) - LSYM(Lstart_cie)
+LSYM(Lstart_cie):
+        .4byte	0xffffffff
+        .byte	0x1
+        .ascii	"\0"
+        .uleb128 0x1
+        .sleb128 -4
+        .byte	0xe
+        .byte	0xc
+        .uleb128 0xd
+        .uleb128 0x0
+
+	.align 2
+LSYM(Lend_cie):
+	.4byte	LSYM(Lend_fde)-LSYM(Lstart_fde)
+LSYM(Lstart_fde):
+	.4byte	LSYM(Lstart_frame)
+	.4byte	\start_label
+	.4byte	\end_label-\start_label
+	.popsection
+.endm
+
+.macro cfi_end	end_label
+	.pushsection	.debug_frame
+	.align	2
+LSYM(Lend_fde):
+	.popsection
+\end_label:
+.endm
+
+.macro THUMB_LDIV0 name signed
+	push	{r0, lr}
+	movs	r0, #0
+	bl	SYM(__aeabi_idiv0)
+	pop	{r1, pc}
+.endm
+
+.macro FUNC_END name
+	SIZE (__\name)
+.endm
+
+.macro DIV_FUNC_END name signed
+	cfi_start	__\name, LSYM(Lend_div0)
+LSYM(Ldiv0):
+	THUMB_LDIV0 \name \signed
+	cfi_end	LSYM(Lend_div0)
+	FUNC_END \name
+.endm
+
+.macro THUMB_FUNC_START name
+	.globl	SYM (\name)
+	TYPE	(\name)
+	.thumb_func
+SYM (\name):
+.endm
+
+.macro FUNC_START name
+	.text
+	.globl SYM (__\name)
+	TYPE (__\name)
+	.align 0
+	.force_thumb
+	.thumb_func
+	.syntax unified
+SYM (__\name):
+.endm
+
+.macro	FUNC_ALIAS new old
+	.globl	SYM (__\new)
+	.thumb_set	SYM (__\new), SYM (__\old)
+.endm
+
+/* Register aliases.  */
+work		.req	r4
+dividend	.req	r0
+divisor		.req	r1
+overdone	.req	r2
+result		.req	r2
+curbit		.req	r3
+
+/* ------------------------------------------------------------------------ */
+/*		Bodies of the division and modulo routines.		    */
+/* ------------------------------------------------------------------------ */
+.macro BranchToDiv n, label
+	lsrs	curbit, dividend, \n
+	cmp	curbit, divisor
+	bcc	\label
+.endm
+
+.macro DoDiv n
+	lsrs	curbit, dividend, \n
+	cmp	curbit, divisor
+	bcc	1f
+	lsls	curbit, divisor, \n
+	subs	dividend, dividend, curbit
+
+1:	adcs	result, result
+.endm
+
+.macro THUMB1_Div_Positive
+	movs	result, #0
+	BranchToDiv #1, LSYM(Lthumb1_div1)
+	BranchToDiv #4, LSYM(Lthumb1_div4)
+	BranchToDiv #8, LSYM(Lthumb1_div8)
+	BranchToDiv #12, LSYM(Lthumb1_div12)
+	BranchToDiv #16, LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_large_positive):
+	movs	result, #0xff
+	lsls	divisor, divisor, #8
+	rev	result, result
+	lsrs	curbit, dividend, #16
+	cmp	curbit, divisor
+	bcc	1f
+	asrs	result, #8
+	lsls	divisor, divisor, #8
+	beq	LSYM(Ldivbyzero_waypoint)
+
+1:	lsrs	curbit, dividend, #12
+	cmp	curbit, divisor
+	bcc	LSYM(Lthumb1_div12)
+	b	LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_loop):
+	lsrs	divisor, divisor, #8
+LSYM(Lthumb1_div16):
+	Dodiv	#15
+	Dodiv	#14
+	Dodiv	#13
+	Dodiv	#12
+LSYM(Lthumb1_div12):
+	Dodiv	#11
+	Dodiv	#10
+	Dodiv	#9
+	Dodiv	#8
+	bcs	LSYM(Lthumb1_div_loop)
+LSYM(Lthumb1_div8):
+	Dodiv	#7
+	Dodiv	#6
+	Dodiv	#5
+LSYM(Lthumb1_div5):
+	Dodiv	#4
+LSYM(Lthumb1_div4):
+	Dodiv	#3
+LSYM(Lthumb1_div3):
+	Dodiv	#2
+LSYM(Lthumb1_div2):
+	Dodiv	#1
+LSYM(Lthumb1_div1):
+	subs	divisor, dividend, divisor
+	bcs	1f
+	mov	divisor, dividend
+
+1:	adcs	result, result
+	mov	dividend, result
+	bx	lr
+
+LSYM(Ldivbyzero_waypoint):
+	b	LSYM(Ldiv0)
+.endm
+
+.macro THUMB1_Div_Negative
+	lsrs	result, divisor, #31
+	beq	1f
+	rsbs	divisor, divisor, #0
+
+1:	asrs	curbit, dividend, #32
+	bcc	2f
+	rsbs	dividend, dividend, #0
+
+2:	eors	curbit, result
+	movs	result, #0
+	mov	ip, curbit
+	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
+	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_large):
+	movs	result, #0xfc
+	lsls	divisor, divisor, #6
+	rev	result, result
+	lsrs	curbit, dividend, #8
+	cmp	curbit, divisor
+	bcc	LSYM(Lthumb1_div_negative8)
+
+	lsls	divisor, divisor, #6
+	asrs	result, result, #6
+	cmp	curbit, divisor
+	bcc	LSYM(Lthumb1_div_negative8)
+
+	lsls	divisor, divisor, #6
+	asrs	result, result, #6
+	cmp	curbit, divisor
+	bcc	LSYM(Lthumb1_div_negative8)
+
+	lsls	divisor, divisor, #6
+	beq	LSYM(Ldivbyzero_negative)
+	asrs	result, result, #6
+	b	LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_negative_loop):
+	lsrs	divisor, divisor, #6
+LSYM(Lthumb1_div_negative8):
+	DoDiv	#7
+	DoDiv	#6
+	DoDiv	#5
+	DoDiv	#4
+LSYM(Lthumb1_div_negative4):
+	DoDiv	#3
+	DoDiv	#2
+	bcs	LSYM(Lthumb1_div_negative_loop)
+	DoDiv	#1
+	subs	divisor, dividend, divisor
+	bcs	1f
+	mov	divisor, dividend
+
+1:	mov	curbit, ip
+	adcs	result, result
+	asrs	curbit, curbit, #1
+	mov	dividend, result
+	bcc	2f
+	rsbs	dividend, dividend, #0
+	cmp	curbit, #0
+
+2:	bpl	3f
+	rsbs	divisor, divisor, #0
+
+3:	bx	lr
+
+LSYM(Ldivbyzero_negative):
+	mov	curbit, ip
+	asrs	curbit, curbit, #1
+	bcc	LSYM(Ldiv0)
+	rsbs	dividend, dividend, #0
+.endm
+
+/* ------------------------------------------------------------------------ */
+/*		Start of the Real Functions				    */
+/* ------------------------------------------------------------------------ */
+
+	FUNC_START aeabi_idiv0
+	bx	lr
+	FUNC_END aeabi_idiv0
+
+	FUNC_START divsi3
+	FUNC_ALIAS aeabi_idiv divsi3
+
+LSYM(divsi3_skip_div0_test):
+	mov	curbit, dividend
+	orrs	curbit, divisor
+	bmi	LSYM(Lthumb1_div_negative)
+
+LSYM(Lthumb1_div_positive):
+	THUMB1_Div_Positive
+
+LSYM(Lthumb1_div_negative):
+	THUMB1_Div_Negative
+
+	DIV_FUNC_END divsi3 signed
+
+	FUNC_START aeabi_idivmod
+
+	cmp	r1, #0
+	beq	LSYM(Ldiv0)
+	push	{r0, r1, lr}
+	bl	LSYM(divsi3_skip_div0_test)
+	POP	{r1, r2, r3}
+	mul	r2, r0
+	sub	r1, r1, r2
+	bx	r3
+
+	FUNC_END aeabi_idivmod
+/* ------------------------------------------------------------------------ */
--- a/contrib/cortex-strings/src/thumb/strcmp-armv6m.S
+++ b/contrib/cortex-strings/src/thumb/strcmp-armv6m.S
@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Implementation of strcmp for ARMv6m.  This version is only used in
+   ARMv6-M when we want an efficient implementation.  Otherwize if the
+   code size is preferred, strcmp-armv4t.S will be used.  */
+
+	.thumb_func
+	.syntax unified
+	.arch	armv6-m
+
+	.macro DoSub n, label
+	subs	r0, r0, r1
+#ifdef __ARM_BIG_ENDIAN
+	lsrs	r1, r4, \n
+#else
+	lsls	r1, r4, \n
+#endif
+	orrs	r1, r0
+	bne	\label
+	.endm
+
+	.macro Byte_Test n, label
+	lsrs	r0, r2, \n
+	lsrs	r1, r3, \n
+	DoSub	\n, \label
+	.endm
+
+	.text
+	.p2align	0
+	.global	strcmp
+	.type	strcmp, %function
+strcmp:
+	.cfi_startproc
+	mov	r2, r0
+	push	{r4, r5, r6, lr}
+	orrs	r2, r1
+	lsls	r2, r2, #30
+	bne	6f
+	ldr	r5, =0x01010101
+	lsls	r6, r5, #7
+1:
+	ldmia	r0!, {r2}
+	ldmia	r1!, {r3}
+	subs	r4, r2, r5
+	bics	r4, r2
+	ands	r4, r6
+	beq	3f
+
+#ifdef __ARM_BIG_ENDIAN
+	Byte_Test #24, 4f
+	Byte_Test #16, 4f
+	Byte_Test #8, 4f
+
+	b       7f
+3:
+	cmp     r2, r3
+	beq     1b
+	cmp     r2, r3
+#else
+	uxtb    r0, r2
+	uxtb    r1, r3
+	DoSub   #24, 2f
+
+	uxth    r0, r2
+	uxth    r1, r3
+	DoSub   #16, 2f
+
+	lsls    r0, r2, #8
+	lsls    r1, r3, #8
+	lsrs    r0, r0, #8
+	lsrs    r1, r1, #8
+	DoSub   #8, 2f
+
+	lsrs    r0, r2, #24
+	lsrs    r1, r3, #24
+	subs    r0, r0, r1
+2:
+	pop     {r4, r5, r6, pc}
+
+3:
+	cmp     r2, r3
+	beq     1b
+	rev     r0, r2
+	rev     r1, r3
+	cmp     r0, r1
+#endif
+
+	bls	5f
+	movs	r0, #1
+4:
+	pop	{r4, r5, r6, pc}
+5:
+	movs	r0, #0
+	mvns	r0, r0
+	pop	{r4, r5, r6, pc}
+6:
+	ldrb	r2, [r0, #0]
+	ldrb	r3, [r1, #0]
+	adds	r0, #1
+	adds	r1, #1
+	cmp	r2, #0
+	beq	7f
+	cmp	r2, r3
+	bne	7f
+	ldrb	r2, [r0, #0]
+	ldrb	r3, [r1, #0]
+	adds	r0, #1
+	adds	r1, #1
+	cmp	r2, #0
+	beq	7f
+	cmp	r2, r3
+	beq	6b
+7:
+	subs	r0, r2, r3
+	pop	{r4, r5, r6, pc}
+	.cfi_endproc
+	.size	strcmp, . - strcmp