Import bionic's x86_64 optimized string routines

Add 'contrib/bionic-x86_64-string/' from commit 'd77520609f5240f5fad18fa1fd2275ac1de7cbb5' git-subtree-dir: contrib/bionic-x86_64-string git-subtree-mainline: 715f82e4f5 git-subtree-split: d77520609f Requested by: mjg Sponsored by: The FreeBSD Foundation
2022-08-16 13:26:59 -04:00 · 2022-08-16 13:26:59 -04:00 · 8ddb146abc
commit 8ddb146abc
parent 715f82e4f5 d77520609f
14 changed files with 7055 additions and 0 deletions
--- a/contrib/bionic-x86_64-string/avx2-memset-kbl.S
+++ b/contrib/bionic-x86_64-string/avx2-memset-kbl.S
@ -0,0 +1,160 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <private/bionic_asm.h>
+
+#include "cache.h"
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+	.section .text.avx2,"ax",@progbits
+
+ENTRY(__memset_chk_avx2)
+	# %rdi = dst, %rsi = byte, %rdx = n, %rcx = dst_len
+	cmp %rcx, %rdx
+	ja __memset_chk_fail
+	// Fall through to memset...
+END(__memset_chk_avx2)
+
+ENTRY(memset_avx2)
+	movq	%rdi, %rax
+	and	$0xff, %rsi
+	mov	$0x0101010101010101, %rcx
+	imul	%rsi, %rcx
+	cmpq	$16, %rdx
+	jae	L(16bytesormore)
+	testb	$8, %dl
+	jnz	L(8_15bytes)
+	testb	$4, %dl
+	jnz	L(4_7bytes)
+	testb	$2, %dl
+	jnz	L(2_3bytes)
+	testb	$1, %dl
+	jz	L(return)
+	movb	%cl, (%rdi)
+L(return):
+	ret
+
+L(8_15bytes):
+	movq	%rcx, (%rdi)
+	movq	%rcx, -8(%rdi, %rdx)
+	ret
+
+L(4_7bytes):
+	movl	%ecx, (%rdi)
+	movl	%ecx, -4(%rdi, %rdx)
+	ret
+
+L(2_3bytes):
+	movw	%cx, (%rdi)
+	movw	%cx, -2(%rdi, %rdx)
+	ret
+
+	ALIGN (4)
+L(16bytesormore):
+	movd	%rcx, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm0, -16(%rdi, %rdx)
+	cmpq	$32, %rdx
+	jbe	L(32bytesless)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, -32(%rdi, %rdx)
+	cmpq	$64, %rdx
+	jbe	L(64bytesless)
+	movdqu	%xmm0, 32(%rdi)
+	movdqu	%xmm0, 48(%rdi)
+	movdqu	%xmm0, -64(%rdi, %rdx)
+	movdqu	%xmm0, -48(%rdi, %rdx)
+	cmpq	$128, %rdx
+	jbe	L(128bytesless)
+        vpbroadcastb %xmm0, %ymm0
+	vmovdqu	%ymm0, 64(%rdi)
+	vmovdqu	%ymm0, 96(%rdi)
+	vmovdqu	%ymm0, -128(%rdi, %rdx)
+	vmovdqu	%ymm0, -96(%rdi, %rdx)
+	cmpq	$256, %rdx
+        ja      L(256bytesmore)
+L(32bytesless):
+L(64bytesless):
+L(128bytesless):
+	ret
+
+	ALIGN (4)
+L(256bytesmore):
+	leaq	128(%rdi), %rcx
+	andq	$-128, %rcx
+	movq	%rdx, %r8
+	addq	%rdi, %rdx
+	andq	$-128, %rdx
+	cmpq	%rcx, %rdx
+	je	L(return)
+
+#ifdef SHARED_CACHE_SIZE
+	cmp	$SHARED_CACHE_SIZE, %r8
+#else
+	cmp	__x86_64_shared_cache_size(%rip), %r8
+#endif
+	ja	L(256bytesmore_nt)
+
+	ALIGN (4)
+L(256bytesmore_normal):
+	vmovdqa	%ymm0, (%rcx)
+	vmovdqa	%ymm0, 32(%rcx)
+	vmovdqa	%ymm0, 64(%rcx)
+	vmovdqa	%ymm0, 96(%rcx)
+	addq	$128, %rcx
+	cmpq	%rcx, %rdx
+	jne	L(256bytesmore_normal)
+	ret
+
+	ALIGN (4)
+L(256bytesmore_nt):
+	movntdq	 %xmm0, (%rcx)
+	movntdq	 %xmm0, 16(%rcx)
+	movntdq	 %xmm0, 32(%rcx)
+	movntdq	 %xmm0, 48(%rcx)
+	movntdq	 %xmm0, 64(%rcx)
+	movntdq	 %xmm0, 80(%rcx)
+	movntdq	 %xmm0, 96(%rcx)
+	movntdq	 %xmm0, 112(%rcx)
+	leaq	128(%rcx), %rcx
+	cmpq	%rcx, %rdx
+	jne	L(256bytesmore_nt)
+	sfence
+	ret
+
+END(memset_avx2)
--- a/contrib/bionic-x86_64-string/cache.h
+++ b/contrib/bionic-x86_64-string/cache.h
@ -0,0 +1,36 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Values are optimized for Core Architecture */
+#define SHARED_CACHE_SIZE (4096*1024)  /* Core Architecture L2 Cache */
+#define DATA_CACHE_SIZE   (24*1024)    /* Core Architecture L1 Data Cache */
+
+#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
+#define DATA_CACHE_SIZE_HALF   (DATA_CACHE_SIZE / 2)
--- a/contrib/bionic-x86_64-string/sse2-memmove-slm.S
+++ b/contrib/bionic-x86_64-string/sse2-memmove-slm.S
@ -0,0 +1,518 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMMOVE
+# define MEMMOVE		memmove
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)		\
+	.type name,  @function;		\
+	.globl name;		\
+	.p2align 4;		\
+name:		\
+	cfi_startproc
+#endif
+
+#ifndef ALIAS_SYMBOL
+# define ALIAS_SYMBOL(alias, original) \
+	.globl alias; \
+	.equ alias, original
+#endif
+
+#ifndef END
+# define END(name)		\
+	cfi_endproc;		\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)		\
+	cfi_adjust_cfa_offset (4);		\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)		\
+	cfi_adjust_cfa_offset (-4);		\
+	cfi_restore (REG)
+
+#define PUSH(REG)	push REG;
+#define POP(REG)	pop REG;
+
+#define ENTRANCE	PUSH (%rbx);
+#define RETURN_END	POP (%rbx); ret
+#define RETURN		RETURN_END;
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (MEMMOVE)
+	ENTRANCE
+	mov	%rdi, %rax
+
+/* Check whether we should copy backward or forward.  */
+	cmp	%rsi, %rdi
+	je	L(mm_return)
+	jg	L(mm_len_0_or_more_backward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %rdx
+	jbe	L(mm_len_0_16_bytes_forward)
+
+	cmp	$32, %rdx
+	ja	L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	-16(%rsi, %rdx), %xmm1
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_32_or_more_forward):
+	cmp	$64, %rdx
+	ja	L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	-16(%rsi, %rdx), %xmm2
+	movdqu	-32(%rsi, %rdx), %xmm3
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, -16(%rdi, %rdx)
+	movdqu	%xmm3, -32(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_64_or_more_forward):
+	cmp	$128, %rdx
+	ja	L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+	movdqu	-64(%rsi, %rdx), %xmm4
+	movdqu	-48(%rsi, %rdx), %xmm5
+	movdqu	-32(%rsi, %rdx), %xmm6
+	movdqu	-16(%rsi, %rdx), %xmm7
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, 32(%rdi)
+	movdqu	%xmm3, 48(%rdi)
+	movdqu	%xmm4, -64(%rdi, %rdx)
+	movdqu	%xmm5, -48(%rdi, %rdx)
+	movdqu	%xmm6, -32(%rdi, %rdx)
+	movdqu	%xmm7, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_128_or_more_forward):
+/* Aligning the address of destination.  */
+/*  save first unaligned 64 bytes */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+
+	lea	64(%rdi), %r8
+	and	$-64, %r8  /* r8 now aligned to next 64 byte boundary */
+	sub	%rdi, %rsi /* rsi = src - dst = diff */
+
+	movdqu	(%r8, %rsi), %xmm4
+	movdqu	16(%r8, %rsi), %xmm5
+	movdqu	32(%r8, %rsi), %xmm6
+	movdqu	48(%r8, %rsi), %xmm7
+
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, 32(%rdi)
+	movdqu	%xmm3, 48(%rdi)
+	movdqa	%xmm4, (%r8)
+	movaps	%xmm5, 16(%r8)
+	movaps	%xmm6, 32(%r8)
+	movaps	%xmm7, 48(%r8)
+	add	$64, %r8
+
+	lea	(%rdi, %rdx), %rbx
+	and	$-64, %rbx
+	cmp	%r8, %rbx
+	jbe	L(mm_copy_remaining_forward)
+
+	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
+	jae	L(mm_large_page_loop_forward)
+
+	.p2align 4
+L(mm_main_loop_forward):
+
+	prefetcht0 128(%r8, %rsi)
+
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movdqa	%xmm0, (%r8)
+	movaps	%xmm1, 16(%r8)
+	movaps	%xmm2, 32(%r8)
+	movaps	%xmm3, 48(%r8)
+	lea	64(%r8), %r8
+	cmp	%r8, %rbx
+	ja	L(mm_main_loop_forward)
+
+L(mm_copy_remaining_forward):
+	add	%rdi, %rdx
+	sub	%r8, %rdx
+/* We copied all up till %rdi position in the dst.
+	In %rdx now is how many bytes are left to copy.
+	Now we need to advance %r8. */
+	lea	(%r8, %rsi), %r9
+
+L(mm_remaining_0_64_bytes_forward):
+	cmp	$32, %rdx
+	ja	L(mm_remaining_33_64_bytes_forward)
+	cmp	$16, %rdx
+	ja	L(mm_remaining_17_32_bytes_forward)
+	test	%rdx, %rdx
+	.p2align 4,,2
+	je	L(mm_return)
+
+	cmpb	$8, %dl
+	ja	L(mm_remaining_9_16_bytes_forward)
+	cmpb	$4, %dl
+	.p2align 4,,5
+	ja	L(mm_remaining_5_8_bytes_forward)
+	cmpb	$2, %dl
+	.p2align 4,,1
+	ja	L(mm_remaining_3_4_bytes_forward)
+	movzbl	-1(%r9,%rdx), %esi
+	movzbl	(%r9), %ebx
+	movb	%sil, -1(%r8,%rdx)
+	movb	%bl, (%r8)
+	jmp	L(mm_return)
+
+L(mm_remaining_33_64_bytes_forward):
+	movdqu	(%r9), %xmm0
+	movdqu	16(%r9), %xmm1
+	movdqu	-32(%r9, %rdx), %xmm2
+	movdqu	-16(%r9, %rdx), %xmm3
+	movdqu	%xmm0, (%r8)
+	movdqu	%xmm1, 16(%r8)
+	movdqu	%xmm2, -32(%r8, %rdx)
+	movdqu	%xmm3, -16(%r8, %rdx)
+	jmp	L(mm_return)
+
+L(mm_remaining_17_32_bytes_forward):
+	movdqu	(%r9), %xmm0
+	movdqu	-16(%r9, %rdx), %xmm1
+	movdqu	%xmm0, (%r8)
+	movdqu	%xmm1, -16(%r8, %rdx)
+	jmp	L(mm_return)
+
+L(mm_remaining_5_8_bytes_forward):
+	movl	(%r9), %esi
+	movl	-4(%r9,%rdx), %ebx
+	movl	%esi, (%r8)
+	movl	%ebx, -4(%r8,%rdx)
+	jmp	L(mm_return)
+
+L(mm_remaining_9_16_bytes_forward):
+	mov	(%r9), %rsi
+	mov	-8(%r9, %rdx), %rbx
+	mov	%rsi, (%r8)
+	mov	%rbx, -8(%r8, %rdx)
+	jmp	L(mm_return)
+
+L(mm_remaining_3_4_bytes_forward):
+	movzwl	-2(%r9,%rdx), %esi
+	movzwl	(%r9), %ebx
+	movw	%si, -2(%r8,%rdx)
+	movw	%bx, (%r8)
+	jmp	L(mm_return)
+
+L(mm_len_0_16_bytes_forward):
+	testb	$24, %dl
+	jne	L(mm_len_9_16_bytes_forward)
+	testb	$4, %dl
+	.p2align 4,,5
+	jne	L(mm_len_5_8_bytes_forward)
+	test	%rdx, %rdx
+	.p2align 4,,2
+	je	L(mm_return)
+	testb	$2, %dl
+	.p2align 4,,1
+	jne	L(mm_len_2_4_bytes_forward)
+	movzbl	-1(%rsi,%rdx), %ebx
+	movzbl	(%rsi), %esi
+	movb	%bl, -1(%rdi,%rdx)
+	movb	%sil, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_len_2_4_bytes_forward):
+	movzwl	-2(%rsi,%rdx), %ebx
+	movzwl	(%rsi), %esi
+	movw	%bx, -2(%rdi,%rdx)
+	movw	%si, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_len_5_8_bytes_forward):
+	movl	(%rsi), %ebx
+	movl	-4(%rsi,%rdx), %esi
+	movl	%ebx, (%rdi)
+	movl	%esi, -4(%rdi,%rdx)
+	jmp	L(mm_return)
+
+L(mm_len_9_16_bytes_forward):
+	mov	(%rsi), %rbx
+	mov	-8(%rsi, %rdx), %rsi
+	mov	%rbx, (%rdi)
+	mov	%rsi, -8(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_recalc_len):
+/* Compute in %rdx how many bytes are left to copy after
+	the main loop stops.  */
+	mov 	%rbx, %rdx
+	sub 	%rdi, %rdx
+/* The code for copying backwards.  */
+L(mm_len_0_or_more_backward):
+
+/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
+	separately.  */
+	cmp	$16, %rdx
+	jbe	L(mm_len_0_16_bytes_backward)
+
+	cmp	$32, %rdx
+	ja	L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	-16(%rsi, %rdx), %xmm1
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_32_or_more_backward):
+	cmp	$64, %rdx
+	ja	L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	-16(%rsi, %rdx), %xmm2
+	movdqu	-32(%rsi, %rdx), %xmm3
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, -16(%rdi, %rdx)
+	movdqu	%xmm3, -32(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_64_or_more_backward):
+	cmp	$128, %rdx
+	ja	L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+	movdqu	-64(%rsi, %rdx), %xmm4
+	movdqu	-48(%rsi, %rdx), %xmm5
+	movdqu	-32(%rsi, %rdx), %xmm6
+	movdqu	-16(%rsi, %rdx), %xmm7
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, 32(%rdi)
+	movdqu	%xmm3, 48(%rdi)
+	movdqu	%xmm4, -64(%rdi, %rdx)
+	movdqu	%xmm5, -48(%rdi, %rdx)
+	movdqu	%xmm6, -32(%rdi, %rdx)
+	movdqu	%xmm7, -16(%rdi, %rdx)
+	jmp	L(mm_return)
+
+L(mm_len_128_or_more_backward):
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+	movdqu	-16(%rsi, %rdx), %xmm0
+	movdqu	-32(%rsi, %rdx), %xmm1
+	movdqu	-48(%rsi, %rdx), %xmm2
+	movdqu	-64(%rsi, %rdx), %xmm3
+
+	lea	(%rdi, %rdx), %r9
+	and	$-64, %r9 /* r9 = aligned dst */
+
+	mov	%rsi, %r8
+	sub	%rdi, %r8 /* r8 = src - dst, diff */
+
+	movdqu	-16(%r9, %r8), %xmm4
+	movdqu	-32(%r9, %r8), %xmm5
+	movdqu	-48(%r9, %r8), %xmm6
+	movdqu	-64(%r9, %r8), %xmm7
+
+	movdqu	%xmm0, -16(%rdi, %rdx)
+	movdqu	%xmm1, -32(%rdi, %rdx)
+	movdqu	%xmm2, -48(%rdi, %rdx)
+	movdqu	%xmm3, -64(%rdi, %rdx)
+	movdqa	%xmm4, -16(%r9)
+	movaps	%xmm5, -32(%r9)
+	movaps	%xmm6, -48(%r9)
+	movaps	%xmm7, -64(%r9)
+	lea	-64(%r9), %r9
+
+	lea	64(%rdi), %rbx
+	and	$-64, %rbx
+
+	cmp	%r9, %rbx
+	jae	L(mm_recalc_len)
+
+	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
+	jae	L(mm_large_page_loop_backward)
+
+	.p2align 4
+L(mm_main_loop_backward):
+
+	prefetcht0 -128(%r9, %r8)
+
+	movdqu	-64(%r9, %r8), %xmm0
+	movdqu	-48(%r9, %r8), %xmm1
+	movdqu	-32(%r9, %r8), %xmm2
+	movdqu	-16(%r9, %r8), %xmm3
+	movdqa	%xmm0, -64(%r9)
+	movaps	%xmm1, -48(%r9)
+	movaps	%xmm2, -32(%r9)
+	movaps	%xmm3, -16(%r9)
+	lea	-64(%r9), %r9
+	cmp	%r9, %rbx
+	jb	L(mm_main_loop_backward)
+	jmp	L(mm_recalc_len)
+
+/* Copy [0..16] and return.  */
+L(mm_len_0_16_bytes_backward):
+	testb	$24, %dl
+	jnz	L(mm_len_9_16_bytes_backward)
+	testb	$4, %dl
+	.p2align 4,,5
+	jnz	L(mm_len_5_8_bytes_backward)
+	test	%rdx, %rdx
+	.p2align 4,,2
+	je	L(mm_return)
+	testb	$2, %dl
+	.p2align 4,,1
+	jne	L(mm_len_3_4_bytes_backward)
+	movzbl	-1(%rsi,%rdx), %ebx
+	movzbl	(%rsi), %ecx
+	movb	%bl, -1(%rdi,%rdx)
+	movb	%cl, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_len_3_4_bytes_backward):
+	movzwl	-2(%rsi,%rdx), %ebx
+	movzwl	(%rsi), %ecx
+	movw	%bx, -2(%rdi,%rdx)
+	movw	%cx, (%rdi)
+	jmp	L(mm_return)
+
+L(mm_len_9_16_bytes_backward):
+	movl	-4(%rsi,%rdx), %ebx
+	movl	-8(%rsi,%rdx), %ecx
+	movl	%ebx, -4(%rdi,%rdx)
+	movl	%ecx, -8(%rdi,%rdx)
+	sub	$8, %rdx
+	jmp	L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+	movl	(%rsi), %ebx
+	movl	-4(%rsi,%rdx), %ecx
+	movl	%ebx, (%rdi)
+	movl	%ecx, -4(%rdi,%rdx)
+
+L(mm_return):
+	RETURN
+
+/* Big length copy forward part.  */
+
+	.p2align 4
+L(mm_large_page_loop_forward):
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movntdq	%xmm0, (%r8)
+	movntdq	%xmm1, 16(%r8)
+	movntdq	%xmm2, 32(%r8)
+	movntdq	%xmm3, 48(%r8)
+	lea 	64(%r8), %r8
+	cmp	%r8, %rbx
+	ja	L(mm_large_page_loop_forward)
+	sfence
+	jmp	L(mm_copy_remaining_forward)
+
+/* Big length copy backward part.  */
+	.p2align 4
+L(mm_large_page_loop_backward):
+	movdqu	-64(%r9, %r8), %xmm0
+	movdqu	-48(%r9, %r8), %xmm1
+	movdqu	-32(%r9, %r8), %xmm2
+	movdqu	-16(%r9, %r8), %xmm3
+	movntdq	%xmm0, -64(%r9)
+	movntdq	%xmm1, -48(%r9)
+	movntdq	%xmm2, -32(%r9)
+	movntdq	%xmm3, -16(%r9)
+	lea 	-64(%r9), %r9
+	cmp	%r9, %rbx
+	jb	L(mm_large_page_loop_backward)
+	sfence
+	jmp	L(mm_recalc_len)
+
+END (MEMMOVE)
+
+ALIAS_SYMBOL(memcpy, MEMMOVE)
--- a/contrib/bionic-x86_64-string/sse2-memset-slm.S
+++ b/contrib/bionic-x86_64-string/sse2-memset-slm.S
@ -0,0 +1,149 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <private/bionic_asm.h>
+
+#include "cache.h"
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+
+ENTRY(__memset_chk_generic)
+  # %rdi = dst, %rsi = byte, %rdx = n, %rcx = dst_len
+  cmp %rcx, %rdx
+  ja __memset_chk_fail
+  // Fall through to memset...
+END(__memset_chk_generic)
+
+
+	.section .text.sse2,"ax",@progbits
+ENTRY(memset_generic)
+	movq	%rdi, %rax
+	and	$0xff, %rsi
+	mov	$0x0101010101010101, %rcx
+	imul	%rsi, %rcx
+	cmpq	$16, %rdx
+	jae	L(16bytesormore)
+	testb	$8, %dl
+	jnz	L(8_15bytes)
+	testb	$4, %dl
+	jnz	L(4_7bytes)
+	testb	$2, %dl
+	jnz	L(2_3bytes)
+	testb	$1, %dl
+	jz	L(return)
+	movb	%cl, (%rdi)
+L(return):
+	ret
+
+L(8_15bytes):
+	movq	%rcx, (%rdi)
+	movq	%rcx, -8(%rdi, %rdx)
+	ret
+
+L(4_7bytes):
+	movl	%ecx, (%rdi)
+	movl	%ecx, -4(%rdi, %rdx)
+	ret
+
+L(2_3bytes):
+	movw	%cx, (%rdi)
+	movw	%cx, -2(%rdi, %rdx)
+	ret
+
+	ALIGN (4)
+L(16bytesormore):
+	movd	%rcx, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm0, -16(%rdi, %rdx)
+	cmpq	$32, %rdx
+	jbe	L(32bytesless)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, -32(%rdi, %rdx)
+	cmpq	$64, %rdx
+	jbe	L(64bytesless)
+	movdqu	%xmm0, 32(%rdi)
+	movdqu	%xmm0, 48(%rdi)
+	movdqu	%xmm0, -64(%rdi, %rdx)
+	movdqu	%xmm0, -48(%rdi, %rdx)
+	cmpq	$128, %rdx
+	ja	L(128bytesmore)
+L(32bytesless):
+L(64bytesless):
+	ret
+
+	ALIGN (4)
+L(128bytesmore):
+	leaq	64(%rdi), %rcx
+	andq	$-64, %rcx
+	movq	%rdx, %r8
+	addq	%rdi, %rdx
+	andq	$-64, %rdx
+	cmpq	%rcx, %rdx
+	je	L(return)
+
+#ifdef SHARED_CACHE_SIZE
+	cmp	$SHARED_CACHE_SIZE, %r8
+#else
+	cmp	__x86_64_shared_cache_size(%rip), %r8
+#endif
+	ja	L(128bytesmore_nt)
+
+	ALIGN (4)
+L(128bytesmore_normal):
+	movdqa	%xmm0, (%rcx)
+	movaps	%xmm0, 0x10(%rcx)
+	movaps	%xmm0, 0x20(%rcx)
+	movaps	%xmm0, 0x30(%rcx)
+	addq	$64, %rcx
+	cmpq	%rcx, %rdx
+	jne	L(128bytesmore_normal)
+	ret
+
+	ALIGN (4)
+L(128bytesmore_nt):
+	movntdq	%xmm0, (%rcx)
+	movntdq	%xmm0, 0x10(%rcx)
+	movntdq	%xmm0, 0x20(%rcx)
+	movntdq	%xmm0, 0x30(%rcx)
+	leaq	64(%rcx), %rcx
+	cmpq	%rcx, %rdx
+	jne	L(128bytesmore_nt)
+	sfence
+	ret
+
+END(memset_generic)
--- a/contrib/bionic-x86_64-string/sse2-stpcpy-slm.S
+++ b/contrib/bionic-x86_64-string/sse2-stpcpy-slm.S
@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STPCPY
+#define STRCPY		stpcpy
+#include "sse2-strcpy-slm.S"
--- a/contrib/bionic-x86_64-string/sse2-stpncpy-slm.S
+++ b/contrib/bionic-x86_64-string/sse2-stpncpy-slm.S
@ -0,0 +1,34 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCPY
+#define USE_AS_STPCPY
+#define STRCPY		stpncpy
+#include "sse2-strcpy-slm.S"
--- a/contrib/bionic-x86_64-string/sse2-strcat-slm.S
+++ b/contrib/bionic-x86_64-string/sse2-strcat-slm.S
@ -0,0 +1,87 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef STRCAT
+# define STRCAT		strcat
+#endif
+
+#ifndef L
+# define L(label)		.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc		 .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc		.cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)		\
+	.type name,  @function;		\
+	.globl name;		\
+	.p2align 4;		\
+name:		\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)		\
+	cfi_endproc;		\
+	.size name, .-name
+#endif
+
+#define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+	mov	%rdi, %r9
+#ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+#endif
+
+#define RETURN jmp L(Strcpy)
+#include "sse2-strlen-slm.S"
+
+#undef RETURN
+#define RETURN ret
+
+L(Strcpy):
+	lea	(%r9, %rax), %rdi
+	mov	%rsi, %rcx
+	mov	%r9, %rax	/* save result */
+
+#ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(ExitZero)
+# define USE_AS_STRNCPY
+#endif
+#include "sse2-strcpy-slm.S"
--- a/contrib/bionic-x86_64-string/sse2-strcpy-slm.S
+++ b/contrib/bionic-x86_64-string/sse2-strcpy-slm.S
--- a/contrib/bionic-x86_64-string/sse2-strlen-slm.S
+++ b/contrib/bionic-x86_64-string/sse2-strlen-slm.S
@ -0,0 +1,294 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_STRCAT
+
+#ifndef STRLEN
+# define STRLEN		strlen
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc			.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc			.cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)			\
+	.type name,  @function; 	\
+	.globl name;			\
+	.p2align 4;			\
+name:					\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)			\
+	cfi_endproc;			\
+	.size name, .-name
+#endif
+#define RETURN ret
+	.section .text.sse2,"ax",@progbits
+ENTRY (STRLEN)
+/* end ifndef USE_AS_STRCAT */
+#endif
+	xor	%rax, %rax
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%rdi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	mov	%rdi, %rax
+	and	$-16, %rax
+	jmp	L(align16_start)
+L(next):
+	mov	%rdi, %rax
+	and	$-16, %rax
+	pcmpeqb	(%rax), %xmm0
+	mov	$-1, %r10d
+	sub	%rax, %rcx
+	shl	%cl, %r10d
+	pmovmskb %xmm0, %edx
+	and	%r10d, %edx
+	jnz	L(exit)
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pcmpeqb	16(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+	
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+	
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$80, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm1
+	add	$16, %rax
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm2
+	add	$16, %rax
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm3
+	add	$16, %rax
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	add	$16, %rax
+	.p2align 4
+	L(align64_loop):
+	movaps	(%rax),	%xmm4
+	pminub	16(%rax), 	%xmm4
+	movaps	32(%rax), 	%xmm5
+	pminub	48(%rax), 	%xmm5
+	add	$64, 	%rax
+	pminub	%xmm4,	%xmm5
+	pcmpeqb	%xmm0,	%xmm5
+	pmovmskb %xmm5,	%edx
+	test	%edx,	%edx
+	jz	L(align64_loop)
+
+
+	pcmpeqb	-64(%rax), %xmm0
+	sub	$80, 	%rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+	RETURN
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+L(exit_less16):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	RETURN
+	.p2align 4
+L(exit16):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$16, %rax
+	RETURN
+	.p2align 4
+L(exit32):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$32, %rax
+	RETURN
+	.p2align 4
+L(exit48):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$48, %rax
+	RETURN
+	.p2align 4
+L(exit64):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+#ifndef USE_AS_STRCAT
+	RETURN
+
+END (STRLEN)
+#endif
--- a/contrib/bionic-x86_64-string/sse2-strncat-slm.S
+++ b/contrib/bionic-x86_64-string/sse2-strncat-slm.S
@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCAT
+#define STRCAT		strncat
+#include "sse2-strcat-slm.S"
--- a/contrib/bionic-x86_64-string/sse2-strncpy-slm.S
+++ b/contrib/bionic-x86_64-string/sse2-strncpy-slm.S
@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCPY
+#define STRCPY		strncpy
+#include "sse2-strcpy-slm.S"
--- a/contrib/bionic-x86_64-string/sse4-memcmp-slm.S
+++ b/contrib/bionic-x86_64-string/sse4-memcmp-slm.S
--- a/contrib/bionic-x86_64-string/ssse3-strcmp-slm.S
+++ b/contrib/bionic-x86_64-string/ssse3-strcmp-slm.S
--- a/contrib/bionic-x86_64-string/ssse3-strncmp-slm.S
+++ b/contrib/bionic-x86_64-string/ssse3-strncmp-slm.S
@ -0,0 +1,33 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCMP
+#define STRCMP		strncmp
+#include "ssse3-strcmp-slm.S"