Mateusz Guzik 088ac3ef4b amd64: handle small memset buffers with overlapping stores
Instead of jumping to locations which store the exact number of bytes,
use displacement to move the destination.

In particular the following clears an area between 8-16 (inclusive)
branch-free:

movq    %r10,(%rdi)
movq    %r10,-8(%rdi,%rcx)

For instance for rcx of 10 the second line is rdi + 10 - 8 = rdi + 2.
Writing 8 bytes starting at that offset overlaps with 6 bytes written
previously and writes 2 new, giving 10 in total.

Provides a nice win for smaller stores. Other ones are erratic depending
on the microarchitecture.

General idea taken from NetBSD (restricted use of the trick) and bionic
string functions (use for various ranges like in this patch).

Reviewed by:	kib (previous version)
Sponsored by:	The FreeBSD Foundation
Differential Revision:	https://reviews.freebsd.org/D17660
2018-11-16 00:44:22 +00:00

143 lines
2.9 KiB
ArmAsm

/*-
* Copyright (c) 2018 The FreeBSD Foundation
*
* This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
* under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <machine/asm.h>
__FBSDID("$FreeBSD$");
#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
.macro MEMSET erms
movq %rdi,%rax
movq %rdx,%rcx
movzbq %sil,%r8
movabs $0x0101010101010101,%r10
imulq %r8,%r10
cmpq $32,%rcx
jbe 101632f
cmpq $256,%rcx
ja 1256f
103200:
movq %r10,(%rdi)
movq %r10,8(%rdi)
movq %r10,16(%rdi)
movq %r10,24(%rdi)
leaq 32(%rdi),%rdi
subq $32,%rcx
cmpq $32,%rcx
ja 103200b
cmpb $16,%cl
ja 201632f
movq %r10,-16(%rdi,%rcx)
movq %r10,-8(%rdi,%rcx)
ret
ALIGN_TEXT
101632:
cmpb $16,%cl
jl 100816f
201632:
movq %r10,(%rdi)
movq %r10,8(%rdi)
movq %r10,-16(%rdi,%rcx)
movq %r10,-8(%rdi,%rcx)
ret
ALIGN_TEXT
100816:
cmpb $8,%cl
jl 100408f
movq %r10,(%rdi)
movq %r10,-8(%rdi,%rcx)
ret
ALIGN_TEXT
100408:
cmpb $4,%cl
jl 100204f
movl %r10d,(%rdi)
movl %r10d,-4(%rdi,%rcx)
ret
ALIGN_TEXT
100204:
cmpb $2,%cl
jl 100001f
movw %r10w,(%rdi)
movw %r10w,-2(%rdi,%rcx)
ret
ALIGN_TEXT
100001:
cmpb $0,%cl
je 100000f
movb %r10b,(%rdi)
100000:
ret
ALIGN_TEXT
1256:
movq %rdi,%r9
movq %r10,%rax
testl $15,%edi
jnz 3f
1:
.if \erms == 1
rep
stosb
movq %r9,%rax
.else
movq %rcx,%rdx
shrq $3,%rcx
rep
stosq
movq %r9,%rax
andl $7,%edx
jnz 2f
ret
2:
movq %r10,-8(%rdi,%rdx)
.endif
ret
ALIGN_TEXT
3:
movq %r10,(%rdi)
movq %r10,8(%rdi)
movq %rdi,%r8
andq $15,%r8
leaq -16(%rcx,%r8),%rcx
neg %r8
leaq 16(%rdi,%r8),%rdi
jmp 1b
.endm
ENTRY(memset)
MEMSET erms=0
END(memset)
.section .note.GNU-stack,"",%progbits