amd64: handle small memset buffers with overlapping stores

Instead of jumping to locations which store the exact number of bytes,
use displacement to move the destination.

In particular the following clears an area between 8-16 (inclusive)
branch-free:

movq    %r10,(%rdi)
movq    %r10,-8(%rdi,%rcx)

For instance for rcx of 10 the second line is rdi + 10 - 8 = rdi + 2.
Writing 8 bytes starting at that offset overlaps with 6 bytes written
previously and writes 2 new, giving 10 in total.

Provides a nice win for smaller stores. Other ones are erratic depending
on the microarchitecture.

General idea taken from NetBSD (restricted use of the trick) and bionic
string functions (use for various ranges like in this patch).

Reviewed by:	kib (previous version)
Sponsored by:	The FreeBSD Foundation
Differential Revision:	https://reviews.freebsd.org/D17660
This commit is contained in:
Mateusz Guzik 2018-11-16 00:44:22 +00:00
parent 47c64f9e3e
commit 088ac3ef4b
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=340472
2 changed files with 76 additions and 58 deletions

View File

@ -41,12 +41,12 @@ __FBSDID("$FreeBSD$");
imulq %r8,%r10
cmpq $32,%rcx
jb 1016f
jbe 101632f
cmpq $256,%rcx
ja 1256f
1032:
103200:
movq %r10,(%rdi)
movq %r10,8(%rdi)
movq %r10,16(%rdi)
@ -54,43 +54,49 @@ __FBSDID("$FreeBSD$");
leaq 32(%rdi),%rdi
subq $32,%rcx
cmpq $32,%rcx
jae 1032b
cmpb $0,%cl
je 1000f
1016:
ja 103200b
cmpb $16,%cl
jl 1008f
ja 201632f
movq %r10,-16(%rdi,%rcx)
movq %r10,-8(%rdi,%rcx)
ret
ALIGN_TEXT
101632:
cmpb $16,%cl
jl 100816f
201632:
movq %r10,(%rdi)
movq %r10,8(%rdi)
subb $16,%cl
jz 1000f
leaq 16(%rdi),%rdi
1008:
movq %r10,-16(%rdi,%rcx)
movq %r10,-8(%rdi,%rcx)
ret
ALIGN_TEXT
100816:
cmpb $8,%cl
jl 1004f
jl 100408f
movq %r10,(%rdi)
subb $8,%cl
jz 1000f
leaq 8(%rdi),%rdi
1004:
movq %r10,-8(%rdi,%rcx)
ret
ALIGN_TEXT
100408:
cmpb $4,%cl
jl 1002f
jl 100204f
movl %r10d,(%rdi)
subb $4,%cl
jz 1000f
leaq 4(%rdi),%rdi
1002:
movl %r10d,-4(%rdi,%rcx)
ret
ALIGN_TEXT
100204:
cmpb $2,%cl
jl 1001f
jl 100001f
movw %r10w,(%rdi)
subb $2,%cl
jz 1000f
leaq 2(%rdi),%rdi
1001:
cmpb $1,%cl
jl 1000f
movw %r10w,-2(%rdi,%rcx)
ret
ALIGN_TEXT
100001:
cmpb $0,%cl
je 100000f
movb %r10b,(%rdi)
1000:
100000:
ret
ALIGN_TEXT
1256:
@ -128,6 +134,7 @@ __FBSDID("$FreeBSD$");
jmp 1b
.endm
ENTRY(memset)
MEMSET erms=0
END(memset)

View File

@ -459,12 +459,12 @@ END(memcpy_erms)
imulq %r8,%r10
cmpq $32,%rcx
jb 1016f
jbe 101632f
cmpq $256,%rcx
ja 1256f
1032:
103200:
movq %r10,(%rdi)
movq %r10,8(%rdi)
movq %r10,16(%rdi)
@ -472,43 +472,54 @@ END(memcpy_erms)
leaq 32(%rdi),%rdi
subq $32,%rcx
cmpq $32,%rcx
jae 1032b
cmpb $0,%cl
je 1000f
1016:
ja 103200b
cmpb $16,%cl
jl 1008f
ja 201632f
movq %r10,-16(%rdi,%rcx)
movq %r10,-8(%rdi,%rcx)
POP_FRAME_POINTER
ret
ALIGN_TEXT
101632:
cmpb $16,%cl
jl 100816f
201632:
movq %r10,(%rdi)
movq %r10,8(%rdi)
subb $16,%cl
jz 1000f
leaq 16(%rdi),%rdi
1008:
movq %r10,-16(%rdi,%rcx)
movq %r10,-8(%rdi,%rcx)
POP_FRAME_POINTER
ret
ALIGN_TEXT
100816:
cmpb $8,%cl
jl 1004f
jl 100408f
movq %r10,(%rdi)
subb $8,%cl
jz 1000f
leaq 8(%rdi),%rdi
1004:
movq %r10,-8(%rdi,%rcx)
POP_FRAME_POINTER
ret
ALIGN_TEXT
100408:
cmpb $4,%cl
jl 1002f
jl 100204f
movl %r10d,(%rdi)
subb $4,%cl
jz 1000f
leaq 4(%rdi),%rdi
1002:
movl %r10d,-4(%rdi,%rcx)
POP_FRAME_POINTER
ret
ALIGN_TEXT
100204:
cmpb $2,%cl
jl 1001f
jl 100001f
movw %r10w,(%rdi)
subb $2,%cl
jz 1000f
leaq 2(%rdi),%rdi
1001:
cmpb $1,%cl
jl 1000f
movw %r10w,-2(%rdi,%rcx)
POP_FRAME_POINTER
ret
ALIGN_TEXT
100001:
cmpb $0,%cl
je 100000f
movb %r10b,(%rdi)
1000:
100000:
POP_FRAME_POINTER
ret
ALIGN_TEXT