amd64: handle small memset buffers with overlapping stores
Instead of jumping to locations which store the exact number of bytes, use displacement to move the destination. In particular the following clears an area between 8-16 (inclusive) branch-free: movq %r10,(%rdi) movq %r10,-8(%rdi,%rcx) For instance for rcx of 10 the second line is rdi + 10 - 8 = rdi + 2. Writing 8 bytes starting at that offset overlaps with 6 bytes written previously and writes 2 new, giving 10 in total. Provides a nice win for smaller stores. Other ones are erratic depending on the microarchitecture. General idea taken from NetBSD (restricted use of the trick) and bionic string functions (use for various ranges like in this patch). Reviewed by: kib (previous version) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17660
This commit is contained in:
parent
47c64f9e3e
commit
088ac3ef4b
@ -41,12 +41,12 @@ __FBSDID("$FreeBSD$");
|
||||
imulq %r8,%r10
|
||||
|
||||
cmpq $32,%rcx
|
||||
jb 1016f
|
||||
jbe 101632f
|
||||
|
||||
cmpq $256,%rcx
|
||||
ja 1256f
|
||||
|
||||
1032:
|
||||
103200:
|
||||
movq %r10,(%rdi)
|
||||
movq %r10,8(%rdi)
|
||||
movq %r10,16(%rdi)
|
||||
@ -54,43 +54,49 @@ __FBSDID("$FreeBSD$");
|
||||
leaq 32(%rdi),%rdi
|
||||
subq $32,%rcx
|
||||
cmpq $32,%rcx
|
||||
jae 1032b
|
||||
cmpb $0,%cl
|
||||
je 1000f
|
||||
1016:
|
||||
ja 103200b
|
||||
cmpb $16,%cl
|
||||
jl 1008f
|
||||
ja 201632f
|
||||
movq %r10,-16(%rdi,%rcx)
|
||||
movq %r10,-8(%rdi,%rcx)
|
||||
ret
|
||||
ALIGN_TEXT
|
||||
101632:
|
||||
cmpb $16,%cl
|
||||
jl 100816f
|
||||
201632:
|
||||
movq %r10,(%rdi)
|
||||
movq %r10,8(%rdi)
|
||||
subb $16,%cl
|
||||
jz 1000f
|
||||
leaq 16(%rdi),%rdi
|
||||
1008:
|
||||
movq %r10,-16(%rdi,%rcx)
|
||||
movq %r10,-8(%rdi,%rcx)
|
||||
ret
|
||||
ALIGN_TEXT
|
||||
100816:
|
||||
cmpb $8,%cl
|
||||
jl 1004f
|
||||
jl 100408f
|
||||
movq %r10,(%rdi)
|
||||
subb $8,%cl
|
||||
jz 1000f
|
||||
leaq 8(%rdi),%rdi
|
||||
1004:
|
||||
movq %r10,-8(%rdi,%rcx)
|
||||
ret
|
||||
ALIGN_TEXT
|
||||
100408:
|
||||
cmpb $4,%cl
|
||||
jl 1002f
|
||||
jl 100204f
|
||||
movl %r10d,(%rdi)
|
||||
subb $4,%cl
|
||||
jz 1000f
|
||||
leaq 4(%rdi),%rdi
|
||||
1002:
|
||||
movl %r10d,-4(%rdi,%rcx)
|
||||
ret
|
||||
ALIGN_TEXT
|
||||
100204:
|
||||
cmpb $2,%cl
|
||||
jl 1001f
|
||||
jl 100001f
|
||||
movw %r10w,(%rdi)
|
||||
subb $2,%cl
|
||||
jz 1000f
|
||||
leaq 2(%rdi),%rdi
|
||||
1001:
|
||||
cmpb $1,%cl
|
||||
jl 1000f
|
||||
movw %r10w,-2(%rdi,%rcx)
|
||||
ret
|
||||
ALIGN_TEXT
|
||||
100001:
|
||||
cmpb $0,%cl
|
||||
je 100000f
|
||||
movb %r10b,(%rdi)
|
||||
1000:
|
||||
100000:
|
||||
ret
|
||||
ALIGN_TEXT
|
||||
1256:
|
||||
@ -128,6 +134,7 @@ __FBSDID("$FreeBSD$");
|
||||
jmp 1b
|
||||
.endm
|
||||
|
||||
|
||||
ENTRY(memset)
|
||||
MEMSET erms=0
|
||||
END(memset)
|
||||
|
@ -459,12 +459,12 @@ END(memcpy_erms)
|
||||
imulq %r8,%r10
|
||||
|
||||
cmpq $32,%rcx
|
||||
jb 1016f
|
||||
jbe 101632f
|
||||
|
||||
cmpq $256,%rcx
|
||||
ja 1256f
|
||||
|
||||
1032:
|
||||
103200:
|
||||
movq %r10,(%rdi)
|
||||
movq %r10,8(%rdi)
|
||||
movq %r10,16(%rdi)
|
||||
@ -472,43 +472,54 @@ END(memcpy_erms)
|
||||
leaq 32(%rdi),%rdi
|
||||
subq $32,%rcx
|
||||
cmpq $32,%rcx
|
||||
jae 1032b
|
||||
cmpb $0,%cl
|
||||
je 1000f
|
||||
1016:
|
||||
ja 103200b
|
||||
cmpb $16,%cl
|
||||
jl 1008f
|
||||
ja 201632f
|
||||
movq %r10,-16(%rdi,%rcx)
|
||||
movq %r10,-8(%rdi,%rcx)
|
||||
POP_FRAME_POINTER
|
||||
ret
|
||||
ALIGN_TEXT
|
||||
101632:
|
||||
cmpb $16,%cl
|
||||
jl 100816f
|
||||
201632:
|
||||
movq %r10,(%rdi)
|
||||
movq %r10,8(%rdi)
|
||||
subb $16,%cl
|
||||
jz 1000f
|
||||
leaq 16(%rdi),%rdi
|
||||
1008:
|
||||
movq %r10,-16(%rdi,%rcx)
|
||||
movq %r10,-8(%rdi,%rcx)
|
||||
POP_FRAME_POINTER
|
||||
ret
|
||||
ALIGN_TEXT
|
||||
100816:
|
||||
cmpb $8,%cl
|
||||
jl 1004f
|
||||
jl 100408f
|
||||
movq %r10,(%rdi)
|
||||
subb $8,%cl
|
||||
jz 1000f
|
||||
leaq 8(%rdi),%rdi
|
||||
1004:
|
||||
movq %r10,-8(%rdi,%rcx)
|
||||
POP_FRAME_POINTER
|
||||
ret
|
||||
ALIGN_TEXT
|
||||
100408:
|
||||
cmpb $4,%cl
|
||||
jl 1002f
|
||||
jl 100204f
|
||||
movl %r10d,(%rdi)
|
||||
subb $4,%cl
|
||||
jz 1000f
|
||||
leaq 4(%rdi),%rdi
|
||||
1002:
|
||||
movl %r10d,-4(%rdi,%rcx)
|
||||
POP_FRAME_POINTER
|
||||
ret
|
||||
ALIGN_TEXT
|
||||
100204:
|
||||
cmpb $2,%cl
|
||||
jl 1001f
|
||||
jl 100001f
|
||||
movw %r10w,(%rdi)
|
||||
subb $2,%cl
|
||||
jz 1000f
|
||||
leaq 2(%rdi),%rdi
|
||||
1001:
|
||||
cmpb $1,%cl
|
||||
jl 1000f
|
||||
movw %r10w,-2(%rdi,%rcx)
|
||||
POP_FRAME_POINTER
|
||||
ret
|
||||
ALIGN_TEXT
|
||||
100001:
|
||||
cmpb $0,%cl
|
||||
je 100000f
|
||||
movb %r10b,(%rdi)
|
||||
1000:
|
||||
100000:
|
||||
POP_FRAME_POINTER
|
||||
ret
|
||||
ALIGN_TEXT
|
||||
|
Loading…
x
Reference in New Issue
Block a user