amd64: make memmove and memcpy less slow with mov

The reasoning is the same as with the memset change, see r339205

Reviewed by:	kib (previous version)
Approved by:	re (gjb)
Sponsored by:	The FreeBSD Foundation
Differential Revision:	https://reviews.freebsd.org/D17441
This commit is contained in:
mjg 2018-10-11 23:37:57 +00:00
parent 7346592324
commit 99479972f8

View File

@ -200,82 +200,236 @@ END(memcmp)
* Adapted from bcopy written by:
* ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
*/
ENTRY(memmove_std)
PUSH_FRAME_POINTER
movq %rdi,%rax
movq %rdx,%rcx
/*
* Register state at entry is supposed to be as follows:
* rdi - destination
* rsi - source
* rdx - count
*
* The macro possibly clobbers the above and: rcx, r8.
* It does not clobber rax, r10 nor r11.
*/
.macro MEMMOVE erms overlap begin end
\begin
.if \overlap == 1
movq %rdi,%r8
subq %rsi,%r8
cmpq %rcx,%r8 /* overlapping && src < dst? */
cmpq %rcx,%r8 /* overlapping && src < dst? */
jb 2f
.endif
cmpq $15,%rcx
jbe 1f
shrq $3,%rcx /* copy by 64-bit words */
cmpq $32,%rcx
jb 1016f
cmpq $256,%rcx
ja 1256f
1032:
movq (%rsi),%rdx
movq %rdx,(%rdi)
movq 8(%rsi),%rdx
movq %rdx,8(%rdi)
movq 16(%rsi),%rdx
movq %rdx,16(%rdi)
movq 24(%rsi),%rdx
movq %rdx,24(%rdi)
leaq 32(%rsi),%rsi
leaq 32(%rdi),%rdi
subq $32,%rcx
cmpq $32,%rcx
jae 1032b
cmpb $0,%cl
jne 1016f
\end
ret
ALIGN_TEXT
1016:
cmpb $16,%cl
jl 1008f
movq (%rsi),%rdx
movq %rdx,(%rdi)
movq 8(%rsi),%rdx
movq %rdx,8(%rdi)
subb $16,%cl
jz 1000f
leaq 16(%rsi),%rsi
leaq 16(%rdi),%rdi
1008:
cmpb $8,%cl
jl 1004f
movq (%rsi),%rdx
movq %rdx,(%rdi)
subb $8,%cl
jz 1000f
leaq 8(%rsi),%rsi
leaq 8(%rdi),%rdi
1004:
cmpb $4,%cl
jl 1002f
movl (%rsi),%edx
movl %edx,(%rdi)
subb $4,%cl
jz 1000f
leaq 4(%rsi),%rsi
leaq 4(%rdi),%rdi
1002:
cmpb $2,%cl
jl 1001f
movw (%rsi),%dx
movw %dx,(%rdi)
subb $2,%cl
jz 1000f
leaq 2(%rsi),%rsi
leaq 2(%rdi),%rdi
1001:
cmpb $1,%cl
jl 1000f
movb (%rsi),%dl
movb %dl,(%rdi)
1000:
\end
ret
ALIGN_TEXT
1256:
.if \erms == 1
rep
movsb
.else
shrq $3,%rcx /* copy by 64-bit words */
rep
movsq
movq %rdx,%rcx
andq $7,%rcx /* any bytes left? */
jne 1f
POP_FRAME_POINTER
ret
ALIGN_TEXT
1:
rep
movsb
POP_FRAME_POINTER
andb $7,%cl /* any bytes left? */
jne 1004b
.endif
\end
ret
/* ALIGN_TEXT */
.if \overlap == 1
/*
* Copy backwards.
*/
ALIGN_TEXT
2:
addq %rcx,%rdi /* copy backwards */
addq %rcx,%rdi
addq %rcx,%rsi
cmpq $32,%rcx
jb 2016f
cmpq $256,%rcx
ja 2256f
2032:
movq -8(%rsi),%rdx
movq %rdx,-8(%rdi)
movq -16(%rsi),%rdx
movq %rdx,-16(%rdi)
movq -24(%rsi),%rdx
movq %rdx,-24(%rdi)
movq -32(%rsi),%rdx
movq %rdx,-32(%rdi)
leaq -32(%rsi),%rsi
leaq -32(%rdi),%rdi
subq $32,%rcx
cmpq $32,%rcx
jae 2032b
cmpb $0,%cl
jne 2016f
\end
ret
ALIGN_TEXT
2016:
cmpb $16,%cl
jl 2008f
movq -8(%rsi),%rdx
movq %rdx,-8(%rdi)
movq -16(%rsi),%rdx
movq %rdx,-16(%rdi)
subb $16,%cl
jz 2000f
leaq -16(%rsi),%rsi
leaq -16(%rdi),%rdi
2008:
cmpb $8,%cl
jl 2004f
movq -8(%rsi),%rdx
movq %rdx,-8(%rdi)
subb $8,%cl
jz 2000f
leaq -8(%rsi),%rsi
leaq -8(%rdi),%rdi
2004:
cmpb $4,%cl
jl 2002f
movl -4(%rsi),%edx
movl %edx,-4(%rdi)
subb $4,%cl
jz 2000f
leaq -4(%rsi),%rsi
leaq -4(%rdi),%rdi
2002:
cmpb $2,%cl
jl 2001f
movw -2(%rsi),%dx
movw %dx,-2(%rdi)
subb $2,%cl
jz 2000f
leaq -2(%rsi),%rsi
leaq -2(%rdi),%rdi
2001:
cmpb $1,%cl
jl 2000f
movb -1(%rsi),%dl
movb %dl,-1(%rdi)
2000:
\end
ret
ALIGN_TEXT
2256:
decq %rdi
decq %rsi
std
andq $7,%rcx /* any fractional bytes? */
.if \erms == 1
rep
movsb
.else
andq $7,%rcx /* any fractional bytes? */
je 3f
rep
movsb
3:
movq %rdx,%rcx /* copy remainder by 32-bit words */
movq %rdx,%rcx /* copy remainder by 32-bit words */
shrq $3,%rcx
subq $7,%rsi
subq $7,%rdi
rep
movsq
.endif
cld
POP_FRAME_POINTER
\end
ret
END(memmove_std)
.endif
.endm
ENTRY(memmove_erms)
.macro MEMMOVE_BEGIN
PUSH_FRAME_POINTER
movq %rdi,%rax
movq %rdx,%rcx
.endm
movq %rdi,%r8
subq %rsi,%r8
cmpq %rcx,%r8 /* overlapping && src < dst? */
jb 1f
rep
movsb
.macro MEMMOVE_END
POP_FRAME_POINTER
ret
.endm
1:
addq %rcx,%rdi /* copy backwards */
addq %rcx,%rsi
decq %rdi
decq %rsi
std
rep
movsb
cld
POP_FRAME_POINTER
ret
ENTRY(memmove_std)
MEMMOVE erms=0 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
END(memmove_std)
ENTRY(memmove_erms)
MEMMOVE erms=1 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
END(memmove_erms)
/*
@ -285,35 +439,11 @@ END(memmove_erms)
* Note: memcpy does not support overlapping copies
*/
ENTRY(memcpy_std)
PUSH_FRAME_POINTER
movq %rdi,%rax
movq %rdx,%rcx
cmpq $15,%rcx
jbe 1f
shrq $3,%rcx /* copy by 64-bit words */
rep
movsq
movq %rdx,%rcx
andq $7,%rcx /* any bytes left? */
jne 1f
POP_FRAME_POINTER
ret
ALIGN_TEXT
1:
rep
movsb
POP_FRAME_POINTER
ret
MEMMOVE erms=0 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END
END(memcpy_std)
ENTRY(memcpy_erms)
PUSH_FRAME_POINTER
movq %rdi,%rax
movq %rdx,%rcx
rep
movsb
POP_FRAME_POINTER
ret
MEMMOVE erms=1 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END
END(memcpy_erms)
/*