amd64: tidy up copying backwards in memmove

For non-ERMS case the code used handle possible trailing bytes with
movsb first and then followed it up with movsq. This also happened
to alter how calculations were done for other cases.

Handle the tail with regular movs, just like when copying forward.
Use leaq to calculate the right offset from the get go, instead of
doing separate add and sub.

This adjusts the offset for non-rep cases so that they can be used
to handle the tail.

The routine is still a work in progress.

Sponsored by:	The FreeBSD Foundation
This commit is contained in:
Mateusz Guzik 2018-11-30 00:45:10 +00:00
parent 31562c4440
commit dd219e5ea5
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=341272
2 changed files with 52 additions and 56 deletions

View File

@ -150,24 +150,24 @@ __FBSDID("$FreeBSD$");
*/
ALIGN_TEXT
2:
addq %rcx,%rdi
addq %rcx,%rsi
cmpq $256,%rcx
ja 2256f
leaq -8(%rdi,%rcx),%rdi
leaq -8(%rsi,%rcx),%rsi
cmpq $32,%rcx
jb 2016f
cmpq $256,%rcx
ja 2256f
2032:
movq (%rsi),%rdx
movq %rdx,(%rdi)
movq -8(%rsi),%rdx
movq %rdx,-8(%rdi)
movq -16(%rsi),%rdx
movq %rdx,-16(%rdi)
movq -24(%rsi),%rdx
movq %rdx,-24(%rdi)
movq -32(%rsi),%rdx
movq %rdx,-32(%rdi)
leaq -32(%rsi),%rsi
leaq -32(%rdi),%rdi
subq $32,%rcx
@ -181,10 +181,10 @@ __FBSDID("$FreeBSD$");
2016:
cmpb $16,%cl
jl 2008f
movq (%rsi),%rdx
movq %rdx,(%rdi)
movq -8(%rsi),%rdx
movq %rdx,-8(%rdi)
movq -16(%rsi),%rdx
movq %rdx,-16(%rdi)
subb $16,%cl
jz 2000f
leaq -16(%rsi),%rsi
@ -192,8 +192,8 @@ __FBSDID("$FreeBSD$");
2008:
cmpb $8,%cl
jl 2004f
movq -8(%rsi),%rdx
movq %rdx,-8(%rdi)
movq (%rsi),%rdx
movq %rdx,(%rdi)
subb $8,%cl
jz 2000f
leaq -8(%rsi),%rsi
@ -201,8 +201,8 @@ __FBSDID("$FreeBSD$");
2004:
cmpb $4,%cl
jl 2002f
movl -4(%rsi),%edx
movl %edx,-4(%rdi)
movl 4(%rsi),%edx
movl %edx,4(%rdi)
subb $4,%cl
jz 2000f
leaq -4(%rsi),%rsi
@ -210,8 +210,8 @@ __FBSDID("$FreeBSD$");
2002:
cmpb $2,%cl
jl 2001f
movw -2(%rsi),%dx
movw %dx,-2(%rdi)
movw 6(%rsi),%dx
movw %dx,6(%rdi)
subb $2,%cl
jz 2000f
leaq -2(%rsi),%rsi
@ -219,33 +219,31 @@ __FBSDID("$FreeBSD$");
2001:
cmpb $1,%cl
jl 2000f
movb -1(%rsi),%dl
movb %dl,-1(%rdi)
movb 7(%rsi),%dl
movb %dl,7(%rdi)
2000:
\end
ret
ALIGN_TEXT
2256:
decq %rdi
decq %rsi
std
.if \erms == 1
leaq -1(%rdi,%rcx),%rdi
leaq -1(%rsi,%rcx),%rsi
rep
movsb
cld
.else
andq $7,%rcx /* any fractional bytes? */
je 3f
rep
movsb
3:
movq %rdx,%rcx /* copy remainder by 32-bit words */
leaq -8(%rdi,%rcx),%rdi
leaq -8(%rsi,%rcx),%rsi
shrq $3,%rcx
subq $7,%rsi
subq $7,%rdi
rep
movsq
.endif
cld
movq %rdx,%rcx
andb $7,%cl
jne 2004b
.endif
\end
ret
.endif

View File

@ -313,24 +313,24 @@ END(memcmp)
*/
ALIGN_TEXT
2:
addq %rcx,%rdi
addq %rcx,%rsi
cmpq $256,%rcx
ja 2256f
leaq -8(%rdi,%rcx),%rdi
leaq -8(%rsi,%rcx),%rsi
cmpq $32,%rcx
jb 2016f
cmpq $256,%rcx
ja 2256f
2032:
movq (%rsi),%rdx
movq %rdx,(%rdi)
movq -8(%rsi),%rdx
movq %rdx,-8(%rdi)
movq -16(%rsi),%rdx
movq %rdx,-16(%rdi)
movq -24(%rsi),%rdx
movq %rdx,-24(%rdi)
movq -32(%rsi),%rdx
movq %rdx,-32(%rdi)
leaq -32(%rsi),%rsi
leaq -32(%rdi),%rdi
subq $32,%rcx
@ -344,10 +344,10 @@ END(memcmp)
2016:
cmpb $16,%cl
jl 2008f
movq (%rsi),%rdx
movq %rdx,(%rdi)
movq -8(%rsi),%rdx
movq %rdx,-8(%rdi)
movq -16(%rsi),%rdx
movq %rdx,-16(%rdi)
subb $16,%cl
jz 2000f
leaq -16(%rsi),%rsi
@ -355,8 +355,8 @@ END(memcmp)
2008:
cmpb $8,%cl
jl 2004f
movq -8(%rsi),%rdx
movq %rdx,-8(%rdi)
movq (%rsi),%rdx
movq %rdx,(%rdi)
subb $8,%cl
jz 2000f
leaq -8(%rsi),%rsi
@ -364,8 +364,8 @@ END(memcmp)
2004:
cmpb $4,%cl
jl 2002f
movl -4(%rsi),%edx
movl %edx,-4(%rdi)
movl 4(%rsi),%edx
movl %edx,4(%rdi)
subb $4,%cl
jz 2000f
leaq -4(%rsi),%rsi
@ -373,8 +373,8 @@ END(memcmp)
2002:
cmpb $2,%cl
jl 2001f
movw -2(%rsi),%dx
movw %dx,-2(%rdi)
movw 6(%rsi),%dx
movw %dx,6(%rdi)
subb $2,%cl
jz 2000f
leaq -2(%rsi),%rsi
@ -382,33 +382,31 @@ END(memcmp)
2001:
cmpb $1,%cl
jl 2000f
movb -1(%rsi),%dl
movb %dl,-1(%rdi)
movb 7(%rsi),%dl
movb %dl,7(%rdi)
2000:
\end
ret
ALIGN_TEXT
2256:
decq %rdi
decq %rsi
std
.if \erms == 1
leaq -1(%rdi,%rcx),%rdi
leaq -1(%rsi,%rcx),%rsi
rep
movsb
cld
.else
andq $7,%rcx /* any fractional bytes? */
je 3f
rep
movsb
3:
movq %rdx,%rcx /* copy remainder by 32-bit words */
leaq -8(%rdi,%rcx),%rdi
leaq -8(%rsi,%rcx),%rsi
shrq $3,%rcx
subq $7,%rsi
subq $7,%rdi
rep
movsq
.endif
cld
movq %rdx,%rcx
andb $7,%cl
jne 2004b
.endif
\end
ret
.endif