From ffc9789a6e496d2a824aee95d7b485eb2674ad03 Mon Sep 17 00:00:00 2001 From: mjg Date: Fri, 30 Nov 2018 00:45:10 +0000 Subject: [PATCH] amd64: tidy up copying backwards in memmove For non-ERMS case the code used handle possible trailing bytes with movsb first and then followed it up with movsq. This also happened to alter how calculations were done for other cases. Handle the tail with regular movs, just like when copying forward. Use leaq to calculate the right offset from the get go, instead of doing separate add and sub. This adjusts the offset for non-rep cases so that they can be used to handle the tail. The routine is still a work in progress. Sponsored by: The FreeBSD Foundation --- lib/libc/amd64/string/memmove.S | 54 ++++++++++++++++----------------- sys/amd64/amd64/support.S | 54 ++++++++++++++++----------------- 2 files changed, 52 insertions(+), 56 deletions(-) diff --git a/lib/libc/amd64/string/memmove.S b/lib/libc/amd64/string/memmove.S index 9d6fa8a7e296..630c034c85c4 100644 --- a/lib/libc/amd64/string/memmove.S +++ b/lib/libc/amd64/string/memmove.S @@ -150,24 +150,24 @@ __FBSDID("$FreeBSD$"); */ ALIGN_TEXT 2: - addq %rcx,%rdi - addq %rcx,%rsi + cmpq $256,%rcx + ja 2256f + + leaq -8(%rdi,%rcx),%rdi + leaq -8(%rsi,%rcx),%rsi cmpq $32,%rcx jb 2016f - cmpq $256,%rcx - ja 2256f - 2032: + movq (%rsi),%rdx + movq %rdx,(%rdi) movq -8(%rsi),%rdx movq %rdx,-8(%rdi) movq -16(%rsi),%rdx movq %rdx,-16(%rdi) movq -24(%rsi),%rdx movq %rdx,-24(%rdi) - movq -32(%rsi),%rdx - movq %rdx,-32(%rdi) leaq -32(%rsi),%rsi leaq -32(%rdi),%rdi subq $32,%rcx @@ -181,10 +181,10 @@ __FBSDID("$FreeBSD$"); 2016: cmpb $16,%cl jl 2008f + movq (%rsi),%rdx + movq %rdx,(%rdi) movq -8(%rsi),%rdx movq %rdx,-8(%rdi) - movq -16(%rsi),%rdx - movq %rdx,-16(%rdi) subb $16,%cl jz 2000f leaq -16(%rsi),%rsi @@ -192,8 +192,8 @@ __FBSDID("$FreeBSD$"); 2008: cmpb $8,%cl jl 2004f - movq -8(%rsi),%rdx - movq %rdx,-8(%rdi) + movq (%rsi),%rdx + movq %rdx,(%rdi) subb $8,%cl jz 2000f leaq -8(%rsi),%rsi @@ -201,8 +201,8 @@ __FBSDID("$FreeBSD$"); 2004: cmpb $4,%cl jl 2002f - movl -4(%rsi),%edx - movl %edx,-4(%rdi) + movl 4(%rsi),%edx + movl %edx,4(%rdi) subb $4,%cl jz 2000f leaq -4(%rsi),%rsi @@ -210,8 +210,8 @@ __FBSDID("$FreeBSD$"); 2002: cmpb $2,%cl jl 2001f - movw -2(%rsi),%dx - movw %dx,-2(%rdi) + movw 6(%rsi),%dx + movw %dx,6(%rdi) subb $2,%cl jz 2000f leaq -2(%rsi),%rsi @@ -219,33 +219,31 @@ __FBSDID("$FreeBSD$"); 2001: cmpb $1,%cl jl 2000f - movb -1(%rsi),%dl - movb %dl,-1(%rdi) + movb 7(%rsi),%dl + movb %dl,7(%rdi) 2000: \end ret ALIGN_TEXT 2256: - decq %rdi - decq %rsi std .if \erms == 1 + leaq -1(%rdi,%rcx),%rdi + leaq -1(%rsi,%rcx),%rsi rep movsb + cld .else - andq $7,%rcx /* any fractional bytes? */ - je 3f - rep - movsb -3: - movq %rdx,%rcx /* copy remainder by 32-bit words */ + leaq -8(%rdi,%rcx),%rdi + leaq -8(%rsi,%rcx),%rsi shrq $3,%rcx - subq $7,%rsi - subq $7,%rdi rep movsq -.endif cld + movq %rdx,%rcx + andb $7,%cl + jne 2004b +.endif \end ret .endif diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 1bb82ef43b97..af4e9c313b88 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -313,24 +313,24 @@ END(memcmp) */ ALIGN_TEXT 2: - addq %rcx,%rdi - addq %rcx,%rsi + cmpq $256,%rcx + ja 2256f + + leaq -8(%rdi,%rcx),%rdi + leaq -8(%rsi,%rcx),%rsi cmpq $32,%rcx jb 2016f - cmpq $256,%rcx - ja 2256f - 2032: + movq (%rsi),%rdx + movq %rdx,(%rdi) movq -8(%rsi),%rdx movq %rdx,-8(%rdi) movq -16(%rsi),%rdx movq %rdx,-16(%rdi) movq -24(%rsi),%rdx movq %rdx,-24(%rdi) - movq -32(%rsi),%rdx - movq %rdx,-32(%rdi) leaq -32(%rsi),%rsi leaq -32(%rdi),%rdi subq $32,%rcx @@ -344,10 +344,10 @@ END(memcmp) 2016: cmpb $16,%cl jl 2008f + movq (%rsi),%rdx + movq %rdx,(%rdi) movq -8(%rsi),%rdx movq %rdx,-8(%rdi) - movq -16(%rsi),%rdx - movq %rdx,-16(%rdi) subb $16,%cl jz 2000f leaq -16(%rsi),%rsi @@ -355,8 +355,8 @@ END(memcmp) 2008: cmpb $8,%cl jl 2004f - movq -8(%rsi),%rdx - movq %rdx,-8(%rdi) + movq (%rsi),%rdx + movq %rdx,(%rdi) subb $8,%cl jz 2000f leaq -8(%rsi),%rsi @@ -364,8 +364,8 @@ END(memcmp) 2004: cmpb $4,%cl jl 2002f - movl -4(%rsi),%edx - movl %edx,-4(%rdi) + movl 4(%rsi),%edx + movl %edx,4(%rdi) subb $4,%cl jz 2000f leaq -4(%rsi),%rsi @@ -373,8 +373,8 @@ END(memcmp) 2002: cmpb $2,%cl jl 2001f - movw -2(%rsi),%dx - movw %dx,-2(%rdi) + movw 6(%rsi),%dx + movw %dx,6(%rdi) subb $2,%cl jz 2000f leaq -2(%rsi),%rsi @@ -382,33 +382,31 @@ END(memcmp) 2001: cmpb $1,%cl jl 2000f - movb -1(%rsi),%dl - movb %dl,-1(%rdi) + movb 7(%rsi),%dl + movb %dl,7(%rdi) 2000: \end ret ALIGN_TEXT 2256: - decq %rdi - decq %rsi std .if \erms == 1 + leaq -1(%rdi,%rcx),%rdi + leaq -1(%rsi,%rcx),%rsi rep movsb + cld .else - andq $7,%rcx /* any fractional bytes? */ - je 3f - rep - movsb -3: - movq %rdx,%rcx /* copy remainder by 32-bit words */ + leaq -8(%rdi,%rcx),%rdi + leaq -8(%rsi,%rcx),%rsi shrq $3,%rcx - subq $7,%rsi - subq $7,%rdi rep movsq -.endif cld + movq %rdx,%rcx + andb $7,%cl + jne 2004b +.endif \end ret .endif