amd64: sync up libc memset with the kernel version

- tidy up memset to have rax set earlier for small sizes
- finish the tail in memset with an overlapping store
- align memset buffers to 16 bytes before using rep stos

Sponsored by:	The FreeBSD Foundation
This commit is contained in:
Mateusz Guzik 2018-11-15 20:28:35 +00:00
parent 6fff634455
commit ad2ff705a4

View File

@ -31,12 +31,14 @@
#include <machine/asm.h>
__FBSDID("$FreeBSD$");
#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
.macro MEMSET erms
movq %rdi,%r9
movq %rdi,%rax
movq %rdx,%rcx
movzbq %sil,%r8
movabs $0x0101010101010101,%rax
imulq %r8,%rax
movabs $0x0101010101010101,%r10
imulq %r8,%r10
cmpq $32,%rcx
jb 1016f
@ -45,10 +47,10 @@ __FBSDID("$FreeBSD$");
ja 1256f
1032:
movq %rax,(%rdi)
movq %rax,8(%rdi)
movq %rax,16(%rdi)
movq %rax,24(%rdi)
movq %r10,(%rdi)
movq %r10,8(%rdi)
movq %r10,16(%rdi)
movq %r10,24(%rdi)
leaq 32(%rdi),%rdi
subq $32,%rcx
cmpq $32,%rcx
@ -58,54 +60,72 @@ __FBSDID("$FreeBSD$");
1016:
cmpb $16,%cl
jl 1008f
movq %rax,(%rdi)
movq %rax,8(%rdi)
movq %r10,(%rdi)
movq %r10,8(%rdi)
subb $16,%cl
jz 1000f
leaq 16(%rdi),%rdi
1008:
cmpb $8,%cl
jl 1004f
movq %rax,(%rdi)
movq %r10,(%rdi)
subb $8,%cl
jz 1000f
leaq 8(%rdi),%rdi
1004:
cmpb $4,%cl
jl 1002f
movl %eax,(%rdi)
movl %r10d,(%rdi)
subb $4,%cl
jz 1000f
leaq 4(%rdi),%rdi
1002:
cmpb $2,%cl
jl 1001f
movw %ax,(%rdi)
movw %r10w,(%rdi)
subb $2,%cl
jz 1000f
leaq 2(%rdi),%rdi
1001:
cmpb $1,%cl
jl 1000f
movb %al,(%rdi)
movb %r10b,(%rdi)
1000:
movq %r9,%rax
ret
ALIGN_TEXT
1256:
movq %rdi,%r9
movq %r10,%rax
testl $15,%edi
jnz 3f
1:
.if \erms == 1
rep
stosb
movq %r9,%rax
.else
movq %rcx,%rdx
shrq $3,%rcx
rep
stosq
movq %rdx,%rcx
andb $7,%cl
jne 1004b
.endif
movq %r9,%rax
andl $7,%edx
jnz 2f
ret
2:
movq %r10,-8(%rdi,%rdx)
.endif
ret
ALIGN_TEXT
3:
movq %r10,(%rdi)
movq %r10,8(%rdi)
movq %rdi,%r8
andq $15,%r8
leaq -16(%rcx,%r8),%rcx
neg %r8
leaq 16(%rdi,%r8),%rdi
jmp 1b
.endm
ENTRY(memset)