amd64: align memset buffers to 16 bytes before using rep stos
Both Intel manual and Agner Fog's docs suggest aligning to 16. See the review for benchmark results. Reviewed by: kib (previous version) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17661
This commit is contained in:
parent
35c107709e
commit
4f16b562cf
@ -515,24 +515,38 @@ END(memcpy_erms)
|
||||
1256:
|
||||
movq %rdi,%r9
|
||||
movq %r10,%rax
|
||||
testl $15,%edi
|
||||
jnz 3f
|
||||
1:
|
||||
.if \erms == 1
|
||||
rep
|
||||
stosb
|
||||
movq %r9,%rax
|
||||
.else
|
||||
movq %rcx,%rdx
|
||||
shrq $3,%rcx
|
||||
rep
|
||||
stosq
|
||||
movq %r9,%rax
|
||||
andl $7,%edx
|
||||
jnz 1f
|
||||
jnz 2f
|
||||
POP_FRAME_POINTER
|
||||
ret
|
||||
1:
|
||||
2:
|
||||
movq %r10,-8(%rdi,%rdx)
|
||||
.endif
|
||||
POP_FRAME_POINTER
|
||||
ret
|
||||
ALIGN_TEXT
|
||||
3:
|
||||
movq %r10,(%rdi)
|
||||
movq %r10,8(%rdi)
|
||||
movq %rdi,%r8
|
||||
andq $15,%r8
|
||||
leaq -16(%rcx,%r8),%rcx
|
||||
neg %r8
|
||||
leaq 16(%rdi,%r8),%rdi
|
||||
jmp 1b
|
||||
.endm
|
||||
|
||||
ENTRY(memset_std)
|
||||
|
Loading…
Reference in New Issue
Block a user