amd64: finish the tail in memset with an overlapping store
Instead of finding the exact size to fit in we can just shift the target by -8 + tail. Doing a blind write to a previously rep stosq'ed area comes with a penalty so do it conditionally. Sample win on EPYC when zeroing a 257 sized buffer (tail = 1) aligned to 16 bytes: before: 44782846 ops/s after: 46118614 ops/s Idea stolen from NetBSD. Sponsored by: The FreeBSD Foundation
This commit is contained in:
parent
9ebffec59f
commit
ac1eb54956
@ -524,9 +524,12 @@ END(memcpy_erms)
|
||||
rep
|
||||
stosq
|
||||
movq %r9,%rax
|
||||
movq %rdx,%rcx
|
||||
andb $7,%cl
|
||||
jne 1004b
|
||||
andl $7,%edx
|
||||
jnz 1f
|
||||
POP_FRAME_POINTER
|
||||
ret
|
||||
1:
|
||||
movq %r10,-8(%rdi,%rdx)
|
||||
.endif
|
||||
POP_FRAME_POINTER
|
||||
ret
|
||||
|
Loading…
x
Reference in New Issue
Block a user