amd64: switch pagecopy from non-temporal stores to rep movsq

The copied data is accessed in part soon after and it results with additional
cache misses during a -j 1 buildkernel WITHOUT_CTF=yes KERNFAST=1, as measured
with pmc stat.

before:
       256165411  cache-references	#	0.003 refs/inst
        15105408  cache-misses		#	5.897%
           20.70  real			#	99.67% cpu
           13.24  user			#	63.94% cpu
            7.40  sys			#	35.73% cpu

after:
       256764469  cache-references	#	0.003 refs/inst
        11913551  cache-misses		#	4.640%
           20.70  real			#	99.67% cpu
           13.19  user			#	63.73% cpu
            7.44  sys			#	35.95% cpu

Note the real time did not change, but traffic to RAM was reduced (multiple
measurements performed with switching the implementation at runtime).
Since nobody else is using non-temporal for this and there is no apparent
benefit at least these days, don't use them either.

Side note is that pagecopy arguments should probably get reversed to not
have to flip them around in the primitive.

Discussed with:		jeff
This commit is contained in:
Mateusz Guzik 2018-05-31 09:56:02 +00:00
parent 5fd1ea0810
commit 64415b8b22

View File

@ -281,26 +281,12 @@ END(memset)
*/
ENTRY(pagecopy)
PUSH_FRAME_POINTER
movq $-PAGE_SIZE,%rax
movq %rax,%rdx
subq %rax,%rdi
subq %rax,%rsi
1:
prefetchnta (%rdi,%rax)
addq $64,%rax
jne 1b
2:
movq (%rdi,%rdx),%rax
movnti %rax,(%rsi,%rdx)
movq 8(%rdi,%rdx),%rax
movnti %rax,8(%rsi,%rdx)
movq 16(%rdi,%rdx),%rax
movnti %rax,16(%rsi,%rdx)
movq 24(%rdi,%rdx),%rax
movnti %rax,24(%rsi,%rdx)
addq $32,%rdx
jne 2b
sfence
movq $PAGE_SIZE/8,%rcx
movq %rdi,%r9
movq %rsi,%rdi
movq %r9,%rsi
rep
movsq
POP_FRAME_POINTER
ret
END(pagecopy)