amd64: switch pagecopy from non-temporal stores to rep movsq

The copied data is accessed in part soon after and it results with additional cache misses during a -j 1 buildkernel WITHOUT_CTF=yes KERNFAST=1, as measured with pmc stat. before: 256165411 cache-references # 0.003 refs/inst 15105408 cache-misses # 5.897% 20.70 real # 99.67% cpu 13.24 user # 63.94% cpu 7.40 sys # 35.73% cpu after: 256764469 cache-references # 0.003 refs/inst 11913551 cache-misses # 4.640% 20.70 real # 99.67% cpu 13.19 user # 63.73% cpu 7.44 sys # 35.95% cpu Note the real time did not change, but traffic to RAM was reduced (multiple measurements performed with switching the implementation at runtime). Since nobody else is using non-temporal for this and there is no apparent benefit at least these days, don't use them either. Side note is that pagecopy arguments should probably get reversed to not have to flip them around in the primitive. Discussed with: jeff
2018-05-31 09:56:02 +00:00 · 2018-05-31 09:56:02 +00:00 · 64415b8b22
commit 64415b8b22
parent 5fd1ea0810
1 changed files with 6 additions and 20 deletions
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@ -281,26 +281,12 @@ END(memset)
 */
 ENTRY(pagecopy)
 	PUSH_FRAME_POINTER
-	movq	$-PAGE_SIZE,%rax
-	movq	%rax,%rdx
-	subq	%rax,%rdi
-	subq	%rax,%rsi
-1:
-	prefetchnta (%rdi,%rax)
-	addq	$64,%rax
-	jne	1b
-2:
-	movq	(%rdi,%rdx),%rax
-	movnti	%rax,(%rsi,%rdx)
-	movq	8(%rdi,%rdx),%rax
-	movnti	%rax,8(%rsi,%rdx)
-	movq	16(%rdi,%rdx),%rax
-	movnti	%rax,16(%rsi,%rdx)
-	movq	24(%rdi,%rdx),%rax
-	movnti	%rax,24(%rsi,%rdx)
-	addq	$32,%rdx
-	jne	2b
-	sfence
+	movq	$PAGE_SIZE/8,%rcx
+	movq	%rdi,%r9
+	movq	%rsi,%rdi
+	movq	%r9,%rsi
+	rep
+	movsq
 	POP_FRAME_POINTER
 	ret
 END(pagecopy)