amd64: make memset less slow with mov

rep stos has a high startup time even on modern microarchitectures like Skylake. Intel optimization manuals discuss how for small sizes it is beneficial to go for streaming stores. Since those cannot be used without extra penalty in the kernel I investigated performance impact of just regular movs. The patch below implements a very simple scheme: a 32-byte loop followed by filling in the remainder of at most 31 bytes. It has a 256 breaking point on which it falls back to rep stos. It provides a significant win over the current primitive on several machines I tested (both Intel and AMD). A 64-byte loop did not provide any benefit even for multiple of 64 sizes. See the review for benchmark data. Reviewed by: kib Approved by: re (gjb) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D17398
2018-10-05 19:25:09 +00:00 · 2018-10-05 19:25:09 +00:00 · de8f0ce648
commit de8f0ce648
parent 95afcc1f4d
1 changed files with 68 additions and 19 deletions
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@ -320,43 +320,92 @@ END(memcpy_erms)
 * memset(dst, c,   len)
 *        rdi, rsi, rdx
 */
-ENTRY(memset_std)
+.macro MEMSET erms
 	PUSH_FRAME_POINTER
 	movq	%rdi,%r9
 	movq	%rdx,%rcx
 	movzbq	%sil,%r8
 	movabs	$0x0101010101010101,%rax
 	imulq	%r8,%rax
-	cmpq	$15,%rcx
-	jbe	1f
-	shrq	$3,%rcx
-	rep
-	stosq
-	movq	%rdx,%rcx
-	andq	$7,%rcx
-	jne	1f
+
+	cmpq	$32,%rcx
+	jb	1016f
+
+	cmpq	$256,%rcx
+	ja	1256f
+
+1032:
+	movq	%rax,(%rdi)
+	movq	%rax,8(%rdi)
+	movq	%rax,16(%rdi)
+	movq	%rax,24(%rdi)
+	leaq	32(%rdi),%rdi
+	subq	$32,%rcx
+	cmpq	$32,%rcx
+	jae	1032b
+	cmpb	$0,%cl
+	je	1000f
+1016:
+	cmpb	$16,%cl
+	jl	1008f
+	movq	%rax,(%rdi)
+	movq	%rax,8(%rdi)
+	subb	$16,%cl
+	jz	1000f
+	leaq	16(%rdi),%rdi
+1008:
+	cmpb	$8,%cl
+	jl	1004f
+	movq	%rax,(%rdi)
+	subb	$8,%cl
+	jz	1000f
+	leaq	8(%rdi),%rdi
+1004:
+	cmpb	$4,%cl
+	jl	1002f
+	movl	%eax,(%rdi)
+	subb	$4,%cl
+	jz	1000f
+	leaq	4(%rdi),%rdi
+1002:
+	cmpb	$2,%cl
+	jl	1001f
+	movw	%ax,(%rdi)
+	subb	$2,%cl
+	jz	1000f
+	leaq	2(%rdi),%rdi
+1001:
+	cmpb	$1,%cl
+	jl	1000f
+	movb	%al,(%rdi)
+1000:
 	movq	%r9,%rax
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
-1:
+1256:
+.if \erms == 1
 	rep
 	stosb
+.else
+	shrq	$3,%rcx
+	rep
+	stosq
+	movq	%rdx,%rcx
+	andb	$7,%cl
+	jne	1004b
+.endif
 	movq	%r9,%rax
 	POP_FRAME_POINTER
 	ret
+.endm
+
+ENTRY(memset_std)
+	MEMSET erms=0
 END(memset_std)

 ENTRY(memset_erms)
-	PUSH_FRAME_POINTER
-	movq	%rdi,%r9
-	movq	%rdx,%rcx
-	movb	%sil,%al
-	rep
-	stosb
-	movq	%r9,%rax
-	POP_FRAME_POINTER
-	ret
+	MEMSET erms=1
 END(memset_erms)

 /* fillw(pat, base, cnt) */