amd64: handle small memmove buffers with overlapping stores

Handling sizes of > 32 backwards will be updated later. Reviewed by: kib (kernel part) Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D18387
2018-11-30 20:58:08 +00:00 · 2018-11-30 20:58:08 +00:00 · 94243af2da
commit 94243af2da
parent 1489776d43
2 changed files with 103 additions and 80 deletions
--- a/lib/libc/amd64/string/memmove.S
+++ b/lib/libc/amd64/string/memmove.S
@ -42,11 +42,19 @@ __FBSDID("$FreeBSD$");
 * rsi - source
 * rdx - count
 *
- * The macro possibly clobbers the above and: rcx, r8.
- * It does not clobber rax, r10 nor r11.
+ * The macro possibly clobbers the above and: rcx, r8, r9, 10
+ * It does not clobber rax nor r11.
 */
 .macro MEMMOVE erms overlap begin end
 	\begin
+
+	/*
+	 * For sizes 0..32 all data is read before it is written, so there
+	 * is no correctness issue with direction of copying.
+	 */
+	cmpq	$32,%rcx
+	jbe	101632f
+
 .if \overlap == 1
 	movq	%rdi,%r8
 	subq	%rsi,%r8
@ -54,13 +62,10 @@ __FBSDID("$FreeBSD$");
 	jb	2f
 .endif

-	cmpq	$32,%rcx
-	jb	1016f
-
 	cmpq	$256,%rcx
 	ja	1256f

-1032:
+103200:
 	movq	(%rsi),%rdx
 	movq	%rdx,(%rdi)
 	movq	8(%rsi),%rdx
@ -73,56 +78,62 @@ __FBSDID("$FreeBSD$");
 	leaq	32(%rdi),%rdi
 	subq	$32,%rcx
 	cmpq	$32,%rcx
-	jae	1032b
+	jae	103200b
 	cmpb	$0,%cl
-	jne	1016f
+	jne	101632f
 	\end
 	ret
 	ALIGN_TEXT
-1016:
+101632:
 	cmpb	$16,%cl
-	jl	1008f
+	jl	100816f
 	movq	(%rsi),%rdx
+	movq	8(%rsi),%r8
+	movq	-16(%rsi,%rcx),%r9
+	movq	-8(%rsi,%rcx),%r10
 	movq	%rdx,(%rdi)
-	movq	8(%rsi),%rdx
-	movq	%rdx,8(%rdi)
-	subb	$16,%cl
-	jz	1000f
-	leaq	16(%rsi),%rsi
-	leaq	16(%rdi),%rdi
-1008:
+	movq	%r8,8(%rdi)
+	movq	%r9,-16(%rdi,%rcx)
+	movq	%r10,-8(%rdi,%rcx)
+	\end
+	ret
+	ALIGN_TEXT
+100816:
 	cmpb	$8,%cl
-	jl	1004f
+	jl	100408f
 	movq	(%rsi),%rdx
+	movq	-8(%rsi,%rcx),%r8
 	movq	%rdx,(%rdi)
-	subb	$8,%cl
-	jz	1000f
-	leaq	8(%rsi),%rsi
-	leaq	8(%rdi),%rdi
-1004:
+	movq	%r8,-8(%rdi,%rcx,)
+	\end
+	ret
+	ALIGN_TEXT
+100408:
 	cmpb	$4,%cl
-	jl	1002f
+	jl	100204f
 	movl	(%rsi),%edx
+	movl	-4(%rsi,%rcx),%r8d
 	movl	%edx,(%rdi)
-	subb	$4,%cl
-	jz	1000f
-	leaq	4(%rsi),%rsi
-	leaq	4(%rdi),%rdi
-1002:
+	movl	%r8d,-4(%rdi,%rcx)
+	\end
+	ret
+	ALIGN_TEXT
+100204:
 	cmpb	$2,%cl
-	jl	1001f
-	movw	(%rsi),%dx
+	jl	100001f
+	movzwl	(%rsi),%edx
+	movzwl	-2(%rsi,%rcx),%r8d
 	movw	%dx,(%rdi)
-	subb	$2,%cl
-	jz	1000f
-	leaq	2(%rsi),%rsi
-	leaq	2(%rdi),%rdi
-1001:
+	movw	%r8w,-2(%rdi,%rcx)
+	\end
+	ret
+	ALIGN_TEXT
+100001:
 	cmpb	$1,%cl
-	jl	1000f
+	jl	100000f
 	movb	(%rsi),%dl
 	movb	%dl,(%rdi)
-1000:
+100000:
 	\end
 	ret

@ -136,8 +147,8 @@ __FBSDID("$FreeBSD$");
 	rep
 	movsq
 	movq	%rdx,%rcx
-	andb	$7,%cl                         /* any bytes left? */
-	jne	1004b
+	andl	$7,%ecx                         /* any bytes left? */
+	jne	100408b
 .endif
 	\end
 	ret
@ -247,6 +258,7 @@ __FBSDID("$FreeBSD$");
 .endif
 .endm

+
 .macro MEMMOVE_BEGIN
 	movq	%rdi,%rax
 	movq	%rdx,%rcx
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@ -205,11 +205,19 @@ END(memcmp)
 * rsi - source
 * rdx - count
 *
- * The macro possibly clobbers the above and: rcx, r8.
- * It does not clobber rax, r10 nor r11.
+ * The macro possibly clobbers the above and: rcx, r8, r9, 10
+ * It does not clobber rax nor r11.
 */
 .macro MEMMOVE erms overlap begin end
 	\begin
+
+	/*
+	 * For sizes 0..32 all data is read before it is written, so there
+	 * is no correctness issue with direction of copying.
+	 */
+	cmpq	$32,%rcx
+	jbe	101632f
+
 .if \overlap == 1
 	movq	%rdi,%r8
 	subq	%rsi,%r8
@ -217,13 +225,10 @@ END(memcmp)
 	jb	2f
 .endif

-	cmpq	$32,%rcx
-	jb	1016f
-
 	cmpq	$256,%rcx
 	ja	1256f

-1032:
+103200:
 	movq	(%rsi),%rdx
 	movq	%rdx,(%rdi)
 	movq	8(%rsi),%rdx
@ -236,56 +241,62 @@ END(memcmp)
 	leaq	32(%rdi),%rdi
 	subq	$32,%rcx
 	cmpq	$32,%rcx
-	jae	1032b
+	jae	103200b
 	cmpb	$0,%cl
-	jne	1016f
+	jne	101632f
 	\end
 	ret
 	ALIGN_TEXT
-1016:
+101632:
 	cmpb	$16,%cl
-	jl	1008f
+	jl	100816f
 	movq	(%rsi),%rdx
+	movq	8(%rsi),%r8
+	movq	-16(%rsi,%rcx),%r9
+	movq	-8(%rsi,%rcx),%r10
 	movq	%rdx,(%rdi)
-	movq	8(%rsi),%rdx
-	movq	%rdx,8(%rdi)
-	subb	$16,%cl
-	jz	1000f
-	leaq	16(%rsi),%rsi
-	leaq	16(%rdi),%rdi
-1008:
+	movq	%r8,8(%rdi)
+	movq	%r9,-16(%rdi,%rcx)
+	movq	%r10,-8(%rdi,%rcx)
+	\end
+	ret
+	ALIGN_TEXT
+100816:
 	cmpb	$8,%cl
-	jl	1004f
+	jl	100408f
 	movq	(%rsi),%rdx
+	movq	-8(%rsi,%rcx),%r8
 	movq	%rdx,(%rdi)
-	subb	$8,%cl
-	jz	1000f
-	leaq	8(%rsi),%rsi
-	leaq	8(%rdi),%rdi
-1004:
+	movq	%r8,-8(%rdi,%rcx,)
+	\end
+	ret
+	ALIGN_TEXT
+100408:
 	cmpb	$4,%cl
-	jl	1002f
+	jl	100204f
 	movl	(%rsi),%edx
+	movl	-4(%rsi,%rcx),%r8d
 	movl	%edx,(%rdi)
-	subb	$4,%cl
-	jz	1000f
-	leaq	4(%rsi),%rsi
-	leaq	4(%rdi),%rdi
-1002:
+	movl	%r8d,-4(%rdi,%rcx)
+	\end
+	ret
+	ALIGN_TEXT
+100204:
 	cmpb	$2,%cl
-	jl	1001f
-	movw	(%rsi),%dx
+	jl	100001f
+	movzwl	(%rsi),%edx
+	movzwl	-2(%rsi,%rcx),%r8d
 	movw	%dx,(%rdi)
-	subb	$2,%cl
-	jz	1000f
-	leaq	2(%rsi),%rsi
-	leaq	2(%rdi),%rdi
-1001:
+	movw	%r8w,-2(%rdi,%rcx)
+	\end
+	ret
+	ALIGN_TEXT
+100001:
 	cmpb	$1,%cl
-	jl	1000f
+	jl	100000f
 	movb	(%rsi),%dl
 	movb	%dl,(%rdi)
-1000:
+100000:
 	\end
 	ret

@ -299,8 +310,8 @@ END(memcmp)
 	rep
 	movsq
 	movq	%rdx,%rcx
-	andb	$7,%cl                         /* any bytes left? */
-	jne	1004b
+	andl	$7,%ecx                         /* any bytes left? */
+	jne	100408b
 .endif
 	\end
 	ret