Added i586-optimized bcopy() and bzero().

These are based on using the FPU to do 64-bit stores. They also use i586-optimized instruction ordering, i586-optimized cache management and a couple of other tricks. They should work on any i*86 with a h/w FPU, but are slower on at least i386's and i486's. They come close to saturating the memory bus on i586's. bzero() can maintain a 3-3-3-3 burst cycle to 66 MHz non-EDO main memory on a P133 (but is too slow to keep up with a 2-2-2-2 burst cycle for EDO - someone with EDO should fix this). bcopy() is several cycles short of keeping up with a 3-3-3-3 cycle for writing. For a P133 writing to 66 MHz main memory, it just manages an N-3-3-3, 3-3-3-3 pair of burst cycles, where N is typically 6. The new routines are not used by default. They are always configured and can be enabled at runtime using a debugger or an lkm to change their function pointer, or at compile time using new options (see another log message). Removed old, dead i586_bzero() and i686_bzero(). Read-before-write is usually bad for i586's. It doubles the memory traffic unless the data is already cached, and data is (or should be) very rarely cached for large bzero()s (the system should prefer uncached pages for cleaning), and the amount of data handled by small bzero()s is relatively small in the kernel. Improved comments about overlapping copies. Removed unused #include.
1996-10-09 18:16:17 +00:00 · 1996-10-09 18:16:17 +00:00 · 18860b410d
commit 18860b410d
parent 318a4f9fc3
3 changed files with 879 additions and 186 deletions
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@ -30,10 +30,10 @@
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
- *	$Id: support.s,v 1.38 1996/09/10 08:31:57 bde Exp $
+ *	$Id: support.s,v 1.39 1996/09/20 16:52:09 bde Exp $
 */

-#include <sys/errno.h>
+#include "opt_temporary.h"			/* for I586_*_B* */

 #include <machine/asmacros.h>
 #include <machine/cputypes.h>
@ -44,10 +44,19 @@
 #define KDSEL		0x10			/* kernel data selector */
 #define IDXSHIFT	10

-
 	.data
+	.globl	_bcopy_vector
+_bcopy_vector:
+	.long	_generic_bcopy
 	.globl	_bzero
-_bzero:	.long	_generic_bzero
+_bzero:
+	.long	_generic_bzero
+	.globl	_ovbcopy_vector
+_ovbcopy_vector:
+	.long	_generic_bcopy
+kernel_fpu_lock:
+	.byte	0xfe
+	.space	3

 	.text

@ -174,66 +183,147 @@ do0:
 	ret
 #endif

-#if 0	/* Actually lowers performance in real-world cases */
 #if defined(I586_CPU) || defined(I686_CPU)
-ALTENTRY(i586_bzero)
-ENTRY(i686_bzero)
+ENTRY(i586_bzero)
+	movl	4(%esp),%edx
+	movl	8(%esp),%ecx
+
+	/*
+	 * The FPU register method is twice as fast as the integer register
+	 * method unless the target is in the L1 cache and we pre-allocate a
+	 * cache line for it (then the integer register method is 4-5 times
+	 * faster).  However, we never pre-allocate cache lines, since that
+	 * would make the integer method 25% or more slower for the common
+	 * case when the target isn't in either the L1 cache or the L2 cache.
+	 * Thus we normally use the FPU register method unless the overhead
+	 * would be too large.
+	 */
+	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
+	jb	intreg_i586_bzero
+
+	/*
+	 * The FPU registers may belong to an application or to fastmove()
+	 * or to another invocation of bcopy() or ourself in a higher level
+	 * interrupt or trap handler.  Preserving the registers is
+	 * complicated since we avoid it if possible at all levels.  We
+	 * want to localize the complications even when that increases them.
+	 * Here the extra work involves preserving CR0_TS in TS.
+	 * `npxproc != NULL' is supposed to be the condition that all the
+	 * FPU resources belong to an application, but npxproc and CR0_TS
+	 * aren't set atomically enough for this condition to work in
+	 * interrupt handlers.
+	 *
+	 * Case 1: FPU registers belong to the application: we must preserve
+	 * the registers if we use them, so we only use the FPU register
+	 * method if the target size is large enough to amortize the extra
+	 * overhead for preserving them.  CR0_TS must be preserved although
+	 * it is very likely to end up as set.
+	 *
+	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
+	 * makes the registers look like they belong to an application so
+	 * that cpu_switch() and savectx() don't have to know about it, so
+	 * this case reduces to case 1.
+	 *
+	 * Case 3: FPU registers belong to the kernel: don't use the FPU
+	 * register method.  This case is unlikely, and supporting it would
+	 * be more complicated and might take too much stack.
+	 *
+	 * Case 4: FPU registers don't belong to anyone: the FPU registers
+	 * don't need to be preserved, so we always use the FPU register
+	 * method.  CR0_TS must be preserved although it is very likely to
+	 * always end up as clear.
+	 */
+	cmpl	$0,_npxproc
+	je	i586_bz1
+	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
+	jb	intreg_i586_bzero
+	sarb	$1,kernel_fpu_lock
+	jc	intreg_i586_bzero
+	smsw	%ax
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+	jmp	i586_bz2
+
+i586_bz1:
+	sarb	$1,kernel_fpu_lock
+	jc	intreg_i586_bzero
+	smsw	%ax
+	clts
+	fninit				/* XXX should avoid needing this */
+i586_bz2:
+	fldz
+
+	/*
+	 * Align to an 8 byte boundary (misalignment in the main loop would
+	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
+	 * already aligned) by always zeroing 8 bytes and using the part up
+	 * to the _next_ alignment position.
+	 */
+	fstl	0(%edx)
+	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
+	addl	$8,%edx
+	andl	$~7,%edx
+	subl	%edx,%ecx
+
+	/*
+	 * Similarly align `len' to a multiple of 8.
+	 */
+	fstl	-8(%edx,%ecx)
+	decl	%ecx
+	andl	$~7,%ecx
+
+	/*
+	 * This wouldn't be any faster if it were unrolled, since the loop
+	 * control instructions are much faster than the fstl and/or done
+	 * in parallel with it so their overhead is insignificant.
+	 */
+fpureg_i586_bzero_loop:
+	fstl	0(%edx)
+	addl	$8,%edx
+	subl	$8,%ecx
+	cmpl	$8,%ecx
+	jae	fpureg_i586_bzero_loop
+
+	cmpl	$0,_npxproc
+	je	i586_bz3
+	frstor	0(%esp)
+	addl	$108,%esp
+	lmsw	%ax
+	movb	$0xfe,kernel_fpu_lock
+	ret
+
+i586_bz3:
+	fstpl	%st(0)
+	lmsw	%ax
+	movb	$0xfe,kernel_fpu_lock
+	ret
+
+intreg_i586_bzero:
+	/*
+	 * `rep stos' seems to be the best method in practice for small
+	 * counts.  Fancy methods usually take too long to start up due
+	 * to cache and BTB misses.
+	 */
 	pushl	%edi
-	movl	8(%esp),%edi	/* destination pointer */
-	movl	12(%esp),%edx	/* size (in 8-bit words) */
-
-	xorl	%eax,%eax	/* store data */
-	cld
-
-/* If less than 100 bytes to write, skip tricky code.  */
-	cmpl	$100,%edx
-	movl	%edx,%ecx	/* needed when branch is taken! */
-	jl	2f
-
-/* First write 0-3 bytes to make the pointer 32-bit aligned.  */
-	movl	%edi,%ecx	/* Copy ptr to ecx... */
-	negl	%ecx		/* ...and negate that and... */
-	andl	$3,%ecx		/* ...mask to get byte count.  */
-	subl	%ecx,%edx	/* adjust global byte count */
-	rep
-	stosb
-
-	subl	$32,%edx	/* offset count for unrolled loop */
-	movl	(%edi),%ecx	/* Fetch destination cache line */
-
-	.align	2,0x90		/* supply 0x90 for broken assemblers */
-1:
-	movl	28(%edi),%ecx	/* allocate cache line for destination */
-	subl	$32,%edx	/* decr loop count */
-	movl	%eax,0(%edi)	/* store words pairwise */
-	movl	%eax,4(%edi)
-	movl	%eax,8(%edi)
-	movl	%eax,12(%edi)
-	movl	%eax,16(%edi)
-	movl	%eax,20(%edi)
-	movl	%eax,24(%edi)
-	movl	%eax,28(%edi)
-
-	leal	32(%edi),%edi	/* update destination pointer */
-	jge	1b
-	leal	32(%edx),%ecx
-
-/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped).  */
-2:
+	movl	%edx,%edi
+	xorl	%eax,%eax
 	shrl	$2,%ecx
+	cld
 	rep
 	stosl
-
-/* Finally write the last 0-3 bytes.  */
-	movl	%edx,%ecx
+	movl	12(%esp),%ecx
 	andl	$3,%ecx
-	rep
-	stosb
-
+	jne	1f
 	popl	%edi
 	ret
-#endif
-#endif
+
+1:
+	rep
+	stosb
+	popl	%edi
+	ret
+#endif /* I586_CPU || I686_CPU */

 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
@ -256,7 +346,7 @@ bcopyb:
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	subl	%esi,%eax
-	cmpl	%ecx,%eax			/* overlapping? */
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 	cld					/* nope, copy forwards */
 	rep
@ -279,13 +369,19 @@ bcopyb:
 	cld
 	ret

+ENTRY(bcopy)
+	MEXITCOUNT
+	jmp	*_bcopy_vector
+
+ENTRY(ovbcopy)
+	MEXITCOUNT
+	jmp	*_ovbcopy_vector
+
 /*
- * (ov)bcopy(src, dst, cnt)
+ * generic_bcopy(src, dst, cnt)
 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
 */
-ALTENTRY(ovbcopy)
-ENTRY(bcopy)
-bcopy:
+ENTRY(generic_bcopy)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
@ -294,7 +390,7 @@ bcopy:

 	movl	%edi,%eax
 	subl	%esi,%eax
-	cmpl	%ecx,%eax			/* overlapping? */
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f

 	shrl	$2,%ecx				/* copy by 32-bit words */
@ -330,6 +426,141 @@ bcopy:
 	cld
 	ret

+ENTRY(i586_bcopy)
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%esi
+	movl	16(%esp),%edi
+	movl	20(%esp),%ecx
+
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
+	jb	1f
+
+	cmpl	$1024,%ecx
+	jb	small_i586_bcopy
+
+	sarb	$1,kernel_fpu_lock
+	jc	small_i586_bcopy
+	cmpl	$0,_npxproc
+	je	i586_bc1
+	smsw	%dx
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+	jmp	4f
+
+i586_bc1:
+	smsw	%dx
+	clts
+	fninit				/* XXX should avoid needing this */
+
+	ALIGN_TEXT
+4:
+	pushl	%ecx
+#define	DCACHE_SIZE	8192
+	cmpl	$(DCACHE_SIZE-512)/2,%ecx
+	jbe	2f
+	movl	$(DCACHE_SIZE-512)/2,%ecx
+2:
+	subl	%ecx,0(%esp)
+	cmpl	$256,%ecx
+	jb	5f			/* XXX should prefetch if %ecx >= 32 */
+	pushl	%esi
+	pushl	%ecx
+	ALIGN_TEXT
+3:
+	movl	0(%esi),%eax
+	movl	32(%esi),%eax
+	movl	64(%esi),%eax
+	movl	96(%esi),%eax
+	movl	128(%esi),%eax
+	movl	160(%esi),%eax
+	movl	192(%esi),%eax
+	movl	224(%esi),%eax
+	addl	$256,%esi
+	subl	$256,%ecx
+	cmpl	$256,%ecx
+	jae	3b
+	popl	%ecx
+	popl	%esi
+5:
+	ALIGN_TEXT
+large_i586_bcopy_loop:
+	fildq	0(%esi)
+	fildq	8(%esi)
+	fildq	16(%esi)
+	fildq	24(%esi)
+	fildq	32(%esi)
+	fildq	40(%esi)
+	fildq	48(%esi)
+	fildq	56(%esi)
+	fistpq	56(%edi)
+	fistpq	48(%edi)
+	fistpq	40(%edi)
+	fistpq	32(%edi)
+	fistpq	24(%edi)
+	fistpq	16(%edi)
+	fistpq	8(%edi)
+	fistpq	0(%edi)
+	addl	$64,%esi
+	addl	$64,%edi
+	subl	$64,%ecx
+	cmpl	$64,%ecx
+	jae	large_i586_bcopy_loop
+	popl	%eax
+	addl	%eax,%ecx
+	cmpl	$64,%ecx
+	jae	4b
+
+	cmpl	$0,_npxproc
+	je	i586_bc2
+	frstor	0(%esp)
+	addl	$108,%esp
+i586_bc2:
+	lmsw	%dx
+	movb	$0xfe,kernel_fpu_lock
+
+/*
+ * This is a duplicate of the main part of generic_bcopy.  See the comments
+ * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
+ * would mess up high resolution profiling.
+ */
+	ALIGN_TEXT
+small_i586_bcopy:
+	shrl	$2,%ecx
+	cld
+	rep
+	movsl
+	movl	20(%esp),%ecx
+	andl	$3,%ecx
+	rep
+	movsb
+	popl	%edi
+	popl	%esi
+	ret
+
+	ALIGN_TEXT
+1:
+	addl	%ecx,%edi
+	addl	%ecx,%esi
+	decl	%edi
+	decl	%esi
+	andl	$3,%ecx
+	std
+	rep
+	movsb
+	movl	20(%esp),%ecx
+	shrl	$2,%ecx
+	subl	$3,%esi
+	subl	$3,%edi
+	rep
+	movsl
+	popl	%edi
+	popl	%esi
+	cld
+	ret

 /*
 * Note: memcpy does not support overlapping copies
--- a/sys/amd64/amd64/support.s
+++ b/sys/amd64/amd64/support.s
@ -30,10 +30,10 @@
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
- *	$Id: support.s,v 1.38 1996/09/10 08:31:57 bde Exp $
+ *	$Id: support.s,v 1.39 1996/09/20 16:52:09 bde Exp $
 */

-#include <sys/errno.h>
+#include "opt_temporary.h"			/* for I586_*_B* */

 #include <machine/asmacros.h>
 #include <machine/cputypes.h>
@ -44,10 +44,19 @@
 #define KDSEL		0x10			/* kernel data selector */
 #define IDXSHIFT	10

-
 	.data
+	.globl	_bcopy_vector
+_bcopy_vector:
+	.long	_generic_bcopy
 	.globl	_bzero
-_bzero:	.long	_generic_bzero
+_bzero:
+	.long	_generic_bzero
+	.globl	_ovbcopy_vector
+_ovbcopy_vector:
+	.long	_generic_bcopy
+kernel_fpu_lock:
+	.byte	0xfe
+	.space	3

 	.text

@ -174,66 +183,147 @@ do0:
 	ret
 #endif

-#if 0	/* Actually lowers performance in real-world cases */
 #if defined(I586_CPU) || defined(I686_CPU)
-ALTENTRY(i586_bzero)
-ENTRY(i686_bzero)
+ENTRY(i586_bzero)
+	movl	4(%esp),%edx
+	movl	8(%esp),%ecx
+
+	/*
+	 * The FPU register method is twice as fast as the integer register
+	 * method unless the target is in the L1 cache and we pre-allocate a
+	 * cache line for it (then the integer register method is 4-5 times
+	 * faster).  However, we never pre-allocate cache lines, since that
+	 * would make the integer method 25% or more slower for the common
+	 * case when the target isn't in either the L1 cache or the L2 cache.
+	 * Thus we normally use the FPU register method unless the overhead
+	 * would be too large.
+	 */
+	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
+	jb	intreg_i586_bzero
+
+	/*
+	 * The FPU registers may belong to an application or to fastmove()
+	 * or to another invocation of bcopy() or ourself in a higher level
+	 * interrupt or trap handler.  Preserving the registers is
+	 * complicated since we avoid it if possible at all levels.  We
+	 * want to localize the complications even when that increases them.
+	 * Here the extra work involves preserving CR0_TS in TS.
+	 * `npxproc != NULL' is supposed to be the condition that all the
+	 * FPU resources belong to an application, but npxproc and CR0_TS
+	 * aren't set atomically enough for this condition to work in
+	 * interrupt handlers.
+	 *
+	 * Case 1: FPU registers belong to the application: we must preserve
+	 * the registers if we use them, so we only use the FPU register
+	 * method if the target size is large enough to amortize the extra
+	 * overhead for preserving them.  CR0_TS must be preserved although
+	 * it is very likely to end up as set.
+	 *
+	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
+	 * makes the registers look like they belong to an application so
+	 * that cpu_switch() and savectx() don't have to know about it, so
+	 * this case reduces to case 1.
+	 *
+	 * Case 3: FPU registers belong to the kernel: don't use the FPU
+	 * register method.  This case is unlikely, and supporting it would
+	 * be more complicated and might take too much stack.
+	 *
+	 * Case 4: FPU registers don't belong to anyone: the FPU registers
+	 * don't need to be preserved, so we always use the FPU register
+	 * method.  CR0_TS must be preserved although it is very likely to
+	 * always end up as clear.
+	 */
+	cmpl	$0,_npxproc
+	je	i586_bz1
+	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
+	jb	intreg_i586_bzero
+	sarb	$1,kernel_fpu_lock
+	jc	intreg_i586_bzero
+	smsw	%ax
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+	jmp	i586_bz2
+
+i586_bz1:
+	sarb	$1,kernel_fpu_lock
+	jc	intreg_i586_bzero
+	smsw	%ax
+	clts
+	fninit				/* XXX should avoid needing this */
+i586_bz2:
+	fldz
+
+	/*
+	 * Align to an 8 byte boundary (misalignment in the main loop would
+	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
+	 * already aligned) by always zeroing 8 bytes and using the part up
+	 * to the _next_ alignment position.
+	 */
+	fstl	0(%edx)
+	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
+	addl	$8,%edx
+	andl	$~7,%edx
+	subl	%edx,%ecx
+
+	/*
+	 * Similarly align `len' to a multiple of 8.
+	 */
+	fstl	-8(%edx,%ecx)
+	decl	%ecx
+	andl	$~7,%ecx
+
+	/*
+	 * This wouldn't be any faster if it were unrolled, since the loop
+	 * control instructions are much faster than the fstl and/or done
+	 * in parallel with it so their overhead is insignificant.
+	 */
+fpureg_i586_bzero_loop:
+	fstl	0(%edx)
+	addl	$8,%edx
+	subl	$8,%ecx
+	cmpl	$8,%ecx
+	jae	fpureg_i586_bzero_loop
+
+	cmpl	$0,_npxproc
+	je	i586_bz3
+	frstor	0(%esp)
+	addl	$108,%esp
+	lmsw	%ax
+	movb	$0xfe,kernel_fpu_lock
+	ret
+
+i586_bz3:
+	fstpl	%st(0)
+	lmsw	%ax
+	movb	$0xfe,kernel_fpu_lock
+	ret
+
+intreg_i586_bzero:
+	/*
+	 * `rep stos' seems to be the best method in practice for small
+	 * counts.  Fancy methods usually take too long to start up due
+	 * to cache and BTB misses.
+	 */
 	pushl	%edi
-	movl	8(%esp),%edi	/* destination pointer */
-	movl	12(%esp),%edx	/* size (in 8-bit words) */
-
-	xorl	%eax,%eax	/* store data */
-	cld
-
-/* If less than 100 bytes to write, skip tricky code.  */
-	cmpl	$100,%edx
-	movl	%edx,%ecx	/* needed when branch is taken! */
-	jl	2f
-
-/* First write 0-3 bytes to make the pointer 32-bit aligned.  */
-	movl	%edi,%ecx	/* Copy ptr to ecx... */
-	negl	%ecx		/* ...and negate that and... */
-	andl	$3,%ecx		/* ...mask to get byte count.  */
-	subl	%ecx,%edx	/* adjust global byte count */
-	rep
-	stosb
-
-	subl	$32,%edx	/* offset count for unrolled loop */
-	movl	(%edi),%ecx	/* Fetch destination cache line */
-
-	.align	2,0x90		/* supply 0x90 for broken assemblers */
-1:
-	movl	28(%edi),%ecx	/* allocate cache line for destination */
-	subl	$32,%edx	/* decr loop count */
-	movl	%eax,0(%edi)	/* store words pairwise */
-	movl	%eax,4(%edi)
-	movl	%eax,8(%edi)
-	movl	%eax,12(%edi)
-	movl	%eax,16(%edi)
-	movl	%eax,20(%edi)
-	movl	%eax,24(%edi)
-	movl	%eax,28(%edi)
-
-	leal	32(%edi),%edi	/* update destination pointer */
-	jge	1b
-	leal	32(%edx),%ecx
-
-/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped).  */
-2:
+	movl	%edx,%edi
+	xorl	%eax,%eax
 	shrl	$2,%ecx
+	cld
 	rep
 	stosl
-
-/* Finally write the last 0-3 bytes.  */
-	movl	%edx,%ecx
+	movl	12(%esp),%ecx
 	andl	$3,%ecx
-	rep
-	stosb
-
+	jne	1f
 	popl	%edi
 	ret
-#endif
-#endif
+
+1:
+	rep
+	stosb
+	popl	%edi
+	ret
+#endif /* I586_CPU || I686_CPU */

 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
@ -256,7 +346,7 @@ bcopyb:
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	subl	%esi,%eax
-	cmpl	%ecx,%eax			/* overlapping? */
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 	cld					/* nope, copy forwards */
 	rep
@ -279,13 +369,19 @@ bcopyb:
 	cld
 	ret

+ENTRY(bcopy)
+	MEXITCOUNT
+	jmp	*_bcopy_vector
+
+ENTRY(ovbcopy)
+	MEXITCOUNT
+	jmp	*_ovbcopy_vector
+
 /*
- * (ov)bcopy(src, dst, cnt)
+ * generic_bcopy(src, dst, cnt)
 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
 */
-ALTENTRY(ovbcopy)
-ENTRY(bcopy)
-bcopy:
+ENTRY(generic_bcopy)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
@ -294,7 +390,7 @@ bcopy:

 	movl	%edi,%eax
 	subl	%esi,%eax
-	cmpl	%ecx,%eax			/* overlapping? */
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f

 	shrl	$2,%ecx				/* copy by 32-bit words */
@ -330,6 +426,141 @@ bcopy:
 	cld
 	ret

+ENTRY(i586_bcopy)
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%esi
+	movl	16(%esp),%edi
+	movl	20(%esp),%ecx
+
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
+	jb	1f
+
+	cmpl	$1024,%ecx
+	jb	small_i586_bcopy
+
+	sarb	$1,kernel_fpu_lock
+	jc	small_i586_bcopy
+	cmpl	$0,_npxproc
+	je	i586_bc1
+	smsw	%dx
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+	jmp	4f
+
+i586_bc1:
+	smsw	%dx
+	clts
+	fninit				/* XXX should avoid needing this */
+
+	ALIGN_TEXT
+4:
+	pushl	%ecx
+#define	DCACHE_SIZE	8192
+	cmpl	$(DCACHE_SIZE-512)/2,%ecx
+	jbe	2f
+	movl	$(DCACHE_SIZE-512)/2,%ecx
+2:
+	subl	%ecx,0(%esp)
+	cmpl	$256,%ecx
+	jb	5f			/* XXX should prefetch if %ecx >= 32 */
+	pushl	%esi
+	pushl	%ecx
+	ALIGN_TEXT
+3:
+	movl	0(%esi),%eax
+	movl	32(%esi),%eax
+	movl	64(%esi),%eax
+	movl	96(%esi),%eax
+	movl	128(%esi),%eax
+	movl	160(%esi),%eax
+	movl	192(%esi),%eax
+	movl	224(%esi),%eax
+	addl	$256,%esi
+	subl	$256,%ecx
+	cmpl	$256,%ecx
+	jae	3b
+	popl	%ecx
+	popl	%esi
+5:
+	ALIGN_TEXT
+large_i586_bcopy_loop:
+	fildq	0(%esi)
+	fildq	8(%esi)
+	fildq	16(%esi)
+	fildq	24(%esi)
+	fildq	32(%esi)
+	fildq	40(%esi)
+	fildq	48(%esi)
+	fildq	56(%esi)
+	fistpq	56(%edi)
+	fistpq	48(%edi)
+	fistpq	40(%edi)
+	fistpq	32(%edi)
+	fistpq	24(%edi)
+	fistpq	16(%edi)
+	fistpq	8(%edi)
+	fistpq	0(%edi)
+	addl	$64,%esi
+	addl	$64,%edi
+	subl	$64,%ecx
+	cmpl	$64,%ecx
+	jae	large_i586_bcopy_loop
+	popl	%eax
+	addl	%eax,%ecx
+	cmpl	$64,%ecx
+	jae	4b
+
+	cmpl	$0,_npxproc
+	je	i586_bc2
+	frstor	0(%esp)
+	addl	$108,%esp
+i586_bc2:
+	lmsw	%dx
+	movb	$0xfe,kernel_fpu_lock
+
+/*
+ * This is a duplicate of the main part of generic_bcopy.  See the comments
+ * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
+ * would mess up high resolution profiling.
+ */
+	ALIGN_TEXT
+small_i586_bcopy:
+	shrl	$2,%ecx
+	cld
+	rep
+	movsl
+	movl	20(%esp),%ecx
+	andl	$3,%ecx
+	rep
+	movsb
+	popl	%edi
+	popl	%esi
+	ret
+
+	ALIGN_TEXT
+1:
+	addl	%ecx,%edi
+	addl	%ecx,%esi
+	decl	%edi
+	decl	%esi
+	andl	$3,%ecx
+	std
+	rep
+	movsb
+	movl	20(%esp),%ecx
+	shrl	$2,%ecx
+	subl	$3,%esi
+	subl	$3,%edi
+	rep
+	movsl
+	popl	%edi
+	popl	%esi
+	cld
+	ret

 /*
 * Note: memcpy does not support overlapping copies
--- a/sys/i386/i386/support.s
+++ b/sys/i386/i386/support.s
@ -30,10 +30,10 @@
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
- *	$Id: support.s,v 1.38 1996/09/10 08:31:57 bde Exp $
+ *	$Id: support.s,v 1.39 1996/09/20 16:52:09 bde Exp $
 */

-#include <sys/errno.h>
+#include "opt_temporary.h"			/* for I586_*_B* */

 #include <machine/asmacros.h>
 #include <machine/cputypes.h>
@ -44,10 +44,19 @@
 #define KDSEL		0x10			/* kernel data selector */
 #define IDXSHIFT	10

-
 	.data
+	.globl	_bcopy_vector
+_bcopy_vector:
+	.long	_generic_bcopy
 	.globl	_bzero
-_bzero:	.long	_generic_bzero
+_bzero:
+	.long	_generic_bzero
+	.globl	_ovbcopy_vector
+_ovbcopy_vector:
+	.long	_generic_bcopy
+kernel_fpu_lock:
+	.byte	0xfe
+	.space	3

 	.text

@ -174,66 +183,147 @@ do0:
 	ret
 #endif

-#if 0	/* Actually lowers performance in real-world cases */
 #if defined(I586_CPU) || defined(I686_CPU)
-ALTENTRY(i586_bzero)
-ENTRY(i686_bzero)
+ENTRY(i586_bzero)
+	movl	4(%esp),%edx
+	movl	8(%esp),%ecx
+
+	/*
+	 * The FPU register method is twice as fast as the integer register
+	 * method unless the target is in the L1 cache and we pre-allocate a
+	 * cache line for it (then the integer register method is 4-5 times
+	 * faster).  However, we never pre-allocate cache lines, since that
+	 * would make the integer method 25% or more slower for the common
+	 * case when the target isn't in either the L1 cache or the L2 cache.
+	 * Thus we normally use the FPU register method unless the overhead
+	 * would be too large.
+	 */
+	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
+	jb	intreg_i586_bzero
+
+	/*
+	 * The FPU registers may belong to an application or to fastmove()
+	 * or to another invocation of bcopy() or ourself in a higher level
+	 * interrupt or trap handler.  Preserving the registers is
+	 * complicated since we avoid it if possible at all levels.  We
+	 * want to localize the complications even when that increases them.
+	 * Here the extra work involves preserving CR0_TS in TS.
+	 * `npxproc != NULL' is supposed to be the condition that all the
+	 * FPU resources belong to an application, but npxproc and CR0_TS
+	 * aren't set atomically enough for this condition to work in
+	 * interrupt handlers.
+	 *
+	 * Case 1: FPU registers belong to the application: we must preserve
+	 * the registers if we use them, so we only use the FPU register
+	 * method if the target size is large enough to amortize the extra
+	 * overhead for preserving them.  CR0_TS must be preserved although
+	 * it is very likely to end up as set.
+	 *
+	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
+	 * makes the registers look like they belong to an application so
+	 * that cpu_switch() and savectx() don't have to know about it, so
+	 * this case reduces to case 1.
+	 *
+	 * Case 3: FPU registers belong to the kernel: don't use the FPU
+	 * register method.  This case is unlikely, and supporting it would
+	 * be more complicated and might take too much stack.
+	 *
+	 * Case 4: FPU registers don't belong to anyone: the FPU registers
+	 * don't need to be preserved, so we always use the FPU register
+	 * method.  CR0_TS must be preserved although it is very likely to
+	 * always end up as clear.
+	 */
+	cmpl	$0,_npxproc
+	je	i586_bz1
+	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
+	jb	intreg_i586_bzero
+	sarb	$1,kernel_fpu_lock
+	jc	intreg_i586_bzero
+	smsw	%ax
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+	jmp	i586_bz2
+
+i586_bz1:
+	sarb	$1,kernel_fpu_lock
+	jc	intreg_i586_bzero
+	smsw	%ax
+	clts
+	fninit				/* XXX should avoid needing this */
+i586_bz2:
+	fldz
+
+	/*
+	 * Align to an 8 byte boundary (misalignment in the main loop would
+	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
+	 * already aligned) by always zeroing 8 bytes and using the part up
+	 * to the _next_ alignment position.
+	 */
+	fstl	0(%edx)
+	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
+	addl	$8,%edx
+	andl	$~7,%edx
+	subl	%edx,%ecx
+
+	/*
+	 * Similarly align `len' to a multiple of 8.
+	 */
+	fstl	-8(%edx,%ecx)
+	decl	%ecx
+	andl	$~7,%ecx
+
+	/*
+	 * This wouldn't be any faster if it were unrolled, since the loop
+	 * control instructions are much faster than the fstl and/or done
+	 * in parallel with it so their overhead is insignificant.
+	 */
+fpureg_i586_bzero_loop:
+	fstl	0(%edx)
+	addl	$8,%edx
+	subl	$8,%ecx
+	cmpl	$8,%ecx
+	jae	fpureg_i586_bzero_loop
+
+	cmpl	$0,_npxproc
+	je	i586_bz3
+	frstor	0(%esp)
+	addl	$108,%esp
+	lmsw	%ax
+	movb	$0xfe,kernel_fpu_lock
+	ret
+
+i586_bz3:
+	fstpl	%st(0)
+	lmsw	%ax
+	movb	$0xfe,kernel_fpu_lock
+	ret
+
+intreg_i586_bzero:
+	/*
+	 * `rep stos' seems to be the best method in practice for small
+	 * counts.  Fancy methods usually take too long to start up due
+	 * to cache and BTB misses.
+	 */
 	pushl	%edi
-	movl	8(%esp),%edi	/* destination pointer */
-	movl	12(%esp),%edx	/* size (in 8-bit words) */
-
-	xorl	%eax,%eax	/* store data */
-	cld
-
-/* If less than 100 bytes to write, skip tricky code.  */
-	cmpl	$100,%edx
-	movl	%edx,%ecx	/* needed when branch is taken! */
-	jl	2f
-
-/* First write 0-3 bytes to make the pointer 32-bit aligned.  */
-	movl	%edi,%ecx	/* Copy ptr to ecx... */
-	negl	%ecx		/* ...and negate that and... */
-	andl	$3,%ecx		/* ...mask to get byte count.  */
-	subl	%ecx,%edx	/* adjust global byte count */
-	rep
-	stosb
-
-	subl	$32,%edx	/* offset count for unrolled loop */
-	movl	(%edi),%ecx	/* Fetch destination cache line */
-
-	.align	2,0x90		/* supply 0x90 for broken assemblers */
-1:
-	movl	28(%edi),%ecx	/* allocate cache line for destination */
-	subl	$32,%edx	/* decr loop count */
-	movl	%eax,0(%edi)	/* store words pairwise */
-	movl	%eax,4(%edi)
-	movl	%eax,8(%edi)
-	movl	%eax,12(%edi)
-	movl	%eax,16(%edi)
-	movl	%eax,20(%edi)
-	movl	%eax,24(%edi)
-	movl	%eax,28(%edi)
-
-	leal	32(%edi),%edi	/* update destination pointer */
-	jge	1b
-	leal	32(%edx),%ecx
-
-/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped).  */
-2:
+	movl	%edx,%edi
+	xorl	%eax,%eax
 	shrl	$2,%ecx
+	cld
 	rep
 	stosl
-
-/* Finally write the last 0-3 bytes.  */
-	movl	%edx,%ecx
+	movl	12(%esp),%ecx
 	andl	$3,%ecx
-	rep
-	stosb
-
+	jne	1f
 	popl	%edi
 	ret
-#endif
-#endif
+
+1:
+	rep
+	stosb
+	popl	%edi
+	ret
+#endif /* I586_CPU || I686_CPU */

 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
@ -256,7 +346,7 @@ bcopyb:
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	subl	%esi,%eax
-	cmpl	%ecx,%eax			/* overlapping? */
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 	cld					/* nope, copy forwards */
 	rep
@ -279,13 +369,19 @@ bcopyb:
 	cld
 	ret

+ENTRY(bcopy)
+	MEXITCOUNT
+	jmp	*_bcopy_vector
+
+ENTRY(ovbcopy)
+	MEXITCOUNT
+	jmp	*_ovbcopy_vector
+
 /*
- * (ov)bcopy(src, dst, cnt)
+ * generic_bcopy(src, dst, cnt)
 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
 */
-ALTENTRY(ovbcopy)
-ENTRY(bcopy)
-bcopy:
+ENTRY(generic_bcopy)
 	pushl	%esi
 	pushl	%edi
 	movl	12(%esp),%esi
@ -294,7 +390,7 @@ bcopy:

 	movl	%edi,%eax
 	subl	%esi,%eax
-	cmpl	%ecx,%eax			/* overlapping? */
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f

 	shrl	$2,%ecx				/* copy by 32-bit words */
@ -330,6 +426,141 @@ bcopy:
 	cld
 	ret

+ENTRY(i586_bcopy)
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%esi
+	movl	16(%esp),%edi
+	movl	20(%esp),%ecx
+
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ecx,%eax			/* overlapping && src < dst? */
+	jb	1f
+
+	cmpl	$1024,%ecx
+	jb	small_i586_bcopy
+
+	sarb	$1,kernel_fpu_lock
+	jc	small_i586_bcopy
+	cmpl	$0,_npxproc
+	je	i586_bc1
+	smsw	%dx
+	clts
+	subl	$108,%esp
+	fnsave	0(%esp)
+	jmp	4f
+
+i586_bc1:
+	smsw	%dx
+	clts
+	fninit				/* XXX should avoid needing this */
+
+	ALIGN_TEXT
+4:
+	pushl	%ecx
+#define	DCACHE_SIZE	8192
+	cmpl	$(DCACHE_SIZE-512)/2,%ecx
+	jbe	2f
+	movl	$(DCACHE_SIZE-512)/2,%ecx
+2:
+	subl	%ecx,0(%esp)
+	cmpl	$256,%ecx
+	jb	5f			/* XXX should prefetch if %ecx >= 32 */
+	pushl	%esi
+	pushl	%ecx
+	ALIGN_TEXT
+3:
+	movl	0(%esi),%eax
+	movl	32(%esi),%eax
+	movl	64(%esi),%eax
+	movl	96(%esi),%eax
+	movl	128(%esi),%eax
+	movl	160(%esi),%eax
+	movl	192(%esi),%eax
+	movl	224(%esi),%eax
+	addl	$256,%esi
+	subl	$256,%ecx
+	cmpl	$256,%ecx
+	jae	3b
+	popl	%ecx
+	popl	%esi
+5:
+	ALIGN_TEXT
+large_i586_bcopy_loop:
+	fildq	0(%esi)
+	fildq	8(%esi)
+	fildq	16(%esi)
+	fildq	24(%esi)
+	fildq	32(%esi)
+	fildq	40(%esi)
+	fildq	48(%esi)
+	fildq	56(%esi)
+	fistpq	56(%edi)
+	fistpq	48(%edi)
+	fistpq	40(%edi)
+	fistpq	32(%edi)
+	fistpq	24(%edi)
+	fistpq	16(%edi)
+	fistpq	8(%edi)
+	fistpq	0(%edi)
+	addl	$64,%esi
+	addl	$64,%edi
+	subl	$64,%ecx
+	cmpl	$64,%ecx
+	jae	large_i586_bcopy_loop
+	popl	%eax
+	addl	%eax,%ecx
+	cmpl	$64,%ecx
+	jae	4b
+
+	cmpl	$0,_npxproc
+	je	i586_bc2
+	frstor	0(%esp)
+	addl	$108,%esp
+i586_bc2:
+	lmsw	%dx
+	movb	$0xfe,kernel_fpu_lock
+
+/*
+ * This is a duplicate of the main part of generic_bcopy.  See the comments
+ * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
+ * would mess up high resolution profiling.
+ */
+	ALIGN_TEXT
+small_i586_bcopy:
+	shrl	$2,%ecx
+	cld
+	rep
+	movsl
+	movl	20(%esp),%ecx
+	andl	$3,%ecx
+	rep
+	movsb
+	popl	%edi
+	popl	%esi
+	ret
+
+	ALIGN_TEXT
+1:
+	addl	%ecx,%edi
+	addl	%ecx,%esi
+	decl	%edi
+	decl	%esi
+	andl	$3,%ecx
+	std
+	rep
+	movsb
+	movl	20(%esp),%ecx
+	shrl	$2,%ecx
+	subl	$3,%esi
+	subl	$3,%edi
+	rep
+	movsl
+	popl	%edi
+	popl	%esi
+	cld
+	ret

 /*
 * Note: memcpy does not support overlapping copies