1) enhanced in_cksum from Bruce Evans.

2) minor comment change in machdep.c 3) enhanced bzero from John Dyson (twice as fast on a 486DX/33)
svn path=/head/; revision=1247
1994-03-07 11:47:32 +00:00 · 1994-03-07 11:47:32 +00:00 · 5c09563e1e · 2020-12-20 02:59:44 +00:00
commit 5c09563e1e
parent 04f1835605
6 changed files with 378 additions and 28 deletions
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@ -35,7 +35,7 @@
 * SUCH DAMAGE.
 *
 *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
- *	$Id: machdep.c,v 1.36 1994/02/08 12:58:44 davidg Exp $
+ *	$Id: machdep.c,v 1.37 1994/02/24 00:18:04 hsu Exp $
 */

 #include "npx.h"
@ -709,7 +709,7 @@ boot(arghowto)
 	DELAY (100000);	/* wait 100ms for printf's to complete */
 	cpu_reset();
 	for(;;) ;
-	/*NOTREACHED*/
+	/* NOTREACHED */
 }

 unsigned long	dumpmag = 0x8fca0101UL;	/* magic number for savecore */
@ -1133,7 +1133,7 @@ init386(first)
 #ifndef LARGEMEM
 	if (biosextmem > 65536) {
 		panic("extended memory beyond limit of 64MB");
-		/* NOT REACHED */
+		/* NOTREACHED */
 	}
 #endif

--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@ -30,7 +30,7 @@
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
- *	$Id: support.s,v 1.3 1994/01/31 23:47:29 davidg Exp $
+ *	$Id: support.s,v 1.4 1994/02/01 04:09:07 davidg Exp $
 */

 #include "assym.s"				/* system definitions */
@ -151,8 +151,20 @@ ENTRY(outsw)					/* outsw(port, addr, cnt) */
 /*
 * bcopy family
 */
-/* void bzero(void *base, u_int cnt) */
+/*
+ * void bzero(void *base, u_int cnt) 
+ * Special code for I486 because stosl uses lots
+ * of clocks.  Makes little or no difference on DX2 type
+ * machines, but about stosl is about 1/2 as fast as
+ * memory moves on standard DX !!!!!
+ */
+
 ENTRY(bzero)
+#if defined(I486_CPU) && (defined(I386_CPU) || defined(I586_CPU))
+	cmpl	$CPUCLASS_486,_cpu_class
+	jz	1f
+#endif
+#if defined(I386_CPU) || defined(I586_CPU)
 	pushl	%edi
 	movl	8(%esp),%edi
 	movl	12(%esp),%ecx
@ -167,6 +179,90 @@ ENTRY(bzero)
 	stosb
 	popl	%edi
 	ret
+	.align	4
+#endif
+#if defined(I486_CPU)
+1:
+	movl	4(%esp),%edx
+	movl	8(%esp),%ecx
+	xorl	%eax,%eax
+/
+/ do 64 byte chunks first
+/
+2:
+	cmpl	$64,%ecx
+	jb	3f
+	movl	%eax,(%edx)
+	movl	%eax,4(%edx)
+	movl	%eax,8(%edx)
+	movl	%eax,12(%edx)
+	movl	%eax,16(%edx)
+	movl	%eax,20(%edx)
+	movl	%eax,24(%edx)
+	movl	%eax,28(%edx)
+	movl	%eax,32(%edx)
+	movl	%eax,36(%edx)
+	movl	%eax,40(%edx)
+	movl	%eax,44(%edx)
+	movl	%eax,48(%edx)
+	movl	%eax,52(%edx)
+	movl	%eax,56(%edx)
+	movl	%eax,60(%edx)
+	addl	$64,%edx
+	subl	$64,%ecx
+	jnz	2b
+	ret
+	.align	4
+/
+/ do 16 byte chunks
+/
+3:
+	cmpl	$16,%ecx
+	jb	4f
+	movl	%eax,(%edx)
+	movl	%eax,4(%edx)
+	movl	%eax,8(%edx)
+	movl	%eax,12(%edx)
+	addl	$16,%edx
+	subl	$16,%ecx
+	jnz	3b
+	ret
+	.align	4
+/
+/ do 4 byte chunks
+/
+4:	cmpl	$4,%ecx
+	jb	5f
+	movl	%eax,(%edx)
+	addl	$4,%edx
+	subl	$4,%ecx
+	jnz	4b
+	ret
+/
+/ do 1 byte chunks -- this appears to be faster than a loop
+/
+	.align	4
+jtab:	.long	do0
+	.long	do1
+	.long	do2
+	.long	do3
+
+	.align	4
+5:	jmp	jtab(,%ecx,4)
+
+	.align	2
+do3:	movb	$0,(%edx)
+	incl	%edx
+	movw	$0,(%edx)
+	ret
+	.align	2
+do2:	movw	$0,(%edx)
+	ret
+	.align	2
+do1:	movb	$0,(%edx)
+do0:	ret
+
+#endif

 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
--- a/sys/amd64/amd64/support.s
+++ b/sys/amd64/amd64/support.s
@ -30,7 +30,7 @@
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
- *	$Id: support.s,v 1.3 1994/01/31 23:47:29 davidg Exp $
+ *	$Id: support.s,v 1.4 1994/02/01 04:09:07 davidg Exp $
 */

 #include "assym.s"				/* system definitions */
@ -151,8 +151,20 @@ ENTRY(outsw)					/* outsw(port, addr, cnt) */
 /*
 * bcopy family
 */
-/* void bzero(void *base, u_int cnt) */
+/*
+ * void bzero(void *base, u_int cnt) 
+ * Special code for I486 because stosl uses lots
+ * of clocks.  Makes little or no difference on DX2 type
+ * machines, but about stosl is about 1/2 as fast as
+ * memory moves on standard DX !!!!!
+ */
+
 ENTRY(bzero)
+#if defined(I486_CPU) && (defined(I386_CPU) || defined(I586_CPU))
+	cmpl	$CPUCLASS_486,_cpu_class
+	jz	1f
+#endif
+#if defined(I386_CPU) || defined(I586_CPU)
 	pushl	%edi
 	movl	8(%esp),%edi
 	movl	12(%esp),%ecx
@ -167,6 +179,90 @@ ENTRY(bzero)
 	stosb
 	popl	%edi
 	ret
+	.align	4
+#endif
+#if defined(I486_CPU)
+1:
+	movl	4(%esp),%edx
+	movl	8(%esp),%ecx
+	xorl	%eax,%eax
+/
+/ do 64 byte chunks first
+/
+2:
+	cmpl	$64,%ecx
+	jb	3f
+	movl	%eax,(%edx)
+	movl	%eax,4(%edx)
+	movl	%eax,8(%edx)
+	movl	%eax,12(%edx)
+	movl	%eax,16(%edx)
+	movl	%eax,20(%edx)
+	movl	%eax,24(%edx)
+	movl	%eax,28(%edx)
+	movl	%eax,32(%edx)
+	movl	%eax,36(%edx)
+	movl	%eax,40(%edx)
+	movl	%eax,44(%edx)
+	movl	%eax,48(%edx)
+	movl	%eax,52(%edx)
+	movl	%eax,56(%edx)
+	movl	%eax,60(%edx)
+	addl	$64,%edx
+	subl	$64,%ecx
+	jnz	2b
+	ret
+	.align	4
+/
+/ do 16 byte chunks
+/
+3:
+	cmpl	$16,%ecx
+	jb	4f
+	movl	%eax,(%edx)
+	movl	%eax,4(%edx)
+	movl	%eax,8(%edx)
+	movl	%eax,12(%edx)
+	addl	$16,%edx
+	subl	$16,%ecx
+	jnz	3b
+	ret
+	.align	4
+/
+/ do 4 byte chunks
+/
+4:	cmpl	$4,%ecx
+	jb	5f
+	movl	%eax,(%edx)
+	addl	$4,%edx
+	subl	$4,%ecx
+	jnz	4b
+	ret
+/
+/ do 1 byte chunks -- this appears to be faster than a loop
+/
+	.align	4
+jtab:	.long	do0
+	.long	do1
+	.long	do2
+	.long	do3
+
+	.align	4
+5:	jmp	jtab(,%ecx,4)
+
+	.align	2
+do3:	movb	$0,(%edx)
+	incl	%edx
+	movw	$0,(%edx)
+	ret
+	.align	2
+do2:	movw	$0,(%edx)
+	ret
+	.align	2
+do1:	movb	$0,(%edx)
+do0:	ret
+
+#endif

 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
--- a/sys/i386/i386/in_cksum.c
+++ b/sys/i386/i386/in_cksum.c
@ -32,7 +32,7 @@
 *
 *	from tahoe:	in_cksum.c	1.2	86/01/05
 *	from:		@(#)in_cksum.c	1.3 (Berkeley) 1/19/91
- *	$Id: in_cksum.c,v 1.3 1993/11/25 01:30:53 wollman Exp $
+ *	$Id: in_cksum.c,v 1.4 1993/12/19 00:50:02 wollman Exp $
 */

 #include "param.h"
@ -56,9 +56,10 @@
 * Thanks to gcc we don't have to guess
 * which registers contain sum & w.
 */
-#define CLC     asm("clc")
-#define ADD(n)  asm("adcl " #n "(%2), %0": "=r"(sum): "0"(sum), "r"(w))
-#define MOP     asm("adcl $0, %0":         "=r"(sum): "0"(sum))
+#define ADD(n)	asm("addl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w))
+#define ADDC(n)	asm("adcl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w))
+#define LOAD(n)	asm volatile("movb " #n "(%1), %0" : "=r" (junk) : "r" (w))
+#define MOP	asm("adcl         $0, %0" : "=r" (sum) : "0" (sum))

 int
 in_cksum(m, len)
@ -113,29 +114,91 @@ in_cksum(m, len)
 				mlen -= 2;
 			}
 		}
+		/*
+		 * Advance to a 486 cache line boundary.
+		 */
+		if (4 & (int) w && mlen >= 4) {
+			ADD(0);
+			MOP;
+			w += 2;
+			mlen -= 4;
+		}
+		if (8 & (int) w && mlen >= 8) {
+			ADD(0);
+			ADDC(4);
+			MOP;
+			w += 4;
+			mlen -= 8;
+		}
 		/*
 		 * Do as much of the checksum as possible 32 bits at at time.
 		 * In fact, this loop is unrolled to make overhead from
 		 * branches &c small.
 		 */
+		mlen -= 1;
 		while ((mlen -= 32) >= 0) {
+			u_char junk;
 			/*
-			 * Clear the carry flag, add with carry 16 words
-			 * and fold-in last carry by adding a 0 with carry.
+			 * Add with carry 16 words and fold in the last
+			 * carry by adding a 0 with carry.
+			 *
+			 * The early ADD(16) and the LOAD(32) are to load
+			 * the next 2 cache lines in advance on 486's.  The
+			 * 486 has a penalty of 2 clock cycles for loading
+			 * a cache line, plus whatever time the external
+			 * memory takes to load the first word(s) addressed.
+			 * These penalties are unavoidable.  Subsequent
+			 * accesses to a cache line being loaded (and to
+			 * other external memory?) are delayed until the 
+			 * whole load finishes.  These penalties are mostly
+			 * avoided by not accessing external memory for
+			 * 8 cycles after the ADD(16) and 12 cycles after
+			 * the LOAD(32).  The loop terminates when mlen
+			 * is initially 33 (not 32) to guaranteed that
+			 * the LOAD(32) is within bounds.
 			 */
-			CLC;
-			ADD(0);  ADD(4);  ADD(8);  ADD(12);
-			ADD(16); ADD(20); ADD(24); ADD(28);
-			MOP; w += 16;
+			ADD(16);
+			ADDC(0);
+			ADDC(4);
+			ADDC(8);
+			ADDC(12);
+			LOAD(32);
+			ADDC(20);
+			ADDC(24);
+			ADDC(28);
+			MOP;
+			w += 16;
 		}
-		mlen += 32;
-		while ((mlen -= 8) >= 0) {
-			CLC;
-			ADD(0); ADD(4);
+		mlen += 32 + 1;
+		if (mlen >= 32) {
+			ADD(16);
+			ADDC(0);
+			ADDC(4);
+			ADDC(8);
+			ADDC(12);
+			ADDC(20);
+			ADDC(24);
+			ADDC(28);
+			MOP;
+			w += 16;
+			mlen -= 32;
+		}
+		if (mlen >= 16) {
+			ADD(0);
+			ADDC(4);
+			ADDC(8);
+			ADDC(12);
+			MOP;
+			w += 8;
+			mlen -= 16;
+		}
+		if (mlen >= 8) {
+			ADD(0);
+			ADDC(4);
 			MOP;
 			w += 4;
+			mlen -= 8;
 		}
-		mlen += 8;
 		if (mlen == 0 && byte_swapped == 0)
 			continue;       /* worth 1% maybe ?? */
 		REDUCE;
@ -172,4 +235,3 @@ in_cksum(m, len)
 	REDUCE;
 	return (~sum & 0xffff);
 }
-
--- a/sys/i386/i386/machdep.c
+++ b/sys/i386/i386/machdep.c
@ -35,7 +35,7 @@
 * SUCH DAMAGE.
 *
 *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
- *	$Id: machdep.c,v 1.36 1994/02/08 12:58:44 davidg Exp $
+ *	$Id: machdep.c,v 1.37 1994/02/24 00:18:04 hsu Exp $
 */

 #include "npx.h"
@ -709,7 +709,7 @@ boot(arghowto)
 	DELAY (100000);	/* wait 100ms for printf's to complete */
 	cpu_reset();
 	for(;;) ;
-	/*NOTREACHED*/
+	/* NOTREACHED */
 }

 unsigned long	dumpmag = 0x8fca0101UL;	/* magic number for savecore */
@ -1133,7 +1133,7 @@ init386(first)
 #ifndef LARGEMEM
 	if (biosextmem > 65536) {
 		panic("extended memory beyond limit of 64MB");
-		/* NOT REACHED */
+		/* NOTREACHED */
 	}
 #endif

--- a/sys/i386/i386/support.s
+++ b/sys/i386/i386/support.s
@ -30,7 +30,7 @@
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
- *	$Id: support.s,v 1.3 1994/01/31 23:47:29 davidg Exp $
+ *	$Id: support.s,v 1.4 1994/02/01 04:09:07 davidg Exp $
 */

 #include "assym.s"				/* system definitions */
@ -151,8 +151,20 @@ ENTRY(outsw)					/* outsw(port, addr, cnt) */
 /*
 * bcopy family
 */
-/* void bzero(void *base, u_int cnt) */
+/*
+ * void bzero(void *base, u_int cnt) 
+ * Special code for I486 because stosl uses lots
+ * of clocks.  Makes little or no difference on DX2 type
+ * machines, but about stosl is about 1/2 as fast as
+ * memory moves on standard DX !!!!!
+ */
+
 ENTRY(bzero)
+#if defined(I486_CPU) && (defined(I386_CPU) || defined(I586_CPU))
+	cmpl	$CPUCLASS_486,_cpu_class
+	jz	1f
+#endif
+#if defined(I386_CPU) || defined(I586_CPU)
 	pushl	%edi
 	movl	8(%esp),%edi
 	movl	12(%esp),%ecx
@ -167,6 +179,90 @@ ENTRY(bzero)
 	stosb
 	popl	%edi
 	ret
+	.align	4
+#endif
+#if defined(I486_CPU)
+1:
+	movl	4(%esp),%edx
+	movl	8(%esp),%ecx
+	xorl	%eax,%eax
+/
+/ do 64 byte chunks first
+/
+2:
+	cmpl	$64,%ecx
+	jb	3f
+	movl	%eax,(%edx)
+	movl	%eax,4(%edx)
+	movl	%eax,8(%edx)
+	movl	%eax,12(%edx)
+	movl	%eax,16(%edx)
+	movl	%eax,20(%edx)
+	movl	%eax,24(%edx)
+	movl	%eax,28(%edx)
+	movl	%eax,32(%edx)
+	movl	%eax,36(%edx)
+	movl	%eax,40(%edx)
+	movl	%eax,44(%edx)
+	movl	%eax,48(%edx)
+	movl	%eax,52(%edx)
+	movl	%eax,56(%edx)
+	movl	%eax,60(%edx)
+	addl	$64,%edx
+	subl	$64,%ecx
+	jnz	2b
+	ret
+	.align	4
+/
+/ do 16 byte chunks
+/
+3:
+	cmpl	$16,%ecx
+	jb	4f
+	movl	%eax,(%edx)
+	movl	%eax,4(%edx)
+	movl	%eax,8(%edx)
+	movl	%eax,12(%edx)
+	addl	$16,%edx
+	subl	$16,%ecx
+	jnz	3b
+	ret
+	.align	4
+/
+/ do 4 byte chunks
+/
+4:	cmpl	$4,%ecx
+	jb	5f
+	movl	%eax,(%edx)
+	addl	$4,%edx
+	subl	$4,%ecx
+	jnz	4b
+	ret
+/
+/ do 1 byte chunks -- this appears to be faster than a loop
+/
+	.align	4
+jtab:	.long	do0
+	.long	do1
+	.long	do2
+	.long	do3
+
+	.align	4
+5:	jmp	jtab(,%ecx,4)
+
+	.align	2
+do3:	movb	$0,(%edx)
+	incl	%edx
+	movw	$0,(%edx)
+	ret
+	.align	2
+do2:	movw	$0,(%edx)
+	ret
+	.align	2
+do1:	movb	$0,(%edx)
+do0:	ret
+
+#endif

 /* fillw(pat, base, cnt) */
 ENTRY(fillw)