From 5c09563e1e3d5f616ed75581f7be089cb801f719 Mon Sep 17 00:00:00 2001 From: David Greenman Date: Mon, 7 Mar 1994 11:47:32 +0000 Subject: [PATCH] 1) enhanced in_cksum from Bruce Evans. 2) minor comment change in machdep.c 3) enhanced bzero from John Dyson (twice as fast on a 486DX/33) --- sys/amd64/amd64/machdep.c | 6 +-- sys/amd64/amd64/support.S | 100 +++++++++++++++++++++++++++++++++++++- sys/amd64/amd64/support.s | 100 +++++++++++++++++++++++++++++++++++++- sys/i386/i386/in_cksum.c | 94 +++++++++++++++++++++++++++++------ sys/i386/i386/machdep.c | 6 +-- sys/i386/i386/support.s | 100 +++++++++++++++++++++++++++++++++++++- 6 files changed, 378 insertions(+), 28 deletions(-) diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 29ec72e26633..cf872b0877d1 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -35,7 +35,7 @@ * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.36 1994/02/08 12:58:44 davidg Exp $ + * $Id: machdep.c,v 1.37 1994/02/24 00:18:04 hsu Exp $ */ #include "npx.h" @@ -709,7 +709,7 @@ boot(arghowto) DELAY (100000); /* wait 100ms for printf's to complete */ cpu_reset(); for(;;) ; - /*NOTREACHED*/ + /* NOTREACHED */ } unsigned long dumpmag = 0x8fca0101UL; /* magic number for savecore */ @@ -1133,7 +1133,7 @@ init386(first) #ifndef LARGEMEM if (biosextmem > 65536) { panic("extended memory beyond limit of 64MB"); - /* NOT REACHED */ + /* NOTREACHED */ } #endif diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 2a0bd34de406..190b835ff40a 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -30,7 +30,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: support.s,v 1.3 1994/01/31 23:47:29 davidg Exp $ + * $Id: support.s,v 1.4 1994/02/01 04:09:07 davidg Exp $ */ #include "assym.s" /* system definitions */ @@ -151,8 +151,20 @@ ENTRY(outsw) /* outsw(port, addr, cnt) */ /* * bcopy family */ -/* void bzero(void *base, u_int cnt) */ +/* + * void bzero(void *base, u_int cnt) + * Special code for I486 because stosl uses lots + * of clocks. Makes little or no difference on DX2 type + * machines, but about stosl is about 1/2 as fast as + * memory moves on standard DX !!!!! + */ + ENTRY(bzero) +#if defined(I486_CPU) && (defined(I386_CPU) || defined(I586_CPU)) + cmpl $CPUCLASS_486,_cpu_class + jz 1f +#endif +#if defined(I386_CPU) || defined(I586_CPU) pushl %edi movl 8(%esp),%edi movl 12(%esp),%ecx @@ -167,6 +179,90 @@ ENTRY(bzero) stosb popl %edi ret + .align 4 +#endif +#if defined(I486_CPU) +1: + movl 4(%esp),%edx + movl 8(%esp),%ecx + xorl %eax,%eax +/ +/ do 64 byte chunks first +/ +2: + cmpl $64,%ecx + jb 3f + movl %eax,(%edx) + movl %eax,4(%edx) + movl %eax,8(%edx) + movl %eax,12(%edx) + movl %eax,16(%edx) + movl %eax,20(%edx) + movl %eax,24(%edx) + movl %eax,28(%edx) + movl %eax,32(%edx) + movl %eax,36(%edx) + movl %eax,40(%edx) + movl %eax,44(%edx) + movl %eax,48(%edx) + movl %eax,52(%edx) + movl %eax,56(%edx) + movl %eax,60(%edx) + addl $64,%edx + subl $64,%ecx + jnz 2b + ret + .align 4 +/ +/ do 16 byte chunks +/ +3: + cmpl $16,%ecx + jb 4f + movl %eax,(%edx) + movl %eax,4(%edx) + movl %eax,8(%edx) + movl %eax,12(%edx) + addl $16,%edx + subl $16,%ecx + jnz 3b + ret + .align 4 +/ +/ do 4 byte chunks +/ +4: cmpl $4,%ecx + jb 5f + movl %eax,(%edx) + addl $4,%edx + subl $4,%ecx + jnz 4b + ret +/ +/ do 1 byte chunks -- this appears to be faster than a loop +/ + .align 4 +jtab: .long do0 + .long do1 + .long do2 + .long do3 + + .align 4 +5: jmp jtab(,%ecx,4) + + .align 2 +do3: movb $0,(%edx) + incl %edx + movw $0,(%edx) + ret + .align 2 +do2: movw $0,(%edx) + ret + .align 2 +do1: movb $0,(%edx) +do0: ret + +#endif /* fillw(pat, base, cnt) */ ENTRY(fillw) diff --git a/sys/amd64/amd64/support.s b/sys/amd64/amd64/support.s index 2a0bd34de406..190b835ff40a 100644 --- a/sys/amd64/amd64/support.s +++ b/sys/amd64/amd64/support.s @@ -30,7 +30,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: support.s,v 1.3 1994/01/31 23:47:29 davidg Exp $ + * $Id: support.s,v 1.4 1994/02/01 04:09:07 davidg Exp $ */ #include "assym.s" /* system definitions */ @@ -151,8 +151,20 @@ ENTRY(outsw) /* outsw(port, addr, cnt) */ /* * bcopy family */ -/* void bzero(void *base, u_int cnt) */ +/* + * void bzero(void *base, u_int cnt) + * Special code for I486 because stosl uses lots + * of clocks. Makes little or no difference on DX2 type + * machines, but about stosl is about 1/2 as fast as + * memory moves on standard DX !!!!! + */ + ENTRY(bzero) +#if defined(I486_CPU) && (defined(I386_CPU) || defined(I586_CPU)) + cmpl $CPUCLASS_486,_cpu_class + jz 1f +#endif +#if defined(I386_CPU) || defined(I586_CPU) pushl %edi movl 8(%esp),%edi movl 12(%esp),%ecx @@ -167,6 +179,90 @@ ENTRY(bzero) stosb popl %edi ret + .align 4 +#endif +#if defined(I486_CPU) +1: + movl 4(%esp),%edx + movl 8(%esp),%ecx + xorl %eax,%eax +/ +/ do 64 byte chunks first +/ +2: + cmpl $64,%ecx + jb 3f + movl %eax,(%edx) + movl %eax,4(%edx) + movl %eax,8(%edx) + movl %eax,12(%edx) + movl %eax,16(%edx) + movl %eax,20(%edx) + movl %eax,24(%edx) + movl %eax,28(%edx) + movl %eax,32(%edx) + movl %eax,36(%edx) + movl %eax,40(%edx) + movl %eax,44(%edx) + movl %eax,48(%edx) + movl %eax,52(%edx) + movl %eax,56(%edx) + movl %eax,60(%edx) + addl $64,%edx + subl $64,%ecx + jnz 2b + ret + .align 4 +/ +/ do 16 byte chunks +/ +3: + cmpl $16,%ecx + jb 4f + movl %eax,(%edx) + movl %eax,4(%edx) + movl %eax,8(%edx) + movl %eax,12(%edx) + addl $16,%edx + subl $16,%ecx + jnz 3b + ret + .align 4 +/ +/ do 4 byte chunks +/ +4: cmpl $4,%ecx + jb 5f + movl %eax,(%edx) + addl $4,%edx + subl $4,%ecx + jnz 4b + ret +/ +/ do 1 byte chunks -- this appears to be faster than a loop +/ + .align 4 +jtab: .long do0 + .long do1 + .long do2 + .long do3 + + .align 4 +5: jmp jtab(,%ecx,4) + + .align 2 +do3: movb $0,(%edx) + incl %edx + movw $0,(%edx) + ret + .align 2 +do2: movw $0,(%edx) + ret + .align 2 +do1: movb $0,(%edx) +do0: ret + +#endif /* fillw(pat, base, cnt) */ ENTRY(fillw) diff --git a/sys/i386/i386/in_cksum.c b/sys/i386/i386/in_cksum.c index c6a2d7e739d9..b11f0ffe7884 100644 --- a/sys/i386/i386/in_cksum.c +++ b/sys/i386/i386/in_cksum.c @@ -32,7 +32,7 @@ * * from tahoe: in_cksum.c 1.2 86/01/05 * from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91 - * $Id: in_cksum.c,v 1.3 1993/11/25 01:30:53 wollman Exp $ + * $Id: in_cksum.c,v 1.4 1993/12/19 00:50:02 wollman Exp $ */ #include "param.h" @@ -56,9 +56,10 @@ * Thanks to gcc we don't have to guess * which registers contain sum & w. */ -#define CLC asm("clc") -#define ADD(n) asm("adcl " #n "(%2), %0": "=r"(sum): "0"(sum), "r"(w)) -#define MOP asm("adcl $0, %0": "=r"(sum): "0"(sum)) +#define ADD(n) asm("addl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w)) +#define ADDC(n) asm("adcl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w)) +#define LOAD(n) asm volatile("movb " #n "(%1), %0" : "=r" (junk) : "r" (w)) +#define MOP asm("adcl $0, %0" : "=r" (sum) : "0" (sum)) int in_cksum(m, len) @@ -113,29 +114,91 @@ in_cksum(m, len) mlen -= 2; } } + /* + * Advance to a 486 cache line boundary. + */ + if (4 & (int) w && mlen >= 4) { + ADD(0); + MOP; + w += 2; + mlen -= 4; + } + if (8 & (int) w && mlen >= 8) { + ADD(0); + ADDC(4); + MOP; + w += 4; + mlen -= 8; + } /* * Do as much of the checksum as possible 32 bits at at time. * In fact, this loop is unrolled to make overhead from * branches &c small. */ + mlen -= 1; while ((mlen -= 32) >= 0) { + u_char junk; /* - * Clear the carry flag, add with carry 16 words - * and fold-in last carry by adding a 0 with carry. + * Add with carry 16 words and fold in the last + * carry by adding a 0 with carry. + * + * The early ADD(16) and the LOAD(32) are to load + * the next 2 cache lines in advance on 486's. The + * 486 has a penalty of 2 clock cycles for loading + * a cache line, plus whatever time the external + * memory takes to load the first word(s) addressed. + * These penalties are unavoidable. Subsequent + * accesses to a cache line being loaded (and to + * other external memory?) are delayed until the + * whole load finishes. These penalties are mostly + * avoided by not accessing external memory for + * 8 cycles after the ADD(16) and 12 cycles after + * the LOAD(32). The loop terminates when mlen + * is initially 33 (not 32) to guaranteed that + * the LOAD(32) is within bounds. */ - CLC; - ADD(0); ADD(4); ADD(8); ADD(12); - ADD(16); ADD(20); ADD(24); ADD(28); - MOP; w += 16; + ADD(16); + ADDC(0); + ADDC(4); + ADDC(8); + ADDC(12); + LOAD(32); + ADDC(20); + ADDC(24); + ADDC(28); + MOP; + w += 16; } - mlen += 32; - while ((mlen -= 8) >= 0) { - CLC; - ADD(0); ADD(4); + mlen += 32 + 1; + if (mlen >= 32) { + ADD(16); + ADDC(0); + ADDC(4); + ADDC(8); + ADDC(12); + ADDC(20); + ADDC(24); + ADDC(28); + MOP; + w += 16; + mlen -= 32; + } + if (mlen >= 16) { + ADD(0); + ADDC(4); + ADDC(8); + ADDC(12); + MOP; + w += 8; + mlen -= 16; + } + if (mlen >= 8) { + ADD(0); + ADDC(4); MOP; w += 4; + mlen -= 8; } - mlen += 8; if (mlen == 0 && byte_swapped == 0) continue; /* worth 1% maybe ?? */ REDUCE; @@ -172,4 +235,3 @@ in_cksum(m, len) REDUCE; return (~sum & 0xffff); } - diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c index 29ec72e26633..cf872b0877d1 100644 --- a/sys/i386/i386/machdep.c +++ b/sys/i386/i386/machdep.c @@ -35,7 +35,7 @@ * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 - * $Id: machdep.c,v 1.36 1994/02/08 12:58:44 davidg Exp $ + * $Id: machdep.c,v 1.37 1994/02/24 00:18:04 hsu Exp $ */ #include "npx.h" @@ -709,7 +709,7 @@ boot(arghowto) DELAY (100000); /* wait 100ms for printf's to complete */ cpu_reset(); for(;;) ; - /*NOTREACHED*/ + /* NOTREACHED */ } unsigned long dumpmag = 0x8fca0101UL; /* magic number for savecore */ @@ -1133,7 +1133,7 @@ init386(first) #ifndef LARGEMEM if (biosextmem > 65536) { panic("extended memory beyond limit of 64MB"); - /* NOT REACHED */ + /* NOTREACHED */ } #endif diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s index 2a0bd34de406..190b835ff40a 100644 --- a/sys/i386/i386/support.s +++ b/sys/i386/i386/support.s @@ -30,7 +30,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: support.s,v 1.3 1994/01/31 23:47:29 davidg Exp $ + * $Id: support.s,v 1.4 1994/02/01 04:09:07 davidg Exp $ */ #include "assym.s" /* system definitions */ @@ -151,8 +151,20 @@ ENTRY(outsw) /* outsw(port, addr, cnt) */ /* * bcopy family */ -/* void bzero(void *base, u_int cnt) */ +/* + * void bzero(void *base, u_int cnt) + * Special code for I486 because stosl uses lots + * of clocks. Makes little or no difference on DX2 type + * machines, but about stosl is about 1/2 as fast as + * memory moves on standard DX !!!!! + */ + ENTRY(bzero) +#if defined(I486_CPU) && (defined(I386_CPU) || defined(I586_CPU)) + cmpl $CPUCLASS_486,_cpu_class + jz 1f +#endif +#if defined(I386_CPU) || defined(I586_CPU) pushl %edi movl 8(%esp),%edi movl 12(%esp),%ecx @@ -167,6 +179,90 @@ ENTRY(bzero) stosb popl %edi ret + .align 4 +#endif +#if defined(I486_CPU) +1: + movl 4(%esp),%edx + movl 8(%esp),%ecx + xorl %eax,%eax +/ +/ do 64 byte chunks first +/ +2: + cmpl $64,%ecx + jb 3f + movl %eax,(%edx) + movl %eax,4(%edx) + movl %eax,8(%edx) + movl %eax,12(%edx) + movl %eax,16(%edx) + movl %eax,20(%edx) + movl %eax,24(%edx) + movl %eax,28(%edx) + movl %eax,32(%edx) + movl %eax,36(%edx) + movl %eax,40(%edx) + movl %eax,44(%edx) + movl %eax,48(%edx) + movl %eax,52(%edx) + movl %eax,56(%edx) + movl %eax,60(%edx) + addl $64,%edx + subl $64,%ecx + jnz 2b + ret + .align 4 +/ +/ do 16 byte chunks +/ +3: + cmpl $16,%ecx + jb 4f + movl %eax,(%edx) + movl %eax,4(%edx) + movl %eax,8(%edx) + movl %eax,12(%edx) + addl $16,%edx + subl $16,%ecx + jnz 3b + ret + .align 4 +/ +/ do 4 byte chunks +/ +4: cmpl $4,%ecx + jb 5f + movl %eax,(%edx) + addl $4,%edx + subl $4,%ecx + jnz 4b + ret +/ +/ do 1 byte chunks -- this appears to be faster than a loop +/ + .align 4 +jtab: .long do0 + .long do1 + .long do2 + .long do3 + + .align 4 +5: jmp jtab(,%ecx,4) + + .align 2 +do3: movb $0,(%edx) + incl %edx + movw $0,(%edx) + ret + .align 2 +do2: movw $0,(%edx) + ret + .align 2 +do1: movb $0,(%edx) +do0: ret + +#endif /* fillw(pat, base, cnt) */ ENTRY(fillw)