1) enhanced in_cksum from Bruce Evans.

2) minor comment change in machdep.c
3) enhanced bzero from John Dyson (twice as fast on a 486DX/33)
This commit is contained in:
David Greenman 1994-03-07 11:47:32 +00:00
parent 04f1835605
commit 5c09563e1e
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=1247
6 changed files with 378 additions and 28 deletions

View File

@ -35,7 +35,7 @@
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
* $Id: machdep.c,v 1.36 1994/02/08 12:58:44 davidg Exp $
* $Id: machdep.c,v 1.37 1994/02/24 00:18:04 hsu Exp $
*/
#include "npx.h"
@ -709,7 +709,7 @@ boot(arghowto)
DELAY (100000); /* wait 100ms for printf's to complete */
cpu_reset();
for(;;) ;
/*NOTREACHED*/
/* NOTREACHED */
}
unsigned long dumpmag = 0x8fca0101UL; /* magic number for savecore */
@ -1133,7 +1133,7 @@ init386(first)
#ifndef LARGEMEM
if (biosextmem > 65536) {
panic("extended memory beyond limit of 64MB");
/* NOT REACHED */
/* NOTREACHED */
}
#endif

View File

@ -30,7 +30,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $Id: support.s,v 1.3 1994/01/31 23:47:29 davidg Exp $
* $Id: support.s,v 1.4 1994/02/01 04:09:07 davidg Exp $
*/
#include "assym.s" /* system definitions */
@ -151,8 +151,20 @@ ENTRY(outsw) /* outsw(port, addr, cnt) */
/*
* bcopy family
*/
/* void bzero(void *base, u_int cnt) */
/*
* void bzero(void *base, u_int cnt)
* Special code for I486 because stosl uses lots
* of clocks. Makes little or no difference on DX2 type
* machines, but about stosl is about 1/2 as fast as
* memory moves on standard DX !!!!!
*/
ENTRY(bzero)
#if defined(I486_CPU) && (defined(I386_CPU) || defined(I586_CPU))
cmpl $CPUCLASS_486,_cpu_class
jz 1f
#endif
#if defined(I386_CPU) || defined(I586_CPU)
pushl %edi
movl 8(%esp),%edi
movl 12(%esp),%ecx
@ -167,6 +179,90 @@ ENTRY(bzero)
stosb
popl %edi
ret
.align 4
#endif
#if defined(I486_CPU)
1:
movl 4(%esp),%edx
movl 8(%esp),%ecx
xorl %eax,%eax
/
/ do 64 byte chunks first
/
2:
cmpl $64,%ecx
jb 3f
movl %eax,(%edx)
movl %eax,4(%edx)
movl %eax,8(%edx)
movl %eax,12(%edx)
movl %eax,16(%edx)
movl %eax,20(%edx)
movl %eax,24(%edx)
movl %eax,28(%edx)
movl %eax,32(%edx)
movl %eax,36(%edx)
movl %eax,40(%edx)
movl %eax,44(%edx)
movl %eax,48(%edx)
movl %eax,52(%edx)
movl %eax,56(%edx)
movl %eax,60(%edx)
addl $64,%edx
subl $64,%ecx
jnz 2b
ret
.align 4
/
/ do 16 byte chunks
/
3:
cmpl $16,%ecx
jb 4f
movl %eax,(%edx)
movl %eax,4(%edx)
movl %eax,8(%edx)
movl %eax,12(%edx)
addl $16,%edx
subl $16,%ecx
jnz 3b
ret
.align 4
/
/ do 4 byte chunks
/
4: cmpl $4,%ecx
jb 5f
movl %eax,(%edx)
addl $4,%edx
subl $4,%ecx
jnz 4b
ret
/
/ do 1 byte chunks -- this appears to be faster than a loop
/
.align 4
jtab: .long do0
.long do1
.long do2
.long do3
.align 4
5: jmp jtab(,%ecx,4)
.align 2
do3: movb $0,(%edx)
incl %edx
movw $0,(%edx)
ret
.align 2
do2: movw $0,(%edx)
ret
.align 2
do1: movb $0,(%edx)
do0: ret
#endif
/* fillw(pat, base, cnt) */
ENTRY(fillw)

View File

@ -30,7 +30,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $Id: support.s,v 1.3 1994/01/31 23:47:29 davidg Exp $
* $Id: support.s,v 1.4 1994/02/01 04:09:07 davidg Exp $
*/
#include "assym.s" /* system definitions */
@ -151,8 +151,20 @@ ENTRY(outsw) /* outsw(port, addr, cnt) */
/*
* bcopy family
*/
/* void bzero(void *base, u_int cnt) */
/*
* void bzero(void *base, u_int cnt)
* Special code for I486 because stosl uses lots
* of clocks. Makes little or no difference on DX2 type
* machines, but about stosl is about 1/2 as fast as
* memory moves on standard DX !!!!!
*/
ENTRY(bzero)
#if defined(I486_CPU) && (defined(I386_CPU) || defined(I586_CPU))
cmpl $CPUCLASS_486,_cpu_class
jz 1f
#endif
#if defined(I386_CPU) || defined(I586_CPU)
pushl %edi
movl 8(%esp),%edi
movl 12(%esp),%ecx
@ -167,6 +179,90 @@ ENTRY(bzero)
stosb
popl %edi
ret
.align 4
#endif
#if defined(I486_CPU)
1:
movl 4(%esp),%edx
movl 8(%esp),%ecx
xorl %eax,%eax
/
/ do 64 byte chunks first
/
2:
cmpl $64,%ecx
jb 3f
movl %eax,(%edx)
movl %eax,4(%edx)
movl %eax,8(%edx)
movl %eax,12(%edx)
movl %eax,16(%edx)
movl %eax,20(%edx)
movl %eax,24(%edx)
movl %eax,28(%edx)
movl %eax,32(%edx)
movl %eax,36(%edx)
movl %eax,40(%edx)
movl %eax,44(%edx)
movl %eax,48(%edx)
movl %eax,52(%edx)
movl %eax,56(%edx)
movl %eax,60(%edx)
addl $64,%edx
subl $64,%ecx
jnz 2b
ret
.align 4
/
/ do 16 byte chunks
/
3:
cmpl $16,%ecx
jb 4f
movl %eax,(%edx)
movl %eax,4(%edx)
movl %eax,8(%edx)
movl %eax,12(%edx)
addl $16,%edx
subl $16,%ecx
jnz 3b
ret
.align 4
/
/ do 4 byte chunks
/
4: cmpl $4,%ecx
jb 5f
movl %eax,(%edx)
addl $4,%edx
subl $4,%ecx
jnz 4b
ret
/
/ do 1 byte chunks -- this appears to be faster than a loop
/
.align 4
jtab: .long do0
.long do1
.long do2
.long do3
.align 4
5: jmp jtab(,%ecx,4)
.align 2
do3: movb $0,(%edx)
incl %edx
movw $0,(%edx)
ret
.align 2
do2: movw $0,(%edx)
ret
.align 2
do1: movb $0,(%edx)
do0: ret
#endif
/* fillw(pat, base, cnt) */
ENTRY(fillw)

View File

@ -32,7 +32,7 @@
*
* from tahoe: in_cksum.c 1.2 86/01/05
* from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91
* $Id: in_cksum.c,v 1.3 1993/11/25 01:30:53 wollman Exp $
* $Id: in_cksum.c,v 1.4 1993/12/19 00:50:02 wollman Exp $
*/
#include "param.h"
@ -56,9 +56,10 @@
* Thanks to gcc we don't have to guess
* which registers contain sum & w.
*/
#define CLC asm("clc")
#define ADD(n) asm("adcl " #n "(%2), %0": "=r"(sum): "0"(sum), "r"(w))
#define MOP asm("adcl $0, %0": "=r"(sum): "0"(sum))
#define ADD(n) asm("addl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w))
#define ADDC(n) asm("adcl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w))
#define LOAD(n) asm volatile("movb " #n "(%1), %0" : "=r" (junk) : "r" (w))
#define MOP asm("adcl $0, %0" : "=r" (sum) : "0" (sum))
int
in_cksum(m, len)
@ -113,29 +114,91 @@ in_cksum(m, len)
mlen -= 2;
}
}
/*
* Advance to a 486 cache line boundary.
*/
if (4 & (int) w && mlen >= 4) {
ADD(0);
MOP;
w += 2;
mlen -= 4;
}
if (8 & (int) w && mlen >= 8) {
ADD(0);
ADDC(4);
MOP;
w += 4;
mlen -= 8;
}
/*
* Do as much of the checksum as possible 32 bits at at time.
* In fact, this loop is unrolled to make overhead from
* branches &c small.
*/
mlen -= 1;
while ((mlen -= 32) >= 0) {
u_char junk;
/*
* Clear the carry flag, add with carry 16 words
* and fold-in last carry by adding a 0 with carry.
* Add with carry 16 words and fold in the last
* carry by adding a 0 with carry.
*
* The early ADD(16) and the LOAD(32) are to load
* the next 2 cache lines in advance on 486's. The
* 486 has a penalty of 2 clock cycles for loading
* a cache line, plus whatever time the external
* memory takes to load the first word(s) addressed.
* These penalties are unavoidable. Subsequent
* accesses to a cache line being loaded (and to
* other external memory?) are delayed until the
* whole load finishes. These penalties are mostly
* avoided by not accessing external memory for
* 8 cycles after the ADD(16) and 12 cycles after
* the LOAD(32). The loop terminates when mlen
* is initially 33 (not 32) to guaranteed that
* the LOAD(32) is within bounds.
*/
CLC;
ADD(0); ADD(4); ADD(8); ADD(12);
ADD(16); ADD(20); ADD(24); ADD(28);
MOP; w += 16;
ADD(16);
ADDC(0);
ADDC(4);
ADDC(8);
ADDC(12);
LOAD(32);
ADDC(20);
ADDC(24);
ADDC(28);
MOP;
w += 16;
}
mlen += 32;
while ((mlen -= 8) >= 0) {
CLC;
ADD(0); ADD(4);
mlen += 32 + 1;
if (mlen >= 32) {
ADD(16);
ADDC(0);
ADDC(4);
ADDC(8);
ADDC(12);
ADDC(20);
ADDC(24);
ADDC(28);
MOP;
w += 16;
mlen -= 32;
}
if (mlen >= 16) {
ADD(0);
ADDC(4);
ADDC(8);
ADDC(12);
MOP;
w += 8;
mlen -= 16;
}
if (mlen >= 8) {
ADD(0);
ADDC(4);
MOP;
w += 4;
mlen -= 8;
}
mlen += 8;
if (mlen == 0 && byte_swapped == 0)
continue; /* worth 1% maybe ?? */
REDUCE;
@ -172,4 +235,3 @@ in_cksum(m, len)
REDUCE;
return (~sum & 0xffff);
}

View File

@ -35,7 +35,7 @@
* SUCH DAMAGE.
*
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
* $Id: machdep.c,v 1.36 1994/02/08 12:58:44 davidg Exp $
* $Id: machdep.c,v 1.37 1994/02/24 00:18:04 hsu Exp $
*/
#include "npx.h"
@ -709,7 +709,7 @@ boot(arghowto)
DELAY (100000); /* wait 100ms for printf's to complete */
cpu_reset();
for(;;) ;
/*NOTREACHED*/
/* NOTREACHED */
}
unsigned long dumpmag = 0x8fca0101UL; /* magic number for savecore */
@ -1133,7 +1133,7 @@ init386(first)
#ifndef LARGEMEM
if (biosextmem > 65536) {
panic("extended memory beyond limit of 64MB");
/* NOT REACHED */
/* NOTREACHED */
}
#endif

View File

@ -30,7 +30,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $Id: support.s,v 1.3 1994/01/31 23:47:29 davidg Exp $
* $Id: support.s,v 1.4 1994/02/01 04:09:07 davidg Exp $
*/
#include "assym.s" /* system definitions */
@ -151,8 +151,20 @@ ENTRY(outsw) /* outsw(port, addr, cnt) */
/*
* bcopy family
*/
/* void bzero(void *base, u_int cnt) */
/*
* void bzero(void *base, u_int cnt)
* Special code for I486 because stosl uses lots
* of clocks. Makes little or no difference on DX2 type
* machines, but about stosl is about 1/2 as fast as
* memory moves on standard DX !!!!!
*/
ENTRY(bzero)
#if defined(I486_CPU) && (defined(I386_CPU) || defined(I586_CPU))
cmpl $CPUCLASS_486,_cpu_class
jz 1f
#endif
#if defined(I386_CPU) || defined(I586_CPU)
pushl %edi
movl 8(%esp),%edi
movl 12(%esp),%ecx
@ -167,6 +179,90 @@ ENTRY(bzero)
stosb
popl %edi
ret
.align 4
#endif
#if defined(I486_CPU)
1:
movl 4(%esp),%edx
movl 8(%esp),%ecx
xorl %eax,%eax
/
/ do 64 byte chunks first
/
2:
cmpl $64,%ecx
jb 3f
movl %eax,(%edx)
movl %eax,4(%edx)
movl %eax,8(%edx)
movl %eax,12(%edx)
movl %eax,16(%edx)
movl %eax,20(%edx)
movl %eax,24(%edx)
movl %eax,28(%edx)
movl %eax,32(%edx)
movl %eax,36(%edx)
movl %eax,40(%edx)
movl %eax,44(%edx)
movl %eax,48(%edx)
movl %eax,52(%edx)
movl %eax,56(%edx)
movl %eax,60(%edx)
addl $64,%edx
subl $64,%ecx
jnz 2b
ret
.align 4
/
/ do 16 byte chunks
/
3:
cmpl $16,%ecx
jb 4f
movl %eax,(%edx)
movl %eax,4(%edx)
movl %eax,8(%edx)
movl %eax,12(%edx)
addl $16,%edx
subl $16,%ecx
jnz 3b
ret
.align 4
/
/ do 4 byte chunks
/
4: cmpl $4,%ecx
jb 5f
movl %eax,(%edx)
addl $4,%edx
subl $4,%ecx
jnz 4b
ret
/
/ do 1 byte chunks -- this appears to be faster than a loop
/
.align 4
jtab: .long do0
.long do1
.long do2
.long do3
.align 4
5: jmp jtab(,%ecx,4)
.align 2
do3: movb $0,(%edx)
incl %edx
movw $0,(%edx)
ret
.align 2
do2: movw $0,(%edx)
ret
.align 2
do1: movb $0,(%edx)
do0: ret
#endif
/* fillw(pat, base, cnt) */
ENTRY(fillw)