1) enhanced in_cksum from Bruce Evans.
2) minor comment change in machdep.c 3) enhanced bzero from John Dyson (twice as fast on a 486DX/33)
This commit is contained in:
parent
04f1835605
commit
5c09563e1e
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=1247
@ -35,7 +35,7 @@
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
|
||||
* $Id: machdep.c,v 1.36 1994/02/08 12:58:44 davidg Exp $
|
||||
* $Id: machdep.c,v 1.37 1994/02/24 00:18:04 hsu Exp $
|
||||
*/
|
||||
|
||||
#include "npx.h"
|
||||
@ -709,7 +709,7 @@ boot(arghowto)
|
||||
DELAY (100000); /* wait 100ms for printf's to complete */
|
||||
cpu_reset();
|
||||
for(;;) ;
|
||||
/*NOTREACHED*/
|
||||
/* NOTREACHED */
|
||||
}
|
||||
|
||||
unsigned long dumpmag = 0x8fca0101UL; /* magic number for savecore */
|
||||
@ -1133,7 +1133,7 @@ init386(first)
|
||||
#ifndef LARGEMEM
|
||||
if (biosextmem > 65536) {
|
||||
panic("extended memory beyond limit of 64MB");
|
||||
/* NOT REACHED */
|
||||
/* NOTREACHED */
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -30,7 +30,7 @@
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* $Id: support.s,v 1.3 1994/01/31 23:47:29 davidg Exp $
|
||||
* $Id: support.s,v 1.4 1994/02/01 04:09:07 davidg Exp $
|
||||
*/
|
||||
|
||||
#include "assym.s" /* system definitions */
|
||||
@ -151,8 +151,20 @@ ENTRY(outsw) /* outsw(port, addr, cnt) */
|
||||
/*
|
||||
* bcopy family
|
||||
*/
|
||||
/* void bzero(void *base, u_int cnt) */
|
||||
/*
|
||||
* void bzero(void *base, u_int cnt)
|
||||
* Special code for I486 because stosl uses lots
|
||||
* of clocks. Makes little or no difference on DX2 type
|
||||
* machines, but about stosl is about 1/2 as fast as
|
||||
* memory moves on standard DX !!!!!
|
||||
*/
|
||||
|
||||
ENTRY(bzero)
|
||||
#if defined(I486_CPU) && (defined(I386_CPU) || defined(I586_CPU))
|
||||
cmpl $CPUCLASS_486,_cpu_class
|
||||
jz 1f
|
||||
#endif
|
||||
#if defined(I386_CPU) || defined(I586_CPU)
|
||||
pushl %edi
|
||||
movl 8(%esp),%edi
|
||||
movl 12(%esp),%ecx
|
||||
@ -167,6 +179,90 @@ ENTRY(bzero)
|
||||
stosb
|
||||
popl %edi
|
||||
ret
|
||||
.align 4
|
||||
#endif
|
||||
#if defined(I486_CPU)
|
||||
1:
|
||||
movl 4(%esp),%edx
|
||||
movl 8(%esp),%ecx
|
||||
xorl %eax,%eax
|
||||
/
|
||||
/ do 64 byte chunks first
|
||||
/
|
||||
2:
|
||||
cmpl $64,%ecx
|
||||
jb 3f
|
||||
movl %eax,(%edx)
|
||||
movl %eax,4(%edx)
|
||||
movl %eax,8(%edx)
|
||||
movl %eax,12(%edx)
|
||||
movl %eax,16(%edx)
|
||||
movl %eax,20(%edx)
|
||||
movl %eax,24(%edx)
|
||||
movl %eax,28(%edx)
|
||||
movl %eax,32(%edx)
|
||||
movl %eax,36(%edx)
|
||||
movl %eax,40(%edx)
|
||||
movl %eax,44(%edx)
|
||||
movl %eax,48(%edx)
|
||||
movl %eax,52(%edx)
|
||||
movl %eax,56(%edx)
|
||||
movl %eax,60(%edx)
|
||||
addl $64,%edx
|
||||
subl $64,%ecx
|
||||
jnz 2b
|
||||
ret
|
||||
.align 4
|
||||
/
|
||||
/ do 16 byte chunks
|
||||
/
|
||||
3:
|
||||
cmpl $16,%ecx
|
||||
jb 4f
|
||||
movl %eax,(%edx)
|
||||
movl %eax,4(%edx)
|
||||
movl %eax,8(%edx)
|
||||
movl %eax,12(%edx)
|
||||
addl $16,%edx
|
||||
subl $16,%ecx
|
||||
jnz 3b
|
||||
ret
|
||||
.align 4
|
||||
/
|
||||
/ do 4 byte chunks
|
||||
/
|
||||
4: cmpl $4,%ecx
|
||||
jb 5f
|
||||
movl %eax,(%edx)
|
||||
addl $4,%edx
|
||||
subl $4,%ecx
|
||||
jnz 4b
|
||||
ret
|
||||
/
|
||||
/ do 1 byte chunks -- this appears to be faster than a loop
|
||||
/
|
||||
.align 4
|
||||
jtab: .long do0
|
||||
.long do1
|
||||
.long do2
|
||||
.long do3
|
||||
|
||||
.align 4
|
||||
5: jmp jtab(,%ecx,4)
|
||||
|
||||
.align 2
|
||||
do3: movb $0,(%edx)
|
||||
incl %edx
|
||||
movw $0,(%edx)
|
||||
ret
|
||||
.align 2
|
||||
do2: movw $0,(%edx)
|
||||
ret
|
||||
.align 2
|
||||
do1: movb $0,(%edx)
|
||||
do0: ret
|
||||
|
||||
#endif
|
||||
|
||||
/* fillw(pat, base, cnt) */
|
||||
ENTRY(fillw)
|
||||
|
@ -30,7 +30,7 @@
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* $Id: support.s,v 1.3 1994/01/31 23:47:29 davidg Exp $
|
||||
* $Id: support.s,v 1.4 1994/02/01 04:09:07 davidg Exp $
|
||||
*/
|
||||
|
||||
#include "assym.s" /* system definitions */
|
||||
@ -151,8 +151,20 @@ ENTRY(outsw) /* outsw(port, addr, cnt) */
|
||||
/*
|
||||
* bcopy family
|
||||
*/
|
||||
/* void bzero(void *base, u_int cnt) */
|
||||
/*
|
||||
* void bzero(void *base, u_int cnt)
|
||||
* Special code for I486 because stosl uses lots
|
||||
* of clocks. Makes little or no difference on DX2 type
|
||||
* machines, but about stosl is about 1/2 as fast as
|
||||
* memory moves on standard DX !!!!!
|
||||
*/
|
||||
|
||||
ENTRY(bzero)
|
||||
#if defined(I486_CPU) && (defined(I386_CPU) || defined(I586_CPU))
|
||||
cmpl $CPUCLASS_486,_cpu_class
|
||||
jz 1f
|
||||
#endif
|
||||
#if defined(I386_CPU) || defined(I586_CPU)
|
||||
pushl %edi
|
||||
movl 8(%esp),%edi
|
||||
movl 12(%esp),%ecx
|
||||
@ -167,6 +179,90 @@ ENTRY(bzero)
|
||||
stosb
|
||||
popl %edi
|
||||
ret
|
||||
.align 4
|
||||
#endif
|
||||
#if defined(I486_CPU)
|
||||
1:
|
||||
movl 4(%esp),%edx
|
||||
movl 8(%esp),%ecx
|
||||
xorl %eax,%eax
|
||||
/
|
||||
/ do 64 byte chunks first
|
||||
/
|
||||
2:
|
||||
cmpl $64,%ecx
|
||||
jb 3f
|
||||
movl %eax,(%edx)
|
||||
movl %eax,4(%edx)
|
||||
movl %eax,8(%edx)
|
||||
movl %eax,12(%edx)
|
||||
movl %eax,16(%edx)
|
||||
movl %eax,20(%edx)
|
||||
movl %eax,24(%edx)
|
||||
movl %eax,28(%edx)
|
||||
movl %eax,32(%edx)
|
||||
movl %eax,36(%edx)
|
||||
movl %eax,40(%edx)
|
||||
movl %eax,44(%edx)
|
||||
movl %eax,48(%edx)
|
||||
movl %eax,52(%edx)
|
||||
movl %eax,56(%edx)
|
||||
movl %eax,60(%edx)
|
||||
addl $64,%edx
|
||||
subl $64,%ecx
|
||||
jnz 2b
|
||||
ret
|
||||
.align 4
|
||||
/
|
||||
/ do 16 byte chunks
|
||||
/
|
||||
3:
|
||||
cmpl $16,%ecx
|
||||
jb 4f
|
||||
movl %eax,(%edx)
|
||||
movl %eax,4(%edx)
|
||||
movl %eax,8(%edx)
|
||||
movl %eax,12(%edx)
|
||||
addl $16,%edx
|
||||
subl $16,%ecx
|
||||
jnz 3b
|
||||
ret
|
||||
.align 4
|
||||
/
|
||||
/ do 4 byte chunks
|
||||
/
|
||||
4: cmpl $4,%ecx
|
||||
jb 5f
|
||||
movl %eax,(%edx)
|
||||
addl $4,%edx
|
||||
subl $4,%ecx
|
||||
jnz 4b
|
||||
ret
|
||||
/
|
||||
/ do 1 byte chunks -- this appears to be faster than a loop
|
||||
/
|
||||
.align 4
|
||||
jtab: .long do0
|
||||
.long do1
|
||||
.long do2
|
||||
.long do3
|
||||
|
||||
.align 4
|
||||
5: jmp jtab(,%ecx,4)
|
||||
|
||||
.align 2
|
||||
do3: movb $0,(%edx)
|
||||
incl %edx
|
||||
movw $0,(%edx)
|
||||
ret
|
||||
.align 2
|
||||
do2: movw $0,(%edx)
|
||||
ret
|
||||
.align 2
|
||||
do1: movb $0,(%edx)
|
||||
do0: ret
|
||||
|
||||
#endif
|
||||
|
||||
/* fillw(pat, base, cnt) */
|
||||
ENTRY(fillw)
|
||||
|
@ -32,7 +32,7 @@
|
||||
*
|
||||
* from tahoe: in_cksum.c 1.2 86/01/05
|
||||
* from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91
|
||||
* $Id: in_cksum.c,v 1.3 1993/11/25 01:30:53 wollman Exp $
|
||||
* $Id: in_cksum.c,v 1.4 1993/12/19 00:50:02 wollman Exp $
|
||||
*/
|
||||
|
||||
#include "param.h"
|
||||
@ -56,9 +56,10 @@
|
||||
* Thanks to gcc we don't have to guess
|
||||
* which registers contain sum & w.
|
||||
*/
|
||||
#define CLC asm("clc")
|
||||
#define ADD(n) asm("adcl " #n "(%2), %0": "=r"(sum): "0"(sum), "r"(w))
|
||||
#define MOP asm("adcl $0, %0": "=r"(sum): "0"(sum))
|
||||
#define ADD(n) asm("addl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w))
|
||||
#define ADDC(n) asm("adcl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w))
|
||||
#define LOAD(n) asm volatile("movb " #n "(%1), %0" : "=r" (junk) : "r" (w))
|
||||
#define MOP asm("adcl $0, %0" : "=r" (sum) : "0" (sum))
|
||||
|
||||
int
|
||||
in_cksum(m, len)
|
||||
@ -113,29 +114,91 @@ in_cksum(m, len)
|
||||
mlen -= 2;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Advance to a 486 cache line boundary.
|
||||
*/
|
||||
if (4 & (int) w && mlen >= 4) {
|
||||
ADD(0);
|
||||
MOP;
|
||||
w += 2;
|
||||
mlen -= 4;
|
||||
}
|
||||
if (8 & (int) w && mlen >= 8) {
|
||||
ADD(0);
|
||||
ADDC(4);
|
||||
MOP;
|
||||
w += 4;
|
||||
mlen -= 8;
|
||||
}
|
||||
/*
|
||||
* Do as much of the checksum as possible 32 bits at at time.
|
||||
* In fact, this loop is unrolled to make overhead from
|
||||
* branches &c small.
|
||||
*/
|
||||
mlen -= 1;
|
||||
while ((mlen -= 32) >= 0) {
|
||||
u_char junk;
|
||||
/*
|
||||
* Clear the carry flag, add with carry 16 words
|
||||
* and fold-in last carry by adding a 0 with carry.
|
||||
* Add with carry 16 words and fold in the last
|
||||
* carry by adding a 0 with carry.
|
||||
*
|
||||
* The early ADD(16) and the LOAD(32) are to load
|
||||
* the next 2 cache lines in advance on 486's. The
|
||||
* 486 has a penalty of 2 clock cycles for loading
|
||||
* a cache line, plus whatever time the external
|
||||
* memory takes to load the first word(s) addressed.
|
||||
* These penalties are unavoidable. Subsequent
|
||||
* accesses to a cache line being loaded (and to
|
||||
* other external memory?) are delayed until the
|
||||
* whole load finishes. These penalties are mostly
|
||||
* avoided by not accessing external memory for
|
||||
* 8 cycles after the ADD(16) and 12 cycles after
|
||||
* the LOAD(32). The loop terminates when mlen
|
||||
* is initially 33 (not 32) to guaranteed that
|
||||
* the LOAD(32) is within bounds.
|
||||
*/
|
||||
CLC;
|
||||
ADD(0); ADD(4); ADD(8); ADD(12);
|
||||
ADD(16); ADD(20); ADD(24); ADD(28);
|
||||
MOP; w += 16;
|
||||
ADD(16);
|
||||
ADDC(0);
|
||||
ADDC(4);
|
||||
ADDC(8);
|
||||
ADDC(12);
|
||||
LOAD(32);
|
||||
ADDC(20);
|
||||
ADDC(24);
|
||||
ADDC(28);
|
||||
MOP;
|
||||
w += 16;
|
||||
}
|
||||
mlen += 32;
|
||||
while ((mlen -= 8) >= 0) {
|
||||
CLC;
|
||||
ADD(0); ADD(4);
|
||||
mlen += 32 + 1;
|
||||
if (mlen >= 32) {
|
||||
ADD(16);
|
||||
ADDC(0);
|
||||
ADDC(4);
|
||||
ADDC(8);
|
||||
ADDC(12);
|
||||
ADDC(20);
|
||||
ADDC(24);
|
||||
ADDC(28);
|
||||
MOP;
|
||||
w += 16;
|
||||
mlen -= 32;
|
||||
}
|
||||
if (mlen >= 16) {
|
||||
ADD(0);
|
||||
ADDC(4);
|
||||
ADDC(8);
|
||||
ADDC(12);
|
||||
MOP;
|
||||
w += 8;
|
||||
mlen -= 16;
|
||||
}
|
||||
if (mlen >= 8) {
|
||||
ADD(0);
|
||||
ADDC(4);
|
||||
MOP;
|
||||
w += 4;
|
||||
mlen -= 8;
|
||||
}
|
||||
mlen += 8;
|
||||
if (mlen == 0 && byte_swapped == 0)
|
||||
continue; /* worth 1% maybe ?? */
|
||||
REDUCE;
|
||||
@ -172,4 +235,3 @@ in_cksum(m, len)
|
||||
REDUCE;
|
||||
return (~sum & 0xffff);
|
||||
}
|
||||
|
||||
|
@ -35,7 +35,7 @@
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
|
||||
* $Id: machdep.c,v 1.36 1994/02/08 12:58:44 davidg Exp $
|
||||
* $Id: machdep.c,v 1.37 1994/02/24 00:18:04 hsu Exp $
|
||||
*/
|
||||
|
||||
#include "npx.h"
|
||||
@ -709,7 +709,7 @@ boot(arghowto)
|
||||
DELAY (100000); /* wait 100ms for printf's to complete */
|
||||
cpu_reset();
|
||||
for(;;) ;
|
||||
/*NOTREACHED*/
|
||||
/* NOTREACHED */
|
||||
}
|
||||
|
||||
unsigned long dumpmag = 0x8fca0101UL; /* magic number for savecore */
|
||||
@ -1133,7 +1133,7 @@ init386(first)
|
||||
#ifndef LARGEMEM
|
||||
if (biosextmem > 65536) {
|
||||
panic("extended memory beyond limit of 64MB");
|
||||
/* NOT REACHED */
|
||||
/* NOTREACHED */
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -30,7 +30,7 @@
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* $Id: support.s,v 1.3 1994/01/31 23:47:29 davidg Exp $
|
||||
* $Id: support.s,v 1.4 1994/02/01 04:09:07 davidg Exp $
|
||||
*/
|
||||
|
||||
#include "assym.s" /* system definitions */
|
||||
@ -151,8 +151,20 @@ ENTRY(outsw) /* outsw(port, addr, cnt) */
|
||||
/*
|
||||
* bcopy family
|
||||
*/
|
||||
/* void bzero(void *base, u_int cnt) */
|
||||
/*
|
||||
* void bzero(void *base, u_int cnt)
|
||||
* Special code for I486 because stosl uses lots
|
||||
* of clocks. Makes little or no difference on DX2 type
|
||||
* machines, but about stosl is about 1/2 as fast as
|
||||
* memory moves on standard DX !!!!!
|
||||
*/
|
||||
|
||||
ENTRY(bzero)
|
||||
#if defined(I486_CPU) && (defined(I386_CPU) || defined(I586_CPU))
|
||||
cmpl $CPUCLASS_486,_cpu_class
|
||||
jz 1f
|
||||
#endif
|
||||
#if defined(I386_CPU) || defined(I586_CPU)
|
||||
pushl %edi
|
||||
movl 8(%esp),%edi
|
||||
movl 12(%esp),%ecx
|
||||
@ -167,6 +179,90 @@ ENTRY(bzero)
|
||||
stosb
|
||||
popl %edi
|
||||
ret
|
||||
.align 4
|
||||
#endif
|
||||
#if defined(I486_CPU)
|
||||
1:
|
||||
movl 4(%esp),%edx
|
||||
movl 8(%esp),%ecx
|
||||
xorl %eax,%eax
|
||||
/
|
||||
/ do 64 byte chunks first
|
||||
/
|
||||
2:
|
||||
cmpl $64,%ecx
|
||||
jb 3f
|
||||
movl %eax,(%edx)
|
||||
movl %eax,4(%edx)
|
||||
movl %eax,8(%edx)
|
||||
movl %eax,12(%edx)
|
||||
movl %eax,16(%edx)
|
||||
movl %eax,20(%edx)
|
||||
movl %eax,24(%edx)
|
||||
movl %eax,28(%edx)
|
||||
movl %eax,32(%edx)
|
||||
movl %eax,36(%edx)
|
||||
movl %eax,40(%edx)
|
||||
movl %eax,44(%edx)
|
||||
movl %eax,48(%edx)
|
||||
movl %eax,52(%edx)
|
||||
movl %eax,56(%edx)
|
||||
movl %eax,60(%edx)
|
||||
addl $64,%edx
|
||||
subl $64,%ecx
|
||||
jnz 2b
|
||||
ret
|
||||
.align 4
|
||||
/
|
||||
/ do 16 byte chunks
|
||||
/
|
||||
3:
|
||||
cmpl $16,%ecx
|
||||
jb 4f
|
||||
movl %eax,(%edx)
|
||||
movl %eax,4(%edx)
|
||||
movl %eax,8(%edx)
|
||||
movl %eax,12(%edx)
|
||||
addl $16,%edx
|
||||
subl $16,%ecx
|
||||
jnz 3b
|
||||
ret
|
||||
.align 4
|
||||
/
|
||||
/ do 4 byte chunks
|
||||
/
|
||||
4: cmpl $4,%ecx
|
||||
jb 5f
|
||||
movl %eax,(%edx)
|
||||
addl $4,%edx
|
||||
subl $4,%ecx
|
||||
jnz 4b
|
||||
ret
|
||||
/
|
||||
/ do 1 byte chunks -- this appears to be faster than a loop
|
||||
/
|
||||
.align 4
|
||||
jtab: .long do0
|
||||
.long do1
|
||||
.long do2
|
||||
.long do3
|
||||
|
||||
.align 4
|
||||
5: jmp jtab(,%ecx,4)
|
||||
|
||||
.align 2
|
||||
do3: movb $0,(%edx)
|
||||
incl %edx
|
||||
movw $0,(%edx)
|
||||
ret
|
||||
.align 2
|
||||
do2: movw $0,(%edx)
|
||||
ret
|
||||
.align 2
|
||||
do1: movb $0,(%edx)
|
||||
do0: ret
|
||||
|
||||
#endif
|
||||
|
||||
/* fillw(pat, base, cnt) */
|
||||
ENTRY(fillw)
|
||||
|
Loading…
Reference in New Issue
Block a user