From d7629dff3b8ec2d5fdeac67fd6b79eb51dab8fb9 Mon Sep 17 00:00:00 2001 From: Satoshi Asami Date: Thu, 13 Jun 1996 07:17:21 +0000 Subject: [PATCH] A fast memory copy for Pentiums using floating point registers. It is called from copyin and copyout. The new routine is conditioned on I586_CPU and I586_FAST_BCOPY, so you need options "I586_FAST_BCOPY" (quotes essenstial) in your kernel config file. Also, if you have other kernel types configured in your kernel, an additional check to make sure it is running on a Pentium is inserted. (It is not clear why it doesn't help on P6s, it may be just that the Orion chipset doesn't prefetch as efficiently as Tritons and friends.) Bruce can now hack this away. :) --- sys/amd64/amd64/support.S | 192 +++++++++++++++++++++++++++++++++++++- sys/amd64/amd64/support.s | 192 +++++++++++++++++++++++++++++++++++++- sys/amd64/amd64/trap.c | 10 +- sys/i386/i386/support.s | 192 +++++++++++++++++++++++++++++++++++++- sys/i386/i386/trap.c | 10 +- sys/kern/subr_trap.c | 10 +- 6 files changed, 600 insertions(+), 6 deletions(-) diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index 0d5bc5dd3f7c..751e923b7dfd 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -30,13 +30,14 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: support.s,v 1.35 1996/05/03 21:01:00 phk Exp $ + * $Id: support.s,v 1.36 1996/05/31 01:08:03 peter Exp $ */ #include "assym.s" /* system definitions */ #include "errno.h" /* error return codes */ #include "machine/asmacros.h" /* miscellaneous asm macros */ #include "machine/cputypes.h" /* types of CPUs */ +#include "machine/specialreg.h" #define KDSEL 0x10 /* kernel data selector */ #define IDXSHIFT 10 @@ -453,6 +454,21 @@ ENTRY(copyout) /* copyout(from_kernel, to_user, len) */ /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx +#if defined(I586_CPU) && defined(I586_FAST_BCOPY) + cmpl $1024,%ecx + jbe slow_copyout + +#if defined(I386_CPU) || defined(I486_CPU) || defined(I686_CPU) + cmpl $CPUCLASS_586,_cpu_class + jne slow_copyout +#endif /* I386_CPU || I486_CPU || I686_CPU */ + + call fastmove + jmp done_copyout + + ALIGN_TEXT +slow_copyout: +#endif /* I586_CPU && I586_FAST_BCOPY */ shrl $2,%ecx cld rep @@ -500,6 +516,21 @@ ENTRY(copyin) cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault +#if defined(I586_CPU) && defined(I586_FAST_BCOPY) + cmpl $1024,%ecx + jbe slow_copyin + +#if defined(I386_CPU) || defined(I486_CPU) || defined(I686_CPU) + cmpl $CPUCLASS_586,_cpu_class + jne slow_copyin +#endif /* I386_CPU || I486_CPU || I686_CPU */ + + call fastmove + jmp done_copyin + + ALIGN_TEXT +slow_copyin: +#endif /* I586_CPU && I586_FAST_BCOPY */ movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld @@ -510,6 +541,10 @@ ENTRY(copyin) rep movsb +#if defined(I586_CPU) && defined(I586_FAST_BCOPY) + ALIGN_TEXT +done_copyin: +#endif /* I586_CPU && I586_FAST_BCOPY */ popl %edi popl %esi xorl %eax,%eax @@ -526,6 +561,161 @@ copyin_fault: movl $EFAULT,%eax ret +#if defined(I586_CPU) && defined(I586_FAST_BCOPY) +/* fastmove(src, dst, len) + src in %esi + dst in %edi + len in %ecx + uses %eax and %edx for tmp. storage + */ + ALIGN_TEXT +fastmove: + cmpl $63,%ecx + jbe 8f + + testl $7,%esi /* check if src addr is multiple of 8 */ + jnz 8f + + testl $7,%edi /* check if dst addr is multiple of 8 */ + jnz 8f + + pushl %ebp + movl %esp,%ebp + subl $PCB_SAVEFPU_SIZE,%esp + +/* if (npxproc != NULL) { */ + cmpl $0,_npxproc + je 6f +/* fnsave(&curpcb->pcb_savefpu); */ + movl _curpcb,%eax + fnsave PCB_SAVEFPU(%eax) +/* npxproc = NULL; */ + movl $0,_npxproc +/* } */ +6: +/* now we own the FPU. */ + +/* + * The process' FP state is saved in the pcb, but if we get + * switched, the cpu_switch() will store our FP state in the + * pcb. It should be possible to avoid all the copying for + * this, e.g., by setting a flag to tell cpu_switch() to + * save the state somewhere else. + */ +/* tmp = curpcb->pcb_savefpu; */ + pushl %edi + pushl %esi + pushl %ecx + leal -PCB_SAVEFPU_SIZE(%ebp),%edi + movl _curpcb,%esi + addl $PCB_SAVEFPU,%esi + cld + movl $PCB_SAVEFPU_SIZE>>2,%ecx + rep + movsl + popl %ecx + popl %esi + popl %edi +/* stop_emulating(); */ + clts +/* npxproc = curproc; */ + movl _curproc,%eax + movl %eax,_npxproc +4: + pushl %ecx + cmpl $1792,%ecx + jbe 2f + movl $1792,%ecx +2: + subl %ecx,0(%esp) + cmpl $256,%ecx + jb 5f + pushl %esi + pushl %ecx + ALIGN_TEXT +3: + movl 0(%esi),%eax + movl 32(%esi),%eax + movl 64(%esi),%eax + movl 96(%esi),%eax + movl 128(%esi),%eax + movl 160(%esi),%eax + movl 192(%esi),%eax + movl 224(%esi),%eax + addl $256,%esi + subl $256,%ecx + cmpl $256,%ecx + jae 3b + popl %ecx + popl %esi +5: + ALIGN_TEXT +7: + fildq 0(%esi) + fildq 8(%esi) + fildq 16(%esi) + fildq 24(%esi) + fildq 32(%esi) + fildq 40(%esi) + fildq 48(%esi) + fildq 56(%esi) + fistpq 56(%edi) + fistpq 48(%edi) + fistpq 40(%edi) + fistpq 32(%edi) + fistpq 24(%edi) + fistpq 16(%edi) + fistpq 8(%edi) + fistpq 0(%edi) + addl $-64,%ecx + addl $64,%esi + addl $64,%edi + cmpl $63,%ecx + ja 7b + popl %eax + addl %eax,%ecx + cmpl $64,%ecx + jae 4b + +/* curpcb->pcb_savefpu = tmp; */ + pushl %edi + pushl %esi + pushl %ecx + movl _curpcb,%edi + addl $PCB_SAVEFPU,%edi + leal -PCB_SAVEFPU_SIZE(%ebp),%esi + cld + movl $PCB_SAVEFPU_SIZE>>2,%ecx + rep + movsl + popl %ecx + popl %esi + popl %edi + +/* start_emulating(); */ + smsw %ax + orb $CR0_TS,%al + lmsw %ax +/* npxproc = NULL; */ + movl $0,_npxproc + movl %ebp,%esp + popl %ebp + + ALIGN_TEXT +8: + movb %cl,%al + shrl $2,%ecx /* copy longword-wise */ + cld + rep + movsl + movb %al,%cl + andb $3,%cl /* copy remaining bytes */ + rep + movsb + + ret +#endif /* I586_CPU && I586_FAST_BCOPY */ + /* * fu{byte,sword,word} : fetch a byte (sword, word) from user memory */ diff --git a/sys/amd64/amd64/support.s b/sys/amd64/amd64/support.s index 0d5bc5dd3f7c..751e923b7dfd 100644 --- a/sys/amd64/amd64/support.s +++ b/sys/amd64/amd64/support.s @@ -30,13 +30,14 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: support.s,v 1.35 1996/05/03 21:01:00 phk Exp $ + * $Id: support.s,v 1.36 1996/05/31 01:08:03 peter Exp $ */ #include "assym.s" /* system definitions */ #include "errno.h" /* error return codes */ #include "machine/asmacros.h" /* miscellaneous asm macros */ #include "machine/cputypes.h" /* types of CPUs */ +#include "machine/specialreg.h" #define KDSEL 0x10 /* kernel data selector */ #define IDXSHIFT 10 @@ -453,6 +454,21 @@ ENTRY(copyout) /* copyout(from_kernel, to_user, len) */ /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx +#if defined(I586_CPU) && defined(I586_FAST_BCOPY) + cmpl $1024,%ecx + jbe slow_copyout + +#if defined(I386_CPU) || defined(I486_CPU) || defined(I686_CPU) + cmpl $CPUCLASS_586,_cpu_class + jne slow_copyout +#endif /* I386_CPU || I486_CPU || I686_CPU */ + + call fastmove + jmp done_copyout + + ALIGN_TEXT +slow_copyout: +#endif /* I586_CPU && I586_FAST_BCOPY */ shrl $2,%ecx cld rep @@ -500,6 +516,21 @@ ENTRY(copyin) cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault +#if defined(I586_CPU) && defined(I586_FAST_BCOPY) + cmpl $1024,%ecx + jbe slow_copyin + +#if defined(I386_CPU) || defined(I486_CPU) || defined(I686_CPU) + cmpl $CPUCLASS_586,_cpu_class + jne slow_copyin +#endif /* I386_CPU || I486_CPU || I686_CPU */ + + call fastmove + jmp done_copyin + + ALIGN_TEXT +slow_copyin: +#endif /* I586_CPU && I586_FAST_BCOPY */ movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld @@ -510,6 +541,10 @@ ENTRY(copyin) rep movsb +#if defined(I586_CPU) && defined(I586_FAST_BCOPY) + ALIGN_TEXT +done_copyin: +#endif /* I586_CPU && I586_FAST_BCOPY */ popl %edi popl %esi xorl %eax,%eax @@ -526,6 +561,161 @@ copyin_fault: movl $EFAULT,%eax ret +#if defined(I586_CPU) && defined(I586_FAST_BCOPY) +/* fastmove(src, dst, len) + src in %esi + dst in %edi + len in %ecx + uses %eax and %edx for tmp. storage + */ + ALIGN_TEXT +fastmove: + cmpl $63,%ecx + jbe 8f + + testl $7,%esi /* check if src addr is multiple of 8 */ + jnz 8f + + testl $7,%edi /* check if dst addr is multiple of 8 */ + jnz 8f + + pushl %ebp + movl %esp,%ebp + subl $PCB_SAVEFPU_SIZE,%esp + +/* if (npxproc != NULL) { */ + cmpl $0,_npxproc + je 6f +/* fnsave(&curpcb->pcb_savefpu); */ + movl _curpcb,%eax + fnsave PCB_SAVEFPU(%eax) +/* npxproc = NULL; */ + movl $0,_npxproc +/* } */ +6: +/* now we own the FPU. */ + +/* + * The process' FP state is saved in the pcb, but if we get + * switched, the cpu_switch() will store our FP state in the + * pcb. It should be possible to avoid all the copying for + * this, e.g., by setting a flag to tell cpu_switch() to + * save the state somewhere else. + */ +/* tmp = curpcb->pcb_savefpu; */ + pushl %edi + pushl %esi + pushl %ecx + leal -PCB_SAVEFPU_SIZE(%ebp),%edi + movl _curpcb,%esi + addl $PCB_SAVEFPU,%esi + cld + movl $PCB_SAVEFPU_SIZE>>2,%ecx + rep + movsl + popl %ecx + popl %esi + popl %edi +/* stop_emulating(); */ + clts +/* npxproc = curproc; */ + movl _curproc,%eax + movl %eax,_npxproc +4: + pushl %ecx + cmpl $1792,%ecx + jbe 2f + movl $1792,%ecx +2: + subl %ecx,0(%esp) + cmpl $256,%ecx + jb 5f + pushl %esi + pushl %ecx + ALIGN_TEXT +3: + movl 0(%esi),%eax + movl 32(%esi),%eax + movl 64(%esi),%eax + movl 96(%esi),%eax + movl 128(%esi),%eax + movl 160(%esi),%eax + movl 192(%esi),%eax + movl 224(%esi),%eax + addl $256,%esi + subl $256,%ecx + cmpl $256,%ecx + jae 3b + popl %ecx + popl %esi +5: + ALIGN_TEXT +7: + fildq 0(%esi) + fildq 8(%esi) + fildq 16(%esi) + fildq 24(%esi) + fildq 32(%esi) + fildq 40(%esi) + fildq 48(%esi) + fildq 56(%esi) + fistpq 56(%edi) + fistpq 48(%edi) + fistpq 40(%edi) + fistpq 32(%edi) + fistpq 24(%edi) + fistpq 16(%edi) + fistpq 8(%edi) + fistpq 0(%edi) + addl $-64,%ecx + addl $64,%esi + addl $64,%edi + cmpl $63,%ecx + ja 7b + popl %eax + addl %eax,%ecx + cmpl $64,%ecx + jae 4b + +/* curpcb->pcb_savefpu = tmp; */ + pushl %edi + pushl %esi + pushl %ecx + movl _curpcb,%edi + addl $PCB_SAVEFPU,%edi + leal -PCB_SAVEFPU_SIZE(%ebp),%esi + cld + movl $PCB_SAVEFPU_SIZE>>2,%ecx + rep + movsl + popl %ecx + popl %esi + popl %edi + +/* start_emulating(); */ + smsw %ax + orb $CR0_TS,%al + lmsw %ax +/* npxproc = NULL; */ + movl $0,_npxproc + movl %ebp,%esp + popl %ebp + + ALIGN_TEXT +8: + movb %cl,%al + shrl $2,%ecx /* copy longword-wise */ + cld + rep + movsl + movb %al,%cl + andb $3,%cl /* copy remaining bytes */ + rep + movsb + + ret +#endif /* I586_CPU && I586_FAST_BCOPY */ + /* * fu{byte,sword,word} : fetch a byte (sword, word) from user memory */ diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 7cf89d17dce2..224a6f4409aa 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -35,7 +35,7 @@ * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 - * $Id: trap.c,v 1.76 1996/05/18 03:36:19 dyson Exp $ + * $Id: trap.c,v 1.77 1996/06/12 05:02:54 gpalmer Exp $ */ /* @@ -319,6 +319,14 @@ trap(frame) (void) trap_pfault(&frame, FALSE); return; + case T_DNA: +#if NNPX > 0 + /* if a transparent fault (due to context switch "late") */ + if (npxdna()) + return; +#endif /* NNPX > 0 */ + break; + case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ /* diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s index 0d5bc5dd3f7c..751e923b7dfd 100644 --- a/sys/i386/i386/support.s +++ b/sys/i386/i386/support.s @@ -30,13 +30,14 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: support.s,v 1.35 1996/05/03 21:01:00 phk Exp $ + * $Id: support.s,v 1.36 1996/05/31 01:08:03 peter Exp $ */ #include "assym.s" /* system definitions */ #include "errno.h" /* error return codes */ #include "machine/asmacros.h" /* miscellaneous asm macros */ #include "machine/cputypes.h" /* types of CPUs */ +#include "machine/specialreg.h" #define KDSEL 0x10 /* kernel data selector */ #define IDXSHIFT 10 @@ -453,6 +454,21 @@ ENTRY(copyout) /* copyout(from_kernel, to_user, len) */ /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx +#if defined(I586_CPU) && defined(I586_FAST_BCOPY) + cmpl $1024,%ecx + jbe slow_copyout + +#if defined(I386_CPU) || defined(I486_CPU) || defined(I686_CPU) + cmpl $CPUCLASS_586,_cpu_class + jne slow_copyout +#endif /* I386_CPU || I486_CPU || I686_CPU */ + + call fastmove + jmp done_copyout + + ALIGN_TEXT +slow_copyout: +#endif /* I586_CPU && I586_FAST_BCOPY */ shrl $2,%ecx cld rep @@ -500,6 +516,21 @@ ENTRY(copyin) cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault +#if defined(I586_CPU) && defined(I586_FAST_BCOPY) + cmpl $1024,%ecx + jbe slow_copyin + +#if defined(I386_CPU) || defined(I486_CPU) || defined(I686_CPU) + cmpl $CPUCLASS_586,_cpu_class + jne slow_copyin +#endif /* I386_CPU || I486_CPU || I686_CPU */ + + call fastmove + jmp done_copyin + + ALIGN_TEXT +slow_copyin: +#endif /* I586_CPU && I586_FAST_BCOPY */ movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld @@ -510,6 +541,10 @@ ENTRY(copyin) rep movsb +#if defined(I586_CPU) && defined(I586_FAST_BCOPY) + ALIGN_TEXT +done_copyin: +#endif /* I586_CPU && I586_FAST_BCOPY */ popl %edi popl %esi xorl %eax,%eax @@ -526,6 +561,161 @@ copyin_fault: movl $EFAULT,%eax ret +#if defined(I586_CPU) && defined(I586_FAST_BCOPY) +/* fastmove(src, dst, len) + src in %esi + dst in %edi + len in %ecx + uses %eax and %edx for tmp. storage + */ + ALIGN_TEXT +fastmove: + cmpl $63,%ecx + jbe 8f + + testl $7,%esi /* check if src addr is multiple of 8 */ + jnz 8f + + testl $7,%edi /* check if dst addr is multiple of 8 */ + jnz 8f + + pushl %ebp + movl %esp,%ebp + subl $PCB_SAVEFPU_SIZE,%esp + +/* if (npxproc != NULL) { */ + cmpl $0,_npxproc + je 6f +/* fnsave(&curpcb->pcb_savefpu); */ + movl _curpcb,%eax + fnsave PCB_SAVEFPU(%eax) +/* npxproc = NULL; */ + movl $0,_npxproc +/* } */ +6: +/* now we own the FPU. */ + +/* + * The process' FP state is saved in the pcb, but if we get + * switched, the cpu_switch() will store our FP state in the + * pcb. It should be possible to avoid all the copying for + * this, e.g., by setting a flag to tell cpu_switch() to + * save the state somewhere else. + */ +/* tmp = curpcb->pcb_savefpu; */ + pushl %edi + pushl %esi + pushl %ecx + leal -PCB_SAVEFPU_SIZE(%ebp),%edi + movl _curpcb,%esi + addl $PCB_SAVEFPU,%esi + cld + movl $PCB_SAVEFPU_SIZE>>2,%ecx + rep + movsl + popl %ecx + popl %esi + popl %edi +/* stop_emulating(); */ + clts +/* npxproc = curproc; */ + movl _curproc,%eax + movl %eax,_npxproc +4: + pushl %ecx + cmpl $1792,%ecx + jbe 2f + movl $1792,%ecx +2: + subl %ecx,0(%esp) + cmpl $256,%ecx + jb 5f + pushl %esi + pushl %ecx + ALIGN_TEXT +3: + movl 0(%esi),%eax + movl 32(%esi),%eax + movl 64(%esi),%eax + movl 96(%esi),%eax + movl 128(%esi),%eax + movl 160(%esi),%eax + movl 192(%esi),%eax + movl 224(%esi),%eax + addl $256,%esi + subl $256,%ecx + cmpl $256,%ecx + jae 3b + popl %ecx + popl %esi +5: + ALIGN_TEXT +7: + fildq 0(%esi) + fildq 8(%esi) + fildq 16(%esi) + fildq 24(%esi) + fildq 32(%esi) + fildq 40(%esi) + fildq 48(%esi) + fildq 56(%esi) + fistpq 56(%edi) + fistpq 48(%edi) + fistpq 40(%edi) + fistpq 32(%edi) + fistpq 24(%edi) + fistpq 16(%edi) + fistpq 8(%edi) + fistpq 0(%edi) + addl $-64,%ecx + addl $64,%esi + addl $64,%edi + cmpl $63,%ecx + ja 7b + popl %eax + addl %eax,%ecx + cmpl $64,%ecx + jae 4b + +/* curpcb->pcb_savefpu = tmp; */ + pushl %edi + pushl %esi + pushl %ecx + movl _curpcb,%edi + addl $PCB_SAVEFPU,%edi + leal -PCB_SAVEFPU_SIZE(%ebp),%esi + cld + movl $PCB_SAVEFPU_SIZE>>2,%ecx + rep + movsl + popl %ecx + popl %esi + popl %edi + +/* start_emulating(); */ + smsw %ax + orb $CR0_TS,%al + lmsw %ax +/* npxproc = NULL; */ + movl $0,_npxproc + movl %ebp,%esp + popl %ebp + + ALIGN_TEXT +8: + movb %cl,%al + shrl $2,%ecx /* copy longword-wise */ + cld + rep + movsl + movb %al,%cl + andb $3,%cl /* copy remaining bytes */ + rep + movsb + + ret +#endif /* I586_CPU && I586_FAST_BCOPY */ + /* * fu{byte,sword,word} : fetch a byte (sword, word) from user memory */ diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c index 7cf89d17dce2..224a6f4409aa 100644 --- a/sys/i386/i386/trap.c +++ b/sys/i386/i386/trap.c @@ -35,7 +35,7 @@ * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 - * $Id: trap.c,v 1.76 1996/05/18 03:36:19 dyson Exp $ + * $Id: trap.c,v 1.77 1996/06/12 05:02:54 gpalmer Exp $ */ /* @@ -319,6 +319,14 @@ trap(frame) (void) trap_pfault(&frame, FALSE); return; + case T_DNA: +#if NNPX > 0 + /* if a transparent fault (due to context switch "late") */ + if (npxdna()) + return; +#endif /* NNPX > 0 */ + break; + case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ /* diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c index 7cf89d17dce2..224a6f4409aa 100644 --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -35,7 +35,7 @@ * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 - * $Id: trap.c,v 1.76 1996/05/18 03:36:19 dyson Exp $ + * $Id: trap.c,v 1.77 1996/06/12 05:02:54 gpalmer Exp $ */ /* @@ -319,6 +319,14 @@ trap(frame) (void) trap_pfault(&frame, FALSE); return; + case T_DNA: +#if NNPX > 0 + /* if a transparent fault (due to context switch "late") */ + if (npxdna()) + return; +#endif /* NNPX > 0 */ + break; + case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ /*