From cbc82634340082d9ec72b57c4c68a5fb91cf5e15 Mon Sep 17 00:00:00 2001 From: jake Date: Sun, 30 Sep 2001 19:50:39 +0000 Subject: [PATCH] Optimize bcopy and bzero etc to use 64 bit loads and stores if possible. Handle overlap in bcopy. Add routines for copying and zeroing pages using physical addresses directly. Remove all the hacks to account for calling the firmware on its own trap table, we use the kernel trap table. There is still a problem with OF_exit(). --- sys/sparc64/sparc64/support.S | 359 +++++++++++++++++++++++----------- sys/sparc64/sparc64/support.s | 359 +++++++++++++++++++++++----------- 2 files changed, 480 insertions(+), 238 deletions(-) diff --git a/sys/sparc64/sparc64/support.S b/sys/sparc64/sparc64/support.S index 41469f51a791..bbd2d132bb67 100644 --- a/sys/sparc64/sparc64/support.S +++ b/sys/sparc64/sparc64/support.S @@ -33,62 +33,184 @@ #include "assym.s" -#define E +#define E /* empty */ +/* + * Generate load and store instructions for the corresponding width and asi + * (or not). Note that we want to evaluate the macro args before + * concatenating, so that E really turns into nothing. + */ #define _LD(w, a) ld ## w ## a #define _ST(w, a) st ## w ## a #define LD(w, a) _LD(w, a) #define ST(w, a) _ST(w, a) -#define _BCOPY(src, dst, len, sa, sasi, da, dasi) \ - brz,pn len, 2f ; \ - mov len, %o3 ; \ -1: LD(ub, sa) [src] sasi, %o4 ; \ - ST(b, da) %o4, [dst] dasi ; \ - dec %o3 ; \ - inc src ; \ - brnz,pt %o3, 1b ; \ - inc dst ; \ -2: - -#define BCOPY(src, dst, len) \ - _BCOPY(src, dst, len, E, E, E, E) - -#define COPYIN(uaddr, kaddr, len) \ - wr %g0, ASI_AIUP, %asi ; \ - _BCOPY(uaddr, kaddr, len, a, %asi, E, E) - -#define COPYOUT(kaddr, uaddr, len) \ - wr %g0, ASI_AIUP, %asi ; \ - _BCOPY(kaddr, uaddr, len, E, E, a, %asi) +/* + * Common code for copy routines. + * + * We use large macros to generate functions for each of the copy routines. + * This allows the load and store instructions to be generated for the right + * operation, asi or not. It is possible to write an asi independent function + * but this would require 2 expensive wrs in the main loop to switch %asi. + * It would also screw up profiling (if we ever get it), but may save some I$. + * We assume that either one of dasi and sasi is empty, or that they are both + * the same (empty or non-empty). It is up to the caller to set %asi. + */ +/* + * ASI independent implementation of copystr(9). + * Used to implement copyinstr() and copystr(). + * + * Return value is in %g1. + */ #define _COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \ - clr %o4 ; \ - clr %o5 ; \ -1: LD(ub, sa) [src] sasi, %g1 ; \ + brz len, 4f ; \ + mov src, %g2 ; \ +1: deccc 1, len ; \ + bl,a,pn %xcc, 3f ; \ + nop ; \ + LD(ub, sa) [src] sasi, %g1 ; \ ST(b, da) %g1, [dst] dasi ; \ - brz,pn %g1, 2f ; \ - inc %o4 ; \ - dec len ; \ - inc src ; \ - brgz,pt len, 1b ; \ + brz,pn %g1, 3f ; \ + inc src ; \ + b %xcc, 1b ; \ inc dst ; \ - mov ENAMETOOLONG, %o5 ; \ -2: brnz,a done, 3f ; \ - stx %o4, [done] ; \ -3: +2: mov ENAMETOOLONG, %g1 ; \ +3: sub src, %g2, %g2 ; \ + brnz,a done, 4f ; \ + stx %g2, [done] ; \ +4: -#define COPYSTR(dst, src, len, done) \ - _COPYSTR(dst, src, len, done, E, E, E, E) +/* + * ASI independent implementation of memset(3). + * Used to implement bzero(), memset() and physzero(). + * + * If the pattern is non-zero, duplicate it to fill 64 bits. + * Store bytes until dst is 8-byte aligned, then store 8 bytes. + * It has yet to be determined how much unrolling is beneficial. + * Could also read and compare before writing to minimize snoop traffic. + * + * XXX bzero() should be implemented as + * #define bzero(dst, len) (void)memset((dst), 0, (len)) + * if at all. + */ +#define _MEMSET(dst, pat, len, da, dasi) \ + brlez,pn len, 5f ; \ + and pat, 0xff, pat ; \ + brz,pt pat, 1f ; \ + sllx pat, 8, %g1 ; \ + or pat, %g1, pat ; \ + sllx pat, 16, %g1 ; \ + or pat, %g1, pat ; \ + sllx pat, 32, %g1 ; \ + or pat, %g1, pat ; \ + .align 16 ; \ +1: deccc 1, len ; \ + bl,pn %xcc, 5f ; \ + btst 7, dst ; \ + bz,a,pt %xcc, 2f ; \ + inc 1, len ; \ + ST(b, da) pat, [dst] dasi ; \ + b %xcc, 1b ; \ + inc dst ; \ + .align 16 ; \ +2: deccc 32, len ; \ + bl,a,pn %xcc, 3f ; \ + inc 32, len ; \ + ST(x, da) pat, [dst] dasi ; \ + ST(x, da) pat, [dst + 8] dasi ; \ + ST(x, da) pat, [dst + 16] dasi ; \ + ST(x, da) pat, [dst + 24] dasi ; \ + b %xcc, 2b ; \ + inc 32, dst ; \ + .align 16 ; \ +3: deccc 8, len ; \ + bl,a,pn %xcc, 4f ; \ + inc 8, len ; \ + ST(x, da) pat, [dst] dasi ; \ + b %xcc, 3b ; \ + inc 8, dst ; \ + .align 16 ; \ +4: deccc 1, len ; \ + bl,a,pn %xcc, 5f ; \ + nop ; \ + ST(b, da) pat, [dst] dasi ; \ + b %xcc, 4b ; \ + inc 1, dst ; \ +5: -#define COPYINSTR(uaddr, kaddr, len, done) \ - wr %g0, ASI_AIUP, %asi ; \ - _COPYSTR(uaddr, kaddr, len, done, a, %asi, E, E) +/* + * ASI independent implementation of memcpy(3). + * Used to implement bcopy(), copyin(), copyout(), memcpy(), and physcopy(). + * + * Transfer bytes until dst is 8-byte aligned. If src is then also 8 byte + * aligned, transfer 8 bytes, otherwise finish with bytes. The unaligned + * case could be optimized, but it is expected that this is the uncommon + * case and of questionable value. The code to do so is also rather large + * and ugly. + * It has yet to be determined how much unrolling is beneficial. + * + * XXX bcopy() must also check for overlap. This is stupid. + * XXX bcopy() should be implemented as + * #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len)) + * if at all. + */ +#define _MEMCPY(dst, src, len, da, dasi, sa, sasi) \ +1: deccc 1, len ; \ + bl,pn %xcc, 6f ; \ + btst 7, dst ; \ + bz,a,pt %xcc, 2f ; \ + inc 1, len ; \ + LD(ub, sa) [src] sasi, %g1 ; \ + ST(b, da) %g1, [dst] dasi ; \ + inc 1, src ; \ + b %xcc, 1b ; \ + inc 1, dst ; \ + .align 16 ; \ +2: btst 7, src ; \ + bz,a,pt %xcc, 3f ; \ + nop ; \ + b,a %xcc, 5f ; \ + .align 16 ; \ +3: deccc 32, len ; \ + bl,a,pn %xcc, 4f ; \ + inc 32, len ; \ + LD(x, sa) [src] sasi, %g1 ; \ + LD(x, sa) [src + 8] sasi, %g2 ; \ + LD(x, sa) [src + 16] sasi, %g3 ; \ + LD(x, sa) [src + 24] sasi, %g4 ; \ + ST(x, da) %g1, [dst] dasi ; \ + ST(x, da) %g2, [dst + 8] dasi ; \ + ST(x, da) %g3, [dst + 16] dasi ; \ + ST(x, da) %g4, [dst + 24] dasi ; \ + inc 32, src ; \ + b %xcc, 3b ; \ + inc 32, dst ; \ + .align 16 ; \ +4: deccc 8, len ; \ + bl,a,pn %xcc, 5f ; \ + inc 8, len ; \ + LD(x, sa) [src] sasi, %g1 ; \ + ST(x, da) %g1, [dst] dasi ; \ + inc 8, src ; \ + b %xcc, 4b ; \ + inc 8, dst ; \ + .align 16 ; \ +5: deccc 1, len ; \ + bl,a,pn %xcc, 6f ; \ + nop ; \ + LD(ub, sa) [src] sasi, %g1 ; \ + ST(b, da) %g1, [dst] dasi ; \ + inc src ; \ + b %xcc, 5b ; \ + inc dst ; \ +6: #define CATCH_SETUP(label) \ setx label, %g2, %g1 ; \ - ldx [PCPU(CURPCB)], %g6 ; \ + ldx [PCPU(CURTHREAD)], %g6 ; \ + ldx [%g6 + TD_PCB], %g6 ; \ stx %g1, [%g6 + PCB_ONFAULT] ; #define CATCH_END() \ @@ -119,7 +241,7 @@ SU_ALIGNED(storer, label) /* - * void bcmp(void *b, size_t len) + * int bcmp(const void *b1, const void *b2, size_t len) */ ENTRY(bcmp) brz,pn %o2, 2f @@ -127,7 +249,7 @@ ENTRY(bcmp) 1: ldub [%o0 + %o3], %o4 ldub [%o1 + %o3], %o5 cmp %o4, %o5 - bne,pn %xcc, 1f + bne,pn %xcc, 2f inc %o3 deccc %o2 bne,pt %xcc, 1b @@ -139,45 +261,89 @@ END(bcmp) /* * void bcopy(const void *src, void *dst, size_t len) */ +ENTRY(ovbcopy) ENTRY(bcopy) - BCOPY(%o0, %o1, %o2) + /* + * Check for overlap, and copy backwards if so. + */ + sub %o1, %o0, %g1 + cmp %g1, %o2 + bgeu,a,pt %xcc, 3f + nop + + /* + * Copy backwards. + */ + add %o0, %o2, %o0 + add %o1, %o2, %o1 +1: deccc 1, %o2 + bl,a,pn %xcc, 2f + nop + dec 1, %o0 + ldub [%o0], %g1 + dec 1, %o1 + b %xcc, 1b + stb %g1, [%o1] +2: retl + nop + + /* + * Do the fast version. + */ +3: _MEMCPY(%o1, %o0, %o2, E, E, E, E) retl nop END(bcopy) -/* - * void ovbcopy(const void *src, void *dst, size_t len) - * XXX handle overlap... - */ -ENTRY(ovbcopy) - BCOPY(%o0, %o1, %o2) - retl - nop -END(ovbcopy) - /* * void bzero(void *b, size_t len) */ ENTRY(bzero) - brz,pn %o1, 1f - nop -1: deccc %o1 - stb %g0, [%o0] - bne,pt %xcc, 1b - inc %o0 -2: retl + _MEMSET(%o0, %g0, %o1, E, E) + retl nop END(bzero) +/* + * void physzero(vm_offset_t pa, size_t len) + */ +ENTRY(physzero) + wr %g0, ASI_PHYS_USE_EC, %asi + _MEMSET(%o0, %g0, %o1, a, %asi) + retl + nop +END(physzero) + +/* + * void physcopy(vm_offset_t src, vm_offset_t dst, size_t len) + */ +ENTRY(physcopy) + wr %g0, ASI_PHYS_USE_EC, %asi + _MEMCPY(%o1, %o0, %o2, a, %asi, a, %asi) + retl + nop +END(physcopy) + /* * void *memcpy(void *dst, const void *src, size_t len) */ ENTRY(memcpy) - BCOPY(%o1, %o0, %o2) + mov %o0, %o3 + _MEMCPY(%o3, %o1, %o2, E, E, E, E) retl nop END(memcpy) +/* + * void *memset(void *b, int c, size_t len) + */ +ENTRY(memset) + mov %o0, %o3 + _MEMSET(%o3, %o1, %o2, E, E) + retl + nop +END(memset) + /* * int copyin(const void *uaddr, void *kaddr, size_t len) */ @@ -191,7 +357,8 @@ ENTRY(copyin) stx %o2, [%o3 + KTR_PARM3] 9: #endif - COPYIN(%o0, %o1, %o2) + wr %g0, ASI_AIUP, %asi + _MEMCPY(%o1, %o0, %o2, E, E, a, %asi) CATCH_END() retl clr %o0 @@ -211,10 +378,11 @@ ENTRY(copyinstr) stx %o3, [%g1 + KTR_PARM4] 9: #endif - COPYINSTR(%o0, %o1, %o2, %o3) + wr %g0, ASI_AIUP, %asi + _COPYSTR(%o0, %o1, %o2, %o3, a, %asi, E, E) CATCH_END() retl - mov %o5, %o0 + mov %g1, %o0 END(copyinstr) /* @@ -230,7 +398,8 @@ ENTRY(copyout) stx %o2, [%o3 + KTR_PARM3] 9: #endif - COPYOUT(%o0, %o1, %o2) + wr %g0, ASI_AIUP, %asi + _MEMCPY(%o1, %o0, %o2, a, %asi, E, E) CATCH_END() retl clr %o0 @@ -250,9 +419,9 @@ END(copyout) * int copystr(const void *src, void *dst, size_t len, size_t *done) */ ENTRY(copystr) - COPYSTR(%o0, %o1, %o2, %o3) + _COPYSTR(%o0, %o1, %o2, %o3, E, E, E, E) retl - mov %o5, %o0 + mov %g1, %o0 END(copystr) /* @@ -325,7 +494,6 @@ ENTRY(fsbail) .Lfsalign: retl mov -1, %o0 -END(fsbail) ENTRY(longjmp) set 1, %g3 @@ -354,65 +522,18 @@ ENTRY(setjmp) clr %o0 END(setjmp) -/* - * Temporary stack for calling into the firmware. We need to setup one, because - * the MMU mapping for our stack page may be lost. When the firmware tries to - * spill the last window (the others are flushed before), this results in an - * DMMU miss trap, which is fatal with the firmware trap handlers installed. - * Additionally, it seems that the firmware does not immediately switch to an - * own stack (or maybe never?), therefore more space needs to be reserved. - * I hope this is sufficient now. - */ - .align 4 -DATA(ofwstack) - .rept CCFSZ * 8 - .byte 0 - .endr -ofwstack_last: - .rept CCFSZ - .byte 0 - .endr -END(ofwstack) - /* * void openfirmware(cell_t args[]) */ ENTRY(openfirmware) - /* - * Disable interrupts. The firmware should not deal with our interrupts - * anyway, and the temporary stack is not large enough to hold the stack - * footprint of the interrrupt handling. - */ - rdpr %pstate, %o3 - andn %o3, PSTATE_IE, %o1 - wrpr %o1, 0, %pstate - setx ofwstack_last - SPOFF, %o1, %o2 - save %o2, 0, %sp - flushw - rdpr %tl, %l1 - rdpr %tba, %l2 - mov AA_DMMU_PCXR, %l3 - ldxa [%l3] ASI_DMMU, %l4 - stxa %g0, [%l3] ASI_DMMU - membar #Sync - flush %sp - setx ofw_tba, %l7, %l5 - ldx [%l5], %l5 + save %sp, -CCFSZ, %sp setx ofw_vec, %l7, %l6 ldx [%l6], %l6 rdpr %pil, %l7 - wrpr %g0, 14, %pil - wrpr %l5, 0, %tba - wrpr %g0, 0, %tl + wrpr %g0, PIL_TICK, %pil call %l6 mov %i0, %o0 - wrpr %l1, 0, %tl - wrpr %l2, 0, %tba - stxa %l4, [%l3] ASI_DMMU wrpr %l7, 0, %pil - membar #Sync - flush %sp - restore - retl - wrpr %o3, 0, %pstate + ret + restore %o0, %g0, %o0 END(openfirmware) diff --git a/sys/sparc64/sparc64/support.s b/sys/sparc64/sparc64/support.s index 41469f51a791..bbd2d132bb67 100644 --- a/sys/sparc64/sparc64/support.s +++ b/sys/sparc64/sparc64/support.s @@ -33,62 +33,184 @@ #include "assym.s" -#define E +#define E /* empty */ +/* + * Generate load and store instructions for the corresponding width and asi + * (or not). Note that we want to evaluate the macro args before + * concatenating, so that E really turns into nothing. + */ #define _LD(w, a) ld ## w ## a #define _ST(w, a) st ## w ## a #define LD(w, a) _LD(w, a) #define ST(w, a) _ST(w, a) -#define _BCOPY(src, dst, len, sa, sasi, da, dasi) \ - brz,pn len, 2f ; \ - mov len, %o3 ; \ -1: LD(ub, sa) [src] sasi, %o4 ; \ - ST(b, da) %o4, [dst] dasi ; \ - dec %o3 ; \ - inc src ; \ - brnz,pt %o3, 1b ; \ - inc dst ; \ -2: - -#define BCOPY(src, dst, len) \ - _BCOPY(src, dst, len, E, E, E, E) - -#define COPYIN(uaddr, kaddr, len) \ - wr %g0, ASI_AIUP, %asi ; \ - _BCOPY(uaddr, kaddr, len, a, %asi, E, E) - -#define COPYOUT(kaddr, uaddr, len) \ - wr %g0, ASI_AIUP, %asi ; \ - _BCOPY(kaddr, uaddr, len, E, E, a, %asi) +/* + * Common code for copy routines. + * + * We use large macros to generate functions for each of the copy routines. + * This allows the load and store instructions to be generated for the right + * operation, asi or not. It is possible to write an asi independent function + * but this would require 2 expensive wrs in the main loop to switch %asi. + * It would also screw up profiling (if we ever get it), but may save some I$. + * We assume that either one of dasi and sasi is empty, or that they are both + * the same (empty or non-empty). It is up to the caller to set %asi. + */ +/* + * ASI independent implementation of copystr(9). + * Used to implement copyinstr() and copystr(). + * + * Return value is in %g1. + */ #define _COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \ - clr %o4 ; \ - clr %o5 ; \ -1: LD(ub, sa) [src] sasi, %g1 ; \ + brz len, 4f ; \ + mov src, %g2 ; \ +1: deccc 1, len ; \ + bl,a,pn %xcc, 3f ; \ + nop ; \ + LD(ub, sa) [src] sasi, %g1 ; \ ST(b, da) %g1, [dst] dasi ; \ - brz,pn %g1, 2f ; \ - inc %o4 ; \ - dec len ; \ - inc src ; \ - brgz,pt len, 1b ; \ + brz,pn %g1, 3f ; \ + inc src ; \ + b %xcc, 1b ; \ inc dst ; \ - mov ENAMETOOLONG, %o5 ; \ -2: brnz,a done, 3f ; \ - stx %o4, [done] ; \ -3: +2: mov ENAMETOOLONG, %g1 ; \ +3: sub src, %g2, %g2 ; \ + brnz,a done, 4f ; \ + stx %g2, [done] ; \ +4: -#define COPYSTR(dst, src, len, done) \ - _COPYSTR(dst, src, len, done, E, E, E, E) +/* + * ASI independent implementation of memset(3). + * Used to implement bzero(), memset() and physzero(). + * + * If the pattern is non-zero, duplicate it to fill 64 bits. + * Store bytes until dst is 8-byte aligned, then store 8 bytes. + * It has yet to be determined how much unrolling is beneficial. + * Could also read and compare before writing to minimize snoop traffic. + * + * XXX bzero() should be implemented as + * #define bzero(dst, len) (void)memset((dst), 0, (len)) + * if at all. + */ +#define _MEMSET(dst, pat, len, da, dasi) \ + brlez,pn len, 5f ; \ + and pat, 0xff, pat ; \ + brz,pt pat, 1f ; \ + sllx pat, 8, %g1 ; \ + or pat, %g1, pat ; \ + sllx pat, 16, %g1 ; \ + or pat, %g1, pat ; \ + sllx pat, 32, %g1 ; \ + or pat, %g1, pat ; \ + .align 16 ; \ +1: deccc 1, len ; \ + bl,pn %xcc, 5f ; \ + btst 7, dst ; \ + bz,a,pt %xcc, 2f ; \ + inc 1, len ; \ + ST(b, da) pat, [dst] dasi ; \ + b %xcc, 1b ; \ + inc dst ; \ + .align 16 ; \ +2: deccc 32, len ; \ + bl,a,pn %xcc, 3f ; \ + inc 32, len ; \ + ST(x, da) pat, [dst] dasi ; \ + ST(x, da) pat, [dst + 8] dasi ; \ + ST(x, da) pat, [dst + 16] dasi ; \ + ST(x, da) pat, [dst + 24] dasi ; \ + b %xcc, 2b ; \ + inc 32, dst ; \ + .align 16 ; \ +3: deccc 8, len ; \ + bl,a,pn %xcc, 4f ; \ + inc 8, len ; \ + ST(x, da) pat, [dst] dasi ; \ + b %xcc, 3b ; \ + inc 8, dst ; \ + .align 16 ; \ +4: deccc 1, len ; \ + bl,a,pn %xcc, 5f ; \ + nop ; \ + ST(b, da) pat, [dst] dasi ; \ + b %xcc, 4b ; \ + inc 1, dst ; \ +5: -#define COPYINSTR(uaddr, kaddr, len, done) \ - wr %g0, ASI_AIUP, %asi ; \ - _COPYSTR(uaddr, kaddr, len, done, a, %asi, E, E) +/* + * ASI independent implementation of memcpy(3). + * Used to implement bcopy(), copyin(), copyout(), memcpy(), and physcopy(). + * + * Transfer bytes until dst is 8-byte aligned. If src is then also 8 byte + * aligned, transfer 8 bytes, otherwise finish with bytes. The unaligned + * case could be optimized, but it is expected that this is the uncommon + * case and of questionable value. The code to do so is also rather large + * and ugly. + * It has yet to be determined how much unrolling is beneficial. + * + * XXX bcopy() must also check for overlap. This is stupid. + * XXX bcopy() should be implemented as + * #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len)) + * if at all. + */ +#define _MEMCPY(dst, src, len, da, dasi, sa, sasi) \ +1: deccc 1, len ; \ + bl,pn %xcc, 6f ; \ + btst 7, dst ; \ + bz,a,pt %xcc, 2f ; \ + inc 1, len ; \ + LD(ub, sa) [src] sasi, %g1 ; \ + ST(b, da) %g1, [dst] dasi ; \ + inc 1, src ; \ + b %xcc, 1b ; \ + inc 1, dst ; \ + .align 16 ; \ +2: btst 7, src ; \ + bz,a,pt %xcc, 3f ; \ + nop ; \ + b,a %xcc, 5f ; \ + .align 16 ; \ +3: deccc 32, len ; \ + bl,a,pn %xcc, 4f ; \ + inc 32, len ; \ + LD(x, sa) [src] sasi, %g1 ; \ + LD(x, sa) [src + 8] sasi, %g2 ; \ + LD(x, sa) [src + 16] sasi, %g3 ; \ + LD(x, sa) [src + 24] sasi, %g4 ; \ + ST(x, da) %g1, [dst] dasi ; \ + ST(x, da) %g2, [dst + 8] dasi ; \ + ST(x, da) %g3, [dst + 16] dasi ; \ + ST(x, da) %g4, [dst + 24] dasi ; \ + inc 32, src ; \ + b %xcc, 3b ; \ + inc 32, dst ; \ + .align 16 ; \ +4: deccc 8, len ; \ + bl,a,pn %xcc, 5f ; \ + inc 8, len ; \ + LD(x, sa) [src] sasi, %g1 ; \ + ST(x, da) %g1, [dst] dasi ; \ + inc 8, src ; \ + b %xcc, 4b ; \ + inc 8, dst ; \ + .align 16 ; \ +5: deccc 1, len ; \ + bl,a,pn %xcc, 6f ; \ + nop ; \ + LD(ub, sa) [src] sasi, %g1 ; \ + ST(b, da) %g1, [dst] dasi ; \ + inc src ; \ + b %xcc, 5b ; \ + inc dst ; \ +6: #define CATCH_SETUP(label) \ setx label, %g2, %g1 ; \ - ldx [PCPU(CURPCB)], %g6 ; \ + ldx [PCPU(CURTHREAD)], %g6 ; \ + ldx [%g6 + TD_PCB], %g6 ; \ stx %g1, [%g6 + PCB_ONFAULT] ; #define CATCH_END() \ @@ -119,7 +241,7 @@ SU_ALIGNED(storer, label) /* - * void bcmp(void *b, size_t len) + * int bcmp(const void *b1, const void *b2, size_t len) */ ENTRY(bcmp) brz,pn %o2, 2f @@ -127,7 +249,7 @@ ENTRY(bcmp) 1: ldub [%o0 + %o3], %o4 ldub [%o1 + %o3], %o5 cmp %o4, %o5 - bne,pn %xcc, 1f + bne,pn %xcc, 2f inc %o3 deccc %o2 bne,pt %xcc, 1b @@ -139,45 +261,89 @@ END(bcmp) /* * void bcopy(const void *src, void *dst, size_t len) */ +ENTRY(ovbcopy) ENTRY(bcopy) - BCOPY(%o0, %o1, %o2) + /* + * Check for overlap, and copy backwards if so. + */ + sub %o1, %o0, %g1 + cmp %g1, %o2 + bgeu,a,pt %xcc, 3f + nop + + /* + * Copy backwards. + */ + add %o0, %o2, %o0 + add %o1, %o2, %o1 +1: deccc 1, %o2 + bl,a,pn %xcc, 2f + nop + dec 1, %o0 + ldub [%o0], %g1 + dec 1, %o1 + b %xcc, 1b + stb %g1, [%o1] +2: retl + nop + + /* + * Do the fast version. + */ +3: _MEMCPY(%o1, %o0, %o2, E, E, E, E) retl nop END(bcopy) -/* - * void ovbcopy(const void *src, void *dst, size_t len) - * XXX handle overlap... - */ -ENTRY(ovbcopy) - BCOPY(%o0, %o1, %o2) - retl - nop -END(ovbcopy) - /* * void bzero(void *b, size_t len) */ ENTRY(bzero) - brz,pn %o1, 1f - nop -1: deccc %o1 - stb %g0, [%o0] - bne,pt %xcc, 1b - inc %o0 -2: retl + _MEMSET(%o0, %g0, %o1, E, E) + retl nop END(bzero) +/* + * void physzero(vm_offset_t pa, size_t len) + */ +ENTRY(physzero) + wr %g0, ASI_PHYS_USE_EC, %asi + _MEMSET(%o0, %g0, %o1, a, %asi) + retl + nop +END(physzero) + +/* + * void physcopy(vm_offset_t src, vm_offset_t dst, size_t len) + */ +ENTRY(physcopy) + wr %g0, ASI_PHYS_USE_EC, %asi + _MEMCPY(%o1, %o0, %o2, a, %asi, a, %asi) + retl + nop +END(physcopy) + /* * void *memcpy(void *dst, const void *src, size_t len) */ ENTRY(memcpy) - BCOPY(%o1, %o0, %o2) + mov %o0, %o3 + _MEMCPY(%o3, %o1, %o2, E, E, E, E) retl nop END(memcpy) +/* + * void *memset(void *b, int c, size_t len) + */ +ENTRY(memset) + mov %o0, %o3 + _MEMSET(%o3, %o1, %o2, E, E) + retl + nop +END(memset) + /* * int copyin(const void *uaddr, void *kaddr, size_t len) */ @@ -191,7 +357,8 @@ ENTRY(copyin) stx %o2, [%o3 + KTR_PARM3] 9: #endif - COPYIN(%o0, %o1, %o2) + wr %g0, ASI_AIUP, %asi + _MEMCPY(%o1, %o0, %o2, E, E, a, %asi) CATCH_END() retl clr %o0 @@ -211,10 +378,11 @@ ENTRY(copyinstr) stx %o3, [%g1 + KTR_PARM4] 9: #endif - COPYINSTR(%o0, %o1, %o2, %o3) + wr %g0, ASI_AIUP, %asi + _COPYSTR(%o0, %o1, %o2, %o3, a, %asi, E, E) CATCH_END() retl - mov %o5, %o0 + mov %g1, %o0 END(copyinstr) /* @@ -230,7 +398,8 @@ ENTRY(copyout) stx %o2, [%o3 + KTR_PARM3] 9: #endif - COPYOUT(%o0, %o1, %o2) + wr %g0, ASI_AIUP, %asi + _MEMCPY(%o1, %o0, %o2, a, %asi, E, E) CATCH_END() retl clr %o0 @@ -250,9 +419,9 @@ END(copyout) * int copystr(const void *src, void *dst, size_t len, size_t *done) */ ENTRY(copystr) - COPYSTR(%o0, %o1, %o2, %o3) + _COPYSTR(%o0, %o1, %o2, %o3, E, E, E, E) retl - mov %o5, %o0 + mov %g1, %o0 END(copystr) /* @@ -325,7 +494,6 @@ ENTRY(fsbail) .Lfsalign: retl mov -1, %o0 -END(fsbail) ENTRY(longjmp) set 1, %g3 @@ -354,65 +522,18 @@ ENTRY(setjmp) clr %o0 END(setjmp) -/* - * Temporary stack for calling into the firmware. We need to setup one, because - * the MMU mapping for our stack page may be lost. When the firmware tries to - * spill the last window (the others are flushed before), this results in an - * DMMU miss trap, which is fatal with the firmware trap handlers installed. - * Additionally, it seems that the firmware does not immediately switch to an - * own stack (or maybe never?), therefore more space needs to be reserved. - * I hope this is sufficient now. - */ - .align 4 -DATA(ofwstack) - .rept CCFSZ * 8 - .byte 0 - .endr -ofwstack_last: - .rept CCFSZ - .byte 0 - .endr -END(ofwstack) - /* * void openfirmware(cell_t args[]) */ ENTRY(openfirmware) - /* - * Disable interrupts. The firmware should not deal with our interrupts - * anyway, and the temporary stack is not large enough to hold the stack - * footprint of the interrrupt handling. - */ - rdpr %pstate, %o3 - andn %o3, PSTATE_IE, %o1 - wrpr %o1, 0, %pstate - setx ofwstack_last - SPOFF, %o1, %o2 - save %o2, 0, %sp - flushw - rdpr %tl, %l1 - rdpr %tba, %l2 - mov AA_DMMU_PCXR, %l3 - ldxa [%l3] ASI_DMMU, %l4 - stxa %g0, [%l3] ASI_DMMU - membar #Sync - flush %sp - setx ofw_tba, %l7, %l5 - ldx [%l5], %l5 + save %sp, -CCFSZ, %sp setx ofw_vec, %l7, %l6 ldx [%l6], %l6 rdpr %pil, %l7 - wrpr %g0, 14, %pil - wrpr %l5, 0, %tba - wrpr %g0, 0, %tl + wrpr %g0, PIL_TICK, %pil call %l6 mov %i0, %o0 - wrpr %l1, 0, %tl - wrpr %l2, 0, %tba - stxa %l4, [%l3] ASI_DMMU wrpr %l7, 0, %pil - membar #Sync - flush %sp - restore - retl - wrpr %o3, 0, %pstate + ret + restore %o0, %g0, %o0 END(openfirmware)