Optimize bcopy and bzero etc to use 64 bit loads and stores if possible.

Handle overlap in bcopy.
Add routines for copying and zeroing pages using physical addresses
directly.
Remove all the hacks to account for calling the firmware on its own
trap table, we use the kernel trap table.  There is still a problem
with OF_exit().
This commit is contained in:
Jake Burkholder 2001-09-30 19:50:39 +00:00
parent c4bc2cc714
commit fa753b0bcb
2 changed files with 480 additions and 238 deletions

View File

@ -33,62 +33,184 @@
#include "assym.s"
#define E
#define E /* empty */
/*
* Generate load and store instructions for the corresponding width and asi
* (or not). Note that we want to evaluate the macro args before
* concatenating, so that E really turns into nothing.
*/
#define _LD(w, a) ld ## w ## a
#define _ST(w, a) st ## w ## a
#define LD(w, a) _LD(w, a)
#define ST(w, a) _ST(w, a)
#define _BCOPY(src, dst, len, sa, sasi, da, dasi) \
brz,pn len, 2f ; \
mov len, %o3 ; \
1: LD(ub, sa) [src] sasi, %o4 ; \
ST(b, da) %o4, [dst] dasi ; \
dec %o3 ; \
inc src ; \
brnz,pt %o3, 1b ; \
inc dst ; \
2:
#define BCOPY(src, dst, len) \
_BCOPY(src, dst, len, E, E, E, E)
#define COPYIN(uaddr, kaddr, len) \
wr %g0, ASI_AIUP, %asi ; \
_BCOPY(uaddr, kaddr, len, a, %asi, E, E)
#define COPYOUT(kaddr, uaddr, len) \
wr %g0, ASI_AIUP, %asi ; \
_BCOPY(kaddr, uaddr, len, E, E, a, %asi)
/*
* Common code for copy routines.
*
* We use large macros to generate functions for each of the copy routines.
* This allows the load and store instructions to be generated for the right
* operation, asi or not. It is possible to write an asi independent function
* but this would require 2 expensive wrs in the main loop to switch %asi.
* It would also screw up profiling (if we ever get it), but may save some I$.
* We assume that either one of dasi and sasi is empty, or that they are both
* the same (empty or non-empty). It is up to the caller to set %asi.
*/
/*
* ASI independent implementation of copystr(9).
* Used to implement copyinstr() and copystr().
*
* Return value is in %g1.
*/
#define _COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \
clr %o4 ; \
clr %o5 ; \
1: LD(ub, sa) [src] sasi, %g1 ; \
brz len, 4f ; \
mov src, %g2 ; \
1: deccc 1, len ; \
bl,a,pn %xcc, 3f ; \
nop ; \
LD(ub, sa) [src] sasi, %g1 ; \
ST(b, da) %g1, [dst] dasi ; \
brz,pn %g1, 2f ; \
inc %o4 ; \
dec len ; \
brz,pn %g1, 3f ; \
inc src ; \
brgz,pt len, 1b ; \
b %xcc, 1b ; \
inc dst ; \
mov ENAMETOOLONG, %o5 ; \
2: brnz,a done, 3f ; \
stx %o4, [done] ; \
3:
2: mov ENAMETOOLONG, %g1 ; \
3: sub src, %g2, %g2 ; \
brnz,a done, 4f ; \
stx %g2, [done] ; \
4:
#define COPYSTR(dst, src, len, done) \
_COPYSTR(dst, src, len, done, E, E, E, E)
/*
* ASI independent implementation of memset(3).
* Used to implement bzero(), memset() and physzero().
*
* If the pattern is non-zero, duplicate it to fill 64 bits.
* Store bytes until dst is 8-byte aligned, then store 8 bytes.
* It has yet to be determined how much unrolling is beneficial.
* Could also read and compare before writing to minimize snoop traffic.
*
* XXX bzero() should be implemented as
* #define bzero(dst, len) (void)memset((dst), 0, (len))
* if at all.
*/
#define _MEMSET(dst, pat, len, da, dasi) \
brlez,pn len, 5f ; \
and pat, 0xff, pat ; \
brz,pt pat, 1f ; \
sllx pat, 8, %g1 ; \
or pat, %g1, pat ; \
sllx pat, 16, %g1 ; \
or pat, %g1, pat ; \
sllx pat, 32, %g1 ; \
or pat, %g1, pat ; \
.align 16 ; \
1: deccc 1, len ; \
bl,pn %xcc, 5f ; \
btst 7, dst ; \
bz,a,pt %xcc, 2f ; \
inc 1, len ; \
ST(b, da) pat, [dst] dasi ; \
b %xcc, 1b ; \
inc dst ; \
.align 16 ; \
2: deccc 32, len ; \
bl,a,pn %xcc, 3f ; \
inc 32, len ; \
ST(x, da) pat, [dst] dasi ; \
ST(x, da) pat, [dst + 8] dasi ; \
ST(x, da) pat, [dst + 16] dasi ; \
ST(x, da) pat, [dst + 24] dasi ; \
b %xcc, 2b ; \
inc 32, dst ; \
.align 16 ; \
3: deccc 8, len ; \
bl,a,pn %xcc, 4f ; \
inc 8, len ; \
ST(x, da) pat, [dst] dasi ; \
b %xcc, 3b ; \
inc 8, dst ; \
.align 16 ; \
4: deccc 1, len ; \
bl,a,pn %xcc, 5f ; \
nop ; \
ST(b, da) pat, [dst] dasi ; \
b %xcc, 4b ; \
inc 1, dst ; \
5:
#define COPYINSTR(uaddr, kaddr, len, done) \
wr %g0, ASI_AIUP, %asi ; \
_COPYSTR(uaddr, kaddr, len, done, a, %asi, E, E)
/*
* ASI independent implementation of memcpy(3).
* Used to implement bcopy(), copyin(), copyout(), memcpy(), and physcopy().
*
* Transfer bytes until dst is 8-byte aligned. If src is then also 8 byte
* aligned, transfer 8 bytes, otherwise finish with bytes. The unaligned
* case could be optimized, but it is expected that this is the uncommon
* case and of questionable value. The code to do so is also rather large
* and ugly.
* It has yet to be determined how much unrolling is beneficial.
*
* XXX bcopy() must also check for overlap. This is stupid.
* XXX bcopy() should be implemented as
* #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len))
* if at all.
*/
#define _MEMCPY(dst, src, len, da, dasi, sa, sasi) \
1: deccc 1, len ; \
bl,pn %xcc, 6f ; \
btst 7, dst ; \
bz,a,pt %xcc, 2f ; \
inc 1, len ; \
LD(ub, sa) [src] sasi, %g1 ; \
ST(b, da) %g1, [dst] dasi ; \
inc 1, src ; \
b %xcc, 1b ; \
inc 1, dst ; \
.align 16 ; \
2: btst 7, src ; \
bz,a,pt %xcc, 3f ; \
nop ; \
b,a %xcc, 5f ; \
.align 16 ; \
3: deccc 32, len ; \
bl,a,pn %xcc, 4f ; \
inc 32, len ; \
LD(x, sa) [src] sasi, %g1 ; \
LD(x, sa) [src + 8] sasi, %g2 ; \
LD(x, sa) [src + 16] sasi, %g3 ; \
LD(x, sa) [src + 24] sasi, %g4 ; \
ST(x, da) %g1, [dst] dasi ; \
ST(x, da) %g2, [dst + 8] dasi ; \
ST(x, da) %g3, [dst + 16] dasi ; \
ST(x, da) %g4, [dst + 24] dasi ; \
inc 32, src ; \
b %xcc, 3b ; \
inc 32, dst ; \
.align 16 ; \
4: deccc 8, len ; \
bl,a,pn %xcc, 5f ; \
inc 8, len ; \
LD(x, sa) [src] sasi, %g1 ; \
ST(x, da) %g1, [dst] dasi ; \
inc 8, src ; \
b %xcc, 4b ; \
inc 8, dst ; \
.align 16 ; \
5: deccc 1, len ; \
bl,a,pn %xcc, 6f ; \
nop ; \
LD(ub, sa) [src] sasi, %g1 ; \
ST(b, da) %g1, [dst] dasi ; \
inc src ; \
b %xcc, 5b ; \
inc dst ; \
6:
#define CATCH_SETUP(label) \
setx label, %g2, %g1 ; \
ldx [PCPU(CURPCB)], %g6 ; \
ldx [PCPU(CURTHREAD)], %g6 ; \
ldx [%g6 + TD_PCB], %g6 ; \
stx %g1, [%g6 + PCB_ONFAULT] ;
#define CATCH_END() \
@ -119,7 +241,7 @@
SU_ALIGNED(storer, label)
/*
* void bcmp(void *b, size_t len)
* int bcmp(const void *b1, const void *b2, size_t len)
*/
ENTRY(bcmp)
brz,pn %o2, 2f
@ -127,7 +249,7 @@ ENTRY(bcmp)
1: ldub [%o0 + %o3], %o4
ldub [%o1 + %o3], %o5
cmp %o4, %o5
bne,pn %xcc, 1f
bne,pn %xcc, 2f
inc %o3
deccc %o2
bne,pt %xcc, 1b
@ -139,45 +261,89 @@ END(bcmp)
/*
* void bcopy(const void *src, void *dst, size_t len)
*/
ENTRY(ovbcopy)
ENTRY(bcopy)
BCOPY(%o0, %o1, %o2)
/*
* Check for overlap, and copy backwards if so.
*/
sub %o1, %o0, %g1
cmp %g1, %o2
bgeu,a,pt %xcc, 3f
nop
/*
* Copy backwards.
*/
add %o0, %o2, %o0
add %o1, %o2, %o1
1: deccc 1, %o2
bl,a,pn %xcc, 2f
nop
dec 1, %o0
ldub [%o0], %g1
dec 1, %o1
b %xcc, 1b
stb %g1, [%o1]
2: retl
nop
/*
* Do the fast version.
*/
3: _MEMCPY(%o1, %o0, %o2, E, E, E, E)
retl
nop
END(bcopy)
/*
* void ovbcopy(const void *src, void *dst, size_t len)
* XXX handle overlap...
*/
ENTRY(ovbcopy)
BCOPY(%o0, %o1, %o2)
retl
nop
END(ovbcopy)
/*
* void bzero(void *b, size_t len)
*/
ENTRY(bzero)
brz,pn %o1, 1f
nop
1: deccc %o1
stb %g0, [%o0]
bne,pt %xcc, 1b
inc %o0
2: retl
_MEMSET(%o0, %g0, %o1, E, E)
retl
nop
END(bzero)
/*
* void physzero(vm_offset_t pa, size_t len)
*/
ENTRY(physzero)
wr %g0, ASI_PHYS_USE_EC, %asi
_MEMSET(%o0, %g0, %o1, a, %asi)
retl
nop
END(physzero)
/*
* void physcopy(vm_offset_t src, vm_offset_t dst, size_t len)
*/
ENTRY(physcopy)
wr %g0, ASI_PHYS_USE_EC, %asi
_MEMCPY(%o1, %o0, %o2, a, %asi, a, %asi)
retl
nop
END(physcopy)
/*
* void *memcpy(void *dst, const void *src, size_t len)
*/
ENTRY(memcpy)
BCOPY(%o1, %o0, %o2)
mov %o0, %o3
_MEMCPY(%o3, %o1, %o2, E, E, E, E)
retl
nop
END(memcpy)
/*
* void *memset(void *b, int c, size_t len)
*/
ENTRY(memset)
mov %o0, %o3
_MEMSET(%o3, %o1, %o2, E, E)
retl
nop
END(memset)
/*
* int copyin(const void *uaddr, void *kaddr, size_t len)
*/
@ -191,7 +357,8 @@ ENTRY(copyin)
stx %o2, [%o3 + KTR_PARM3]
9:
#endif
COPYIN(%o0, %o1, %o2)
wr %g0, ASI_AIUP, %asi
_MEMCPY(%o1, %o0, %o2, E, E, a, %asi)
CATCH_END()
retl
clr %o0
@ -211,10 +378,11 @@ ENTRY(copyinstr)
stx %o3, [%g1 + KTR_PARM4]
9:
#endif
COPYINSTR(%o0, %o1, %o2, %o3)
wr %g0, ASI_AIUP, %asi
_COPYSTR(%o0, %o1, %o2, %o3, a, %asi, E, E)
CATCH_END()
retl
mov %o5, %o0
mov %g1, %o0
END(copyinstr)
/*
@ -230,7 +398,8 @@ ENTRY(copyout)
stx %o2, [%o3 + KTR_PARM3]
9:
#endif
COPYOUT(%o0, %o1, %o2)
wr %g0, ASI_AIUP, %asi
_MEMCPY(%o1, %o0, %o2, a, %asi, E, E)
CATCH_END()
retl
clr %o0
@ -250,9 +419,9 @@ END(copyout)
* int copystr(const void *src, void *dst, size_t len, size_t *done)
*/
ENTRY(copystr)
COPYSTR(%o0, %o1, %o2, %o3)
_COPYSTR(%o0, %o1, %o2, %o3, E, E, E, E)
retl
mov %o5, %o0
mov %g1, %o0
END(copystr)
/*
@ -325,7 +494,6 @@ ENTRY(fsbail)
.Lfsalign:
retl
mov -1, %o0
END(fsbail)
ENTRY(longjmp)
set 1, %g3
@ -354,65 +522,18 @@ ENTRY(setjmp)
clr %o0
END(setjmp)
/*
* Temporary stack for calling into the firmware. We need to setup one, because
* the MMU mapping for our stack page may be lost. When the firmware tries to
* spill the last window (the others are flushed before), this results in an
* DMMU miss trap, which is fatal with the firmware trap handlers installed.
* Additionally, it seems that the firmware does not immediately switch to an
* own stack (or maybe never?), therefore more space needs to be reserved.
* I hope this is sufficient now.
*/
.align 4
DATA(ofwstack)
.rept CCFSZ * 8
.byte 0
.endr
ofwstack_last:
.rept CCFSZ
.byte 0
.endr
END(ofwstack)
/*
* void openfirmware(cell_t args[])
*/
ENTRY(openfirmware)
/*
* Disable interrupts. The firmware should not deal with our interrupts
* anyway, and the temporary stack is not large enough to hold the stack
* footprint of the interrrupt handling.
*/
rdpr %pstate, %o3
andn %o3, PSTATE_IE, %o1
wrpr %o1, 0, %pstate
setx ofwstack_last - SPOFF, %o1, %o2
save %o2, 0, %sp
flushw
rdpr %tl, %l1
rdpr %tba, %l2
mov AA_DMMU_PCXR, %l3
ldxa [%l3] ASI_DMMU, %l4
stxa %g0, [%l3] ASI_DMMU
membar #Sync
flush %sp
setx ofw_tba, %l7, %l5
ldx [%l5], %l5
save %sp, -CCFSZ, %sp
setx ofw_vec, %l7, %l6
ldx [%l6], %l6
rdpr %pil, %l7
wrpr %g0, 14, %pil
wrpr %l5, 0, %tba
wrpr %g0, 0, %tl
wrpr %g0, PIL_TICK, %pil
call %l6
mov %i0, %o0
wrpr %l1, 0, %tl
wrpr %l2, 0, %tba
stxa %l4, [%l3] ASI_DMMU
wrpr %l7, 0, %pil
membar #Sync
flush %sp
restore
retl
wrpr %o3, 0, %pstate
ret
restore %o0, %g0, %o0
END(openfirmware)

View File

@ -33,62 +33,184 @@
#include "assym.s"
#define E
#define E /* empty */
/*
* Generate load and store instructions for the corresponding width and asi
* (or not). Note that we want to evaluate the macro args before
* concatenating, so that E really turns into nothing.
*/
#define _LD(w, a) ld ## w ## a
#define _ST(w, a) st ## w ## a
#define LD(w, a) _LD(w, a)
#define ST(w, a) _ST(w, a)
#define _BCOPY(src, dst, len, sa, sasi, da, dasi) \
brz,pn len, 2f ; \
mov len, %o3 ; \
1: LD(ub, sa) [src] sasi, %o4 ; \
ST(b, da) %o4, [dst] dasi ; \
dec %o3 ; \
inc src ; \
brnz,pt %o3, 1b ; \
inc dst ; \
2:
#define BCOPY(src, dst, len) \
_BCOPY(src, dst, len, E, E, E, E)
#define COPYIN(uaddr, kaddr, len) \
wr %g0, ASI_AIUP, %asi ; \
_BCOPY(uaddr, kaddr, len, a, %asi, E, E)
#define COPYOUT(kaddr, uaddr, len) \
wr %g0, ASI_AIUP, %asi ; \
_BCOPY(kaddr, uaddr, len, E, E, a, %asi)
/*
* Common code for copy routines.
*
* We use large macros to generate functions for each of the copy routines.
* This allows the load and store instructions to be generated for the right
* operation, asi or not. It is possible to write an asi independent function
* but this would require 2 expensive wrs in the main loop to switch %asi.
* It would also screw up profiling (if we ever get it), but may save some I$.
* We assume that either one of dasi and sasi is empty, or that they are both
* the same (empty or non-empty). It is up to the caller to set %asi.
*/
/*
* ASI independent implementation of copystr(9).
* Used to implement copyinstr() and copystr().
*
* Return value is in %g1.
*/
#define _COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \
clr %o4 ; \
clr %o5 ; \
1: LD(ub, sa) [src] sasi, %g1 ; \
brz len, 4f ; \
mov src, %g2 ; \
1: deccc 1, len ; \
bl,a,pn %xcc, 3f ; \
nop ; \
LD(ub, sa) [src] sasi, %g1 ; \
ST(b, da) %g1, [dst] dasi ; \
brz,pn %g1, 2f ; \
inc %o4 ; \
dec len ; \
brz,pn %g1, 3f ; \
inc src ; \
brgz,pt len, 1b ; \
b %xcc, 1b ; \
inc dst ; \
mov ENAMETOOLONG, %o5 ; \
2: brnz,a done, 3f ; \
stx %o4, [done] ; \
3:
2: mov ENAMETOOLONG, %g1 ; \
3: sub src, %g2, %g2 ; \
brnz,a done, 4f ; \
stx %g2, [done] ; \
4:
#define COPYSTR(dst, src, len, done) \
_COPYSTR(dst, src, len, done, E, E, E, E)
/*
* ASI independent implementation of memset(3).
* Used to implement bzero(), memset() and physzero().
*
* If the pattern is non-zero, duplicate it to fill 64 bits.
* Store bytes until dst is 8-byte aligned, then store 8 bytes.
* It has yet to be determined how much unrolling is beneficial.
* Could also read and compare before writing to minimize snoop traffic.
*
* XXX bzero() should be implemented as
* #define bzero(dst, len) (void)memset((dst), 0, (len))
* if at all.
*/
#define _MEMSET(dst, pat, len, da, dasi) \
brlez,pn len, 5f ; \
and pat, 0xff, pat ; \
brz,pt pat, 1f ; \
sllx pat, 8, %g1 ; \
or pat, %g1, pat ; \
sllx pat, 16, %g1 ; \
or pat, %g1, pat ; \
sllx pat, 32, %g1 ; \
or pat, %g1, pat ; \
.align 16 ; \
1: deccc 1, len ; \
bl,pn %xcc, 5f ; \
btst 7, dst ; \
bz,a,pt %xcc, 2f ; \
inc 1, len ; \
ST(b, da) pat, [dst] dasi ; \
b %xcc, 1b ; \
inc dst ; \
.align 16 ; \
2: deccc 32, len ; \
bl,a,pn %xcc, 3f ; \
inc 32, len ; \
ST(x, da) pat, [dst] dasi ; \
ST(x, da) pat, [dst + 8] dasi ; \
ST(x, da) pat, [dst + 16] dasi ; \
ST(x, da) pat, [dst + 24] dasi ; \
b %xcc, 2b ; \
inc 32, dst ; \
.align 16 ; \
3: deccc 8, len ; \
bl,a,pn %xcc, 4f ; \
inc 8, len ; \
ST(x, da) pat, [dst] dasi ; \
b %xcc, 3b ; \
inc 8, dst ; \
.align 16 ; \
4: deccc 1, len ; \
bl,a,pn %xcc, 5f ; \
nop ; \
ST(b, da) pat, [dst] dasi ; \
b %xcc, 4b ; \
inc 1, dst ; \
5:
#define COPYINSTR(uaddr, kaddr, len, done) \
wr %g0, ASI_AIUP, %asi ; \
_COPYSTR(uaddr, kaddr, len, done, a, %asi, E, E)
/*
* ASI independent implementation of memcpy(3).
* Used to implement bcopy(), copyin(), copyout(), memcpy(), and physcopy().
*
* Transfer bytes until dst is 8-byte aligned. If src is then also 8 byte
* aligned, transfer 8 bytes, otherwise finish with bytes. The unaligned
* case could be optimized, but it is expected that this is the uncommon
* case and of questionable value. The code to do so is also rather large
* and ugly.
* It has yet to be determined how much unrolling is beneficial.
*
* XXX bcopy() must also check for overlap. This is stupid.
* XXX bcopy() should be implemented as
* #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len))
* if at all.
*/
#define _MEMCPY(dst, src, len, da, dasi, sa, sasi) \
1: deccc 1, len ; \
bl,pn %xcc, 6f ; \
btst 7, dst ; \
bz,a,pt %xcc, 2f ; \
inc 1, len ; \
LD(ub, sa) [src] sasi, %g1 ; \
ST(b, da) %g1, [dst] dasi ; \
inc 1, src ; \
b %xcc, 1b ; \
inc 1, dst ; \
.align 16 ; \
2: btst 7, src ; \
bz,a,pt %xcc, 3f ; \
nop ; \
b,a %xcc, 5f ; \
.align 16 ; \
3: deccc 32, len ; \
bl,a,pn %xcc, 4f ; \
inc 32, len ; \
LD(x, sa) [src] sasi, %g1 ; \
LD(x, sa) [src + 8] sasi, %g2 ; \
LD(x, sa) [src + 16] sasi, %g3 ; \
LD(x, sa) [src + 24] sasi, %g4 ; \
ST(x, da) %g1, [dst] dasi ; \
ST(x, da) %g2, [dst + 8] dasi ; \
ST(x, da) %g3, [dst + 16] dasi ; \
ST(x, da) %g4, [dst + 24] dasi ; \
inc 32, src ; \
b %xcc, 3b ; \
inc 32, dst ; \
.align 16 ; \
4: deccc 8, len ; \
bl,a,pn %xcc, 5f ; \
inc 8, len ; \
LD(x, sa) [src] sasi, %g1 ; \
ST(x, da) %g1, [dst] dasi ; \
inc 8, src ; \
b %xcc, 4b ; \
inc 8, dst ; \
.align 16 ; \
5: deccc 1, len ; \
bl,a,pn %xcc, 6f ; \
nop ; \
LD(ub, sa) [src] sasi, %g1 ; \
ST(b, da) %g1, [dst] dasi ; \
inc src ; \
b %xcc, 5b ; \
inc dst ; \
6:
#define CATCH_SETUP(label) \
setx label, %g2, %g1 ; \
ldx [PCPU(CURPCB)], %g6 ; \
ldx [PCPU(CURTHREAD)], %g6 ; \
ldx [%g6 + TD_PCB], %g6 ; \
stx %g1, [%g6 + PCB_ONFAULT] ;
#define CATCH_END() \
@ -119,7 +241,7 @@
SU_ALIGNED(storer, label)
/*
* void bcmp(void *b, size_t len)
* int bcmp(const void *b1, const void *b2, size_t len)
*/
ENTRY(bcmp)
brz,pn %o2, 2f
@ -127,7 +249,7 @@ ENTRY(bcmp)
1: ldub [%o0 + %o3], %o4
ldub [%o1 + %o3], %o5
cmp %o4, %o5
bne,pn %xcc, 1f
bne,pn %xcc, 2f
inc %o3
deccc %o2
bne,pt %xcc, 1b
@ -139,45 +261,89 @@ END(bcmp)
/*
* void bcopy(const void *src, void *dst, size_t len)
*/
ENTRY(ovbcopy)
ENTRY(bcopy)
BCOPY(%o0, %o1, %o2)
/*
* Check for overlap, and copy backwards if so.
*/
sub %o1, %o0, %g1
cmp %g1, %o2
bgeu,a,pt %xcc, 3f
nop
/*
* Copy backwards.
*/
add %o0, %o2, %o0
add %o1, %o2, %o1
1: deccc 1, %o2
bl,a,pn %xcc, 2f
nop
dec 1, %o0
ldub [%o0], %g1
dec 1, %o1
b %xcc, 1b
stb %g1, [%o1]
2: retl
nop
/*
* Do the fast version.
*/
3: _MEMCPY(%o1, %o0, %o2, E, E, E, E)
retl
nop
END(bcopy)
/*
* void ovbcopy(const void *src, void *dst, size_t len)
* XXX handle overlap...
*/
ENTRY(ovbcopy)
BCOPY(%o0, %o1, %o2)
retl
nop
END(ovbcopy)
/*
* void bzero(void *b, size_t len)
*/
ENTRY(bzero)
brz,pn %o1, 1f
nop
1: deccc %o1
stb %g0, [%o0]
bne,pt %xcc, 1b
inc %o0
2: retl
_MEMSET(%o0, %g0, %o1, E, E)
retl
nop
END(bzero)
/*
* void physzero(vm_offset_t pa, size_t len)
*/
ENTRY(physzero)
wr %g0, ASI_PHYS_USE_EC, %asi
_MEMSET(%o0, %g0, %o1, a, %asi)
retl
nop
END(physzero)
/*
* void physcopy(vm_offset_t src, vm_offset_t dst, size_t len)
*/
ENTRY(physcopy)
wr %g0, ASI_PHYS_USE_EC, %asi
_MEMCPY(%o1, %o0, %o2, a, %asi, a, %asi)
retl
nop
END(physcopy)
/*
* void *memcpy(void *dst, const void *src, size_t len)
*/
ENTRY(memcpy)
BCOPY(%o1, %o0, %o2)
mov %o0, %o3
_MEMCPY(%o3, %o1, %o2, E, E, E, E)
retl
nop
END(memcpy)
/*
* void *memset(void *b, int c, size_t len)
*/
ENTRY(memset)
mov %o0, %o3
_MEMSET(%o3, %o1, %o2, E, E)
retl
nop
END(memset)
/*
* int copyin(const void *uaddr, void *kaddr, size_t len)
*/
@ -191,7 +357,8 @@ ENTRY(copyin)
stx %o2, [%o3 + KTR_PARM3]
9:
#endif
COPYIN(%o0, %o1, %o2)
wr %g0, ASI_AIUP, %asi
_MEMCPY(%o1, %o0, %o2, E, E, a, %asi)
CATCH_END()
retl
clr %o0
@ -211,10 +378,11 @@ ENTRY(copyinstr)
stx %o3, [%g1 + KTR_PARM4]
9:
#endif
COPYINSTR(%o0, %o1, %o2, %o3)
wr %g0, ASI_AIUP, %asi
_COPYSTR(%o0, %o1, %o2, %o3, a, %asi, E, E)
CATCH_END()
retl
mov %o5, %o0
mov %g1, %o0
END(copyinstr)
/*
@ -230,7 +398,8 @@ ENTRY(copyout)
stx %o2, [%o3 + KTR_PARM3]
9:
#endif
COPYOUT(%o0, %o1, %o2)
wr %g0, ASI_AIUP, %asi
_MEMCPY(%o1, %o0, %o2, a, %asi, E, E)
CATCH_END()
retl
clr %o0
@ -250,9 +419,9 @@ END(copyout)
* int copystr(const void *src, void *dst, size_t len, size_t *done)
*/
ENTRY(copystr)
COPYSTR(%o0, %o1, %o2, %o3)
_COPYSTR(%o0, %o1, %o2, %o3, E, E, E, E)
retl
mov %o5, %o0
mov %g1, %o0
END(copystr)
/*
@ -325,7 +494,6 @@ ENTRY(fsbail)
.Lfsalign:
retl
mov -1, %o0
END(fsbail)
ENTRY(longjmp)
set 1, %g3
@ -354,65 +522,18 @@ ENTRY(setjmp)
clr %o0
END(setjmp)
/*
* Temporary stack for calling into the firmware. We need to setup one, because
* the MMU mapping for our stack page may be lost. When the firmware tries to
* spill the last window (the others are flushed before), this results in an
* DMMU miss trap, which is fatal with the firmware trap handlers installed.
* Additionally, it seems that the firmware does not immediately switch to an
* own stack (or maybe never?), therefore more space needs to be reserved.
* I hope this is sufficient now.
*/
.align 4
DATA(ofwstack)
.rept CCFSZ * 8
.byte 0
.endr
ofwstack_last:
.rept CCFSZ
.byte 0
.endr
END(ofwstack)
/*
* void openfirmware(cell_t args[])
*/
ENTRY(openfirmware)
/*
* Disable interrupts. The firmware should not deal with our interrupts
* anyway, and the temporary stack is not large enough to hold the stack
* footprint of the interrrupt handling.
*/
rdpr %pstate, %o3
andn %o3, PSTATE_IE, %o1
wrpr %o1, 0, %pstate
setx ofwstack_last - SPOFF, %o1, %o2
save %o2, 0, %sp
flushw
rdpr %tl, %l1
rdpr %tba, %l2
mov AA_DMMU_PCXR, %l3
ldxa [%l3] ASI_DMMU, %l4
stxa %g0, [%l3] ASI_DMMU
membar #Sync
flush %sp
setx ofw_tba, %l7, %l5
ldx [%l5], %l5
save %sp, -CCFSZ, %sp
setx ofw_vec, %l7, %l6
ldx [%l6], %l6
rdpr %pil, %l7
wrpr %g0, 14, %pil
wrpr %l5, 0, %tba
wrpr %g0, 0, %tl
wrpr %g0, PIL_TICK, %pil
call %l6
mov %i0, %o0
wrpr %l1, 0, %tl
wrpr %l2, 0, %tba
stxa %l4, [%l3] ASI_DMMU
wrpr %l7, 0, %pil
membar #Sync
flush %sp
restore
retl
wrpr %o3, 0, %pstate
ret
restore %o0, %g0, %o0
END(openfirmware)