Add optimized block copy and zero functions using vis instructions, which

can do 64 bytes at a time and don't allocate lines in the L2 cache.  These
assume that everything is 64 byte aligned, and that there's more than 128
bytes of data (best for whole pages).  The block load and store instructions
don't follow normal memory ordering rules and require either a memory barrier
or move between registers before the data can actually be used.  This
implementation correctly shuffles around 3 out of the 4 sets of registers
in order to avoid memory barriers expect for the last 2 blocks.
This commit is contained in:
Jake Burkholder 2003-04-03 18:43:40 +00:00
parent c2a2b443e2
commit 6412c65cf0
2 changed files with 140 additions and 0 deletions

View File

@ -196,6 +196,9 @@ void ascopyfrom(u_long sasi, vm_offset_t src, caddr_t dst, size_t len);
void ascopyto(caddr_t src, u_long dasi, vm_offset_t dst, size_t len);
void aszero(u_long asi, vm_offset_t dst, size_t len);
void spitfire_block_copy(void *src, void *dst, size_t len);
void spitfire_block_zero(void *dst, size_t len);
/*
* Ultrasparc II doesn't implement popc in hardware. Suck.
*/

View File

@ -527,6 +527,143 @@ ENTRY(fs_fault)
mov -1, %o0
END(fsfault)
.globl fpu_fault_begin
fpu_fault_begin:
nop
/*
* void spitfire_block_copy(void *src, void *dst, size_t len)
*/
ENTRY(spitfire_block_copy)
rdpr %pil, %o3
wrpr %g0, PIL_TICK, %pil
wr %g0, ASI_BLK_S, %asi
wr %g0, FPRS_FEF, %fprs
sub PCB_REG, TF_SIZEOF, %o4
ldx [%o4 + TF_FPRS], %o5
andcc %o5, FPRS_FEF, %g0
bz,a,pt %xcc, 1f
nop
stda %f0, [PCB_REG + PCB_UFP + (0 * 64)] %asi
stda %f16, [PCB_REG + PCB_UFP + (1 * 64)] %asi
stda %f32, [PCB_REG + PCB_UFP + (2 * 64)] %asi
stda %f48, [PCB_REG + PCB_UFP + (3 * 64)] %asi
membar #Sync
andn %o5, FPRS_FEF, %o5
stx %o5, [%o4 + TF_FPRS]
ldx [PCB_REG + PCB_FLAGS], %o4
or %o4, PCB_FEF, %o4
stx %o4, [PCB_REG + PCB_FLAGS]
1: wrpr %o3, 0, %pil
ldda [%o0] %asi, %f0
add %o0, 64, %o0
sub %o2, 64, %o2
2: ldda [%o0] %asi, %f16
fsrc1 %f0, %f32
fsrc1 %f2, %f34
fsrc1 %f4, %f36
fsrc1 %f6, %f38
fsrc1 %f8, %f40
fsrc1 %f10, %f42
fsrc1 %f12, %f44
fsrc1 %f14, %f46
stda %f32, [%o1] %asi
add %o0, 64, %o0
subcc %o2, 64, %o2
bz,pn %xcc, 3f
add %o1, 64, %o1
ldda [%o0] %asi, %f0
fsrc1 %f16, %f32
fsrc1 %f18, %f34
fsrc1 %f20, %f36
fsrc1 %f22, %f38
fsrc1 %f24, %f40
fsrc1 %f26, %f42
fsrc1 %f28, %f44
fsrc1 %f30, %f46
stda %f32, [%o1] %asi
add %o0, 64, %o0
sub %o2, 64, %o2
ba %xcc, 2b
add %o1, 64, %o1
3: membar #Sync
stda %f16, [%o1] %asi
membar #Sync
wr %g0, 0, %fprs
retl
nop
END(spitfire_block_copy)
/*
* void spitfire_block_zero(void *dst, size_t len)
*/
ENTRY(spitfire_block_zero)
rdpr %pil, %o3
wrpr %g0, PIL_TICK, %pil
wr %g0, ASI_BLK_S, %asi
wr %g0, FPRS_FEF, %fprs
sub PCB_REG, TF_SIZEOF, %o4
ldx [%o4 + TF_FPRS], %o5
andcc %o5, FPRS_FEF, %g0
bz,a,pt %xcc, 1f
nop
stda %f0, [PCB_REG + PCB_UFP + (0 * 64)] %asi
stda %f16, [PCB_REG + PCB_UFP + (1 * 64)] %asi
stda %f32, [PCB_REG + PCB_UFP + (2 * 64)] %asi
stda %f48, [PCB_REG + PCB_UFP + (3 * 64)] %asi
membar #Sync
andn %o5, FPRS_FEF, %o5
stx %o5, [%o4 + TF_FPRS]
ldx [PCB_REG + PCB_FLAGS], %o4
or %o4, PCB_FEF, %o4
stx %o4, [PCB_REG + PCB_FLAGS]
1: wrpr %o3, 0, %pil
fzero %f0
fzero %f2
fzero %f4
fzero %f6
fzero %f8
fzero %f10
fzero %f12
fzero %f14
1: stda %f0, [%o0] %asi
stda %f0, [%o0 + 64] %asi
stda %f0, [%o0 + 128] %asi
stda %f0, [%o0 + 192] %asi
sub %o1, 256, %o1
brnz %o1, 1b
add %o0, 256, %o0
membar #Sync
wr %g0, 0, %fprs
retl
nop
END(spitfire_block_zero)
.globl fpu_fault_end
fpu_fault_end:
nop
.globl fpu_fault_size
.set fpu_fault_size, fpu_fault_end - fpu_fault_begin
ENTRY(longjmp)
set 1, %g3
movrz %o1, %o1, %g3