freebsd-nq/contrib/gcc/config/ia64/lib1funcs.asm
2004-07-28 03:11:36 +00:00

744 lines
14 KiB
NASM

#ifdef L__divxf3
// Compute a 80-bit IEEE double-extended quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend. farg1 holds the divisor.
//
// __divtf3 is an alternate symbol name for backward compatibility.
.text
.align 16
.global __divxf3
.global __divtf3
.proc __divxf3
__divxf3:
__divtf3:
cmp.eq p7, p0 = r0, r0
frcpa.s0 f10, p6 = farg0, farg1
;;
(p6) cmp.ne p7, p0 = r0, r0
.pred.rel.mutex p6, p7
(p6) fnma.s1 f11 = farg1, f10, f1
(p6) fma.s1 f12 = farg0, f10, f0
;;
(p6) fma.s1 f13 = f11, f11, f0
(p6) fma.s1 f14 = f11, f11, f11
;;
(p6) fma.s1 f11 = f13, f13, f11
(p6) fma.s1 f13 = f14, f10, f10
;;
(p6) fma.s1 f10 = f13, f11, f10
(p6) fnma.s1 f11 = farg1, f12, farg0
;;
(p6) fma.s1 f11 = f11, f10, f12
(p6) fnma.s1 f12 = farg1, f10, f1
;;
(p6) fma.s1 f10 = f12, f10, f10
(p6) fnma.s1 f12 = farg1, f11, farg0
;;
(p6) fma.s0 fret0 = f12, f10, f11
(p7) mov fret0 = f10
br.ret.sptk rp
.endp __divxf3
#endif
#ifdef L__divdf3
// Compute a 64-bit IEEE double quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend. farg1 holds the divisor.
.text
.align 16
.global __divdf3
.proc __divdf3
__divdf3:
cmp.eq p7, p0 = r0, r0
frcpa.s0 f10, p6 = farg0, farg1
;;
(p6) cmp.ne p7, p0 = r0, r0
.pred.rel.mutex p6, p7
(p6) fmpy.s1 f11 = farg0, f10
(p6) fnma.s1 f12 = farg1, f10, f1
;;
(p6) fma.s1 f11 = f12, f11, f11
(p6) fmpy.s1 f13 = f12, f12
;;
(p6) fma.s1 f10 = f12, f10, f10
(p6) fma.s1 f11 = f13, f11, f11
;;
(p6) fmpy.s1 f12 = f13, f13
(p6) fma.s1 f10 = f13, f10, f10
;;
(p6) fma.d.s1 f11 = f12, f11, f11
(p6) fma.s1 f10 = f12, f10, f10
;;
(p6) fnma.d.s1 f8 = farg1, f11, farg0
;;
(p6) fma.d fret0 = f8, f10, f11
(p7) mov fret0 = f10
br.ret.sptk rp
;;
.endp __divdf3
#endif
#ifdef L__divsf3
// Compute a 32-bit IEEE float quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend. farg1 holds the divisor.
.text
.align 16
.global __divsf3
.proc __divsf3
__divsf3:
cmp.eq p7, p0 = r0, r0
frcpa.s0 f10, p6 = farg0, farg1
;;
(p6) cmp.ne p7, p0 = r0, r0
.pred.rel.mutex p6, p7
(p6) fmpy.s1 f8 = farg0, f10
(p6) fnma.s1 f9 = farg1, f10, f1
;;
(p6) fma.s1 f8 = f9, f8, f8
(p6) fmpy.s1 f9 = f9, f9
;;
(p6) fma.s1 f8 = f9, f8, f8
(p6) fmpy.s1 f9 = f9, f9
;;
(p6) fma.d.s1 f10 = f9, f8, f8
;;
(p6) fnorm.s.s0 fret0 = f10
(p7) mov fret0 = f10
br.ret.sptk rp
;;
.endp __divsf3
#endif
#ifdef L__divdi3
// Compute a 64-bit integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend. in1 holds the divisor.
.text
.align 16
.global __divdi3
.proc __divdi3
__divdi3:
.regstk 2,0,0,0
// Transfer inputs to FP registers.
setf.sig f8 = in0
setf.sig f9 = in1
;;
// Convert the inputs to FP, so that they won't be treated as unsigned.
fcvt.xf f8 = f8
fcvt.xf f9 = f9
;;
// Compute the reciprocal approximation.
frcpa.s1 f10, p6 = f8, f9
;;
// 3 Newton-Raphson iterations.
(p6) fnma.s1 f11 = f9, f10, f1
(p6) fmpy.s1 f12 = f8, f10
;;
(p6) fmpy.s1 f13 = f11, f11
(p6) fma.s1 f12 = f11, f12, f12
;;
(p6) fma.s1 f10 = f11, f10, f10
(p6) fma.s1 f11 = f13, f12, f12
;;
(p6) fma.s1 f10 = f13, f10, f10
(p6) fnma.s1 f12 = f9, f11, f8
;;
(p6) fma.s1 f10 = f12, f10, f11
;;
// Round quotient to an integer.
fcvt.fx.trunc.s1 f10 = f10
;;
// Transfer result to GP registers.
getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __divdi3
#endif
#ifdef L__moddi3
// Compute a 64-bit integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend (a). in1 holds the divisor (b).
.text
.align 16
.global __moddi3
.proc __moddi3
__moddi3:
.regstk 2,0,0,0
// Transfer inputs to FP registers.
setf.sig f14 = in0
setf.sig f9 = in1
;;
// Convert the inputs to FP, so that they won't be treated as unsigned.
fcvt.xf f8 = f14
fcvt.xf f9 = f9
;;
// Compute the reciprocal approximation.
frcpa.s1 f10, p6 = f8, f9
;;
// 3 Newton-Raphson iterations.
(p6) fmpy.s1 f12 = f8, f10
(p6) fnma.s1 f11 = f9, f10, f1
;;
(p6) fma.s1 f12 = f11, f12, f12
(p6) fmpy.s1 f13 = f11, f11
;;
(p6) fma.s1 f10 = f11, f10, f10
(p6) fma.s1 f11 = f13, f12, f12
;;
sub in1 = r0, in1
(p6) fma.s1 f10 = f13, f10, f10
(p6) fnma.s1 f12 = f9, f11, f8
;;
setf.sig f9 = in1
(p6) fma.s1 f10 = f12, f10, f11
;;
fcvt.fx.trunc.s1 f10 = f10
;;
// r = q * (-b) + a
xma.l f10 = f10, f9, f14
;;
// Transfer result to GP registers.
getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __moddi3
#endif
#ifdef L__udivdi3
// Compute a 64-bit unsigned integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend. in1 holds the divisor.
.text
.align 16
.global __udivdi3
.proc __udivdi3
__udivdi3:
.regstk 2,0,0,0
// Transfer inputs to FP registers.
setf.sig f8 = in0
setf.sig f9 = in1
;;
// Convert the inputs to FP, to avoid FP software-assist faults.
fcvt.xuf.s1 f8 = f8
fcvt.xuf.s1 f9 = f9
;;
// Compute the reciprocal approximation.
frcpa.s1 f10, p6 = f8, f9
;;
// 3 Newton-Raphson iterations.
(p6) fnma.s1 f11 = f9, f10, f1
(p6) fmpy.s1 f12 = f8, f10
;;
(p6) fmpy.s1 f13 = f11, f11
(p6) fma.s1 f12 = f11, f12, f12
;;
(p6) fma.s1 f10 = f11, f10, f10
(p6) fma.s1 f11 = f13, f12, f12
;;
(p6) fma.s1 f10 = f13, f10, f10
(p6) fnma.s1 f12 = f9, f11, f8
;;
(p6) fma.s1 f10 = f12, f10, f11
;;
// Round quotient to an unsigned integer.
fcvt.fxu.trunc.s1 f10 = f10
;;
// Transfer result to GP registers.
getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __udivdi3
#endif
#ifdef L__umoddi3
// Compute a 64-bit unsigned integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend (a). in1 holds the divisor (b).
.text
.align 16
.global __umoddi3
.proc __umoddi3
__umoddi3:
.regstk 2,0,0,0
// Transfer inputs to FP registers.
setf.sig f14 = in0
setf.sig f9 = in1
;;
// Convert the inputs to FP, to avoid FP software assist faults.
fcvt.xuf.s1 f8 = f14
fcvt.xuf.s1 f9 = f9
;;
// Compute the reciprocal approximation.
frcpa.s1 f10, p6 = f8, f9
;;
// 3 Newton-Raphson iterations.
(p6) fmpy.s1 f12 = f8, f10
(p6) fnma.s1 f11 = f9, f10, f1
;;
(p6) fma.s1 f12 = f11, f12, f12
(p6) fmpy.s1 f13 = f11, f11
;;
(p6) fma.s1 f10 = f11, f10, f10
(p6) fma.s1 f11 = f13, f12, f12
;;
sub in1 = r0, in1
(p6) fma.s1 f10 = f13, f10, f10
(p6) fnma.s1 f12 = f9, f11, f8
;;
setf.sig f9 = in1
(p6) fma.s1 f10 = f12, f10, f11
;;
// Round quotient to an unsigned integer.
fcvt.fxu.trunc.s1 f10 = f10
;;
// r = q * (-b) + a
xma.l f10 = f10, f9, f14
;;
// Transfer result to GP registers.
getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __umoddi3
#endif
#ifdef L__divsi3
// Compute a 32-bit integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend. in1 holds the divisor.
.text
.align 16
.global __divsi3
.proc __divsi3
__divsi3:
.regstk 2,0,0,0
sxt4 in0 = in0
sxt4 in1 = in1
;;
setf.sig f8 = in0
setf.sig f9 = in1
;;
mov r2 = 0x0ffdd
fcvt.xf f8 = f8
fcvt.xf f9 = f9
;;
setf.exp f11 = r2
frcpa.s1 f10, p6 = f8, f9
;;
(p6) fmpy.s1 f8 = f8, f10
(p6) fnma.s1 f9 = f9, f10, f1
;;
(p6) fma.s1 f8 = f9, f8, f8
(p6) fma.s1 f9 = f9, f9, f11
;;
(p6) fma.s1 f10 = f9, f8, f8
;;
fcvt.fx.trunc.s1 f10 = f10
;;
getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __divsi3
#endif
#ifdef L__modsi3
// Compute a 32-bit integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend. in1 holds the divisor.
.text
.align 16
.global __modsi3
.proc __modsi3
__modsi3:
.regstk 2,0,0,0
mov r2 = 0x0ffdd
sxt4 in0 = in0
sxt4 in1 = in1
;;
setf.sig f13 = r32
setf.sig f9 = r33
;;
sub in1 = r0, in1
fcvt.xf f8 = f13
fcvt.xf f9 = f9
;;
setf.exp f11 = r2
frcpa.s1 f10, p6 = f8, f9
;;
(p6) fmpy.s1 f12 = f8, f10
(p6) fnma.s1 f10 = f9, f10, f1
;;
setf.sig f9 = in1
(p6) fma.s1 f12 = f10, f12, f12
(p6) fma.s1 f10 = f10, f10, f11
;;
(p6) fma.s1 f10 = f10, f12, f12
;;
fcvt.fx.trunc.s1 f10 = f10
;;
xma.l f10 = f10, f9, f13
;;
getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __modsi3
#endif
#ifdef L__udivsi3
// Compute a 32-bit unsigned integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend. in1 holds the divisor.
.text
.align 16
.global __udivsi3
.proc __udivsi3
__udivsi3:
.regstk 2,0,0,0
mov r2 = 0x0ffdd
zxt4 in0 = in0
zxt4 in1 = in1
;;
setf.sig f8 = in0
setf.sig f9 = in1
;;
fcvt.xf f8 = f8
fcvt.xf f9 = f9
;;
setf.exp f11 = r2
frcpa.s1 f10, p6 = f8, f9
;;
(p6) fmpy.s1 f8 = f8, f10
(p6) fnma.s1 f9 = f9, f10, f1
;;
(p6) fma.s1 f8 = f9, f8, f8
(p6) fma.s1 f9 = f9, f9, f11
;;
(p6) fma.s1 f10 = f9, f8, f8
;;
fcvt.fxu.trunc.s1 f10 = f10
;;
getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __udivsi3
#endif
#ifdef L__umodsi3
// Compute a 32-bit unsigned integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend. in1 holds the divisor.
.text
.align 16
.global __umodsi3
.proc __umodsi3
__umodsi3:
.regstk 2,0,0,0
mov r2 = 0x0ffdd
zxt4 in0 = in0
zxt4 in1 = in1
;;
setf.sig f13 = in0
setf.sig f9 = in1
;;
sub in1 = r0, in1
fcvt.xf f8 = f13
fcvt.xf f9 = f9
;;
setf.exp f11 = r2
frcpa.s1 f10, p6 = f8, f9
;;
(p6) fmpy.s1 f12 = f8, f10
(p6) fnma.s1 f10 = f9, f10, f1
;;
setf.sig f9 = in1
(p6) fma.s1 f12 = f10, f12, f12
(p6) fma.s1 f10 = f10, f10, f11
;;
(p6) fma.s1 f10 = f10, f12, f12
;;
fcvt.fxu.trunc.s1 f10 = f10
;;
xma.l f10 = f10, f9, f13
;;
getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __umodsi3
#endif
#ifdef L__save_stack_nonlocal
// Notes on save/restore stack nonlocal: We read ar.bsp but write
// ar.bspstore. This is because ar.bsp can be read at all times
// (independent of the RSE mode) but since it's read-only we need to
// restore the value via ar.bspstore. This is OK because
// ar.bsp==ar.bspstore after executing "flushrs".
// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
.text
.align 16
.global __ia64_save_stack_nonlocal
.proc __ia64_save_stack_nonlocal
__ia64_save_stack_nonlocal:
{ .mmf
alloc r18 = ar.pfs, 2, 0, 0, 0
mov r19 = ar.rsc
;;
}
{ .mmi
flushrs
st8 [in0] = in1, 24
and r19 = 0x1c, r19
;;
}
{ .mmi
st8 [in0] = r18, -16
mov ar.rsc = r19
or r19 = 0x3, r19
;;
}
{ .mmi
mov r16 = ar.bsp
mov r17 = ar.rnat
adds r2 = 8, in0
;;
}
{ .mmi
st8 [in0] = r16
st8 [r2] = r17
}
{ .mib
mov ar.rsc = r19
br.ret.sptk.few rp
;;
}
.endp __ia64_save_stack_nonlocal
#endif
#ifdef L__nonlocal_goto
// void __ia64_nonlocal_goto(void *target_label, void *save_area,
// void *static_chain);
.text
.align 16
.global __ia64_nonlocal_goto
.proc __ia64_nonlocal_goto
__ia64_nonlocal_goto:
{ .mmi
alloc r20 = ar.pfs, 3, 0, 0, 0
ld8 r12 = [in1], 8
mov.ret.sptk rp = in0, .L0
;;
}
{ .mmf
ld8 r16 = [in1], 8
mov r19 = ar.rsc
;;
}
{ .mmi
flushrs
ld8 r17 = [in1], 8
and r19 = 0x1c, r19
;;
}
{ .mmi
ld8 r18 = [in1]
mov ar.rsc = r19
or r19 = 0x3, r19
;;
}
{ .mmi
mov ar.bspstore = r16
;;
mov ar.rnat = r17
;;
}
{ .mmi
loadrs
invala
mov r15 = in2
;;
}
.L0: { .mib
mov ar.rsc = r19
mov ar.pfs = r18
br.ret.sptk.few rp
;;
}
.endp __ia64_nonlocal_goto
#endif
#ifdef L__restore_stack_nonlocal
// This is mostly the same as nonlocal_goto above.
// ??? This has not been tested yet.
// void __ia64_restore_stack_nonlocal(void *save_area)
.text
.align 16
.global __ia64_restore_stack_nonlocal
.proc __ia64_restore_stack_nonlocal
__ia64_restore_stack_nonlocal:
{ .mmf
alloc r20 = ar.pfs, 4, 0, 0, 0
ld8 r12 = [in0], 8
;;
}
{ .mmb
ld8 r16=[in0], 8
mov r19 = ar.rsc
;;
}
{ .mmi
flushrs
ld8 r17 = [in0], 8
and r19 = 0x1c, r19
;;
}
{ .mmf
ld8 r18 = [in0]
mov ar.rsc = r19
;;
}
{ .mmi
mov ar.bspstore = r16
;;
mov ar.rnat = r17
or r19 = 0x3, r19
;;
}
{ .mmf
loadrs
invala
;;
}
.L0: { .mib
mov ar.rsc = r19
mov ar.pfs = r18
br.ret.sptk.few rp
;;
}
.endp __ia64_restore_stack_nonlocal
#endif
#ifdef L__trampoline
// Implement the nested function trampoline. This is out of line
// so that we don't have to bother with flushing the icache, as
// well as making the on-stack trampoline smaller.
//
// The trampoline has the following form:
//
// +-------------------+ >
// TRAMP: | __ia64_trampoline | |
// +-------------------+ > fake function descriptor
// | TRAMP+16 | |
// +-------------------+ >
// | target descriptor |
// +-------------------+
// | static link |
// +-------------------+
.text
.align 16
.global __ia64_trampoline
.proc __ia64_trampoline
__ia64_trampoline:
{ .mmi
ld8 r2 = [r1], 8
;;
ld8 r15 = [r1]
}
{ .mmi
ld8 r3 = [r2], 8
;;
ld8 r1 = [r2]
mov b6 = r3
}
{ .bbb
br.sptk.many b6
;;
}
.endp __ia64_trampoline
#endif
#ifdef L__compat
// Thunks for backward compatibility.
.text
.align 16
.global __fixtfti
.proc __fixtfti
__fixtfti:
{ .bbb
br.sptk.many __fixxfti
;;
}
.endp __fixtfti
.align 16
.global __fixunstfti
.proc __fixunstfti
__fixunstfti:
{ .bbb
br.sptk.many __fixunsxfti
;;
}
.endp __fixunstfti
.align 16
.global __floattitf
.proc __floattitf
__floattitf:
{ .bbb
br.sptk.many __floattixf
;;
}
.endp __floattitf
#endif