freebsd-dev/contrib/gcc/config/ia64/lib1funcs.asm

#ifdef L__divxf3
// Compute a 80-bit IEEE double-extended quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.
//
// __divtf3 is an alternate symbol name for backward compatibility.

	.text
	.align 16
	.global __divxf3
	.global __divtf3
	.proc __divxf3
__divxf3:
__divtf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
(p6)	cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
(p6)	fnma.s1 f11 = farg1, f10, f1
(p6)	fma.s1 f12 = farg0, f10, f0
	;;
(p6)	fma.s1 f13 = f11, f11, f0
(p6)	fma.s1 f14 = f11, f11, f11
	;;
(p6)	fma.s1 f11 = f13, f13, f11
(p6)	fma.s1 f13 = f14, f10, f10
	;;
(p6)	fma.s1 f10 = f13, f11, f10
(p6)	fnma.s1 f11 = farg1, f12, farg0
	;;
(p6)	fma.s1 f11 = f11, f10, f12
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f10 = f12, f10, f10
(p6)	fnma.s1 f12 = farg1, f11, farg0
	;;
(p6)	fma.s0 fret0 = f12, f10, f11
(p7)	mov fret0 = f10
	br.ret.sptk rp
	.endp __divxf3
#endif

#ifdef L__divdf3
// Compute a 64-bit IEEE double quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.

	.text
	.align 16
	.global __divdf3
	.proc __divdf3
__divdf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
(p6)	cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
(p6)	fmpy.s1 f11 = farg0, f10
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fmpy.s1 f13 = f12, f12
	;;
(p6)	fma.s1 f10 = f12, f10, f10
(p6)	fma.s1 f11 = f13, f11, f11
	;;
(p6)	fmpy.s1 f12 = f13, f13
(p6)	fma.s1 f10 = f13, f10, f10
	;;
(p6)	fma.d.s1 f11 = f12, f11, f11
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fnma.d.s1 f8 = farg1, f11, farg0
	;;
(p6)	fma.d fret0 = f8, f10, f11
(p7)	mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divdf3
#endif

#ifdef L__divsf3
// Compute a 32-bit IEEE float quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.

	.text
	.align 16
	.global __divsf3
	.proc __divsf3
__divsf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
(p6)	cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
(p6)	fmpy.s1 f8 = farg0, f10
(p6)	fnma.s1 f9 = farg1, f10, f1
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fmpy.s1 f9 = f9, f9
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fmpy.s1 f9 = f9, f9
	;;
(p6)	fma.d.s1 f10 = f9, f8, f8
	;;
(p6)	fnorm.s.s0 fret0 = f10
(p7)	mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divsf3
#endif

#ifdef L__divdi3
// Compute a 64-bit integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __divdi3
	.proc __divdi3
__divdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fnma.s1 f11 = f9, f10, f1
(p6)	fmpy.s1 f12 = f8, f10
	;;
(p6)	fmpy.s1 f13 = f11, f11
(p6)	fma.s1 f12 = f11, f12, f12
	;;
(p6)	fma.s1 f10 = f11, f10, f10
(p6)	fma.s1 f11 = f13, f12, f12
	;;
(p6)	fma.s1 f10 = f13, f10, f10
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an integer.
	fcvt.fx.trunc.s1 f10 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __divdi3
#endif

#ifdef L__moddi3
// Compute a 64-bit integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend (a).  in1 holds the divisor (b).

	.text
	.align 16
	.global __moddi3
	.proc __moddi3
__moddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f14 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f14
	fcvt.xf f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fmpy.s1 f12 = f8, f10
(p6)	fnma.s1 f11 = f9, f10, f1
	;;
(p6)	fma.s1 f12 = f11, f12, f12
(p6)	fmpy.s1 f13 = f11, f11
	;;
(p6)	fma.s1 f10 = f11, f10, f10
(p6)	fma.s1 f11 = f13, f12, f12
	;;
	sub in1 = r0, in1
(p6)	fma.s1 f10 = f13, f10, f10
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
	setf.sig f9 = in1
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	// r = q * (-b) + a
	xma.l f10 = f10, f9, f14
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __moddi3
#endif

#ifdef L__udivdi3
// Compute a 64-bit unsigned integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __udivdi3
	.proc __udivdi3
__udivdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, to avoid FP software-assist faults.
	fcvt.xuf.s1 f8 = f8
	fcvt.xuf.s1 f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fnma.s1 f11 = f9, f10, f1
(p6)	fmpy.s1 f12 = f8, f10
	;;
(p6)	fmpy.s1 f13 = f11, f11
(p6)	fma.s1 f12 = f11, f12, f12
	;;
(p6)	fma.s1 f10 = f11, f10, f10
(p6)	fma.s1 f11 = f13, f12, f12
	;;
(p6)	fma.s1 f10 = f13, f10, f10
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __udivdi3
#endif

#ifdef L__umoddi3
// Compute a 64-bit unsigned integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend (a).  in1 holds the divisor (b).

	.text
	.align 16
	.global __umoddi3
	.proc __umoddi3
__umoddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f14 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, to avoid FP software assist faults.
	fcvt.xuf.s1 f8 = f14
	fcvt.xuf.s1 f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fmpy.s1 f12 = f8, f10
(p6)	fnma.s1 f11 = f9, f10, f1
	;;
(p6)	fma.s1 f12 = f11, f12, f12
(p6)	fmpy.s1 f13 = f11, f11
	;;
(p6)	fma.s1 f10 = f11, f10, f10
(p6)	fma.s1 f11 = f13, f12, f12
	;;
	sub in1 = r0, in1
(p6)	fma.s1 f10 = f13, f10, f10
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
	setf.sig f9 = in1
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	// r = q * (-b) + a
	xma.l f10 = f10, f9, f14
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __umoddi3
#endif

#ifdef L__divsi3
// Compute a 32-bit integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __divsi3
	.proc __divsi3
__divsi3:
	.regstk 2,0,0,0
	sxt4 in0 = in0
	sxt4 in1 = in1
	;;
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	mov r2 = 0x0ffdd
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
(p6)	fmpy.s1 f8 = f8, f10
(p6)	fnma.s1 f9 = f9, f10, f1
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fma.s1 f9 = f9, f9, f11
	;;
(p6)	fma.s1 f10 = f9, f8, f8
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __divsi3
#endif

#ifdef L__modsi3
// Compute a 32-bit integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __modsi3
	.proc __modsi3
__modsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	sxt4 in0 = in0
	sxt4 in1 = in1
	;;
	setf.sig f13 = r32
	setf.sig f9 = r33
	;;
	sub in1 = r0, in1
	fcvt.xf f8 = f13
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
(p6)	fmpy.s1 f12 = f8, f10
(p6)	fnma.s1 f10 = f9, f10, f1
	;;
	setf.sig f9 = in1
(p6)	fma.s1 f12 = f10, f12, f12
(p6)	fma.s1 f10 = f10, f10, f11	
	;;
(p6)	fma.s1 f10 = f10, f12, f12
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	xma.l f10 = f10, f9, f13
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __modsi3
#endif

#ifdef L__udivsi3
// Compute a 32-bit unsigned integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __udivsi3
	.proc __udivsi3
__udivsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	zxt4 in0 = in0
	zxt4 in1 = in1
	;;
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
(p6)	fmpy.s1 f8 = f8, f10
(p6)	fnma.s1 f9 = f9, f10, f1
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fma.s1 f9 = f9, f9, f11
	;;
(p6)	fma.s1 f10 = f9, f8, f8
	;;
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __udivsi3
#endif

#ifdef L__umodsi3
// Compute a 32-bit unsigned integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __umodsi3
	.proc __umodsi3
__umodsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	zxt4 in0 = in0
	zxt4 in1 = in1
	;;
	setf.sig f13 = in0
	setf.sig f9 = in1
	;;
	sub in1 = r0, in1
	fcvt.xf f8 = f13
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
(p6)	fmpy.s1 f12 = f8, f10
(p6)	fnma.s1 f10 = f9, f10, f1
	;;
	setf.sig f9 = in1
(p6)	fma.s1 f12 = f10, f12, f12
(p6)	fma.s1 f10 = f10, f10, f11
	;;
(p6)	fma.s1 f10 = f10, f12, f12
	;;
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	xma.l f10 = f10, f9, f13
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __umodsi3
#endif

#ifdef L__save_stack_nonlocal
// Notes on save/restore stack nonlocal: We read ar.bsp but write
// ar.bspstore.  This is because ar.bsp can be read at all times
// (independent of the RSE mode) but since it's read-only we need to
// restore the value via ar.bspstore.  This is OK because
// ar.bsp==ar.bspstore after executing "flushrs".

// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)

	.text
	.align 16
	.global __ia64_save_stack_nonlocal
	.proc __ia64_save_stack_nonlocal
__ia64_save_stack_nonlocal:
	{ .mmf
	  alloc r18 = ar.pfs, 2, 0, 0, 0
	  mov r19 = ar.rsc
	  ;;
	}
	{ .mmi
	  flushrs
	  st8 [in0] = in1, 24
	  and r19 = 0x1c, r19
	  ;;
	}
	{ .mmi
	  st8 [in0] = r18, -16
	  mov ar.rsc = r19
	  or r19 = 0x3, r19
	  ;;
	}
	{ .mmi
	  mov r16 = ar.bsp
	  mov r17 = ar.rnat
	  adds r2 = 8, in0
	  ;;
	}
	{ .mmi
	  st8 [in0] = r16
	  st8 [r2] = r17
	}
	{ .mib
	  mov ar.rsc = r19
	  br.ret.sptk.few rp
	  ;;
	}
	.endp __ia64_save_stack_nonlocal
#endif

#ifdef L__nonlocal_goto
// void __ia64_nonlocal_goto(void *target_label, void *save_area,
//			     void *static_chain);

	.text
	.align 16
	.global __ia64_nonlocal_goto
	.proc __ia64_nonlocal_goto
__ia64_nonlocal_goto:
	{ .mmi
	  alloc r20 = ar.pfs, 3, 0, 0, 0
	  ld8 r12 = [in1], 8
	  mov.ret.sptk rp = in0, .L0
	  ;;
	}
	{ .mmf
	  ld8 r16 = [in1], 8
	  mov r19 = ar.rsc
	  ;;
	}
	{ .mmi
	  flushrs
	  ld8 r17 = [in1], 8
	  and r19 = 0x1c, r19
	  ;;
	}
	{ .mmi
	  ld8 r18 = [in1]
	  mov ar.rsc = r19
	  or r19 = 0x3, r19
	  ;;
	}
	{ .mmi
	  mov ar.bspstore = r16
	  ;;
	  mov ar.rnat = r17
	  ;;
	}
	{ .mmi
	  loadrs
	  invala
	  mov r15 = in2
	  ;;
	}
.L0:	{ .mib
	  mov ar.rsc = r19
	  mov ar.pfs = r18
	  br.ret.sptk.few rp
	  ;;
	}
	.endp __ia64_nonlocal_goto
#endif

#ifdef L__restore_stack_nonlocal
// This is mostly the same as nonlocal_goto above.
// ??? This has not been tested yet.

// void __ia64_restore_stack_nonlocal(void *save_area)

	.text
	.align 16
	.global __ia64_restore_stack_nonlocal
	.proc __ia64_restore_stack_nonlocal
__ia64_restore_stack_nonlocal:
	{ .mmf
	  alloc r20 = ar.pfs, 4, 0, 0, 0
	  ld8 r12 = [in0], 8
	  ;;
	}
	{ .mmb
	  ld8 r16=[in0], 8
	  mov r19 = ar.rsc
	  ;;
	}
	{ .mmi
	  flushrs
	  ld8 r17 = [in0], 8
	  and r19 = 0x1c, r19
	  ;;
	}
	{ .mmf
	  ld8 r18 = [in0]
	  mov ar.rsc = r19
	  ;;
	}
	{ .mmi
	  mov ar.bspstore = r16
	  ;;
	  mov ar.rnat = r17
	  or r19 = 0x3, r19
	  ;;
	}
	{ .mmf
	  loadrs
	  invala
	  ;;
	}
.L0:	{ .mib
	  mov ar.rsc = r19
	  mov ar.pfs = r18
	  br.ret.sptk.few rp
	  ;;
	}
	.endp __ia64_restore_stack_nonlocal
#endif

#ifdef L__trampoline
// Implement the nested function trampoline.  This is out of line
// so that we don't have to bother with flushing the icache, as
// well as making the on-stack trampoline smaller.
//
// The trampoline has the following form:
//
//		+-------------------+ >
//	TRAMP:	| __ia64_trampoline | |
//		+-------------------+  > fake function descriptor
//		| TRAMP+16          | |
//		+-------------------+ >
//		| target descriptor |
//		+-------------------+
//		| static link	    |
//		+-------------------+

	.text
	.align 16
	.global __ia64_trampoline
	.proc __ia64_trampoline
__ia64_trampoline:
	{ .mmi
	  ld8 r2 = [r1], 8
	  ;;
	  ld8 r15 = [r1]
	}
	{ .mmi
	  ld8 r3 = [r2], 8
	  ;;
	  ld8 r1 = [r2]
	  mov b6 = r3
	}
	{ .bbb
	  br.sptk.many b6
	  ;;
	}
	.endp __ia64_trampoline
#endif

#ifdef L__compat
// Thunks for backward compatibility.

	.text
	.align 16
	.global __fixtfti
	.proc __fixtfti
__fixtfti:
	{ .bbb
	  br.sptk.many __fixxfti
	  ;;
	}
	.endp __fixtfti

	.align 16
	.global __fixunstfti
	.proc __fixunstfti
__fixunstfti:
	{ .bbb
	  br.sptk.many __fixunsxfti
	  ;;
	}
	.endp __fixunstfti

	.align 16
	.global __floattitf
	.proc __floattitf
__floattitf:
	{ .bbb
	  br.sptk.many __floattixf
	  ;;
	}
	.endp __floattitf

#endif
Gcc 3.4.2 20040728. 2004-07-28 03:11:36 +00:00			`#ifdef L__divxf3`
Enlist the FreeBSD-CURRENT users as testers of what is to become Gcc 3.1.0. These bits are taken from the FSF anoncvs repo on 1-Feb-2002 08:20 PST. 2002-02-01 18:16:02 +00:00			`// Compute a 80-bit IEEE double-extended quotient.`
			`//`
			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
			`// alternative.`
			`//`
			`// farg0 holds the dividend. farg1 holds the divisor.`
Gcc 3.4.2 20040728. 2004-07-28 03:11:36 +00:00			`//`
			`// __divtf3 is an alternate symbol name for backward compatibility.`
Enlist the FreeBSD-CURRENT users as testers of what is to become Gcc 3.1.0. These bits are taken from the FSF anoncvs repo on 1-Feb-2002 08:20 PST. 2002-02-01 18:16:02 +00:00
			`.text`
			`.align 16`
Gcc 3.4.2 20040728. 2004-07-28 03:11:36 +00:00			`.global __divxf3`
Enlist the FreeBSD-CURRENT users as testers of what is to become Gcc 3.1.0. These bits are taken from the FSF anoncvs repo on 1-Feb-2002 08:20 PST. 2002-02-01 18:16:02 +00:00			`.global __divtf3`
Gcc 3.4.2 20040728. 2004-07-28 03:11:36 +00:00			`.proc __divxf3`
			`__divxf3:`
Enlist the FreeBSD-CURRENT users as testers of what is to become Gcc 3.1.0. These bits are taken from the FSF anoncvs repo on 1-Feb-2002 08:20 PST. 2002-02-01 18:16:02 +00:00			`__divtf3:`
			`cmp.eq p7, p0 = r0, r0`
			`frcpa.s0 f10, p6 = farg0, farg1`
			`;;`
			`(p6) cmp.ne p7, p0 = r0, r0`
			`.pred.rel.mutex p6, p7`
			`(p6) fnma.s1 f11 = farg1, f10, f1`
			`(p6) fma.s1 f12 = farg0, f10, f0`
			`;;`
			`(p6) fma.s1 f13 = f11, f11, f0`
			`(p6) fma.s1 f14 = f11, f11, f11`
			`;;`
			`(p6) fma.s1 f11 = f13, f13, f11`
			`(p6) fma.s1 f13 = f14, f10, f10`
			`;;`
			`(p6) fma.s1 f10 = f13, f11, f10`
			`(p6) fnma.s1 f11 = farg1, f12, farg0`
			`;;`
			`(p6) fma.s1 f11 = f11, f10, f12`
			`(p6) fnma.s1 f12 = farg1, f10, f1`
			`;;`
			`(p6) fma.s1 f10 = f12, f10, f10`
			`(p6) fnma.s1 f12 = farg1, f11, farg0`
			`;;`
			`(p6) fma.s0 fret0 = f12, f10, f11`
			`(p7) mov fret0 = f10`
			`br.ret.sptk rp`
Gcc 3.4.2 20040728. 2004-07-28 03:11:36 +00:00			`.endp __divxf3`
Enlist the FreeBSD-CURRENT users as testers of what is to become Gcc 3.1.0. These bits are taken from the FSF anoncvs repo on 1-Feb-2002 08:20 PST. 2002-02-01 18:16:02 +00:00			`#endif`

			`#ifdef L__divdf3`
			`// Compute a 64-bit IEEE double quotient.`
			`//`
			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
			`// alternative.`
			`//`
			`// farg0 holds the dividend. farg1 holds the divisor.`

			`.text`
			`.align 16`
			`.global __divdf3`
			`.proc __divdf3`
			`__divdf3:`
			`cmp.eq p7, p0 = r0, r0`
			`frcpa.s0 f10, p6 = farg0, farg1`
			`;;`
			`(p6) cmp.ne p7, p0 = r0, r0`
			`.pred.rel.mutex p6, p7`
			`(p6) fmpy.s1 f11 = farg0, f10`
			`(p6) fnma.s1 f12 = farg1, f10, f1`
			`;;`
			`(p6) fma.s1 f11 = f12, f11, f11`
			`(p6) fmpy.s1 f13 = f12, f12`
			`;;`
			`(p6) fma.s1 f10 = f12, f10, f10`
			`(p6) fma.s1 f11 = f13, f11, f11`
			`;;`
			`(p6) fmpy.s1 f12 = f13, f13`
			`(p6) fma.s1 f10 = f13, f10, f10`
			`;;`
			`(p6) fma.d.s1 f11 = f12, f11, f11`
			`(p6) fma.s1 f10 = f12, f10, f10`
			`;;`
			`(p6) fnma.d.s1 f8 = farg1, f11, farg0`
			`;;`
			`(p6) fma.d fret0 = f8, f10, f11`
			`(p7) mov fret0 = f10`
			`br.ret.sptk rp`
			`;;`
			`.endp __divdf3`
			`#endif`

			`#ifdef L__divsf3`
			`// Compute a 32-bit IEEE float quotient.`
			`//`
			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
			`// alternative.`
			`//`
			`// farg0 holds the dividend. farg1 holds the divisor.`

			`.text`
			`.align 16`
			`.global __divsf3`
			`.proc __divsf3`
			`__divsf3:`
			`cmp.eq p7, p0 = r0, r0`
			`frcpa.s0 f10, p6 = farg0, farg1`
			`;;`
			`(p6) cmp.ne p7, p0 = r0, r0`
			`.pred.rel.mutex p6, p7`
			`(p6) fmpy.s1 f8 = farg0, f10`
			`(p6) fnma.s1 f9 = farg1, f10, f1`
			`;;`
			`(p6) fma.s1 f8 = f9, f8, f8`
			`(p6) fmpy.s1 f9 = f9, f9`
			`;;`
			`(p6) fma.s1 f8 = f9, f8, f8`
			`(p6) fmpy.s1 f9 = f9, f9`
			`;;`
			`(p6) fma.d.s1 f10 = f9, f8, f8`
			`;;`
			`(p6) fnorm.s.s0 fret0 = f10`
			`(p7) mov fret0 = f10`
			`br.ret.sptk rp`
			`;;`
			`.endp __divsf3`
			`#endif`

			`#ifdef L__divdi3`
			`// Compute a 64-bit integer quotient.`
			`//`
			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
			`// alternative.`
			`//`
			`// in0 holds the dividend. in1 holds the divisor.`

			`.text`
			`.align 16`
			`.global __divdi3`
			`.proc __divdi3`
			`__divdi3:`
			`.regstk 2,0,0,0`
			`// Transfer inputs to FP registers.`
			`setf.sig f8 = in0`
			`setf.sig f9 = in1`
			`;;`
			`// Convert the inputs to FP, so that they won't be treated as unsigned.`
			`fcvt.xf f8 = f8`
			`fcvt.xf f9 = f9`
			`;;`
			`// Compute the reciprocal approximation.`
			`frcpa.s1 f10, p6 = f8, f9`
			`;;`
			`// 3 Newton-Raphson iterations.`
			`(p6) fnma.s1 f11 = f9, f10, f1`
			`(p6) fmpy.s1 f12 = f8, f10`
			`;;`
			`(p6) fmpy.s1 f13 = f11, f11`
			`(p6) fma.s1 f12 = f11, f12, f12`
			`;;`
			`(p6) fma.s1 f10 = f11, f10, f10`
			`(p6) fma.s1 f11 = f13, f12, f12`
			`;;`
			`(p6) fma.s1 f10 = f13, f10, f10`
			`(p6) fnma.s1 f12 = f9, f11, f8`
			`;;`
			`(p6) fma.s1 f10 = f12, f10, f11`
			`;;`
			`// Round quotient to an integer.`
			`fcvt.fx.trunc.s1 f10 = f10`
			`;;`
			`// Transfer result to GP registers.`
			`getf.sig ret0 = f10`
			`br.ret.sptk rp`
			`;;`
			`.endp __divdi3`
			`#endif`

			`#ifdef L__moddi3`
			`// Compute a 64-bit integer modulus.`
			`//`
			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
			`// alternative.`
			`//`
			`// in0 holds the dividend (a). in1 holds the divisor (b).`

			`.text`
			`.align 16`
			`.global __moddi3`
			`.proc __moddi3`
			`__moddi3:`
			`.regstk 2,0,0,0`
			`// Transfer inputs to FP registers.`
			`setf.sig f14 = in0`
			`setf.sig f9 = in1`
			`;;`
			`// Convert the inputs to FP, so that they won't be treated as unsigned.`
			`fcvt.xf f8 = f14`
			`fcvt.xf f9 = f9`
			`;;`
			`// Compute the reciprocal approximation.`
			`frcpa.s1 f10, p6 = f8, f9`
			`;;`
			`// 3 Newton-Raphson iterations.`
			`(p6) fmpy.s1 f12 = f8, f10`
			`(p6) fnma.s1 f11 = f9, f10, f1`
			`;;`
			`(p6) fma.s1 f12 = f11, f12, f12`
			`(p6) fmpy.s1 f13 = f11, f11`
			`;;`
			`(p6) fma.s1 f10 = f11, f10, f10`
			`(p6) fma.s1 f11 = f13, f12, f12`
			`;;`
			`sub in1 = r0, in1`
			`(p6) fma.s1 f10 = f13, f10, f10`
			`(p6) fnma.s1 f12 = f9, f11, f8`
			`;;`
			`setf.sig f9 = in1`
			`(p6) fma.s1 f10 = f12, f10, f11`
			`;;`
			`fcvt.fx.trunc.s1 f10 = f10`
			`;;`
			`// r = q * (-b) + a`
			`xma.l f10 = f10, f9, f14`
			`;;`
			`// Transfer result to GP registers.`
			`getf.sig ret0 = f10`
			`br.ret.sptk rp`
			`;;`
			`.endp __moddi3`
			`#endif`

			`#ifdef L__udivdi3`
			`// Compute a 64-bit unsigned integer quotient.`
			`//`
			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
			`// alternative.`
			`//`
			`// in0 holds the dividend. in1 holds the divisor.`

			`.text`
			`.align 16`
			`.global __udivdi3`
			`.proc __udivdi3`
			`__udivdi3:`
			`.regstk 2,0,0,0`
			`// Transfer inputs to FP registers.`
			`setf.sig f8 = in0`
			`setf.sig f9 = in1`
			`;;`
			`// Convert the inputs to FP, to avoid FP software-assist faults.`
			`fcvt.xuf.s1 f8 = f8`
			`fcvt.xuf.s1 f9 = f9`
			`;;`
			`// Compute the reciprocal approximation.`
			`frcpa.s1 f10, p6 = f8, f9`
			`;;`
			`// 3 Newton-Raphson iterations.`
			`(p6) fnma.s1 f11 = f9, f10, f1`
			`(p6) fmpy.s1 f12 = f8, f10`
			`;;`
			`(p6) fmpy.s1 f13 = f11, f11`
			`(p6) fma.s1 f12 = f11, f12, f12`
			`;;`
			`(p6) fma.s1 f10 = f11, f10, f10`
			`(p6) fma.s1 f11 = f13, f12, f12`
			`;;`
			`(p6) fma.s1 f10 = f13, f10, f10`
			`(p6) fnma.s1 f12 = f9, f11, f8`
			`;;`
			`(p6) fma.s1 f10 = f12, f10, f11`
			`;;`
			`// Round quotient to an unsigned integer.`
			`fcvt.fxu.trunc.s1 f10 = f10`
			`;;`
			`// Transfer result to GP registers.`
			`getf.sig ret0 = f10`
			`br.ret.sptk rp`
			`;;`
			`.endp __udivdi3`
			`#endif`

			`#ifdef L__umoddi3`
			`// Compute a 64-bit unsigned integer modulus.`
			`//`
			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
			`// alternative.`
			`//`
			`// in0 holds the dividend (a). in1 holds the divisor (b).`

			`.text`
			`.align 16`
			`.global __umoddi3`
			`.proc __umoddi3`
			`__umoddi3:`
			`.regstk 2,0,0,0`
			`// Transfer inputs to FP registers.`
			`setf.sig f14 = in0`
			`setf.sig f9 = in1`
			`;;`
			`// Convert the inputs to FP, to avoid FP software assist faults.`
			`fcvt.xuf.s1 f8 = f14`
			`fcvt.xuf.s1 f9 = f9`
			`;;`
			`// Compute the reciprocal approximation.`
			`frcpa.s1 f10, p6 = f8, f9`
			`;;`
			`// 3 Newton-Raphson iterations.`
			`(p6) fmpy.s1 f12 = f8, f10`
			`(p6) fnma.s1 f11 = f9, f10, f1`
			`;;`
			`(p6) fma.s1 f12 = f11, f12, f12`
			`(p6) fmpy.s1 f13 = f11, f11`
			`;;`
			`(p6) fma.s1 f10 = f11, f10, f10`
			`(p6) fma.s1 f11 = f13, f12, f12`
			`;;`
			`sub in1 = r0, in1`
			`(p6) fma.s1 f10 = f13, f10, f10`
			`(p6) fnma.s1 f12 = f9, f11, f8`
			`;;`
			`setf.sig f9 = in1`
			`(p6) fma.s1 f10 = f12, f10, f11`
			`;;`
			`// Round quotient to an unsigned integer.`
			`fcvt.fxu.trunc.s1 f10 = f10`
			`;;`
			`// r = q * (-b) + a`
			`xma.l f10 = f10, f9, f14`
			`;;`
			`// Transfer result to GP registers.`
			`getf.sig ret0 = f10`
			`br.ret.sptk rp`
			`;;`
			`.endp __umoddi3`
			`#endif`

			`#ifdef L__divsi3`
			`// Compute a 32-bit integer quotient.`
			`//`
			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
			`// alternative.`
			`//`
			`// in0 holds the dividend. in1 holds the divisor.`

			`.text`
			`.align 16`
			`.global __divsi3`
			`.proc __divsi3`
			`__divsi3:`
			`.regstk 2,0,0,0`
			`sxt4 in0 = in0`
			`sxt4 in1 = in1`
			`;;`
			`setf.sig f8 = in0`
			`setf.sig f9 = in1`
			`;;`
			`mov r2 = 0x0ffdd`
			`fcvt.xf f8 = f8`
			`fcvt.xf f9 = f9`
			`;;`
			`setf.exp f11 = r2`
			`frcpa.s1 f10, p6 = f8, f9`
			`;;`
			`(p6) fmpy.s1 f8 = f8, f10`
			`(p6) fnma.s1 f9 = f9, f10, f1`
			`;;`
			`(p6) fma.s1 f8 = f9, f8, f8`
			`(p6) fma.s1 f9 = f9, f9, f11`
			`;;`
			`(p6) fma.s1 f10 = f9, f8, f8`
			`;;`
			`fcvt.fx.trunc.s1 f10 = f10`
			`;;`
			`getf.sig ret0 = f10`
			`br.ret.sptk rp`
			`;;`
			`.endp __divsi3`
			`#endif`

			`#ifdef L__modsi3`
			`// Compute a 32-bit integer modulus.`
			`//`
			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
			`// alternative.`
			`//`
			`// in0 holds the dividend. in1 holds the divisor.`

			`.text`
			`.align 16`
			`.global __modsi3`
			`.proc __modsi3`
			`__modsi3:`
			`.regstk 2,0,0,0`
			`mov r2 = 0x0ffdd`
			`sxt4 in0 = in0`
			`sxt4 in1 = in1`
			`;;`
			`setf.sig f13 = r32`
			`setf.sig f9 = r33`
			`;;`
			`sub in1 = r0, in1`
			`fcvt.xf f8 = f13`
			`fcvt.xf f9 = f9`
			`;;`
			`setf.exp f11 = r2`
			`frcpa.s1 f10, p6 = f8, f9`
			`;;`
			`(p6) fmpy.s1 f12 = f8, f10`
			`(p6) fnma.s1 f10 = f9, f10, f1`
			`;;`
			`setf.sig f9 = in1`
			`(p6) fma.s1 f12 = f10, f12, f12`
			`(p6) fma.s1 f10 = f10, f10, f11`
			`;;`
			`(p6) fma.s1 f10 = f10, f12, f12`
			`;;`
			`fcvt.fx.trunc.s1 f10 = f10`
			`;;`
			`xma.l f10 = f10, f9, f13`
			`;;`
			`getf.sig ret0 = f10`
			`br.ret.sptk rp`
			`;;`
			`.endp __modsi3`
			`#endif`

			`#ifdef L__udivsi3`
			`// Compute a 32-bit unsigned integer quotient.`
			`//`
			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
			`// alternative.`
			`//`
			`// in0 holds the dividend. in1 holds the divisor.`

			`.text`
			`.align 16`
			`.global __udivsi3`
			`.proc __udivsi3`
			`__udivsi3:`
			`.regstk 2,0,0,0`
			`mov r2 = 0x0ffdd`
			`zxt4 in0 = in0`
			`zxt4 in1 = in1`
			`;;`
			`setf.sig f8 = in0`
			`setf.sig f9 = in1`
			`;;`
			`fcvt.xf f8 = f8`
			`fcvt.xf f9 = f9`
			`;;`
			`setf.exp f11 = r2`
			`frcpa.s1 f10, p6 = f8, f9`
			`;;`
			`(p6) fmpy.s1 f8 = f8, f10`
			`(p6) fnma.s1 f9 = f9, f10, f1`
			`;;`
			`(p6) fma.s1 f8 = f9, f8, f8`
			`(p6) fma.s1 f9 = f9, f9, f11`
			`;;`
			`(p6) fma.s1 f10 = f9, f8, f8`
			`;;`
			`fcvt.fxu.trunc.s1 f10 = f10`
			`;;`
			`getf.sig ret0 = f10`
			`br.ret.sptk rp`
			`;;`
			`.endp __udivsi3`
			`#endif`

			`#ifdef L__umodsi3`
			`// Compute a 32-bit unsigned integer modulus.`
			`//`
			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
			`// alternative.`
			`//`
			`// in0 holds the dividend. in1 holds the divisor.`

			`.text`
			`.align 16`
			`.global __umodsi3`
			`.proc __umodsi3`
			`__umodsi3:`
			`.regstk 2,0,0,0`
			`mov r2 = 0x0ffdd`
			`zxt4 in0 = in0`
			`zxt4 in1 = in1`
			`;;`
			`setf.sig f13 = in0`
			`setf.sig f9 = in1`
			`;;`
			`sub in1 = r0, in1`
			`fcvt.xf f8 = f13`
			`fcvt.xf f9 = f9`
			`;;`
			`setf.exp f11 = r2`
			`frcpa.s1 f10, p6 = f8, f9`
			`;;`
			`(p6) fmpy.s1 f12 = f8, f10`
			`(p6) fnma.s1 f10 = f9, f10, f1`
			`;;`
			`setf.sig f9 = in1`
			`(p6) fma.s1 f12 = f10, f12, f12`
			`(p6) fma.s1 f10 = f10, f10, f11`
			`;;`
			`(p6) fma.s1 f10 = f10, f12, f12`
			`;;`
			`fcvt.fxu.trunc.s1 f10 = f10`
			`;;`
			`xma.l f10 = f10, f9, f13`
			`;;`
			`getf.sig ret0 = f10`
			`br.ret.sptk rp`
			`;;`
			`.endp __umodsi3`
			`#endif`

			`#ifdef L__save_stack_nonlocal`
			`// Notes on save/restore stack nonlocal: We read ar.bsp but write`
			`// ar.bspstore. This is because ar.bsp can be read at all times`
			`// (independent of the RSE mode) but since it's read-only we need to`
			`// restore the value via ar.bspstore. This is OK because`
			`// ar.bsp==ar.bspstore after executing "flushrs".`

			`// void __ia64_save_stack_nonlocal(void save_area, void stack_pointer)`

			`.text`
			`.align 16`
			`.global __ia64_save_stack_nonlocal`
			`.proc __ia64_save_stack_nonlocal`
			`__ia64_save_stack_nonlocal:`
			`{ .mmf`
			`alloc r18 = ar.pfs, 2, 0, 0, 0`
			`mov r19 = ar.rsc`
			`;;`
			`}`
			`{ .mmi`
			`flushrs`
			`st8 [in0] = in1, 24`
			`and r19 = 0x1c, r19`
			`;;`
			`}`
			`{ .mmi`
			`st8 [in0] = r18, -16`
			`mov ar.rsc = r19`
			`or r19 = 0x3, r19`
			`;;`
			`}`
			`{ .mmi`
			`mov r16 = ar.bsp`
			`mov r17 = ar.rnat`
			`adds r2 = 8, in0`
			`;;`
			`}`
			`{ .mmi`
			`st8 [in0] = r16`
			`st8 [r2] = r17`
			`}`
			`{ .mib`
			`mov ar.rsc = r19`
			`br.ret.sptk.few rp`
			`;;`
			`}`
			`.endp __ia64_save_stack_nonlocal`
			`#endif`

			`#ifdef L__nonlocal_goto`
			`// void __ia64_nonlocal_goto(void target_label, void save_area,`
			`// void *static_chain);`

			`.text`
			`.align 16`
			`.global __ia64_nonlocal_goto`
			`.proc __ia64_nonlocal_goto`
			`__ia64_nonlocal_goto:`
			`{ .mmi`
			`alloc r20 = ar.pfs, 3, 0, 0, 0`
			`ld8 r12 = [in1], 8`
			`mov.ret.sptk rp = in0, .L0`
			`;;`
			`}`
			`{ .mmf`
			`ld8 r16 = [in1], 8`
			`mov r19 = ar.rsc`
			`;;`
			`}`
			`{ .mmi`
			`flushrs`
			`ld8 r17 = [in1], 8`
			`and r19 = 0x1c, r19`
			`;;`
			`}`
			`{ .mmi`
			`ld8 r18 = [in1]`
			`mov ar.rsc = r19`
			`or r19 = 0x3, r19`
			`;;`
			`}`
			`{ .mmi`
			`mov ar.bspstore = r16`
			`;;`
			`mov ar.rnat = r17`
			`;;`
			`}`
			`{ .mmi`
			`loadrs`
			`invala`
			`mov r15 = in2`
			`;;`
			`}`
			`.L0: { .mib`
			`mov ar.rsc = r19`
			`mov ar.pfs = r18`
			`br.ret.sptk.few rp`
			`;;`
			`}`
			`.endp __ia64_nonlocal_goto`
			`#endif`

			`#ifdef L__restore_stack_nonlocal`
			`// This is mostly the same as nonlocal_goto above.`
			`// ??? This has not been tested yet.`

			`// void __ia64_restore_stack_nonlocal(void *save_area)`

			`.text`
			`.align 16`
			`.global __ia64_restore_stack_nonlocal`
			`.proc __ia64_restore_stack_nonlocal`
			`__ia64_restore_stack_nonlocal:`
			`{ .mmf`
			`alloc r20 = ar.pfs, 4, 0, 0, 0`
			`ld8 r12 = [in0], 8`
			`;;`
			`}`
			`{ .mmb`
			`ld8 r16=[in0], 8`
			`mov r19 = ar.rsc`
			`;;`
			`}`
			`{ .mmi`
			`flushrs`
			`ld8 r17 = [in0], 8`
			`and r19 = 0x1c, r19`
			`;;`
			`}`
			`{ .mmf`
			`ld8 r18 = [in0]`
			`mov ar.rsc = r19`
			`;;`
			`}`
			`{ .mmi`
			`mov ar.bspstore = r16`
			`;;`
			`mov ar.rnat = r17`
			`or r19 = 0x3, r19`
			`;;`
			`}`
			`{ .mmf`
			`loadrs`
			`invala`
			`;;`
			`}`
			`.L0: { .mib`
			`mov ar.rsc = r19`
			`mov ar.pfs = r18`
			`br.ret.sptk.few rp`
			`;;`
			`}`
			`.endp __ia64_restore_stack_nonlocal`
			`#endif`

			`#ifdef L__trampoline`
			`// Implement the nested function trampoline. This is out of line`
			`// so that we don't have to bother with flushing the icache, as`
			`// well as making the on-stack trampoline smaller.`
			`//`
			`// The trampoline has the following form:`
			`//`
			`// +-------------------+ >`
			`// TRAMP: \| __ia64_trampoline \| \|`
			`// +-------------------+ > fake function descriptor`
			`// \| TRAMP+16 \| \|`
			`// +-------------------+ >`
			`// \| target descriptor \|`
			`// +-------------------+`
			`// \| static link \|`
			`// +-------------------+`

			`.text`
			`.align 16`
			`.global __ia64_trampoline`
			`.proc __ia64_trampoline`
			`__ia64_trampoline:`
			`{ .mmi`
			`ld8 r2 = [r1], 8`
			`;;`
			`ld8 r15 = [r1]`
			`}`
			`{ .mmi`
			`ld8 r3 = [r2], 8`
			`;;`
			`ld8 r1 = [r2]`
			`mov b6 = r3`
			`}`
			`{ .bbb`
			`br.sptk.many b6`
			`;;`
			`}`
			`.endp __ia64_trampoline`
			`#endif`
Gcc 3.4.2 20040728. 2004-07-28 03:11:36 +00:00
			`#ifdef L__compat`
			`// Thunks for backward compatibility.`

			`.text`
			`.align 16`
			`.global __fixtfti`
			`.proc __fixtfti`
			`__fixtfti:`
			`{ .bbb`
			`br.sptk.many __fixxfti`
			`;;`
			`}`
			`.endp __fixtfti`

			`.align 16`
			`.global __fixunstfti`
			`.proc __fixunstfti`
			`__fixunstfti:`
			`{ .bbb`
			`br.sptk.many __fixunsxfti`
			`;;`
			`}`
			`.endp __fixunstfti`

			`.align 16`
			`.global __floattitf`
			`.proc __floattitf`
			`__floattitf:`
			`{ .bbb`
			`br.sptk.many __floattixf`
			`;;`
			`}`
			`.endp __floattitf`

			`#endif`