Moved all the optimizations for |x| <= 9pi/2 from

__ieee754_rem_pio2f() to its 3 callers and manually inline them. On Athlons, with favourable compiler flags and optimizations and favourable pipeline conditions, this gives a speedup of 30-40 cycles for cosf(), sinf() and tanf() on the range pi/4 < |x| <= 9pi/4, so thes functions are now signifcantly faster than the hardware trig functions in many cases. E.g., in a benchmark with uniformly distributed x in [-2pi, 2pi], A64 hardware fcos took 72-129 cycles and cosf() took 37-55 cycles. Out-of-order execution is needed to get both of these times. The optimizations in this commit apparently work more by removing 1 serialization point than by reducing latency.
2005-11-19 02:38:27 +00:00 · 2005-11-19 02:38:27 +00:00 · 558fb238b1
commit 558fb238b1
parent 666e602c46
4 changed files with 105 additions and 67 deletions
--- a/lib/msun/src/e_rem_pio2f.c
+++ b/lib/msun/src/e_rem_pio2f.c
@ -47,10 +47,6 @@ static const int32_t two_over_pi[] = {

 /*
 * invpio2:  53 bits of 2/pi
- * e1pio2:   1*pi/2 rounded to 53 bits
- * e2pio2:   2*pi/2 rounded to 53 bits
- * e3pio2:   3*pi/2 rounded to 53 bits
- * e4pio2:   4*pi/2 rounded to 53 bits
 * pio2_1:   first  33 bit of pi/2
 * pio2_1t:  pi/2 - pio2_1
 */
@ -60,10 +56,6 @@ zero =  0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
 half =  5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
 two24 =  1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
 invpio2 =  6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
-e1pio2  =  1*M_PI_2,                   /* 0x3FF921FB, 0x54442D18 */
-e2pio2  =  2*M_PI_2,                   /* 0x400921FB, 0x54442D18 */
-e3pio2  =  3*M_PI_2,                   /* 0x4012D97C, 0x7F3321D2 */
-e4pio2  =  4*M_PI_2,                   /* 0x401921FB, 0x54442D18 */
 pio2_1  =  1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
 pio2_1t =  6.07710050650619224932e-11; /* 0x3DD0B461, 0x1A626331 */

@ -75,53 +67,6 @@ pio2_1t =  6.07710050650619224932e-11; /* 0x3DD0B461, 0x1A626331 */

 	GET_FLOAT_WORD(hx,x);
 	ix = hx&0x7fffffff;
-	if(ix<=0x3f490fda)   		/* |x| ~<= pi/4, reduction is null */
-	    {y[0] = x; y[1] = 0; return 0;}
-    /* 53 bit pi is good enough for special cases */
-	if(ix<=0x407b53d1) {		/* |x| ~<= 5*pi/4 */
-	    if(ix<=0x4016cbe3) {	/* |x| ~<= 3*pi/4 */
-		if(hx>0) { 
-		    z = x - e1pio2;
-		    n = 1;
-		} else {
-		    z = x + e1pio2;
-		    n = 3;
-		}
-		y[0] = z;
-		y[1] = z - y[0];
-		return n;
-	    } else {
-		if(hx>0)
-		    z = x - e2pio2;
-		else
-		    z = x + e2pio2;
-		y[0] = z;
-		y[1] = z - y[0];
-		return 2;
-	    }
-	}
-	if(ix<=0x40e231d5) {		/* |x| ~<= 9*pi/4*/
-	    if(ix<=0x40afeddf) {	/* |x| ~<= 7*pi/4 */
-		if(hx>0) { 
-		    z = x - e3pio2;
-		    n = 3;
-		} else {
-		    z = x + e3pio2;
-		    n = 1;
-		}
-		y[0] = z;
-		y[1] = z - y[0];
-		return n;
-	    } else {
-		if(hx>0)
-		    z = x - e4pio2;
-		else
-		    z = x + e4pio2;
-		y[0] = z;
-		y[1] = z - y[0];
-		return 0;
-	    }
-	}
    /* 33+53 bit pi is good enough for medium size */
 	if(ix<=0x49490f80) {		/* |x| ~<= 2^19*(pi/2), medium size */
 	    t  = fabsf(x);
--- a/lib/msun/src/s_cosf.c
+++ b/lib/msun/src/s_cosf.c
@ -20,25 +20,56 @@ static char rcsid[] = "$FreeBSD$";
 #include "math.h"
 #include "math_private.h"

+/* Small multiples of pi/2 rounded to double precision. */
+static const double
+c1pio2 = 1*M_PI_2,			/* 0x3FF921FB, 0x54442D18 */
+c2pio2 = 2*M_PI_2,			/* 0x400921FB, 0x54442D18 */
+c3pio2 = 3*M_PI_2,			/* 0x4012D97C, 0x7F3321D2 */
+c4pio2 = 4*M_PI_2;			/* 0x401921FB, 0x54442D18 */
+
+static inline float
+__kernel_cosdf(double x)
+{
+	return __kernel_cosf((float)x, x - (float)x);
+}
+
+static inline float
+__kernel_sindf(double x)
+{
+	return __kernel_sinf((float)x, x - (float)x, 1);
+}
+
 float
 cosf(float x)
 {
 	float y[2];
 	int32_t n,ix;

+	x = fabsf(x);
 	GET_FLOAT_WORD(ix,x);
-	ix &= 0x7fffffff;

 	if(ix <= 0x3f490fda) {		/* |x| ~<= pi/4 */
 	    if(ix<0x39800000)		/* |x| < 2**-12 */
 		if(((int)x)==0) return 1.0;	/* 1 with inexact if x != 0 */
 	    return __kernel_cosf(x,0.0);
 	}
+	if(ix<=0x407b53d1) {		/* |x| <= ~5*pi/4 */
+	    if(ix<=0x4016cbe3)		/* |x| <= ~3pi/4 */
+		return -__kernel_sindf(x - c1pio2);
+	    else
+		return -__kernel_cosdf(x - c2pio2);
+	}
+	if(ix<=0x40e231d5) {		/* |x| <= ~9*pi/4 */
+	    if(ix<=0x40afeddf)		/* |x| <= ~7*pi/4 */
+		return __kernel_sindf(x - c3pio2);
+	    else
+		return __kernel_cosdf(x - c4pio2);
+	}

    /* cos(Inf or NaN) is NaN */
 	else if (ix>=0x7f800000) return x-x;

-    /* argument reduction needed */
+    /* general argument reduction needed */
 	else {
 	    n = __ieee754_rem_pio2f(x,y);
 	    switch(n&3) {
--- a/lib/msun/src/s_sinf.c
+++ b/lib/msun/src/s_sinf.c
@ -20,25 +20,62 @@ static char rcsid[] = "$FreeBSD$";
 #include "math.h"
 #include "math_private.h"

+/* Small multiples of pi/2 rounded to double precision. */
+static const double
+s1pio2 = 1*M_PI_2,			/* 0x3FF921FB, 0x54442D18 */
+s2pio2 = 2*M_PI_2,			/* 0x400921FB, 0x54442D18 */
+s3pio2 = 3*M_PI_2,			/* 0x4012D97C, 0x7F3321D2 */
+s4pio2 = 4*M_PI_2;			/* 0x401921FB, 0x54442D18 */
+
+static inline float
+__kernel_cosdf(double x)
+{
+	return __kernel_cosf((float)x, x - (float)x);
+}
+
+static inline float
+__kernel_sindf(double x)
+{
+	return __kernel_sinf((float)x, x - (float)x, 1);
+}
+
 float
 sinf(float x)
 {
 	float y[2];
-	int32_t n, ix;
+	int32_t n, hx, ix;

-	GET_FLOAT_WORD(ix,x);
-	ix &= 0x7fffffff;
+	GET_FLOAT_WORD(hx,x);
+	ix = hx & 0x7fffffff;

 	if(ix <= 0x3f490fda) {		/* |x| ~<= pi/4 */
 	    if(ix<0x39800000)		/* |x| < 2**-12 */
 		if(((int)x)==0) return x;	/* x with inexact if x != 0 */
 	    return __kernel_sinf(x,0.0,0);
 	}
+	if(ix<=0x407b53d1) {		/* |x| <= ~5*pi/4 */
+	    if(ix<=0x4016cbe3) {	/* |x| <= ~3pi/4 */
+		if(hx>0)
+		    return __kernel_cosdf(x - s1pio2);
+		else
+		    return -__kernel_cosdf(x + s1pio2);
+	    } else
+		return -__kernel_sindf(x + (hx > 0 ? -s2pio2 : s2pio2));
+	}
+	if(ix<=0x40e231d5) {		/* |x| <= ~9*pi/4 */
+	    if(ix<=0x40afeddf) {	/* |x| <= ~7*pi/4 */
+		if(hx>0)
+		    return -__kernel_cosdf(x - s3pio2);
+		else
+		    return __kernel_cosdf(x + s3pio2);
+	    } else
+		return __kernel_sindf(x + (hx > 0 ? -s4pio2 : s4pio2));
+	}

    /* sin(Inf or NaN) is NaN */
 	else if (ix>=0x7f800000) return x-x;

-    /* argument reduction needed */
+    /* general argument reduction needed */
 	else {
 	    n = __ieee754_rem_pio2f(x,y);
 	    switch(n&3) {
--- a/lib/msun/src/s_tanf.c
+++ b/lib/msun/src/s_tanf.c
@ -20,25 +20,50 @@ static char rcsid[] = "$FreeBSD$";
 #include "math.h"
 #include "math_private.h"

+/* Small multiples of pi/2 rounded to double precision. */
+static const double
+t1pio2 = 1*M_PI_2,			/* 0x3FF921FB, 0x54442D18 */
+t2pio2 = 2*M_PI_2,			/* 0x400921FB, 0x54442D18 */
+t3pio2 = 3*M_PI_2,			/* 0x4012D97C, 0x7F3321D2 */
+t4pio2 = 4*M_PI_2;			/* 0x401921FB, 0x54442D18 */
+
+static inline float
+__kernel_tandf(double x, int iy)
+{
+	return __kernel_tanf((float)x, x - (float)x, iy);
+}
+
 float
 tanf(float x)
 {
-	float y[2];
-	int32_t n, ix;
+	float y[2],z=0.0;
+	int32_t n, hx, ix;

-	GET_FLOAT_WORD(ix,x);
-	ix &= 0x7fffffff;
+	GET_FLOAT_WORD(hx,x);
+	ix = hx & 0x7fffffff;

 	if(ix <= 0x3f490fda) {		/* |x| ~<= pi/4 */
 	    if(ix<0x39800000)		/* |x| < 2**-12 */
 		if(((int)x)==0) return x;	/* x with inexact if x != 0 */
-	    return __kernel_tanf(x,0.0,1);
+	    return __kernel_tanf(x,z,1);
+	}
+	if(ix<=0x407b53d1) {		/* |x| ~<= 5*pi/4 */
+	    if(ix<=0x4016cbe3)		/* |x| ~<= 3pi/4 */
+		return __kernel_tandf(x + (hx>0 ? -t1pio2 : t1pio2), -1);
+	    else
+		return __kernel_tandf(x + (hx>0 ? -t2pio2 : t2pio2), 1);
+	}
+	if(ix<=0x40e231d5) {		/* |x| ~<= 9*pi/4 */
+	    if(ix<=0x40afeddf)		/* |x| ~<= 7*pi/4 */
+		return __kernel_tandf(x + (hx>0 ? -t3pio2 : t3pio2), -1);
+	    else
+		return __kernel_tandf(x + (hx>0 ? -t4pio2 : t4pio2), 1);
 	}

    /* tan(Inf or NaN) is NaN */
 	else if (ix>=0x7f800000) return x-x;

-    /* argument reduction needed */
+    /* general argument reduction needed */
 	else {
 	    n = __ieee754_rem_pio2f(x,y);
 	    /* integer parameter: 1 -- n even; -1 -- n odd */