Moved all the optimizations for |x| <= 9pi/2 from

__ieee754_rem_pio2f() to its 3 callers and manually inline them.

On Athlons, with favourable compiler flags and optimizations and
favourable pipeline conditions, this gives a speedup of 30-40 cycles
for cosf(), sinf() and tanf() on the range pi/4 < |x| <= 9pi/4, so
thes functions are now signifcantly faster than the hardware trig
functions in many cases.  E.g., in a benchmark with uniformly distributed
x in [-2pi, 2pi], A64 hardware fcos took 72-129 cycles and cosf() took
37-55 cycles.  Out-of-order execution is needed to get both of these
times.  The optimizations in this commit apparently work more by
removing 1 serialization point than by reducing latency.
This commit is contained in:
bde 2005-11-19 02:38:27 +00:00
parent 666e602c46
commit 558fb238b1
4 changed files with 105 additions and 67 deletions

View File

@ -47,10 +47,6 @@ static const int32_t two_over_pi[] = {
/*
* invpio2: 53 bits of 2/pi
* e1pio2: 1*pi/2 rounded to 53 bits
* e2pio2: 2*pi/2 rounded to 53 bits
* e3pio2: 3*pi/2 rounded to 53 bits
* e4pio2: 4*pi/2 rounded to 53 bits
* pio2_1: first 33 bit of pi/2
* pio2_1t: pi/2 - pio2_1
*/
@ -60,10 +56,6 @@ zero = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
e1pio2 = 1*M_PI_2, /* 0x3FF921FB, 0x54442D18 */
e2pio2 = 2*M_PI_2, /* 0x400921FB, 0x54442D18 */
e3pio2 = 3*M_PI_2, /* 0x4012D97C, 0x7F3321D2 */
e4pio2 = 4*M_PI_2, /* 0x401921FB, 0x54442D18 */
pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
pio2_1t = 6.07710050650619224932e-11; /* 0x3DD0B461, 0x1A626331 */
@ -75,53 +67,6 @@ pio2_1t = 6.07710050650619224932e-11; /* 0x3DD0B461, 0x1A626331 */
GET_FLOAT_WORD(hx,x);
ix = hx&0x7fffffff;
if(ix<=0x3f490fda) /* |x| ~<= pi/4, reduction is null */
{y[0] = x; y[1] = 0; return 0;}
/* 53 bit pi is good enough for special cases */
if(ix<=0x407b53d1) { /* |x| ~<= 5*pi/4 */
if(ix<=0x4016cbe3) { /* |x| ~<= 3*pi/4 */
if(hx>0) {
z = x - e1pio2;
n = 1;
} else {
z = x + e1pio2;
n = 3;
}
y[0] = z;
y[1] = z - y[0];
return n;
} else {
if(hx>0)
z = x - e2pio2;
else
z = x + e2pio2;
y[0] = z;
y[1] = z - y[0];
return 2;
}
}
if(ix<=0x40e231d5) { /* |x| ~<= 9*pi/4*/
if(ix<=0x40afeddf) { /* |x| ~<= 7*pi/4 */
if(hx>0) {
z = x - e3pio2;
n = 3;
} else {
z = x + e3pio2;
n = 1;
}
y[0] = z;
y[1] = z - y[0];
return n;
} else {
if(hx>0)
z = x - e4pio2;
else
z = x + e4pio2;
y[0] = z;
y[1] = z - y[0];
return 0;
}
}
/* 33+53 bit pi is good enough for medium size */
if(ix<=0x49490f80) { /* |x| ~<= 2^19*(pi/2), medium size */
t = fabsf(x);

View File

@ -20,25 +20,56 @@ static char rcsid[] = "$FreeBSD$";
#include "math.h"
#include "math_private.h"
/* Small multiples of pi/2 rounded to double precision. */
static const double
c1pio2 = 1*M_PI_2, /* 0x3FF921FB, 0x54442D18 */
c2pio2 = 2*M_PI_2, /* 0x400921FB, 0x54442D18 */
c3pio2 = 3*M_PI_2, /* 0x4012D97C, 0x7F3321D2 */
c4pio2 = 4*M_PI_2; /* 0x401921FB, 0x54442D18 */
static inline float
__kernel_cosdf(double x)
{
return __kernel_cosf((float)x, x - (float)x);
}
static inline float
__kernel_sindf(double x)
{
return __kernel_sinf((float)x, x - (float)x, 1);
}
float
cosf(float x)
{
float y[2];
int32_t n,ix;
x = fabsf(x);
GET_FLOAT_WORD(ix,x);
ix &= 0x7fffffff;
if(ix <= 0x3f490fda) { /* |x| ~<= pi/4 */
if(ix<0x39800000) /* |x| < 2**-12 */
if(((int)x)==0) return 1.0; /* 1 with inexact if x != 0 */
return __kernel_cosf(x,0.0);
}
if(ix<=0x407b53d1) { /* |x| <= ~5*pi/4 */
if(ix<=0x4016cbe3) /* |x| <= ~3pi/4 */
return -__kernel_sindf(x - c1pio2);
else
return -__kernel_cosdf(x - c2pio2);
}
if(ix<=0x40e231d5) { /* |x| <= ~9*pi/4 */
if(ix<=0x40afeddf) /* |x| <= ~7*pi/4 */
return __kernel_sindf(x - c3pio2);
else
return __kernel_cosdf(x - c4pio2);
}
/* cos(Inf or NaN) is NaN */
else if (ix>=0x7f800000) return x-x;
/* argument reduction needed */
/* general argument reduction needed */
else {
n = __ieee754_rem_pio2f(x,y);
switch(n&3) {

View File

@ -20,25 +20,62 @@ static char rcsid[] = "$FreeBSD$";
#include "math.h"
#include "math_private.h"
/* Small multiples of pi/2 rounded to double precision. */
static const double
s1pio2 = 1*M_PI_2, /* 0x3FF921FB, 0x54442D18 */
s2pio2 = 2*M_PI_2, /* 0x400921FB, 0x54442D18 */
s3pio2 = 3*M_PI_2, /* 0x4012D97C, 0x7F3321D2 */
s4pio2 = 4*M_PI_2; /* 0x401921FB, 0x54442D18 */
static inline float
__kernel_cosdf(double x)
{
return __kernel_cosf((float)x, x - (float)x);
}
static inline float
__kernel_sindf(double x)
{
return __kernel_sinf((float)x, x - (float)x, 1);
}
float
sinf(float x)
{
float y[2];
int32_t n, ix;
int32_t n, hx, ix;
GET_FLOAT_WORD(ix,x);
ix &= 0x7fffffff;
GET_FLOAT_WORD(hx,x);
ix = hx & 0x7fffffff;
if(ix <= 0x3f490fda) { /* |x| ~<= pi/4 */
if(ix<0x39800000) /* |x| < 2**-12 */
if(((int)x)==0) return x; /* x with inexact if x != 0 */
return __kernel_sinf(x,0.0,0);
}
if(ix<=0x407b53d1) { /* |x| <= ~5*pi/4 */
if(ix<=0x4016cbe3) { /* |x| <= ~3pi/4 */
if(hx>0)
return __kernel_cosdf(x - s1pio2);
else
return -__kernel_cosdf(x + s1pio2);
} else
return -__kernel_sindf(x + (hx > 0 ? -s2pio2 : s2pio2));
}
if(ix<=0x40e231d5) { /* |x| <= ~9*pi/4 */
if(ix<=0x40afeddf) { /* |x| <= ~7*pi/4 */
if(hx>0)
return -__kernel_cosdf(x - s3pio2);
else
return __kernel_cosdf(x + s3pio2);
} else
return __kernel_sindf(x + (hx > 0 ? -s4pio2 : s4pio2));
}
/* sin(Inf or NaN) is NaN */
else if (ix>=0x7f800000) return x-x;
/* argument reduction needed */
/* general argument reduction needed */
else {
n = __ieee754_rem_pio2f(x,y);
switch(n&3) {

View File

@ -20,25 +20,50 @@ static char rcsid[] = "$FreeBSD$";
#include "math.h"
#include "math_private.h"
/* Small multiples of pi/2 rounded to double precision. */
static const double
t1pio2 = 1*M_PI_2, /* 0x3FF921FB, 0x54442D18 */
t2pio2 = 2*M_PI_2, /* 0x400921FB, 0x54442D18 */
t3pio2 = 3*M_PI_2, /* 0x4012D97C, 0x7F3321D2 */
t4pio2 = 4*M_PI_2; /* 0x401921FB, 0x54442D18 */
static inline float
__kernel_tandf(double x, int iy)
{
return __kernel_tanf((float)x, x - (float)x, iy);
}
float
tanf(float x)
{
float y[2];
int32_t n, ix;
float y[2],z=0.0;
int32_t n, hx, ix;
GET_FLOAT_WORD(ix,x);
ix &= 0x7fffffff;
GET_FLOAT_WORD(hx,x);
ix = hx & 0x7fffffff;
if(ix <= 0x3f490fda) { /* |x| ~<= pi/4 */
if(ix<0x39800000) /* |x| < 2**-12 */
if(((int)x)==0) return x; /* x with inexact if x != 0 */
return __kernel_tanf(x,0.0,1);
return __kernel_tanf(x,z,1);
}
if(ix<=0x407b53d1) { /* |x| ~<= 5*pi/4 */
if(ix<=0x4016cbe3) /* |x| ~<= 3pi/4 */
return __kernel_tandf(x + (hx>0 ? -t1pio2 : t1pio2), -1);
else
return __kernel_tandf(x + (hx>0 ? -t2pio2 : t2pio2), 1);
}
if(ix<=0x40e231d5) { /* |x| ~<= 9*pi/4 */
if(ix<=0x40afeddf) /* |x| ~<= 7*pi/4 */
return __kernel_tandf(x + (hx>0 ? -t3pio2 : t3pio2), -1);
else
return __kernel_tandf(x + (hx>0 ? -t4pio2 : t4pio2), 1);
}
/* tan(Inf or NaN) is NaN */
else if (ix>=0x7f800000) return x-x;
/* argument reduction needed */
/* general argument reduction needed */
else {
n = __ieee754_rem_pio2f(x,y);
/* integer parameter: 1 -- n even; -1 -- n odd */