Use double precision to simplify and optimize a long division.
On athlons, this gives a speedup of 10-20% for tanf() on uniformly distributed args in [-2Pi, 2Pi]. (It only directly applies for 43% of the args and gives a 16-20% speedup for these (more for AXP than A64) and this gives an overall speedup of 10-12% which is all that it should; however, it gives an overall speedup of 17-20% with gcc-3.3 on AXP-A64 by mysteriously effected cases where it isn't executed.) I originally intended to use double precision for all internals of float trig functions and will probably still do this, but benchmarking showed that converting to double precision and back is a pessimization in cases where a simple float precision calculation works, so it may be optimal to switch precisions only when using extra precision is much simpler.
This commit is contained in:
parent
01155bb235
commit
d96648954f
@ -63,19 +63,5 @@ __kernel_tanf(float x, float y, int iy)
|
||||
return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
|
||||
}
|
||||
if(iy==1) return w;
|
||||
else { /* if allow error up to 2 ulp,
|
||||
simply return -1.0/(x+r) here */
|
||||
/* compute -1.0/(x+r) accurately */
|
||||
float a,t;
|
||||
int32_t i;
|
||||
z = w;
|
||||
GET_FLOAT_WORD(i,z);
|
||||
SET_FLOAT_WORD(z,i&0xfffff000);
|
||||
v = r-(z - x); /* z+v = r+x */
|
||||
t = a = -(float)1.0/w; /* a = -1.0/w */
|
||||
GET_FLOAT_WORD(i,t);
|
||||
SET_FLOAT_WORD(t,i&0xfffff000);
|
||||
s = (float)1.0+t*z;
|
||||
return t+a*(s+t*v);
|
||||
}
|
||||
else return -1.0/((double)x+r);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user