Use a better method of scaling by 2**k. Instead of adding to the
exponent bits of the reduced result, construct 2**k (hopefully in parallel with the construction of the reduced result) and multiply by it. This tends to be much faster if the construction of 2**k is actually in parallel, and might be faster even with no parallelism since adjustment of the exponent requires a read-modify-wrtite at an unfortunate time for pipelines. In some cases involving exp2* on amd64 (A64), this change saves about 40 cycles or 30%. I think it is inherently only about 12 cycles faster in these cases and the rest of the speedup is from partly-accidentally avoiding compiler pessimizations (the construction of 2**k is now manually scheduled for good results, and -O2 doesn't always mess this up). In most cases on amd64 (A64) and i386 (A64) the speedup is about 20 cycles. The worst case that I found is expf on ia64 where this change is a pessimization of about 10 cycles or 5%. The manual scheduling for plain exp[f] is harder and not as tuned. This change ld128/s_exp2l.c has not been tested.
This commit is contained in:
parent
04539c7099
commit
a373e66b85
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=176074
@ -356,8 +356,8 @@ static const float eps[TBLSIZE] = {
|
||||
long double
|
||||
exp2l(long double x)
|
||||
{
|
||||
union IEEEl2bits u;
|
||||
long double r, t, z;
|
||||
union IEEEl2bits u, v;
|
||||
long double r, t, twopk, twopkp10000, z;
|
||||
uint32_t hx, ix, i0;
|
||||
int k;
|
||||
|
||||
@ -403,6 +403,15 @@ exp2l(long double x)
|
||||
i0 = i0 & (TBLSIZE - 1);
|
||||
u.e -= redux;
|
||||
z = x - u.e;
|
||||
v.xbits.manh = 0;
|
||||
v.xbits.manl = 0;
|
||||
if (k >= LDBL_MIN_EXP) {
|
||||
v.xbits.expsign = LDBL_MAX_EXP - 1 + k;
|
||||
twopk = v.e;
|
||||
} else {
|
||||
v.xbits.expsign = LDBL_MAX_EXP - 1 + k + 10000;
|
||||
twopkp10000 = v.e;
|
||||
}
|
||||
|
||||
/* Compute r = exp2(y) = exp2t[i0] * p(z - eps[i]). */
|
||||
t = tbl[i0]; /* exp2t[i0] */
|
||||
@ -412,15 +421,10 @@ exp2l(long double x)
|
||||
|
||||
/* Scale by 2**k. */
|
||||
if(k >= LDBL_MIN_EXP) {
|
||||
if (k != 0) {
|
||||
u.e = r;
|
||||
u.xbits.expsign += k;
|
||||
r = u.e;
|
||||
}
|
||||
return (r);
|
||||
if (k == LDBL_MAX_EXP)
|
||||
return (r * 2.0 * 0x1p16383L);
|
||||
return (r * twopk);
|
||||
} else {
|
||||
u.e = r;
|
||||
u.xbits.expsign += k + 10000;
|
||||
return (u.e * twom10000);
|
||||
return (r * twopkp10000 * twom10000);
|
||||
}
|
||||
}
|
||||
|
@ -214,8 +214,8 @@ static const double tbl[TBLSIZE * 2] = {
|
||||
long double
|
||||
exp2l(long double x)
|
||||
{
|
||||
union IEEEl2bits u;
|
||||
long double r, z;
|
||||
union IEEEl2bits u, v;
|
||||
long double r, twopk, twopkp10000, z;
|
||||
uint32_t hx, ix, i0;
|
||||
int k;
|
||||
|
||||
@ -267,6 +267,14 @@ exp2l(long double x)
|
||||
i0 = (i0 & (TBLSIZE - 1)) << 1;
|
||||
u.e -= redux;
|
||||
z = x - u.e;
|
||||
v.xbits.man = 1ULL << 63;
|
||||
if (k >= LDBL_MIN_EXP) {
|
||||
v.xbits.expsign = LDBL_MAX_EXP - 1 + k;
|
||||
twopk = v.e;
|
||||
} else {
|
||||
v.xbits.expsign = LDBL_MAX_EXP - 1 + k + 10000;
|
||||
twopkp10000 = v.e;
|
||||
}
|
||||
|
||||
/* Compute r = exp2l(y) = exp2lt[i0] * p(z). */
|
||||
long double t_hi = tbl[i0];
|
||||
@ -277,15 +285,10 @@ exp2l(long double x)
|
||||
|
||||
/* Scale by 2**k. */
|
||||
if (k >= LDBL_MIN_EXP) {
|
||||
if (k != 0) {
|
||||
u.e = r;
|
||||
u.xbits.expsign += k;
|
||||
return (u.e);
|
||||
}
|
||||
return (r);
|
||||
if (k == LDBL_MAX_EXP)
|
||||
return (r * 2.0 * 0x1p16383L);
|
||||
return (r * twopk);
|
||||
} else {
|
||||
u.e = r;
|
||||
u.xbits.expsign += k + 10000;
|
||||
return (u.e * twom10000);
|
||||
return (r * twopkp10000 * twom10000);
|
||||
}
|
||||
}
|
||||
|
@ -102,7 +102,7 @@ P5 = 4.13813679705723846039e-08; /* 0x3E663769, 0x72BEA4D0 */
|
||||
double
|
||||
__ieee754_exp(double x) /* default IEEE double exp */
|
||||
{
|
||||
double y,hi=0.0,lo=0.0,c,t;
|
||||
double y,hi=0.0,lo=0.0,c,t,twopk;
|
||||
int32_t k=0,xsb;
|
||||
u_int32_t hx;
|
||||
|
||||
@ -142,18 +142,17 @@ __ieee754_exp(double x) /* default IEEE double exp */
|
||||
|
||||
/* x is now in primary range */
|
||||
t = x*x;
|
||||
if(k >= -1021)
|
||||
INSERT_WORDS(twopk,0x3ff00000+(k<<20), 0);
|
||||
else
|
||||
INSERT_WORDS(twopk,0x3ff00000+((k+1000)<<20), 0);
|
||||
c = x - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
|
||||
if(k==0) return one-((x*c)/(c-2.0)-x);
|
||||
else y = one-((lo-(x*c)/(2.0-c))-hi);
|
||||
if(k >= -1021) {
|
||||
u_int32_t hy;
|
||||
GET_HIGH_WORD(hy,y);
|
||||
SET_HIGH_WORD(y,hy+(k<<20)); /* add k to y's exponent */
|
||||
return y;
|
||||
if (k==1024) return y*2.0*0x1p1023;
|
||||
return y*twopk;
|
||||
} else {
|
||||
u_int32_t hy;
|
||||
GET_HIGH_WORD(hy,y);
|
||||
SET_HIGH_WORD(y,hy+((k+1000)<<20)); /* add k to y's exponent */
|
||||
return y*twom1000;
|
||||
return y*twopk*twom1000;
|
||||
}
|
||||
}
|
||||
|
@ -43,7 +43,7 @@ static volatile float twom100 = 7.8886090522e-31; /* 2**-100=0x0d800000 */
|
||||
float
|
||||
__ieee754_expf(float x) /* default IEEE double exp */
|
||||
{
|
||||
float y,hi=0.0,lo=0.0,c,t;
|
||||
float y,hi=0.0,lo=0.0,c,t,twopk;
|
||||
int32_t k=0,xsb;
|
||||
u_int32_t hx;
|
||||
|
||||
@ -80,18 +80,17 @@ __ieee754_expf(float x) /* default IEEE double exp */
|
||||
|
||||
/* x is now in primary range */
|
||||
t = x*x;
|
||||
if(k >= -125)
|
||||
SET_FLOAT_WORD(twopk,0x3f800000+(k<<23));
|
||||
else
|
||||
SET_FLOAT_WORD(twopk,0x3f800000+((k+100)<<23));
|
||||
c = x - t*(P1+t*P2);
|
||||
if(k==0) return one-((x*c)/(c-(float)2.0)-x);
|
||||
else y = one-((lo-(x*c)/((float)2.0-c))-hi);
|
||||
if(k >= -125) {
|
||||
u_int32_t hy;
|
||||
GET_FLOAT_WORD(hy,y);
|
||||
SET_FLOAT_WORD(y,hy+(k<<23)); /* add k to y's exponent */
|
||||
return y;
|
||||
if(k==128) return y*2.0F*0x1p127F;
|
||||
return y*twopk;
|
||||
} else {
|
||||
u_int32_t hy;
|
||||
GET_FLOAT_WORD(hy,y);
|
||||
SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
|
||||
return y*twom100;
|
||||
return y*twopk*twom100;
|
||||
}
|
||||
}
|
||||
|
@ -340,7 +340,7 @@ static const double tbl[TBLSIZE * 2] = {
|
||||
double
|
||||
exp2(double x)
|
||||
{
|
||||
double r, t, z;
|
||||
double r, t, twopk, twopkp1000, z;
|
||||
uint32_t hx, hr, ix, lx, i0;
|
||||
int k;
|
||||
|
||||
@ -375,19 +375,19 @@ exp2(double x)
|
||||
/* Compute r = exp2(y) = exp2t[i0] * p(z - eps[i]). */
|
||||
t = tbl[i0]; /* exp2t[i0] */
|
||||
z -= tbl[i0 + 1]; /* eps[i0] */
|
||||
if (k >= -1021 << 20)
|
||||
INSERT_WORDS(twopk, 0x3ff00000 + k, 0);
|
||||
else
|
||||
INSERT_WORDS(twopkp1000, 0x3ff00000 + k + (1000 << 20), 0);
|
||||
r = t + t * z * (P1 + z * (P2 + z * (P3 + z * (P4 + z * P5))));
|
||||
|
||||
/* Scale by 2**(k>>20). */
|
||||
if(k >= -1021 << 20) {
|
||||
if (k != 0) {
|
||||
GET_HIGH_WORD(hr, r);
|
||||
SET_HIGH_WORD(r, hr + k);
|
||||
}
|
||||
return (r);
|
||||
if (k == 1024 << 20)
|
||||
return (r * 2.0 * 0x1p1023);
|
||||
return (r * twopk);
|
||||
} else {
|
||||
GET_HIGH_WORD(hr, r);
|
||||
SET_HIGH_WORD(r, hr + (k + (1000 << 20)));
|
||||
return (r * twom1000);
|
||||
return (r * twopkp1000 * twom1000);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -92,7 +92,7 @@ static const double exp2ft[TBLSIZE] = {
|
||||
float
|
||||
exp2f(float x)
|
||||
{
|
||||
double tv;
|
||||
double tv, twopk;
|
||||
float t, z;
|
||||
uint32_t hx, htv, ix, i0;
|
||||
int32_t k;
|
||||
@ -129,9 +129,6 @@ exp2f(float x)
|
||||
tv = tv + tv * (z * (P1 + z * (P2 + z * (P3 + z * P4))));
|
||||
|
||||
/* Scale by 2**(k>>20). */
|
||||
if (k != 0) {
|
||||
GET_HIGH_WORD(htv, tv);
|
||||
SET_HIGH_WORD(tv, htv + k);
|
||||
}
|
||||
return (tv);
|
||||
INSERT_WORDS(twopk, 0x3ff00000 + k, 0);
|
||||
return (tv * twopk);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user