Various changes to improve the accuracy and speed of log{2,10}{,f}.
- Rename __kernel_log() to k_log1p(). - Move some of the work that was previously done in the kernel log into the callers. This enables further refactoring to improve accuracy or speed, although I don't recall the details. - Use extra precision when adding the final scaling term, which improves accuracy. - Describe and work around compiler problems that break some of the multiprecision calculations. A fix for a small bug is also included: - Add a special case for log*(1). This is needed to ensure that log*(1) == +0 instead of -0, even when the rounding mode is FE_DOWNWARD. Submitted by: bde
This commit is contained in:
parent
5ebf26f6a1
commit
b052ec9065
@ -17,6 +17,9 @@ __FBSDID("$FreeBSD$");
|
|||||||
/*
|
/*
|
||||||
* Return the base 10 logarithm of x. See e_log.c and k_log.h for most
|
* Return the base 10 logarithm of x. See e_log.c and k_log.h for most
|
||||||
* comments.
|
* comments.
|
||||||
|
*
|
||||||
|
* log10(x) = (f - 0.5*f*f + k_log1p(f)) / ln10 + k * log10(2)
|
||||||
|
* in not-quite-routine extra precision.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "math.h"
|
#include "math.h"
|
||||||
@ -35,7 +38,7 @@ static const double zero = 0.0;
|
|||||||
double
|
double
|
||||||
__ieee754_log10(double x)
|
__ieee754_log10(double x)
|
||||||
{
|
{
|
||||||
double f,hi,lo,y,z;
|
double f,hfsq,hi,lo,r,val_hi,val_lo,w,y,y2;
|
||||||
int32_t i,k,hx;
|
int32_t i,k,hx;
|
||||||
u_int32_t lx;
|
u_int32_t lx;
|
||||||
|
|
||||||
@ -50,16 +53,35 @@ __ieee754_log10(double x)
|
|||||||
GET_HIGH_WORD(hx,x);
|
GET_HIGH_WORD(hx,x);
|
||||||
}
|
}
|
||||||
if (hx >= 0x7ff00000) return x+x;
|
if (hx >= 0x7ff00000) return x+x;
|
||||||
|
if (hx == 0x3ff00000 && lx == 0)
|
||||||
|
return zero; /* log(1) = +0 */
|
||||||
k += (hx>>20)-1023;
|
k += (hx>>20)-1023;
|
||||||
hx &= 0x000fffff;
|
hx &= 0x000fffff;
|
||||||
i = (hx+0x95f64)&0x100000;
|
i = (hx+0x95f64)&0x100000;
|
||||||
SET_HIGH_WORD(x,hx|(i^0x3ff00000)); /* normalize x or x/2 */
|
SET_HIGH_WORD(x,hx|(i^0x3ff00000)); /* normalize x or x/2 */
|
||||||
k += (i>>20);
|
k += (i>>20);
|
||||||
y = (double)k;
|
y = (double)k;
|
||||||
f = __kernel_log(x);
|
f = x - 1.0;
|
||||||
hi = x = x - 1;
|
hfsq = 0.5*f*f;
|
||||||
|
r = k_log1p(f);
|
||||||
|
|
||||||
|
/* See e_log2.c for most details. */
|
||||||
|
hi = f - hfsq;
|
||||||
SET_LOW_WORD(hi,0);
|
SET_LOW_WORD(hi,0);
|
||||||
lo = x - hi;
|
lo = (f - hi) - hfsq + r;
|
||||||
z = y*log10_2lo + (x+f)*ivln10lo + (lo+f)*ivln10hi + hi*ivln10hi;
|
val_hi = hi*ivln10hi;
|
||||||
return z+y*log10_2hi;
|
y2 = y*log10_2hi;
|
||||||
|
val_lo = y*log10_2lo + (lo+hi)*ivln10lo + lo*ivln10hi;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Extra precision in for adding y*log10_2hi is not strictly needed
|
||||||
|
* since there is no very large cancellation near x = sqrt(2) or
|
||||||
|
* x = 1/sqrt(2), but we do it anyway since it costs little on CPUs
|
||||||
|
* with some parallelism and it reduces the error for many args.
|
||||||
|
*/
|
||||||
|
w = y2 + val_hi;
|
||||||
|
val_lo += (y2 - w) + val_hi;
|
||||||
|
val_hi = w;
|
||||||
|
|
||||||
|
return val_lo + val_hi;
|
||||||
}
|
}
|
||||||
|
@ -32,7 +32,7 @@ static const float zero = 0.0;
|
|||||||
float
|
float
|
||||||
__ieee754_log10f(float x)
|
__ieee754_log10f(float x)
|
||||||
{
|
{
|
||||||
float f,hi,lo,y,z;
|
float f,hfsq,hi,lo,r,y,y2;
|
||||||
int32_t i,k,hx;
|
int32_t i,k,hx;
|
||||||
|
|
||||||
GET_FLOAT_WORD(hx,x);
|
GET_FLOAT_WORD(hx,x);
|
||||||
@ -46,17 +46,26 @@ __ieee754_log10f(float x)
|
|||||||
GET_FLOAT_WORD(hx,x);
|
GET_FLOAT_WORD(hx,x);
|
||||||
}
|
}
|
||||||
if (hx >= 0x7f800000) return x+x;
|
if (hx >= 0x7f800000) return x+x;
|
||||||
|
if (hx == 0x3f800000)
|
||||||
|
return zero; /* log(1) = +0 */
|
||||||
k += (hx>>23)-127;
|
k += (hx>>23)-127;
|
||||||
hx &= 0x007fffff;
|
hx &= 0x007fffff;
|
||||||
i = (hx+(0x4afb0d))&0x800000;
|
i = (hx+(0x4afb0d))&0x800000;
|
||||||
SET_FLOAT_WORD(x,hx|(i^0x3f800000)); /* normalize x or x/2 */
|
SET_FLOAT_WORD(x,hx|(i^0x3f800000)); /* normalize x or x/2 */
|
||||||
k += (i>>23);
|
k += (i>>23);
|
||||||
y = (float)k;
|
y = (float)k;
|
||||||
f = __kernel_logf(x);
|
f = x - (float)1.0;
|
||||||
x = x - (float)1.0;
|
hfsq = (float)0.5*f*f;
|
||||||
GET_FLOAT_WORD(hx,x);
|
r = k_log1pf(f);
|
||||||
|
|
||||||
|
/* See e_log2f.c and e_log2.c for details. */
|
||||||
|
if (sizeof(float_t) > sizeof(float))
|
||||||
|
return (r - hfsq + f) * ((float_t)ivln10lo + ivln10hi) +
|
||||||
|
y * ((float_t)log10_2lo + log10_2hi);
|
||||||
|
hi = f - hfsq;
|
||||||
|
GET_FLOAT_WORD(hx,hi);
|
||||||
SET_FLOAT_WORD(hi,hx&0xfffff000);
|
SET_FLOAT_WORD(hi,hx&0xfffff000);
|
||||||
lo = x - hi;
|
lo = (f - hi) - hfsq + r;
|
||||||
z = y*log10_2lo + (x+f)*ivln10lo + (lo+f)*ivln10hi + hi*ivln10hi;
|
return y*log10_2lo + (lo+hi)*ivln10lo + lo*ivln10hi + hi*ivln10hi +
|
||||||
return z+y*log10_2hi;
|
y*log10_2hi;
|
||||||
}
|
}
|
||||||
|
@ -17,6 +17,11 @@ __FBSDID("$FreeBSD$");
|
|||||||
/*
|
/*
|
||||||
* Return the base 2 logarithm of x. See e_log.c and k_log.h for most
|
* Return the base 2 logarithm of x. See e_log.c and k_log.h for most
|
||||||
* comments.
|
* comments.
|
||||||
|
*
|
||||||
|
* This reduces x to {k, 1+f} exactly as in e_log.c, then calls the kernel,
|
||||||
|
* then does the combining and scaling steps
|
||||||
|
* log2(x) = (f - 0.5*f*f + k_log1p(f)) / ln2 + k
|
||||||
|
* in not-quite-routine extra precision.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "math.h"
|
#include "math.h"
|
||||||
@ -33,7 +38,7 @@ static const double zero = 0.0;
|
|||||||
double
|
double
|
||||||
__ieee754_log2(double x)
|
__ieee754_log2(double x)
|
||||||
{
|
{
|
||||||
double f,hi,lo;
|
double f,hfsq,hi,lo,r,val_hi,val_lo,w,y;
|
||||||
int32_t i,k,hx;
|
int32_t i,k,hx;
|
||||||
u_int32_t lx;
|
u_int32_t lx;
|
||||||
|
|
||||||
@ -48,14 +53,58 @@ __ieee754_log2(double x)
|
|||||||
GET_HIGH_WORD(hx,x);
|
GET_HIGH_WORD(hx,x);
|
||||||
}
|
}
|
||||||
if (hx >= 0x7ff00000) return x+x;
|
if (hx >= 0x7ff00000) return x+x;
|
||||||
|
if (hx == 0x3ff00000 && lx == 0)
|
||||||
|
return zero; /* log(1) = +0 */
|
||||||
k += (hx>>20)-1023;
|
k += (hx>>20)-1023;
|
||||||
hx &= 0x000fffff;
|
hx &= 0x000fffff;
|
||||||
i = (hx+0x95f64)&0x100000;
|
i = (hx+0x95f64)&0x100000;
|
||||||
SET_HIGH_WORD(x,hx|(i^0x3ff00000)); /* normalize x or x/2 */
|
SET_HIGH_WORD(x,hx|(i^0x3ff00000)); /* normalize x or x/2 */
|
||||||
k += (i>>20);
|
k += (i>>20);
|
||||||
f = __kernel_log(x);
|
y = (double)k;
|
||||||
hi = x = x - 1;
|
f = x - 1.0;
|
||||||
|
hfsq = 0.5*f*f;
|
||||||
|
r = k_log1p(f);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* f-hfsq must (for args near 1) be evaluated in extra precision
|
||||||
|
* to avoid a large cancellation when x is near sqrt(2) or 1/sqrt(2).
|
||||||
|
* This is fairly efficient since f-hfsq only depends on f, so can
|
||||||
|
* be evaluated in parallel with R. Not combining hfsq with R also
|
||||||
|
* keeps R small (though not as small as a true `lo' term would be),
|
||||||
|
* so that extra precision is not needed for terms involving R.
|
||||||
|
*
|
||||||
|
* Compiler bugs involving extra precision used to break Dekker's
|
||||||
|
* theorem for spitting f-hfsq as hi+lo, unless double_t was used
|
||||||
|
* or the multi-precision calculations were avoided when double_t
|
||||||
|
* has extra precision. These problems are now automatically
|
||||||
|
* avoided as a side effect of the optimization of combining the
|
||||||
|
* Dekker splitting step with the clear-low-bits step.
|
||||||
|
*
|
||||||
|
* y must (for args near sqrt(2) and 1/sqrt(2)) be added in extra
|
||||||
|
* precision to avoid a very large cancellation when x is very near
|
||||||
|
* these values. Unlike the above cancellations, this problem is
|
||||||
|
* specific to base 2. It is strange that adding +-1 is so much
|
||||||
|
* harder than adding +-ln2 or +-log10_2.
|
||||||
|
*
|
||||||
|
* This uses Dekker's theorem to normalize y+val_hi, so the
|
||||||
|
* compiler bugs are back in some configurations, sigh. And I
|
||||||
|
* don't want to used double_t to avoid them, since that gives a
|
||||||
|
* pessimization and the support for avoiding the pessimization
|
||||||
|
* is not yet available.
|
||||||
|
*
|
||||||
|
* The multi-precision calculations for the multiplications are
|
||||||
|
* routine.
|
||||||
|
*/
|
||||||
|
hi = f - hfsq;
|
||||||
SET_LOW_WORD(hi,0);
|
SET_LOW_WORD(hi,0);
|
||||||
lo = x - hi;
|
lo = (f - hi) - hfsq + r;
|
||||||
return (x+f)*ivln2lo + (lo+f)*ivln2hi + hi*ivln2hi + k;
|
val_hi = hi*ivln2hi;
|
||||||
|
val_lo = (lo+hi)*ivln2lo + lo*ivln2hi;
|
||||||
|
|
||||||
|
/* spadd(val_hi, val_lo, y), except for not using double_t: */
|
||||||
|
w = y + val_hi;
|
||||||
|
val_lo += (y - w) + val_hi;
|
||||||
|
val_hi = w;
|
||||||
|
|
||||||
|
return val_lo + val_hi;
|
||||||
}
|
}
|
||||||
|
@ -30,7 +30,7 @@ static const float zero = 0.0;
|
|||||||
float
|
float
|
||||||
__ieee754_log2f(float x)
|
__ieee754_log2f(float x)
|
||||||
{
|
{
|
||||||
float f,hi,lo;
|
float f,hfsq,hi,lo,r,y;
|
||||||
int32_t i,k,hx;
|
int32_t i,k,hx;
|
||||||
|
|
||||||
GET_FLOAT_WORD(hx,x);
|
GET_FLOAT_WORD(hx,x);
|
||||||
@ -44,15 +44,38 @@ __ieee754_log2f(float x)
|
|||||||
GET_FLOAT_WORD(hx,x);
|
GET_FLOAT_WORD(hx,x);
|
||||||
}
|
}
|
||||||
if (hx >= 0x7f800000) return x+x;
|
if (hx >= 0x7f800000) return x+x;
|
||||||
|
if (hx == 0x3f800000)
|
||||||
|
return zero; /* log(1) = +0 */
|
||||||
k += (hx>>23)-127;
|
k += (hx>>23)-127;
|
||||||
hx &= 0x007fffff;
|
hx &= 0x007fffff;
|
||||||
i = (hx+(0x4afb0d))&0x800000;
|
i = (hx+(0x4afb0d))&0x800000;
|
||||||
SET_FLOAT_WORD(x,hx|(i^0x3f800000)); /* normalize x or x/2 */
|
SET_FLOAT_WORD(x,hx|(i^0x3f800000)); /* normalize x or x/2 */
|
||||||
k += (i>>23);
|
k += (i>>23);
|
||||||
f = __kernel_logf(x);
|
y = (float)k;
|
||||||
x = x - (float)1.0;
|
f = x - (float)1.0;
|
||||||
GET_FLOAT_WORD(hx,x);
|
hfsq = (float)0.5*f*f;
|
||||||
|
r = k_log1pf(f);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We no longer need to avoid falling into the multi-precision
|
||||||
|
* calculations due to compiler bugs breaking Dekker's theorem.
|
||||||
|
* Keep avoiding this as an optimization. See e_log2.c for more
|
||||||
|
* details (some details are here only because the optimization
|
||||||
|
* is not yet available in double precision).
|
||||||
|
*
|
||||||
|
* Another compiler bug turned up. With gcc on i386,
|
||||||
|
* (ivln2lo + ivln2hi) would be evaluated in float precision
|
||||||
|
* despite runtime evaluations using double precision. So we
|
||||||
|
* must cast one of its terms to float_t. This makes the whole
|
||||||
|
* expression have type float_t, so return is forced to waste
|
||||||
|
* time clobbering its extra precision.
|
||||||
|
*/
|
||||||
|
if (sizeof(float_t) > sizeof(float))
|
||||||
|
return (r - hfsq + f) * ((float_t)ivln2lo + ivln2hi) + y;
|
||||||
|
|
||||||
|
hi = f - hfsq;
|
||||||
|
GET_FLOAT_WORD(hx,hi);
|
||||||
SET_FLOAT_WORD(hi,hx&0xfffff000);
|
SET_FLOAT_WORD(hi,hx&0xfffff000);
|
||||||
lo = x - hi;
|
lo = (f - hi) - hfsq + r;
|
||||||
return (x+f)*ivln2lo + (lo+f)*ivln2hi + hi*ivln2hi + k;
|
return (lo+hi)*ivln2lo + lo*ivln2hi + hi*ivln2hi + y;
|
||||||
}
|
}
|
||||||
|
@ -14,8 +14,9 @@
|
|||||||
#include <sys/cdefs.h>
|
#include <sys/cdefs.h>
|
||||||
__FBSDID("$FreeBSD$");
|
__FBSDID("$FreeBSD$");
|
||||||
|
|
||||||
/* __kernel_log(x)
|
/*
|
||||||
* Return log(x) - (x-1) for x in ~[sqrt(2)/2, sqrt(2)].
|
* k_log1p(f):
|
||||||
|
* Return log(1+f) - f for 1+f in ~[sqrt(2)/2, sqrt(2)].
|
||||||
*
|
*
|
||||||
* The following describes the overall strategy for computing
|
* The following describes the overall strategy for computing
|
||||||
* logarithms in base e. The argument reduction and adding the final
|
* logarithms in base e. The argument reduction and adding the final
|
||||||
@ -80,37 +81,20 @@ Lg6 = 1.531383769920937332e-01, /* 3FC39A09 D078C69F */
|
|||||||
Lg7 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
|
Lg7 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We always inline __kernel_log(), since doing so produces a
|
* We always inline k_log1p(), since doing so produces a
|
||||||
* substantial performance improvement (~40% on amd64).
|
* substantial performance improvement (~40% on amd64).
|
||||||
*/
|
*/
|
||||||
static inline double
|
static inline double
|
||||||
__kernel_log(double x)
|
k_log1p(double f)
|
||||||
{
|
{
|
||||||
double hfsq,f,s,z,R,w,t1,t2;
|
double hfsq,s,z,R,w,t1,t2;
|
||||||
int32_t hx,i,j;
|
|
||||||
u_int32_t lx;
|
|
||||||
|
|
||||||
EXTRACT_WORDS(hx,lx,x);
|
s = f/(2.0+f);
|
||||||
|
|
||||||
f = x-1.0;
|
|
||||||
if((0x000fffff&(2+hx))<3) { /* -2**-20 <= f < 2**-20 */
|
|
||||||
if(f==0.0) return 0.0;
|
|
||||||
return f*f*(0.33333333333333333*f-0.5);
|
|
||||||
}
|
|
||||||
s = f/(2.0+f);
|
|
||||||
z = s*s;
|
z = s*s;
|
||||||
hx &= 0x000fffff;
|
|
||||||
i = hx-0x6147a;
|
|
||||||
w = z*z;
|
w = z*z;
|
||||||
j = 0x6b851-hx;
|
t1= w*(Lg2+w*(Lg4+w*Lg6));
|
||||||
t1= w*(Lg2+w*(Lg4+w*Lg6));
|
t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
|
||||||
t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
|
|
||||||
i |= j;
|
|
||||||
R = t2+t1;
|
R = t2+t1;
|
||||||
if (i>0) {
|
hfsq=0.5*f*f;
|
||||||
hfsq=0.5*f*f;
|
return s*(hfsq+R);
|
||||||
return s*(hfsq+R) - hfsq;
|
|
||||||
} else {
|
|
||||||
return s*(R-f);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
__FBSDID("$FreeBSD$");
|
__FBSDID("$FreeBSD$");
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* float version of __kernel_log(x). See k_log.c for details.
|
* Float version of k_log.h. See the latter for most comments.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static const float
|
static const float
|
||||||
@ -24,32 +24,16 @@ Lg3 = 0x91e9ee.0p-25, /* 0.28498786688 */
|
|||||||
Lg4 = 0xf89e26.0p-26; /* 0.24279078841 */
|
Lg4 = 0xf89e26.0p-26; /* 0.24279078841 */
|
||||||
|
|
||||||
static inline float
|
static inline float
|
||||||
__kernel_logf(float x)
|
k_log1pf(float f)
|
||||||
{
|
{
|
||||||
float hfsq,f,s,z,R,w,t1,t2;
|
float hfsq,s,z,R,w,t1,t2;
|
||||||
int32_t ix,i,j;
|
|
||||||
|
|
||||||
GET_FLOAT_WORD(ix,x);
|
|
||||||
|
|
||||||
f = x-(float)1.0;
|
|
||||||
if((0x007fffff&(0x8000+ix))<0xc000) { /* -2**-9 <= f < 2**-9 */
|
|
||||||
if(f==0.0f) return 0.0f;
|
|
||||||
return f*f*((float)0.33333333333333333*f-(float)0.5);
|
|
||||||
}
|
|
||||||
s = f/((float)2.0+f);
|
s = f/((float)2.0+f);
|
||||||
z = s*s;
|
z = s*s;
|
||||||
ix &= 0x007fffff;
|
|
||||||
i = ix-(0x6147a<<3);
|
|
||||||
w = z*z;
|
w = z*z;
|
||||||
j = (0x6b851<<3)-ix;
|
|
||||||
t1= w*(Lg2+w*Lg4);
|
t1= w*(Lg2+w*Lg4);
|
||||||
t2= z*(Lg1+w*Lg3);
|
t2= z*(Lg1+w*Lg3);
|
||||||
i |= j;
|
|
||||||
R = t2+t1;
|
R = t2+t1;
|
||||||
if(i>0) {
|
hfsq=(float)0.5*f*f;
|
||||||
hfsq=(float)0.5*f*f;
|
return s*(hfsq+R);
|
||||||
return s*(hfsq+R) - hfsq;
|
|
||||||
} else {
|
|
||||||
return s*(R-f);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user