Centralize the complications for special efficient rounding to integers.
This was open-coded in range reduction for trig and exp functions. Now there are 3 static inline functions rnint[fl]() that replace open-coded expressions, and type-generic irint() and i64rint() macros that hide the complications for efficiently using non-generic irint() and irintl() functions and casts. Special details: ld128/e_rem_pio2l.h needs to use i64rint() since it needs a 46-bit integer result. Everything else only needs a (less than) 32-bit integer result so uses irint(). Float and double cases now use float_t and double_t locally instead of STRICT_ASSIGN() to avoid bugs in extra precision. On amd64, inline asm is now only used for irint() on long doubles. The SSE asm for irint() on amd64 only existed because the ifdef tangles made the correct method of simply casting to int for this case non-obvious.
This commit is contained in:
parent
1a59bccc42
commit
27aa844253
@ -74,14 +74,9 @@ __ieee754_rem_pio2l(long double x, long double *y)
|
||||
if (ex < BIAS + 45 || ex == BIAS + 45 &&
|
||||
u.bits.manh < 0x921fb54442d1LL) {
|
||||
/* |x| ~< 2^45*(pi/2), medium size */
|
||||
/* Use a specialized rint() to get fn. Assume round-to-nearest. */
|
||||
fn = x*invpio2+0x1.8p112;
|
||||
fn = fn-0x1.8p112;
|
||||
#ifdef HAVE_EFFICIENT_I64RINT
|
||||
/* TODO: use only double precision for fn, as in expl(). */
|
||||
fn = rnintl(x * invpio2);
|
||||
n = i64rint(fn);
|
||||
#else
|
||||
n = fn;
|
||||
#endif
|
||||
r = x-fn*pio2_1;
|
||||
w = fn*pio2_1t; /* 1st round good to 180 bit */
|
||||
{
|
||||
|
@ -244,16 +244,8 @@ __k_expl(long double x, long double *hip, long double *lop, int *kp)
|
||||
int n, n2;
|
||||
|
||||
/* Reduce x to (k*ln2 + endpoint[n2] + r1 + r2). */
|
||||
/* Use a specialized rint() to get fn. Assume round-to-nearest. */
|
||||
/* XXX assume no extra precision for the additions, as for trig fns. */
|
||||
/* XXX this set of comments is now quadruplicated. */
|
||||
/* XXX but see ../src/e_exp.c for a fix using double_t. */
|
||||
fn = (double)x * INV_L + 0x1.8p52 - 0x1.8p52;
|
||||
#if defined(HAVE_EFFICIENT_IRINT)
|
||||
fn = rnint((double)x * INV_L);
|
||||
n = irint(fn);
|
||||
#else
|
||||
n = (int)fn;
|
||||
#endif
|
||||
n2 = (unsigned)n % INTERVALS;
|
||||
/* Depend on the sign bit being propagated: */
|
||||
*kp = n >> LOG2_INTERVALS;
|
||||
|
@ -268,13 +268,8 @@ expm1l(long double x)
|
||||
}
|
||||
|
||||
/* Reduce x to (k*ln2 + endpoint[n2] + r1 + r2). */
|
||||
/* Use a specialized rint() to get fn. Assume round-to-nearest. */
|
||||
fn = (double)x * INV_L + 0x1.8p52 - 0x1.8p52;
|
||||
#if defined(HAVE_EFFICIENT_IRINT)
|
||||
fn = rnint((double)x * INV_L);
|
||||
n = irint(fn);
|
||||
#else
|
||||
n = (int)fn;
|
||||
#endif
|
||||
n2 = (unsigned)n % INTERVALS;
|
||||
k = n >> LOG2_INTERVALS;
|
||||
r1 = x - fn * L1;
|
||||
|
@ -84,14 +84,8 @@ __ieee754_rem_pio2l(long double x, long double *y)
|
||||
ex = expsign & 0x7fff;
|
||||
if (ex < BIAS + 25 || (ex == BIAS + 25 && u.bits.manh < 0xc90fdaa2)) {
|
||||
/* |x| ~< 2^25*(pi/2), medium size */
|
||||
/* Use a specialized rint() to get fn. Assume round-to-nearest. */
|
||||
fn = x*invpio2+0x1.8p63;
|
||||
fn = fn-0x1.8p63;
|
||||
#ifdef HAVE_EFFICIENT_IRINT
|
||||
fn = rnintl(x*invpio2);
|
||||
n = irint(fn);
|
||||
#else
|
||||
n = fn;
|
||||
#endif
|
||||
r = x-fn*pio2_1;
|
||||
w = fn*pio2_1t; /* 1st round good to 102 bit */
|
||||
{
|
||||
|
@ -225,16 +225,9 @@ __k_expl(long double x, long double *hip, long double *lop, int *kp)
|
||||
int n, n2;
|
||||
|
||||
/* Reduce x to (k*ln2 + endpoint[n2] + r1 + r2). */
|
||||
/* Use a specialized rint() to get fn. Assume round-to-nearest. */
|
||||
fn = x * INV_L + 0x1.8p63 - 0x1.8p63;
|
||||
fn = rnintl(x * INV_L);
|
||||
r = x - fn * L1 - fn * L2; /* r = r1 + r2 done independently. */
|
||||
#if defined(HAVE_EFFICIENT_IRINTL)
|
||||
n = irintl(fn);
|
||||
#elif defined(HAVE_EFFICIENT_IRINT)
|
||||
n = irint(fn);
|
||||
#else
|
||||
n = (int)fn;
|
||||
#endif
|
||||
n2 = (unsigned)n % INTERVALS;
|
||||
/* Depend on the sign bit being propagated: */
|
||||
*kp = n >> LOG2_INTERVALS;
|
||||
|
@ -225,15 +225,8 @@ expm1l(long double x)
|
||||
}
|
||||
|
||||
/* Reduce x to (k*ln2 + endpoint[n2] + r1 + r2). */
|
||||
/* Use a specialized rint() to get fn. Assume round-to-nearest. */
|
||||
fn = x * INV_L + 0x1.8p63 - 0x1.8p63;
|
||||
#if defined(HAVE_EFFICIENT_IRINTL)
|
||||
n = irintl(fn);
|
||||
#elif defined(HAVE_EFFICIENT_IRINT)
|
||||
fn = rnintl(x * INV_L);
|
||||
n = irint(fn);
|
||||
#else
|
||||
n = (int)fn;
|
||||
#endif
|
||||
n2 = (unsigned)n % INTERVALS;
|
||||
k = n >> LOG2_INTERVALS;
|
||||
r1 = x - fn * L1;
|
||||
|
@ -127,14 +127,8 @@ __ieee754_rem_pio2(double x, double *y)
|
||||
}
|
||||
if(ix<0x413921fb) { /* |x| ~< 2^20*(pi/2), medium size */
|
||||
medium:
|
||||
/* Use a specialized rint() to get fn. Assume round-to-nearest. */
|
||||
STRICT_ASSIGN(double,fn,x*invpio2+0x1.8p52);
|
||||
fn = fn-0x1.8p52;
|
||||
#ifdef HAVE_EFFICIENT_IRINT
|
||||
fn = rnint((double_t)x*invpio2);
|
||||
n = irint(fn);
|
||||
#else
|
||||
n = (int32_t)fn;
|
||||
#endif
|
||||
r = x-fn*pio2_1;
|
||||
w = fn*pio2_1t; /* 1st round good to 85 bit */
|
||||
{
|
||||
|
@ -55,14 +55,8 @@ __ieee754_rem_pio2f(float x, double *y)
|
||||
ix = hx&0x7fffffff;
|
||||
/* 33+53 bit pi is good enough for medium size */
|
||||
if(ix<0x4dc90fdb) { /* |x| ~< 2^28*(pi/2), medium size */
|
||||
/* Use a specialized rint() to get fn. Assume round-to-nearest. */
|
||||
STRICT_ASSIGN(double,fn,x*invpio2+0x1.8p52);
|
||||
fn = fn-0x1.8p52;
|
||||
#ifdef HAVE_EFFICIENT_IRINT
|
||||
fn = rnint((float_t)x*invpio2);
|
||||
n = irint(fn);
|
||||
#else
|
||||
n = (int32_t)fn;
|
||||
#endif
|
||||
r = x-fn*pio2_1;
|
||||
w = fn*pio2_1t;
|
||||
*y = r-w;
|
||||
|
@ -571,48 +571,116 @@ CMPLXL(long double x, long double y)
|
||||
|
||||
#endif /* _COMPLEX_H */
|
||||
|
||||
#ifdef __GNUCLIKE_ASM
|
||||
/*
|
||||
* The rnint() family rounds to the nearest integer for a restricted range
|
||||
* range of args (up to about 2**MANT_DIG). We assume that the current
|
||||
* rounding mode is FE_TONEAREST so that this can be done efficiently.
|
||||
* Extra precision causes more problems in practice, and we only centralize
|
||||
* this here to reduce those problems, and have not solved the efficiency
|
||||
* problems. The exp2() family uses a more delicate version of this that
|
||||
* requires extracting bits from the intermediate value, so it is not
|
||||
* centralized here and should copy any solution of the efficiency problems.
|
||||
*/
|
||||
|
||||
/* Asm versions of some functions. */
|
||||
static inline double
|
||||
rnint(__double_t x)
|
||||
{
|
||||
/*
|
||||
* This casts to double to kill any extra precision. This depends
|
||||
* on the cast being applied to a double_t to avoid compiler bugs
|
||||
* (this is a cleaner version of STRICT_ASSIGN()). This is
|
||||
* inefficient if there actually is extra precision, but is hard
|
||||
* to improve on. We use double_t in the API to minimise conversions
|
||||
* for just calling here. Note that we cannot easily change the
|
||||
* magic number to the one that works directly with double_t, since
|
||||
* the rounding precision is variable at runtime on x86 so the
|
||||
* magic number would need to be variable. Assuming that the
|
||||
* rounding precision is always the default is too fragile. This
|
||||
* and many other complications will move when the default is
|
||||
* changed to FP_PE.
|
||||
*/
|
||||
return ((double)(x + 0x1.8p52) - 0x1.8p52);
|
||||
}
|
||||
|
||||
#ifdef __amd64__
|
||||
static inline float
|
||||
rnintf(__float_t x)
|
||||
{
|
||||
/*
|
||||
* As for rnint(), except we could just call that to handle the
|
||||
* extra precision case, usually without losing efficiency.
|
||||
*/
|
||||
return ((float)(x + 0x1.8p23F) - 0x1.8p23F);
|
||||
}
|
||||
|
||||
#ifdef LDBL_MANT_DIG
|
||||
/*
|
||||
* The complications for extra precision are smaller for rnintl() since it
|
||||
* can safely assume that the rounding precision has been increased from
|
||||
* its default to FP_PE on x86. We don't exploit that here to get small
|
||||
* optimizations from limiting the rangle to double. We just need it for
|
||||
* the magic number to work with long doubles. ld128 callers should use
|
||||
* rnint() instead of this if possible. ld80 callers should prefer
|
||||
* rnintl() since for amd64 this avoids swapping the register set, while
|
||||
* for i386 it makes no difference (assuming FP_PE), and for other arches
|
||||
* it makes little difference.
|
||||
*/
|
||||
static inline long double
|
||||
rnintl(long double x)
|
||||
{
|
||||
return (x + __CONCAT(0x1.8p, LDBL_MANT_DIG) / 2 -
|
||||
__CONCAT(0x1.8p, LDBL_MANT_DIG) / 2);
|
||||
}
|
||||
#endif /* LDBL_MANT_DIG */
|
||||
|
||||
/*
|
||||
* irint() and i64rint() give the same result as casting to their integer
|
||||
* return type provided their arg is a floating point integer. They can
|
||||
* sometimes be more efficient because no rounding is required.
|
||||
*/
|
||||
#if (defined(amd64) || defined(__i386__)) && defined(__GNUCLIKE_ASM)
|
||||
#define irint(x) \
|
||||
(sizeof(x) == sizeof(float) && \
|
||||
sizeof(__float_t) == sizeof(long double) ? irintf(x) : \
|
||||
sizeof(x) == sizeof(double) && \
|
||||
sizeof(__double_t) == sizeof(long double) ? irintd(x) : \
|
||||
sizeof(x) == sizeof(long double) ? irintl(x) : (int)(x))
|
||||
#else
|
||||
#define irint(x) ((int)(x))
|
||||
#endif
|
||||
|
||||
#define i64rint(x) ((int64_t)(x)) /* only needed for ld128 so not opt. */
|
||||
|
||||
#if defined(__i386__) && defined(__GNUCLIKE_ASM)
|
||||
static __inline int
|
||||
irint(double x)
|
||||
irintf(float x)
|
||||
{
|
||||
int n;
|
||||
|
||||
asm("cvtsd2si %1,%0" : "=r" (n) : "x" (x));
|
||||
__asm("fistl %0" : "=m" (n) : "t" (x));
|
||||
return (n);
|
||||
}
|
||||
#define HAVE_EFFICIENT_IRINT
|
||||
#endif
|
||||
|
||||
#ifdef __i386__
|
||||
static __inline int
|
||||
irint(double x)
|
||||
irintd(double x)
|
||||
{
|
||||
int n;
|
||||
|
||||
asm("fistl %0" : "=m" (n) : "t" (x));
|
||||
__asm("fistl %0" : "=m" (n) : "t" (x));
|
||||
return (n);
|
||||
}
|
||||
#define HAVE_EFFICIENT_IRINT
|
||||
#endif
|
||||
|
||||
#if defined(__amd64__) || defined(__i386__)
|
||||
#if (defined(__amd64__) || defined(__i386__)) && defined(__GNUCLIKE_ASM)
|
||||
static __inline int
|
||||
irintl(long double x)
|
||||
{
|
||||
int n;
|
||||
|
||||
asm("fistl %0" : "=m" (n) : "t" (x));
|
||||
__asm("fistl %0" : "=m" (n) : "t" (x));
|
||||
return (n);
|
||||
}
|
||||
#define HAVE_EFFICIENT_IRINTL
|
||||
#endif
|
||||
|
||||
#endif /* __GNUCLIKE_ASM */
|
||||
|
||||
#ifdef DEBUG
|
||||
#if defined(__amd64__) || defined(__i386__)
|
||||
#define breakpoint() asm("int $3")
|
||||
|
Loading…
Reference in New Issue
Block a user