Remove the "quick check no cancellation" optimization for
9pi/2 < |x| < 32pi/2 since it is only a small or negative optimation and it gets in the way of further optimizations. It did one more branch to avoid some integer operations and to use a different dependency on previous results. The branches are fairly predictable so they are usually not a problem, so whether this is a good optimization depends mainly on the timing for the previous results, which is very machine-dependent. On amd64 (A64), this "optimization" is a pessimization of about 1 cycle or 1%; on ia64, it is an optimization of about 2 cycles or 1%; on i386 (A64), it is an optimization of about 5 cycles or 4%; on i386 (Celeron P2) it is an optimization of about 4 cycles or 3% for cos but a pessimization of about 5 cycles for sin and 1 cycle for tan. I think the new i386 (A64) slowness is due to an pipeline stall due to an avoidable load-store mismatch (so the old timing was better), and the i386 (Celeron) variance is due to its branch predictor not being too good.
This commit is contained in:
parent
37c23ae5ff
commit
e31bf4b688
@ -26,15 +26,6 @@ __FBSDID("$FreeBSD$");
|
||||
#include "math.h"
|
||||
#include "math_private.h"
|
||||
|
||||
static const int32_t npio2_hw[] = {
|
||||
0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C,
|
||||
0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C,
|
||||
0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A,
|
||||
0x403DD85A, 0x403F6A7A, 0x40407E4C, 0x4041475C, 0x4042106C, 0x4042D97C,
|
||||
0x4043A28C, 0x40446B9C, 0x404534AC, 0x4045FDBB, 0x4046C6CB, 0x40478FDB,
|
||||
0x404858EB, 0x404921FB,
|
||||
};
|
||||
|
||||
/*
|
||||
* invpio2: 53 bits of 2/pi
|
||||
* pio2_1: first 33 bit of pi/2
|
||||
@ -148,9 +139,7 @@ __ieee754_rem_pio2(double x, double *y)
|
||||
#endif
|
||||
r = t-fn*pio2_1;
|
||||
w = fn*pio2_1t; /* 1st round good to 85 bit */
|
||||
if(n<32&&ix!=npio2_hw[n-1]) {
|
||||
y[0] = r-w; /* quick check no cancellation */
|
||||
} else {
|
||||
{
|
||||
u_int32_t high;
|
||||
j = ix>>20;
|
||||
y[0] = r-w;
|
||||
|
Loading…
Reference in New Issue
Block a user