Inline __ieee754__rem_pio2f(). On amd64 (A64) and i386 (A64), this

gives an average speedup of about 12 cycles or 17% for 9pi/4 < |x| <= 2**19pi/2 and a smaller speedup for larger x, and a small speeddown for |x| <= 9pi/4 (only 1-2 cycles average, but that is 4%). Inlining this is less likely to bust caches than inlining the float version since it is much smaller (about 220 bytes text and rodata) and has many fewer branches. However, the float version was already large due to its manual inlining of the branches and also the polynomial evaluations.
2008-02-25 22:19:17 +00:00 · 2008-02-25 22:19:17 +00:00 · e822ea5b2a
commit e822ea5b2a
parent af6e49e963
4 changed files with 15 additions and 0 deletions
--- a/lib/msun/src/e_rem_pio2f.c
+++ b/lib/msun/src/e_rem_pio2f.c
@ -41,6 +41,9 @@ invpio2 =  6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
 pio2_1  =  1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
 pio2_1t =  6.07710050650619224932e-11; /* 0x3DD0B461, 0x1A626331 */

+#ifdef INLINE_REM_PIO2F
+extern inline
+#endif
 int
 __ieee754_rem_pio2f(float x, double *y)
 {
--- a/lib/msun/src/s_cosf.c
+++ b/lib/msun/src/s_cosf.c
@ -17,10 +17,14 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");

+#include <float.h>
+
 #include "math.h"
 #define	INLINE_KERNEL_COSDF
 #define	INLINE_KERNEL_SINDF
+#define INLINE_REM_PIO2F
 #include "math_private.h"
+#include "e_rem_pio2f.c"
 #include "k_cosf.c"
 #include "k_sinf.c"

--- a/lib/msun/src/s_sinf.c
+++ b/lib/msun/src/s_sinf.c
@ -17,10 +17,14 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");

+#include <float.h>
+
 #include "math.h"
 #define	INLINE_KERNEL_COSDF
 #define	INLINE_KERNEL_SINDF
+#define INLINE_REM_PIO2F
 #include "math_private.h"
+#include "e_rem_pio2f.c"
 #include "k_cosf.c"
 #include "k_sinf.c"

--- a/lib/msun/src/s_tanf.c
+++ b/lib/msun/src/s_tanf.c
@ -17,9 +17,13 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");

+#include <float.h>
+
 #include "math.h"
 #define	INLINE_KERNEL_TANDF
+#define INLINE_REM_PIO2F
 #include "math_private.h"
+#include "e_rem_pio2f.c"
 #include "k_tanf.c"

 /* Small multiples of pi/2 rounded to double precision. */