From d5580d091aea6201b4546261acd0695d9df9f7ff Mon Sep 17 00:00:00 2001 From: David Schultz Date: Sat, 22 Jan 2005 09:53:18 +0000 Subject: [PATCH] Add fma() and fmaf(), which implement a fused multiply-add operation. --- lib/msun/Makefile | 10 ++- lib/msun/ia64/s_fma.S | 34 ++++++++ lib/msun/ia64/s_fmaf.S | 34 ++++++++ lib/msun/man/fma.3 | 105 ++++++++++++++++++++++++ lib/msun/man/math.3 | 2 +- lib/msun/src/math.h | 5 ++ lib/msun/src/s_fma.c | 178 +++++++++++++++++++++++++++++++++++++++++ lib/msun/src/s_fmaf.c | 47 +++++++++++ 8 files changed, 412 insertions(+), 3 deletions(-) create mode 100644 lib/msun/ia64/s_fma.S create mode 100644 lib/msun/ia64/s_fmaf.S create mode 100644 lib/msun/man/fma.3 create mode 100644 lib/msun/src/s_fma.c create mode 100644 lib/msun/src/s_fmaf.c diff --git a/lib/msun/Makefile b/lib/msun/Makefile index eec0f8a1d13b..d5724df36ac0 100644 --- a/lib/msun/Makefile +++ b/lib/msun/Makefile @@ -41,6 +41,7 @@ # default standard # +# XXX MD crud should be in separate makefiles .if ${MACHINE_ARCH} == "alpha" ARCH_SRCS = s_copysign.S s_copysignf.S # XXX Comment from NetBSD/Alpha: @@ -50,6 +51,8 @@ ARCH_SRCS = s_copysign.S s_copysignf.S #CFLAGS += -mtrap-precision=i -mfp-trap-mode=su .elif ${MACHINE_ARCH} == "amd64" ARCH_SRCS = e_sqrt.S s_lrint.S s_llrint.S +.elif ${MACHINE_ARCH} == "ia64" +ARCH_SRCS = s_fma.S s_fmaf.S .elif ${MACHINE_ARCH} == "i386" ARCH_SUBDIR= i387 ARCH_SRCS = e_acos.S e_asin.S e_atan2.S e_exp.S e_fmod.S e_log.S e_log10.S \ @@ -88,7 +91,8 @@ COMMON_SRCS= b_exp.c b_log.c b_tgamma.c \ s_ceil.c s_ceilf.c s_ceill.c \ s_copysign.c s_copysignf.c s_cos.c s_cosf.c s_erf.c s_erff.c \ s_expm1.c s_expm1f.c s_fabsf.c s_fdim.c s_finite.c s_finitef.c \ - s_floor.c s_floorf.c s_floorl.c s_fmax.c s_fmaxf.c s_fmaxl.c s_fmin.c \ + s_floor.c s_floorf.c s_floorl.c s_fma.c s_fmaf.c \ + s_fmax.c s_fmaxf.c s_fmaxl.c s_fmin.c \ s_fminf.c s_fminl.c s_frexp.c s_frexpf.c s_ilogb.c s_ilogbf.c \ s_ilogbl.c s_isfinite.c s_isnan.c s_isnormal.c s_ldexpf.c \ s_lib_version.c s_llrint.c s_llrintf.c s_llround.c s_llroundf.c \ @@ -137,7 +141,8 @@ INCS= fenv.h math.h MAN= acos.3 acosh.3 asin.3 asinh.3 atan.3 atan2.3 atanh.3 ceil.3 \ cimag.3 cos.3 cosh.3 erf.3 exp.3 fabs.3 fdim.3 feclearexcept.3 \ - fegetenv.3 fegetround.3 fenv.3 floor.3 fmax.3 fmod.3 hypot.3 ieee.3 \ + fegetenv.3 fegetround.3 fenv.3 floor.3 \ + fma.3 fmax.3 fmod.3 hypot.3 ieee.3 \ ieee_test.3 j0.3 lgamma.3 lrint.3 lround.3 math.3 rint.3 round.3 \ signbit.3 sin.3 sinh.3 sqrt.3 tan.3 tanh.3 trunc.3 @@ -167,6 +172,7 @@ MLINKS+=fegetenv.3 feholdexcept.3 fegetenv.3 fesetenv.3 \ fegetenv.3 feupdateenv.3 MLINKS+=fegetround.3 fesetround.3 MLINKS+=floor.3 floorf.3 floor.3 floorl.3 +MLINKS+=fma.3 fmaf.3 MLINKS+=fmax.3 fmaxf.3 fmax.3 fmaxl.3 \ fmax.3 fmin.3 fmax.3 fminf.3 fmax.3 fminl.3 MLINKS+=fmod.3 fmodf.3 diff --git a/lib/msun/ia64/s_fma.S b/lib/msun/ia64/s_fma.S new file mode 100644 index 000000000000..3e693592fce8 --- /dev/null +++ b/lib/msun/ia64/s_fma.S @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2005 David Schultz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$") + +ENTRY(fma, 3) +{ + fma.d f8 = f8, f9, f10 + br.ret.sptk b0 +} diff --git a/lib/msun/ia64/s_fmaf.S b/lib/msun/ia64/s_fmaf.S new file mode 100644 index 000000000000..1e122bb525eb --- /dev/null +++ b/lib/msun/ia64/s_fmaf.S @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2005 David Schultz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$") + +ENTRY(fmaf, 3) +{ + fma.s f8 = f8, f9, f10 + br.ret.sptk b0 +} diff --git a/lib/msun/man/fma.3 b/lib/msun/man/fma.3 new file mode 100644 index 000000000000..1eb5b604df66 --- /dev/null +++ b/lib/msun/man/fma.3 @@ -0,0 +1,105 @@ +.\" Copyright (c) 2005 David Schultz +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd January 22, 2005 +.Dt FMA 3 +.Os +.Sh NAME +.Nm fma , +.Nm fmaf +.Nd fused multiply-add +.Sh LIBRARY +.Lb libm +.Sh SYNOPSIS +.In math.h +.Ft double +.Fn fma "double x" "double y" "double z" +.Ft float +.Fn fmaf "float x" "float y" "float z" +.Sh DESCRIPTION +The +.Fn fma +and +.Fn fmaf +functions return +.No "(x * y) + z" , +computed with only one rounding error. +Using the ordinary multiplication and addition operators, by contrast, +results in two roundings: one for the intermediate product and one for +the final result. +.Pp +For instance, the expression +.No "1.2e100 * 2.0e208 - 1.4e308" +produces \*(If due to overflow in the intermediate product, whereas +.No "fma(1.2e100, 2.0e208, -1.4e308)" +returns approximately 1.0e308. +.Pp +The fused multiply-add operation is often used to improve the +accuracy of calculations such as dot products. +It may also be used to improve performance on machines that implement +it natively. +The macros +.Dv FP_FAST_FMA +and +.Dv FP_FAST_FMAF +may be defined in +.In math.h +to indicate that +.Fn fma +and +.Fn fmaf +(respectively) have comparable or faster speed than a multiply +operation followed by an add operation. +.Sh IMPLEMENTATION NOTES +In general, +.Fn fma +and +.Fn fmaf +will behave as one would expect if +.No "x * y + z" +were computed with unbounded precision and range, +then rounded to the precision of the return type. +However, on some platforms, if +.Fa z +is \*(Na, these functions may not raise an exception even +when the computation of +.No "x * y" +would have otherwise generated an invalid exception. +.Sh SEE ALSO +.Xr fenv 3 , +.Xr math 3 +.Sh STANDARDS +The +.Fn fma +and +.Fn fmaf +functions conform to +.St -isoC-99 . +A fused multiply-add operation with virtually identical +characteristics appears in IEEE draft standard 754R. +.Sh HISTORY +These routines first appeared in +.Fx 5.4 . diff --git a/lib/msun/man/math.3 b/lib/msun/man/math.3 index 1a35db17cfff..4e3ec8dd1776 100644 --- a/lib/msun/man/math.3 +++ b/lib/msun/man/math.3 @@ -107,7 +107,7 @@ expm1 exp(x)\-1 1 fabs absolute value 0 fdim positive difference 1 floor integer no greater than 0 -.\" fma multiply-add ??? +fma multiply-add 1 fmax maximum function 0 fmin minimum function 0 fmod remainder function ??? diff --git a/lib/msun/src/math.h b/lib/msun/src/math.h index 58e845a00041..e250905ed3c4 100644 --- a/lib/msun/src/math.h +++ b/lib/msun/src/math.h @@ -68,6 +68,11 @@ extern const union __nan_un { #define MATH_ERREXCEPT 2 #define math_errhandling MATH_ERREXCEPT +#ifdef __ia64__ +#define FP_FAST_FMA +#endif +#define FP_FAST_FMAF + /* Symbolic constants to classify floating point numbers. */ #define FP_INFINITE 0x01 #define FP_NAN 0x02 diff --git a/lib/msun/src/s_fma.c b/lib/msun/src/s_fma.c new file mode 100644 index 000000000000..401c2c79460b --- /dev/null +++ b/lib/msun/src/s_fma.c @@ -0,0 +1,178 @@ +/*- + * Copyright (c) 2005 David Schultz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +/* + * Fused multiply-add: Compute x * y + z with a single rounding error. + * + * We use scaling to avoid overflow/underflow, along with the + * canonical precision-doubling technique adapted from: + * + * Dekker, T. A Floating-Point Technique for Extending the + * Available Precision. Numer. Math. 18, 224-242 (1971). + * + * This algorithm is sensitive to the rounding precision. FPUs such + * as the i387 must be set in double-precision mode if variables are + * to be stored in FP registers in order to avoid incorrect results. + * This is the default on FreeBSD, but not on many other systems. + * + * Tests on an Itanium 2 indicate that the hardware's FMA instruction + * is almost twice as fast as this implementation. The hardware + * instruction should be used on platforms that support it. + * + * XXX May incur an absolute error of 0x1p-1074 for subnormal results + * due to double rounding induced by the final scaling operation. + * + * XXX On machines supporting quad precision, we should use that, but + * see the caveat in s_fmaf.c. + */ +double +fma(double x, double y, double z) +{ + static const double split = 0x1p27 + 1.0; + double xs, ys, zs; + double c, cc, hx, hy, p, q, tx, ty; + double r, rr, s; + int oround; + int ex, ey, ez; + int spread; + + if (x == 0.0 || y == 0.0) + return (z); + if (z == 0.0) + return (x * y); + + /* Results of frexp() are undefined for these cases. */ + if (!isfinite(x) || !isfinite(y) || !isfinite(z)) + return (x * y + z); + + xs = frexp(x, &ex); + ys = frexp(y, &ey); + zs = frexp(z, &ez); + oround = fegetround(); + spread = ex + ey - ez; + + /* + * If x * y and z are many orders of magnitude apart, the scaling + * will overflow, so we handle these cases specially. Rounding + * modes other than FE_TONEAREST are painful. + */ + if (spread > DBL_MANT_DIG * 2) { + fenv_t env; + feraiseexcept(FE_INEXACT); + switch(oround) { + case FE_TONEAREST: + return (x * y); + case FE_TOWARDZERO: + if (x > 0.0 ^ y < 0.0 ^ z < 0.0) + return (x * y); + feholdexcept(&env); + r = x * y; + if (!fetestexcept(FE_INEXACT)) + r = nextafter(r, 0); + feupdateenv(&env); + return (r); + case FE_DOWNWARD: + if (z > 0.0) + return (x * y); + feholdexcept(&env); + r = x * y; + if (!fetestexcept(FE_INEXACT)) + r = nextafter(r, -INFINITY); + feupdateenv(&env); + return (r); + default: /* FE_UPWARD */ + if (z < 0.0) + return (x * y); + feholdexcept(&env); + r = x * y; + if (!fetestexcept(FE_INEXACT)) + r = nextafter(r, INFINITY); + feupdateenv(&env); + return (r); + } + } + if (spread < -DBL_MANT_DIG) { + feraiseexcept(FE_INEXACT); + if (!isnormal(z)) + feraiseexcept(FE_UNDERFLOW); + switch (oround) { + case FE_TONEAREST: + return (z); + case FE_TOWARDZERO: + if (x > 0.0 ^ y < 0.0 ^ z < 0.0) + return (z); + else + return (nextafter(z, 0)); + case FE_DOWNWARD: + if (x > 0.0 ^ y < 0.0) + return (z); + else + return (nextafter(z, -INFINITY)); + default: /* FE_UPWARD */ + if (x > 0.0 ^ y < 0.0) + return (nextafter(z, INFINITY)); + else + return (z); + } + } + + /* + * Use Dekker's algorithm to perform the multiplication and + * subsequent addition in twice the machine precision. + * Arrange so that x * y = c + cc, and x * y + z = r + rr. + */ + fesetround(FE_TONEAREST); + + p = xs * split; + hx = xs - p; + hx += p; + tx = xs - hx; + + p = ys * split; + hy = ys - p; + hy += p; + ty = ys - hy; + + p = hx * hy; + q = hx * ty + tx * hy; + c = p + q; + cc = p - c + q + tx * ty; + + zs = ldexp(zs, -spread); + r = c + zs; + s = r - c; + rr = (c - (r - s)) + (zs - s) + cc; + + fesetround(oround); + return (ldexp(r + rr, ex + ey)); +} diff --git a/lib/msun/src/s_fmaf.c b/lib/msun/src/s_fmaf.c new file mode 100644 index 000000000000..1b483b740679 --- /dev/null +++ b/lib/msun/src/s_fmaf.c @@ -0,0 +1,47 @@ +/*- + * Copyright (c) 2005 David Schultz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + * Fused multiply-add: Compute x * y + z with a single rounding error. + * + * A double has more than twice as much precision than a float, so + * direct double-precision arithmetic suffices. + * + * XXX We are relying on the compiler to convert from double to float + * using the current rounding mode and with the appropriate + * side-effects. But on at least one platform (gcc 3.4.2/sparc64), + * this appears to be too much to ask for. The precision + * reduction should be done manually. + */ +float +fmaf(float x, float y, float z) +{ + + return ((double)x * y + z); +}