f4b01a9edf
k_tanf.c but with different details. The polynomial is odd with degree 13 for tanf() and odd with degree 9 for sinf(), so the details are not very different for sinf() -- the term with the x**11 and x**13 coefficients goes awaym and (mysteriously) it helps to do the evaluation of w = z*z early although moving it later was a key optimization for tanf(). The details are different but simpler for cosf() because the polynomial is even and of lower degree. On Athlons, for uniformly distributed args in [-2pi, 2pi], this gives an optimization of about 4 cycles (10%) in most cases (13% for sinf() on AXP, but 0% for cosf() with gcc-3.3 -O1 on AXP). The best case (sinf() with gcc-3.4 -O1 -fcaller-saves on A64) now takes 33-39 cycles (was 37-45 cycles). Hardware sinf takes 74-129 cycles. Despite being fine tuned for Athlons, the optimization is even larger on some other arches (about 15% on ia64 (pluto2) and 20% on alpha (beast) with gcc -O2 -fomit-frame-pointer). |
||
---|---|---|
.. | ||
bind | ||
csu | ||
libalias | ||
libarchive | ||
libatm | ||
libautofs | ||
libbegemot | ||
libbluetooth | ||
libbsnmp | ||
libbz2 | ||
libc | ||
libc_r | ||
libcalendar | ||
libcam | ||
libcom_err | ||
libcompat | ||
libcrypt | ||
libdevinfo | ||
libdevstat | ||
libdisk | ||
libedit | ||
libexpat | ||
libfetch | ||
libform | ||
libftpio | ||
libgeom | ||
libgpib | ||
libio | ||
libipsec | ||
libipx | ||
libkiconv | ||
libkse | ||
libkvm | ||
libmagic | ||
libmd | ||
libmemstat | ||
libmenu | ||
libmilter | ||
libmp | ||
libncp | ||
libncurses | ||
libnetgraph | ||
libngatm | ||
libopie | ||
libpam | ||
libpanel | ||
libpcap | ||
libpmc | ||
libpthread | ||
libradius | ||
librpcsvc | ||
libsbuf | ||
libsdp | ||
libsm | ||
libsmb | ||
libsmdb | ||
libsmutil | ||
libstand | ||
libtacplus | ||
libtelnet | ||
libthr | ||
libthread_db | ||
libufs | ||
libugidfw | ||
libusbhid | ||
libutil | ||
libvgl | ||
libwrap | ||
liby | ||
libypclnt | ||
libz | ||
msun | ||
ncurses | ||
Makefile | ||
Makefile.inc |