Fix fpset*() to not trap if there is a currently unmasked exception.

Unmasked exceptions (which can be fixed up using fpset*() before they
trap) are very rare, especially on amd64 since SSE exceptions trap
synchronously, but I want to merge the faster amd64 implementations of
fpset*() back to i386 without introducing the bug on i386.

The i386 implementation has always avoided the trap automatically by
changing things using load/store of the FP environment, but this is
very slow.  Most changes only affect the control word, so they can
usually be done much more efficiently, and amd64 has always done this,
but loading the control word can trap.

This version use the fast method only in the usual case where it will
not trap.  This only costs a couple of integer instructions (including
one branch which I haven't optimized carefully yet) in the usual case,
but bloats the inlines a lot.  The inlines were already a bit too large
to handle both the FPU and SSE.
This commit is contained in:
Bruce Evans 2008-01-11 17:11:32 +00:00
parent 89a828fde9
commit 31e30d75d5

View File

@ -125,11 +125,41 @@ typedef enum {
#ifdef __GNUCLIKE_ASM #ifdef __GNUCLIKE_ASM
#define __fldcw(addr) __asm __volatile("fldcw %0" : : "m" (*(addr))) #define __fldcw(addr) __asm __volatile("fldcw %0" : : "m" (*(addr)))
#define __fldenv(addr) __asm __volatile("fldenv %0" : : "m" (*(addr)))
#define __fnstcw(addr) __asm __volatile("fnstcw %0" : "=m" (*(addr))) #define __fnstcw(addr) __asm __volatile("fnstcw %0" : "=m" (*(addr)))
#define __fnstenv(addr) __asm __volatile("fnstenv %0" : "=m" (*(addr)))
#define __fnstsw(addr) __asm __volatile("fnstsw %0" : "=m" (*(addr))) #define __fnstsw(addr) __asm __volatile("fnstsw %0" : "=m" (*(addr)))
#define __ldmxcsr(addr) __asm __volatile("ldmxcsr %0" : : "m" (*(addr))) #define __ldmxcsr(addr) __asm __volatile("ldmxcsr %0" : : "m" (*(addr)))
#define __stmxcsr(addr) __asm __volatile("stmxcsr %0" : "=m" (*(addr))) #define __stmxcsr(addr) __asm __volatile("stmxcsr %0" : "=m" (*(addr)))
/*
* Load the control word. Be careful not to trap if there is a currently
* unmasked exception (ones that will become freshly unmasked are not a
* problem). This case must be handled by a save/restore of the
* environment or even of the full x87 state. Accessing the environment
* is very inefficient, so only do it when necessary.
*/
static __inline void
__fnldcw(unsigned short _cw, unsigned short _newcw)
{
struct {
unsigned _cw;
unsigned _other[6];
} _env;
unsigned short _sw;
if ((_cw & FP_MSKS_FLD) != FP_MSKS_FLD) {
__fnstsw(&_sw);
if (((_sw & ~_cw) & FP_STKY_FLD) != 0) {
__fnstenv(&_env);
_env._cw = _newcw;
__fldenv(&_env);
return;
}
}
__fldcw(&_newcw);
}
/* /*
* General notes about conflicting SSE vs FP status bits. * General notes about conflicting SSE vs FP status bits.
* This code assumes that software will not fiddle with the control * This code assumes that software will not fiddle with the control
@ -155,13 +185,13 @@ __fpsetround(fp_rnd_t _m)
{ {
fp_rnd_t _p; fp_rnd_t _p;
unsigned _mxcsr; unsigned _mxcsr;
unsigned short _cw; unsigned short _cw, _newcw;
__fnstcw(&_cw); __fnstcw(&_cw);
_p = (fp_rnd_t)((_cw & FP_RND_FLD) >> FP_RND_OFF); _p = (fp_rnd_t)((_cw & FP_RND_FLD) >> FP_RND_OFF);
_cw &= ~FP_RND_FLD; _newcw = _cw & ~FP_RND_FLD;
_cw |= (_m << FP_RND_OFF) & FP_RND_FLD; _newcw |= (_m << FP_RND_OFF) & FP_RND_FLD;
__fldcw(&_cw); __fnldcw(_cw, _newcw);
__stmxcsr(&_mxcsr); __stmxcsr(&_mxcsr);
_mxcsr &= ~SSE_RND_FLD; _mxcsr &= ~SSE_RND_FLD;
_mxcsr |= (_m << SSE_RND_OFF) & SSE_RND_FLD; _mxcsr |= (_m << SSE_RND_OFF) & SSE_RND_FLD;
@ -187,13 +217,13 @@ static __inline fp_prec_t
__fpsetprec(fp_prec_t _m) __fpsetprec(fp_prec_t _m)
{ {
fp_prec_t _p; fp_prec_t _p;
unsigned short _cw; unsigned short _cw, _newcw;
__fnstcw(&_cw); __fnstcw(&_cw);
_p = (fp_prec_t)((_cw & FP_PRC_FLD) >> FP_PRC_OFF); _p = (fp_prec_t)((_cw & FP_PRC_FLD) >> FP_PRC_OFF);
_cw &= ~FP_PRC_FLD; _newcw = _cw & ~FP_PRC_FLD;
_cw |= (_m << FP_PRC_OFF) & FP_PRC_FLD; _newcw |= (_m << FP_PRC_OFF) & FP_PRC_FLD;
__fldcw(&_cw); __fnldcw(_cw, _newcw);
return (_p); return (_p);
} }
@ -217,13 +247,13 @@ __fpsetmask(fp_except_t _m)
{ {
fp_except_t _p; fp_except_t _p;
unsigned _mxcsr; unsigned _mxcsr;
unsigned short _cw; unsigned short _cw, _newcw;
__fnstcw(&_cw); __fnstcw(&_cw);
_p = (~_cw & FP_MSKS_FLD) >> FP_MSKS_OFF; _p = (~_cw & FP_MSKS_FLD) >> FP_MSKS_OFF;
_cw &= ~FP_MSKS_FLD; _newcw = _cw & ~FP_MSKS_FLD;
_cw |= (~_m << FP_MSKS_OFF) & FP_MSKS_FLD; _newcw |= (~_m << FP_MSKS_OFF) & FP_MSKS_FLD;
__fldcw(&_cw); __fnldcw(_cw, _newcw);
__stmxcsr(&_mxcsr); __stmxcsr(&_mxcsr);
/* XXX should we clear non-ieee SSE_DAZ_FLD and SSE_FZ_FLD ? */ /* XXX should we clear non-ieee SSE_DAZ_FLD and SSE_FZ_FLD ? */
_mxcsr &= ~SSE_MSKS_FLD; _mxcsr &= ~SSE_MSKS_FLD;