freebsd-dev/sys/i386/i386/math_emulate.c

/*
 * linux/kernel/math/math_emulate.c
 *
 * (C) 1991 Linus Torvalds
 *
 * [expediant "port" of linux 8087 emulator to 386BSD, with apologies -wfj]
 *
 *	from: 386BSD 0.1
 *	$Id: math_emulate.c,v 1.33 1999/01/28 01:59:50 dillon Exp $
 */

/*
 * Limited emulation 27.12.91 - mostly loads/stores, which gcc wants
 * even for soft-float, unless you use bruce evans' patches. The patches
 * are great, but they have to be re-applied for every version, and the
 * library is different for soft-float and 80387. So emulation is more
 * practical, even though it's slower.
 *
 * 28.12.91 - loads/stores work, even BCD. I'll have to start thinking
 * about add/sub/mul/div. Urgel. I should find some good source, but I'll
 * just fake up something.
 *
 * 30.12.91 - add/sub/mul/div/com seem to work mostly. I should really
 * test every possible combination.
 */

/*
 * This file is full of ugly macros etc: one problem was that gcc simply
 * didn't want to make the structures as they should be: it has to try to
 * align them. Sickening code, but at least I've hidden the ugly things
 * in this one file: the other files don't need to know about these things.
 *
 * The other files also don't care about ST(x) etc - they just get addresses
 * to 80-bit temporary reals, and do with them as they please. I wanted to
 * hide most of the 387-specific things here.
 */

#include <sys/param.h>
#include <sys/systm.h>

#include <machine/frame.h>
#include <machine/reg.h>

#include <sys/proc.h>
#include <sys/kernel.h>

#include <vm/vm.h>
#include <sys/lock.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <sys/user.h>

#define __ALIGNED_TEMP_REAL 1
#include <i386/i386/math_emu.h>

#define bswapw(x) __asm__("xchgb %%al,%%ah":"=a" (x):"0" ((short)x))
#define ST(x) (*__st((x)))
#define PST(x) ((const temp_real *) __st((x)))
#define	math_abort(tfp, signo) tfp->tf_eip = oldeip; return (signo);

/*
 * We don't want these inlined - it gets too messy in the machine-code.
 */
static void fpop(void);
static void fpush(void);
static void fxchg(temp_real_unaligned * a, temp_real_unaligned * b);
static temp_real_unaligned * __st(int i);

static unsigned char
get_fs_byte(char *adr)
	{ return(fubyte(adr)); }

static unsigned short
get_fs_word(unsigned short *adr)
	{ return(fuword(adr)); }

static u_int32_t
get_fs_long(u_int32_t *adr)
	{ return(fuword(adr)); }

static void
put_fs_byte(unsigned char val, char *adr)
	{ (void)subyte(adr,val); }

static void
put_fs_word(unsigned short val, short *adr)
	{ (void)susword(adr,val); }

static void
put_fs_long(u_long val, u_int32_t *adr)
	{ (void)suword(adr,val); }

static int
math_emulate(struct trapframe * info)
{
	unsigned short code;
	temp_real tmp;
	char * address;
	u_int32_t oldeip;

	/* ever used fp? */
	if ((((struct pcb *)curproc->p_addr)->pcb_flags & FP_SOFTFP) == 0) {
		((struct pcb *)curproc->p_addr)->pcb_flags |= FP_SOFTFP;
		I387.cwd = 0x037f;
		I387.swd = 0x0000;
		I387.twd = 0x0000;
	}

	if (I387.cwd & I387.swd & 0x3f)
		I387.swd |= 0x8000;
	else
		I387.swd &= 0x7fff;
	oldeip = info->tf_eip;
/* 0x001f means user code space */
	if ((u_short)info->tf_cs != 0x001F) {
		printf("math_emulate: %04x:%08lx\n", (u_short)info->tf_cs,
			(u_long)oldeip);
		panic("?Math emulation needed in kernel?");
	}
	/* completely ignore an operand-size prefix */
	if (get_fs_byte((char *) info->tf_eip) == 0x66)
		info->tf_eip++;
	code = get_fs_word((unsigned short *) info->tf_eip);
	bswapw(code);
	code &= 0x7ff;
	I387.fip = oldeip;
	*(unsigned short *) &I387.fcs = (u_short) info->tf_cs;
	*(1+(unsigned short *) &I387.fcs) = code;
	info->tf_eip += 2;
	switch (code) {
		case 0x1d0: /* fnop */
			return(0);
		case 0x1d1: case 0x1d2: case 0x1d3:  /* fst to 32-bit mem */
		case 0x1d4: case 0x1d5: case 0x1d6: case 0x1d7:
			math_abort(info,SIGILL);
		case 0x1e0: /* fchs */
			ST(0).exponent ^= 0x8000;
			return(0);
		case 0x1e1: /* fabs */
			ST(0).exponent &= 0x7fff;
			return(0);
		case 0x1e2: case 0x1e3:
			math_abort(info,SIGILL);
		case 0x1e4: /* ftst */
			ftst(PST(0));
			return(0);
		case 0x1e5: /* fxam */
			printf("fxam not implemented\n");
			math_abort(info,SIGILL);
		case 0x1e6: case 0x1e7: /* fldenv */
			math_abort(info,SIGILL);
		case 0x1e8: /* fld1 */
			fpush();
			ST(0) = CONST1;
			return(0);
		case 0x1e9: /* fld2t */
			fpush();
			ST(0) = CONSTL2T;
			return(0);
		case 0x1ea: /* fld2e */
			fpush();
			ST(0) = CONSTL2E;
			return(0);
		case 0x1eb: /* fldpi */
			fpush();
			ST(0) = CONSTPI;
			return(0);
		case 0x1ec: /* fldlg2 */
			fpush();
			ST(0) = CONSTLG2;
			return(0);
		case 0x1ed: /* fldln2 */
			fpush();
			ST(0) = CONSTLN2;
			return(0);
		case 0x1ee: /* fldz */
			fpush();
			ST(0) = CONSTZ;
			return(0);
		case 0x1ef:
			math_abort(info,SIGILL);
		case 0x1f0: /* f2xm1 */
		case 0x1f1: /* fyl2x */
		case 0x1f2: /* fptan */
		case 0x1f3: /* fpatan */
		case 0x1f4: /* fxtract */
		case 0x1f5: /* fprem1 */
		case 0x1f6: /* fdecstp */
		case 0x1f7: /* fincstp */
		case 0x1f8: /* fprem */
		case 0x1f9: /* fyl2xp1 */
		case 0x1fa: /* fsqrt */
		case 0x1fb: /* fsincos */
		case 0x1fe: /* fsin */
		case 0x1ff: /* fcos */
			uprintf(
			 "math_emulate: instruction %04x not implemented\n",
			  code + 0xd800);
			math_abort(info,SIGILL);
		case 0x1fc: /* frndint */
			frndint(PST(0),&tmp);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 0x1fd: /* fscale */
			/* incomplete and totally inadequate -wfj */
			Fscale(PST(0), PST(1), &tmp);
			real_to_real(&tmp,&ST(0));
			return(0);			/* 19 Sep 92*/
		case 0x2e9: /* ????? */
/* if this should be a fucomp ST(0),ST(1) , it must be a 0x3e9  ATS */
			fucom(PST(1),PST(0));
			fpop(); fpop();
			return(0);
		case 0x3d0: case 0x3d1: /* fist ?? */
			return(0);
		case 0x3e2: /* fclex */
			I387.swd &= 0x7f00;
			return(0);
		case 0x3e3: /* fninit */
			I387.cwd = 0x037f;
			I387.swd = 0x0000;
			I387.twd = 0x0000;
			return(0);
		case 0x3e4:
			return(0);
		case 0x6d9: /* fcompp */
			fcom(PST(1),PST(0));
			fpop(); fpop();
			return(0);
		case 0x7e0: /* fstsw ax */
			*(short *) &info->tf_eax = I387.swd;
			return(0);
	}
	switch (code >> 3) {
		case 0x18: /* fadd */
			fadd(PST(0),PST(code & 7),&tmp);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 0x19: /* fmul */
			fmul(PST(0),PST(code & 7),&tmp);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 0x1a: /* fcom */
			fcom(PST(code & 7),PST(0));
			return(0);
		case 0x1b: /* fcomp */
			fcom(PST(code & 7),PST(0));
			fpop();
			return(0);
		case 0x1c: /* fsubr */
			real_to_real(&ST(code & 7),&tmp);
			tmp.exponent ^= 0x8000;
			fadd(PST(0),&tmp,&tmp);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 0x1d: /* fsub */
			ST(0).exponent ^= 0x8000;
			fadd(PST(0),PST(code & 7),&tmp);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 0x1e: /* fdivr */
			fdiv(PST(0),PST(code & 7),&tmp);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 0x1f: /* fdiv */
			fdiv(PST(code & 7),PST(0),&tmp);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 0x38: /* fld */
			fpush();
			ST(0) = ST((code & 7)+1);  /* why plus 1 ????? ATS */
			return(0);
		case 0x39: /* fxch */
			fxchg(&ST(0),&ST(code & 7));
			return(0);
		case 0x3b: /*  ??? ??? wrong ???? ATS */
			ST(code & 7) = ST(0);
			fpop();
			return(0);
		case 0x98: /* fadd */
			fadd(PST(0),PST(code & 7),&tmp);
			real_to_real(&tmp,&ST(code & 7));
			return(0);
		case 0x99: /* fmul */
			fmul(PST(0),PST(code & 7),&tmp);
			real_to_real(&tmp,&ST(code & 7));
			return(0);
		case 0x9a: /* ???? , my manual don't list a direction bit
for fcom , ??? ATS */
			fcom(PST(code & 7),PST(0));
			return(0);
		case 0x9b: /* same as above , ATS */
			fcom(PST(code & 7),PST(0));
			fpop();
			return(0);
		case 0x9c: /* fsubr */
			ST(code & 7).exponent ^= 0x8000;
			fadd(PST(0),PST(code & 7),&tmp);
			real_to_real(&tmp,&ST(code & 7));
			return(0);
		case 0x9d: /* fsub */
			real_to_real(&ST(0),&tmp);
			tmp.exponent ^= 0x8000;
			fadd(PST(code & 7),&tmp,&tmp);
			real_to_real(&tmp,&ST(code & 7));
			return(0);
		case 0x9e: /* fdivr */
			fdiv(PST(0),PST(code & 7),&tmp);
			real_to_real(&tmp,&ST(code & 7));
			return(0);
		case 0x9f: /* fdiv */
			fdiv(PST(code & 7),PST(0),&tmp);
			real_to_real(&tmp,&ST(code & 7));
			return(0);
		case 0xb8: /* ffree */
			printf("ffree not implemented\n");
			math_abort(info,SIGILL);
		case 0xb9: /* fstp ???? where is the pop ? ATS */
			fxchg(&ST(0),&ST(code & 7));
			return(0);
		case 0xba: /* fst */
			ST(code & 7) = ST(0);
			return(0);
		case 0xbb: /* ????? encoding of fstp to mem ? ATS */
			ST(code & 7) = ST(0);
			fpop();
			return(0);
		case 0xbc: /* fucom */
			fucom(PST(code & 7),PST(0));
			return(0);
		case 0xbd: /* fucomp */
			fucom(PST(code & 7),PST(0));
			fpop();
			return(0);
		case 0xd8: /* faddp */
			fadd(PST(code & 7),PST(0),&tmp);
			real_to_real(&tmp,&ST(code & 7));
			fpop();
			return(0);
		case 0xd9: /* fmulp */
			fmul(PST(code & 7),PST(0),&tmp);
			real_to_real(&tmp,&ST(code & 7));
			fpop();
			return(0);
		case 0xda: /* ??? encoding of ficom with 16 bit mem ? ATS */
			fcom(PST(code & 7),PST(0));
			fpop();
			return(0);
		case 0xdc: /* fsubrp */
			ST(code & 7).exponent ^= 0x8000;
			fadd(PST(0),PST(code & 7),&tmp);
			real_to_real(&tmp,&ST(code & 7));
			fpop();
			return(0);
		case 0xdd: /* fsubp */
			real_to_real(&ST(0),&tmp);
			tmp.exponent ^= 0x8000;
			fadd(PST(code & 7),&tmp,&tmp);
			real_to_real(&tmp,&ST(code & 7));
			fpop();
			return(0);
		case 0xde: /* fdivrp */
			fdiv(PST(0),PST(code & 7),&tmp);
			real_to_real(&tmp,&ST(code & 7));
			fpop();
			return(0);
		case 0xdf: /* fdivp */
			fdiv(PST(code & 7),PST(0),&tmp);
			real_to_real(&tmp,&ST(code & 7));
			fpop();
			return(0);
		case 0xf8: /* fild 16-bit mem ???? ATS */
			printf("ffree not implemented\n");
			math_abort(info,SIGILL);
			fpop();
			return(0);
		case 0xf9: /*  ????? ATS */
			fxchg(&ST(0),&ST(code & 7));
			return(0);
		case 0xfa: /* fist 16-bit mem ? ATS */
		case 0xfb: /* fistp 16-bit mem ? ATS */
			ST(code & 7) = ST(0);
			fpop();
			return(0);
	}
	switch ((code>>3) & 0xe7) {
		case 0x22:
			put_short_real(PST(0),info,code);
			return(0);
		case 0x23:
			put_short_real(PST(0),info,code);
			fpop();
			return(0);
		case 0x24:
			address = ea(info,code);
			for (code = 0 ; code < 7 ; code++) {
				((int32_t *) & I387)[code] =
				   get_fs_long((u_int32_t *) address);
				address += 4;
			}
			return(0);
		case 0x25:
			address = ea(info,code);
			*(unsigned short *) &I387.cwd =
				get_fs_word((unsigned short *) address);
			return(0);
		case 0x26:
			address = ea(info,code);
			/*verify_area(address,28);*/
			for (code = 0 ; code < 7 ; code++) {
				put_fs_long( ((int32_t *) & I387)[code],
					(u_int32_t *) address);
				address += 4;
			}
			return(0);
		case 0x27:
			address = ea(info,code);
			/*verify_area(address,2);*/
			put_fs_word(I387.cwd,(short *) address);
			return(0);
		case 0x62:
			put_long_int(PST(0),info,code);
			return(0);
		case 0x63:
			put_long_int(PST(0),info,code);
			fpop();
			return(0);
		case 0x65:
			fpush();
			get_temp_real(&tmp,info,code);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 0x67:
			put_temp_real(PST(0),info,code);
			fpop();
			return(0);
		case 0xa2:
			put_long_real(PST(0),info,code);
			return(0);
		case 0xa3:
			put_long_real(PST(0),info,code);
			fpop();
			return(0);
		case 0xa4:
			address = ea(info,code);
			for (code = 0 ; code < 27 ; code++) {
				((int32_t *) & I387)[code] =
				   get_fs_long((u_int32_t *) address);
				address += 4;
			}
			return(0);
		case 0xa6:
			address = ea(info,code);
			/*verify_area(address,108);*/
			for (code = 0 ; code < 27 ; code++) {
				put_fs_long( ((int32_t *) & I387)[code],
					(u_int32_t *) address);
				address += 4;
			}
			I387.cwd = 0x037f;
			I387.swd = 0x0000;
			I387.twd = 0x0000;
			return(0);
		case 0xa7:
			address = ea(info,code);
			/*verify_area(address,2);*/
			put_fs_word(I387.swd,(short *) address);
			return(0);
		case 0xe2:
			put_short_int(PST(0),info,code);
			return(0);
		case 0xe3:
			put_short_int(PST(0),info,code);
			fpop();
			return(0);
		case 0xe4:
			fpush();
			get_BCD(&tmp,info,code);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 0xe5:
			fpush();
			get_longlong_int(&tmp,info,code);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 0xe6:
			put_BCD(PST(0),info,code);
			fpop();
			return(0);
		case 0xe7:
			put_longlong_int(PST(0),info,code);
			fpop();
			return(0);
	}
	switch (code >> 9) {
		case 0:
			get_short_real(&tmp,info,code);
			break;
		case 1:
			get_long_int(&tmp,info,code);
			break;
		case 2:
			get_long_real(&tmp,info,code);
			break;
		case 4:
			get_short_int(&tmp,info,code);
	}
	switch ((code>>3) & 0x27) {
		case 0:
			fadd(&tmp,PST(0),&tmp);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 1:
			fmul(&tmp,PST(0),&tmp);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 2:
			fcom(&tmp,PST(0));
			return(0);
		case 3:
			fcom(&tmp,PST(0));
			fpop();
			return(0);
		case 4:
			tmp.exponent ^= 0x8000;
			fadd(&tmp,PST(0),&tmp);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 5:
			ST(0).exponent ^= 0x8000;
			fadd(&tmp,PST(0),&tmp);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 6:
			fdiv(PST(0),&tmp,&tmp);
			real_to_real(&tmp,&ST(0));
			return(0);
		case 7:
			fdiv(&tmp,PST(0),&tmp);
			real_to_real(&tmp,&ST(0));
			return(0);
	}
	if ((code & 0x138) == 0x100) {
			fpush();
			real_to_real(&tmp,&ST(0));
			return(0);
	}
	printf("Unknown math-insns: %04x:%08x %04x\n",(u_short)info->tf_cs,
		info->tf_eip,code);
	math_abort(info,SIGFPE);
}

static void
fpop(void)
{
	u_int32_t tmp;

	tmp = I387.swd & 0xffffc7ffUL;
	I387.swd += 0x00000800;
	I387.swd &= 0x00003800;
	I387.swd |= tmp;
}

static void
fpush(void)
{
	u_int32_t tmp;

	tmp = I387.swd & 0xffffc7ffUL;
	I387.swd += 0x00003800;
	I387.swd &= 0x00003800;
	I387.swd |= tmp;
}

static void
fxchg(temp_real_unaligned * a, temp_real_unaligned * b)
{
	temp_real_unaligned c;

	c = *a;
	*a = *b;
	*b = c;
}

static temp_real_unaligned *
__st(int i)
{
	i += I387.swd >> 11;
	i &= 7;
	return (temp_real_unaligned *) (i*10 + (char *)(I387.st_space));
}

/*
 * linux/kernel/math/ea.c
 *
 * (C) 1991 Linus Torvalds
 */

/*
 * Calculate the effective address.
 */


static int __regoffset[] = {
	tEAX, tECX, tEDX, tEBX, tESP, tEBP, tESI, tEDI
};

#define REG(x) (((int *)curproc->p_md.md_regs)[__regoffset[(x)]])

static char *
sib(struct trapframe * info, int mod)
{
	unsigned char ss,index,base;
	int32_t offset = 0;

	base = get_fs_byte((char *) info->tf_eip);
	info->tf_eip++;
	ss = base >> 6;
	index = (base >> 3) & 7;
	base &= 7;
	if (index == 4)
		offset = 0;
	else
		offset = REG(index);
	offset <<= ss;
	if (mod || base != 5)
		offset += REG(base);
	if (mod == 1) {
		offset += (signed char) get_fs_byte((char *) info->tf_eip);
		info->tf_eip++;
	} else if (mod == 2 || base == 5) {
		offset += (signed) get_fs_long((u_int32_t *) info->tf_eip);
		info->tf_eip += 4;
	}
	I387.foo = offset;
	I387.fos = 0x17;
	return (char *) offset;
}

static char *
ea(struct trapframe * info, unsigned short code)
{
	unsigned char mod,rm;
	int32_t * tmp;
	int offset = 0;

	mod = (code >> 6) & 3;
	rm = code & 7;
	if (rm == 4 && mod != 3)
		return sib(info,mod);
	if (rm == 5 && !mod) {
		offset = get_fs_long((u_int32_t *) info->tf_eip);
		info->tf_eip += 4;
		I387.foo = offset;
		I387.fos = 0x17;
		return (char *) offset;
	}
	tmp = (int32_t *) &REG(rm);
	switch (mod) {
		case 0: offset = 0; break;
		case 1:
			offset = (signed char) get_fs_byte((char *) info->tf_eip);
			info->tf_eip++;
			break;
		case 2:
			offset = (signed) get_fs_long((u_int32_t *) info->tf_eip);
			info->tf_eip += 4;
			break;
#ifdef notyet
		case 3:
			math_abort(info,1<<(SIGILL-1));
#endif
	}
	I387.foo = offset;
	I387.fos = 0x17;
	return offset + (char *) *tmp;
}
/*
 * linux/kernel/math/get_put.c
 *
 * (C) 1991 Linus Torvalds
 */

/*
 * This file handles all accesses to user memory: getting and putting
 * ints/reals/BCD etc. This is the only part that concerns itself with
 * other than temporary real format. All other cals are strictly temp_real.
 */

static void
get_short_real(temp_real * tmp, struct trapframe * info, unsigned short code)
{
	char * addr;
	short_real sr;

	addr = ea(info,code);
	sr = get_fs_long((u_int32_t *) addr);
	short_to_temp(&sr,tmp);
}

static void
get_long_real(temp_real * tmp, struct trapframe * info, unsigned short code)
{
	char * addr;
	long_real lr;

	addr = ea(info,code);
	lr.a = get_fs_long((u_int32_t *) addr);
	lr.b = get_fs_long(1 + (u_int32_t *) addr);
	long_to_temp(&lr,tmp);
}

static void
get_temp_real(temp_real * tmp, struct trapframe * info, unsigned short code)
{
	char * addr;

	addr = ea(info,code);
	tmp->a = get_fs_long((u_int32_t *) addr);
	tmp->b = get_fs_long(1 + (u_int32_t *) addr);
	tmp->exponent = get_fs_word(4 + (unsigned short *) addr);
}

static void
get_short_int(temp_real * tmp, struct trapframe * info, unsigned short code)
{
	char * addr;
	temp_int ti;

	addr = ea(info,code);
	ti.a = (signed short) get_fs_word((unsigned short *) addr);
	ti.b = 0;
	if ((ti.sign = (ti.a < 0)) != 0)
		ti.a = - ti.a;
	int_to_real(&ti,tmp);
}

static void
get_long_int(temp_real * tmp, struct trapframe * info, unsigned short code)
{
	char * addr;
	temp_int ti;

	addr = ea(info,code);
	ti.a = get_fs_long((u_int32_t *) addr);
	ti.b = 0;
	if ((ti.sign = (ti.a < 0)) != 0)
		ti.a = - ti.a;
	int_to_real(&ti,tmp);
}

static void
get_longlong_int(temp_real * tmp, struct trapframe * info, unsigned short code)
{
	char * addr;
	temp_int ti;

	addr = ea(info,code);
	ti.a = get_fs_long((u_int32_t *) addr);
	ti.b = get_fs_long(1 + (u_int32_t *) addr);
	if ((ti.sign = (ti.b < 0)) != 0)
		__asm__("notl %0 ; notl %1\n\t"
			"addl $1,%0 ; adcl $0,%1"
			:"=r" (ti.a),"=r" (ti.b)
			:"0" (ti.a),"1" (ti.b));
	int_to_real(&ti,tmp);
}

#define MUL10(low,high) \
__asm__("addl %0,%0 ; adcl %1,%1\n\t" \
"movl %0,%%ecx ; movl %1,%%ebx\n\t" \
"addl %0,%0 ; adcl %1,%1\n\t" \
"addl %0,%0 ; adcl %1,%1\n\t" \
"addl %%ecx,%0 ; adcl %%ebx,%1" \
:"=a" (low),"=d" (high) \
:"0" (low),"1" (high):"cx","bx")

#define ADD64(val,low,high) \
__asm__("addl %4,%0 ; adcl $0,%1":"=r" (low),"=r" (high) \
:"0" (low),"1" (high),"r" ((u_int32_t) (val)))

static void
get_BCD(temp_real * tmp, struct trapframe * info, unsigned short code)
{
	int k;
	char * addr;
	temp_int i;
	unsigned char c;

	addr = ea(info,code);
	addr += 9;
	i.sign = 0x80 & get_fs_byte(addr--);
	i.a = i.b = 0;
	for (k = 0; k < 9; k++) {
		c = get_fs_byte(addr--);
		MUL10(i.a, i.b);
		ADD64((c>>4), i.a, i.b);
		MUL10(i.a, i.b);
		ADD64((c&0xf), i.a, i.b);
	}
	int_to_real(&i,tmp);
}

static void
put_short_real(const temp_real * tmp,
	struct trapframe * info, unsigned short code)
{
	char * addr;
	short_real sr;

	addr = ea(info,code);
	/*verify_area(addr,4);*/
	temp_to_short(tmp,&sr);
	put_fs_long(sr,(u_int32_t *) addr);
}

static void
put_long_real(const temp_real * tmp,
	struct trapframe * info, unsigned short code)
{
	char * addr;
	long_real lr;

	addr = ea(info,code);
	/*verify_area(addr,8);*/
	temp_to_long(tmp,&lr);
	put_fs_long(lr.a, (u_int32_t *) addr);
	put_fs_long(lr.b, 1 + (u_int32_t *) addr);
}

static void
put_temp_real(const temp_real * tmp,
	struct trapframe * info, unsigned short code)
{
	char * addr;

	addr = ea(info,code);
	/*verify_area(addr,10);*/
	put_fs_long(tmp->a, (u_int32_t *) addr);
	put_fs_long(tmp->b, 1 + (u_int32_t *) addr);
	put_fs_word(tmp->exponent, 4 + (short *) addr);
}

static void
put_short_int(const temp_real * tmp,
	struct trapframe * info, unsigned short code)
{
	char * addr;
	temp_int ti;

	addr = ea(info,code);
	real_to_int(tmp,&ti);
	/*verify_area(addr,2);*/
	if (ti.sign)
		ti.a = -ti.a;
	put_fs_word(ti.a,(short *) addr);
}

static void
put_long_int(const temp_real * tmp,
	struct trapframe * info, unsigned short code)
{
	char * addr;
	temp_int ti;

	addr = ea(info,code);
	real_to_int(tmp,&ti);
	/*verify_area(addr,4);*/
	if (ti.sign)
		ti.a = -ti.a;
	put_fs_long(ti.a,(u_int32_t *) addr);
}

static void
put_longlong_int(const temp_real * tmp,
	struct trapframe * info, unsigned short code)
{
	char * addr;
	temp_int ti;

	addr = ea(info,code);
	real_to_int(tmp,&ti);
	/*verify_area(addr,8);*/
	if (ti.sign)
		__asm__("notl %0 ; notl %1\n\t"
			"addl $1,%0 ; adcl $0,%1"
			:"=r" (ti.a),"=r" (ti.b)
			:"0" (ti.a),"1" (ti.b));
	put_fs_long(ti.a,(u_int32_t *) addr);
	put_fs_long(ti.b,1 + (u_int32_t *) addr);
}

#define DIV10(low,high,rem) \
__asm__("divl %6 ; xchgl %1,%2 ; divl %6" \
	:"=d" (rem),"=a" (low),"=r" (high) \
	:"0" (0),"1" (high),"2" (low),"c" (10))

static void
put_BCD(const temp_real * tmp,struct trapframe * info, unsigned short code)
{
	int k,rem;
	char * addr;
	temp_int i;
	unsigned char c;

	addr = ea(info,code);
	/*verify_area(addr,10);*/
	real_to_int(tmp,&i);
	if (i.sign)
		put_fs_byte(0x80, addr+9);
	else
		put_fs_byte(0, addr+9);
	for (k = 0; k < 9; k++) {
		DIV10(i.a,i.b,rem);
		c = rem;
		DIV10(i.a,i.b,rem);
		c += rem<<4;
		put_fs_byte(c,addr++);
	}
}

/*
 * linux/kernel/math/mul.c
 *
 * (C) 1991 Linus Torvalds
 */

/*
 * temporary real multiplication routine.
 */


static void
shift(int * c)
{
	__asm__("movl (%0),%%eax ; addl %%eax,(%0)\n\t"
		"movl 4(%0),%%eax ; adcl %%eax,4(%0)\n\t"
		"movl 8(%0),%%eax ; adcl %%eax,8(%0)\n\t"
		"movl 12(%0),%%eax ; adcl %%eax,12(%0)"
		::"r" (c):"ax");
}

static void
mul64(const temp_real * a, const temp_real * b, int * c)
{
	__asm__("movl (%0),%%eax\n\t"
		"mull (%1)\n\t"
		"movl %%eax,(%2)\n\t"
		"movl %%edx,4(%2)\n\t"
		"movl 4(%0),%%eax\n\t"
		"mull 4(%1)\n\t"
		"movl %%eax,8(%2)\n\t"
		"movl %%edx,12(%2)\n\t"
		"movl (%0),%%eax\n\t"
		"mull 4(%1)\n\t"
		"addl %%eax,4(%2)\n\t"
		"adcl %%edx,8(%2)\n\t"
		"adcl $0,12(%2)\n\t"
		"movl 4(%0),%%eax\n\t"
		"mull (%1)\n\t"
		"addl %%eax,4(%2)\n\t"
		"adcl %%edx,8(%2)\n\t"
		"adcl $0,12(%2)"
		::"S" (a),"c" (b),"D" (c)
		:"ax","dx");
}

static void
fmul(const temp_real * src1, const temp_real * src2, temp_real * result)
{
	int i,sign;
	int tmp[4] = {0,0,0,0};

	sign = (src1->exponent ^ src2->exponent) & 0x8000;
	i = (src1->exponent & 0x7fff) + (src2->exponent & 0x7fff) - 16383 + 1;
	if (i<0) {
		result->exponent = sign;
		result->a = result->b = 0;
		return;
	}
	if (i>0x7fff) {
		set_OE();
		return;
	}
	mul64(src1,src2,tmp);
	if (tmp[0] || tmp[1] || tmp[2] || tmp[3])
		while (i && tmp[3] >= 0) {
			i--;
			shift(tmp);
		}
	else
		i = 0;
	result->exponent = i | sign;
	result->a = tmp[2];
	result->b = tmp[3];
}

/*
 * linux/kernel/math/div.c
 *
 * (C) 1991 Linus Torvalds
 */

/*
 * temporary real division routine.
 */

static void
shift_left(int * c)
{
	__asm__ __volatile__("movl (%0),%%eax ; addl %%eax,(%0)\n\t"
		"movl 4(%0),%%eax ; adcl %%eax,4(%0)\n\t"
		"movl 8(%0),%%eax ; adcl %%eax,8(%0)\n\t"
		"movl 12(%0),%%eax ; adcl %%eax,12(%0)"
		::"r" (c):"ax");
}

static void
shift_right(int * c)
{
	__asm__("shrl $1,12(%0) ; rcrl $1,8(%0) ; rcrl $1,4(%0) ; rcrl $1,(%0)"
		::"r" (c));
}

static int
try_sub(int * a, int * b)
{
	char ok;

	__asm__ __volatile__("movl (%1),%%eax ; subl %%eax,(%2)\n\t"
		"movl 4(%1),%%eax ; sbbl %%eax,4(%2)\n\t"
		"movl 8(%1),%%eax ; sbbl %%eax,8(%2)\n\t"
		"movl 12(%1),%%eax ; sbbl %%eax,12(%2)\n\t"
		"setae %%al":"=a" (ok):"c" (a),"d" (b));
	return ok;
}

static void
div64(int * a, int * b, int * c)
{
	int tmp[4];
	int i;
	unsigned int mask = 0;

	c += 4;
	for (i = 0 ; i<64 ; i++) {
		if (!(mask >>= 1)) {
			c--;
			mask = 0x80000000UL;
		}
		tmp[0] = a[0]; tmp[1] = a[1];
		tmp[2] = a[2]; tmp[3] = a[3];
		if (try_sub(b,tmp)) {
			*c |= mask;
			a[0] = tmp[0]; a[1] = tmp[1];
			a[2] = tmp[2]; a[3] = tmp[3];
		}
		shift_right(b);
	}
}

static void
fdiv(const temp_real * src1, const temp_real * src2, temp_real * result)
{
	int i,sign;
	int a[4],b[4],tmp[4] = {0,0,0,0};

	sign = (src1->exponent ^ src2->exponent) & 0x8000;
	if (!(src2->a || src2->b)) {
		set_ZE();
		return;
	}
	i = (src1->exponent & 0x7fff) - (src2->exponent & 0x7fff) + 16383;
	if (i<0) {
		set_UE();
		result->exponent = sign;
		result->a = result->b = 0;
		return;
	}
	a[0] = a[1] = 0;
	a[2] = src1->a;
	a[3] = src1->b;
	b[0] = b[1] = 0;
	b[2] = src2->a;
	b[3] = src2->b;
	while (b[3] >= 0) {
		i++;
		shift_left(b);
	}
	div64(a,b,tmp);
	if (tmp[0] || tmp[1] || tmp[2] || tmp[3]) {
		while (i && tmp[3] >= 0) {
			i--;
			shift_left(tmp);
		}
		if (tmp[3] >= 0)
			set_DE();
	} else
		i = 0;
	if (i>0x7fff) {
		set_OE();
		return;
	}
	if (tmp[0] || tmp[1])
		set_PE();
	result->exponent = i | sign;
	result->a = tmp[2];
	result->b = tmp[3];
}

/*
 * linux/kernel/math/add.c
 *
 * (C) 1991 Linus Torvalds
 */

/*
 * temporary real addition routine.
 *
 * NOTE! These aren't exact: they are only 62 bits wide, and don't do
 * correct rounding. Fast hack. The reason is that we shift right the
 * values by two, in order not to have overflow (1 bit), and to be able
 * to move the sign into the mantissa (1 bit). Much simpler algorithms,
 * and 62 bits (61 really - no rounding) accuracy is usually enough. The
 * only time you should notice anything weird is when adding 64-bit
 * integers together. When using doubles (52 bits accuracy), the
 * 61-bit accuracy never shows at all.
 */

#define NEGINT(a) \
__asm__("notl %0 ; notl %1 ; addl $1,%0 ; adcl $0,%1" \
	:"=r" (a->a),"=r" (a->b) \
	:"0" (a->a),"1" (a->b))

static void signify(temp_real * a)
{
	a->exponent += 2;
	__asm__("shrdl $2,%1,%0 ; shrl $2,%1"
		:"=r" (a->a),"=r" (a->b)
		:"0" (a->a),"1" (a->b));
	if (a->exponent < 0)
		NEGINT(a);
	a->exponent &= 0x7fff;
}

static void unsignify(temp_real * a)
{
	if (!(a->a || a->b)) {
		a->exponent = 0;
		return;
	}
	a->exponent &= 0x7fff;
	if (a->b < 0) {
		NEGINT(a);
		a->exponent |= 0x8000;
	}
	while (a->b >= 0) {
		a->exponent--;
		__asm__("addl %0,%0 ; adcl %1,%1"
			:"=r" (a->a),"=r" (a->b)
			:"0" (a->a),"1" (a->b));
	}
}

static void
fadd(const temp_real * src1, const temp_real * src2, temp_real * result)
{
	temp_real a,b;
	int x1,x2,shift;

	x1 = src1->exponent & 0x7fff;
	x2 = src2->exponent & 0x7fff;
	if (x1 > x2) {
		a = *src1;
		b = *src2;
		shift = x1-x2;
	} else {
		a = *src2;
		b = *src1;
		shift = x2-x1;
	}
	if (shift >= 64) {
		*result = a;
		return;
	}
	if (shift >= 32) {
		b.a = b.b;
		b.b = 0;
		shift -= 32;
	}
	__asm__("shrdl %4,%1,%0 ; shrl %4,%1"
		:"=r" (b.a),"=r" (b.b)
		:"0" (b.a),"1" (b.b),"c" ((char) shift));
	signify(&a);
	signify(&b);
	__asm__("addl %4,%0 ; adcl %5,%1"
		:"=r" (a.a),"=r" (a.b)
		:"0" (a.a),"1" (a.b),"g" (b.a),"g" (b.b));
	unsignify(&a);
	*result = a;
}

/*
 * linux/kernel/math/compare.c
 *
 * (C) 1991 Linus Torvalds
 */

/*
 * temporary real comparison routines
 */


#define clear_Cx() (I387.swd &= ~0x4500)

static void
normalize(temp_real * a)
{
	int i = a->exponent & 0x7fff;
	int sign = a->exponent & 0x8000;

	if (!(a->a || a->b)) {
		a->exponent = 0;
		return;
	}
	while (i && a->b >= 0) {
		i--;
		__asm__("addl %0,%0 ; adcl %1,%1"
			:"=r" (a->a),"=r" (a->b)
			:"0" (a->a),"1" (a->b));
	}
	a->exponent = i | sign;
}

static void
ftst(const temp_real * a)
{
	temp_real b;

	clear_Cx();
	b = *a;
	normalize(&b);
	if (b.a || b.b || b.exponent) {
		if (b.exponent < 0)
			set_C0();
	} else
		set_C3();
}

static void
fcom(const temp_real * src1, const temp_real * src2)
{
	temp_real a;

	a = *src1;
	a.exponent ^= 0x8000;
	fadd(&a,src2,&a);
	ftst(&a);
}

static void
fucom(const temp_real * src1, const temp_real * src2)
{
	fcom(src1,src2);
}

/*
 * linux/kernel/math/convert.c
 *
 * (C) 1991 Linus Torvalds
 */


/*
 * NOTE!!! There is some "non-obvious" optimisations in the temp_to_long
 * and temp_to_short conversion routines: don't touch them if you don't
 * know what's going on. They are the adding of one in the rounding: the
 * overflow bit is also used for adding one into the exponent. Thus it
 * looks like the overflow would be incorrectly handled, but due to the
 * way the IEEE numbers work, things are correct.
 *
 * There is no checking for total overflow in the conversions, though (ie
 * if the temp-real number simply won't fit in a short- or long-real.)
 */

static void
short_to_temp(const short_real * a, temp_real * b)
{
	if (!(*a & 0x7fffffff)) {
		b->a = b->b = 0;
		if (*a)
			b->exponent = 0x8000;
		else
			b->exponent = 0;
		return;
	}
	b->exponent = ((*a>>23) & 0xff)-127+16383;
	if (*a<0)
		b->exponent |= 0x8000;
	b->b = (*a<<8) | 0x80000000UL;
	b->a = 0;
}

static void
long_to_temp(const long_real * a, temp_real * b)
{
	if (!a->a && !(a->b & 0x7fffffff)) {
		b->a = b->b = 0;
		if (a->b)
			b->exponent = 0x8000;
		else
			b->exponent = 0;
		return;
	}
	b->exponent = ((a->b >> 20) & 0x7ff)-1023+16383;
	if (a->b<0)
		b->exponent |= 0x8000;
	b->b = 0x80000000UL | (a->b<<11) | (((u_int32_t)a->a)>>21);
	b->a = a->a<<11;
}

static void
temp_to_short(const temp_real * a, short_real * b)
{
	if (!(a->exponent & 0x7fff)) {
		*b = (a->exponent)?0x80000000UL:0;
		return;
	}
	*b = ((((int32_t) a->exponent)-16383+127) << 23) & 0x7f800000;
	if (a->exponent < 0)
		*b |= 0x80000000UL;
	*b |= (a->b >> 8) & 0x007fffff;
	switch ((int)ROUNDING) {
		case ROUND_NEAREST:
			if ((a->b & 0xff) > 0x80)
				++*b;
			break;
		case ROUND_DOWN:
			if ((a->exponent & 0x8000) && (a->b & 0xff))
				++*b;
			break;
		case ROUND_UP:
			if (!(a->exponent & 0x8000) && (a->b & 0xff))
				++*b;
			break;
	}
}

static void
temp_to_long(const temp_real * a, long_real * b)
{
	if (!(a->exponent & 0x7fff)) {
		b->a = 0;
		b->b = (a->exponent)?0x80000000UL:0;
		return;
	}
	b->b = (((0x7fff & (int32_t) a->exponent)-16383+1023) << 20) &
	    0x7ff00000;
	if (a->exponent < 0)
		b->b |= 0x80000000UL;
	b->b |= (a->b >> 11) & 0x000fffff;
	b->a = a->b << 21;
	b->a |= (a->a >> 11) & 0x001fffff;
	switch ((int)ROUNDING) {
		case ROUND_NEAREST:
			if ((a->a & 0x7ff) > 0x400)
				__asm__("addl $1,%0 ; adcl $0,%1"
					:"=r" (b->a),"=r" (b->b)
					:"0" (b->a),"1" (b->b));
			break;
		case ROUND_DOWN:
			if ((a->exponent & 0x8000) && (a->b & 0xff))
				__asm__("addl $1,%0 ; adcl $0,%1"
					:"=r" (b->a),"=r" (b->b)
					:"0" (b->a),"1" (b->b));
			break;
		case ROUND_UP:
			if (!(a->exponent & 0x8000) && (a->b & 0xff))
				__asm__("addl $1,%0 ; adcl $0,%1"
					:"=r" (b->a),"=r" (b->b)
					:"0" (b->a),"1" (b->b));
			break;
	}
}

static void
frndint(const temp_real * a, temp_real * b)
{
	int shift =  16383 + 63 - (a->exponent & 0x7fff);
	u_int32_t underflow;

	if ((shift < 0) || (shift == 16383+63)) {
		*b = *a;
		return;
	}
	b->a = b->b = underflow = 0;
	b->exponent = a->exponent;
	if (shift < 32) {
		b->b = a->b; b->a = a->a;
	} else if (shift < 64) {
		b->a = a->b; underflow = a->a;
		shift -= 32;
		b->exponent += 32;
	} else if (shift < 96) {
		underflow = a->b;
		shift -= 64;
		b->exponent += 64;
	} else {
		underflow = 1;
		shift = 0;
	}
	b->exponent += shift;
	__asm__("shrdl %2,%1,%0"
		:"=r" (underflow),"=r" (b->a)
		:"c" ((char) shift),"0" (underflow),"1" (b->a));
	__asm__("shrdl %2,%1,%0"
		:"=r" (b->a),"=r" (b->b)
		:"c" ((char) shift),"0" (b->a),"1" (b->b));
	__asm__("shrl %1,%0"
		:"=r" (b->b)
		:"c" ((char) shift),"0" (b->b));
	switch ((int)ROUNDING) {
		case ROUND_NEAREST:
			__asm__("addl %4,%5 ; adcl $0,%0 ; adcl $0,%1"
				:"=r" (b->a),"=r" (b->b)
				:"0" (b->a),"1" (b->b)
				,"r" (0x7fffffff + (b->a & 1))
				,"m" (*&underflow));
			break;
		case ROUND_UP:
			if ((b->exponent >= 0) && underflow)
				__asm__("addl $1,%0 ; adcl $0,%1"
					:"=r" (b->a),"=r" (b->b)
					:"0" (b->a),"1" (b->b));
			break;
		case ROUND_DOWN:
			if ((b->exponent < 0) && underflow)
				__asm__("addl $1,%0 ; adcl $0,%1"
					:"=r" (b->a),"=r" (b->b)
					:"0" (b->a),"1" (b->b));
			break;
	}
	if (b->a || b->b)
		while (b->b >= 0) {
			b->exponent--;
			__asm__("addl %0,%0 ; adcl %1,%1"
				:"=r" (b->a),"=r" (b->b)
				:"0" (b->a),"1" (b->b));
		}
	else
		b->exponent = 0;
}

static void
Fscale(const temp_real *a, const temp_real *b, temp_real *c)
{
	temp_int ti;

	*c = *a;
	if(!c->a && !c->b) {				/* 19 Sep 92*/
		c->exponent = 0;
		return;
	}
	real_to_int(b, &ti);
	if(ti.sign)
		c->exponent -= ti.a;
	else
		c->exponent += ti.a;
}

static void
real_to_int(const temp_real * a, temp_int * b)
{
	int shift =  16383 + 63 - (a->exponent & 0x7fff);
	u_int32_t underflow;

	b->a = b->b = underflow = 0;
	b->sign = (a->exponent < 0);
	if (shift < 0) {
		set_OE();
		return;
	}
	if (shift < 32) {
		b->b = a->b; b->a = a->a;
	} else if (shift < 64) {
		b->a = a->b; underflow = a->a;
		shift -= 32;
	} else if (shift < 96) {
		underflow = a->b;
		shift -= 64;
	} else {
		underflow = 1;
		shift = 0;
	}
	__asm__("shrdl %2,%1,%0"
		:"=r" (underflow),"=r" (b->a)
		:"c" ((char) shift),"0" (underflow),"1" (b->a));
	__asm__("shrdl %2,%1,%0"
		:"=r" (b->a),"=r" (b->b)
		:"c" ((char) shift),"0" (b->a),"1" (b->b));
	__asm__("shrl %1,%0"
		:"=r" (b->b)
		:"c" ((char) shift),"0" (b->b));
	switch ((int)ROUNDING) {
		case ROUND_NEAREST:
			__asm__("addl %4,%5 ; adcl $0,%0 ; adcl $0,%1"
				:"=r" (b->a),"=r" (b->b)
				:"0" (b->a),"1" (b->b)
				,"r" (0x7fffffff + (b->a & 1))
				,"m" (*&underflow));
			break;
		case ROUND_UP:
			if (!b->sign && underflow)
				__asm__("addl $1,%0 ; adcl $0,%1"
					:"=r" (b->a),"=r" (b->b)
					:"0" (b->a),"1" (b->b));
			break;
		case ROUND_DOWN:
			if (b->sign && underflow)
				__asm__("addl $1,%0 ; adcl $0,%1"
					:"=r" (b->a),"=r" (b->b)
					:"0" (b->a),"1" (b->b));
			break;
	}
}

static void
int_to_real(const temp_int * a, temp_real * b)
{
	b->a = a->a;
	b->b = a->b;
	if (b->a || b->b)
		b->exponent = 16383 + 63 + (a->sign? 0x8000:0);
	else {
		b->exponent = 0;
		return;
	}
	while (b->b >= 0) {
		b->exponent--;
		__asm__("addl %0,%0 ; adcl %1,%1"
			:"=r" (b->a),"=r" (b->b)
			:"0" (b->a),"1" (b->b));
	}
}

static int
fpu_modevent(module_t mod, int type, void *unused)
{
	switch (type) {
	case MOD_LOAD:
		if (pmath_emulate) {
			printf("Another Math emulator already present\n");
			return EBUSY;
		}
		pmath_emulate = math_emulate;
		if (bootverbose)
			printf("Math emulator present\n");
		break;
	case MOD_UNLOAD:
		if (pmath_emulate != math_emulate) {
			printf("Cannot unload another math emulator\n");
			return EACCES;
		}
		pmath_emulate = 0;
		if (bootverbose)
			printf("Math emulator unloaded\n");
		break;
	default:
		break;
	}
	return 0;
}
static moduledata_t fpumod = {
	"fpu",
	fpu_modevent,
	0
};
DECLARE_MODULE(fpu, fpumod, SI_SUB_DRIVERS, SI_ORDER_ANY);