From adbfd03a09e1604f626cfd4b65e3225e11b4822b Mon Sep 17 00:00:00 2001 From: Arvind Date: Tue, 28 Sep 2021 23:02:32 -0700 Subject: [PATCH] fpu: Backported helpers for fyl2x and fyl2xp1 from qemu 6.1.0 --- fpu/softfloat-specialize.h | 35 +++- fpu/softfloat.c | 46 +---- include/fpu/softfloat.h | 124 ++++++++++- target-i386/fpu_helper.c | 408 +++++++++++++++++++++++++++++++++++-- 4 files changed, 547 insertions(+), 66 deletions(-) diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h index fa1214a4..93ec6e73 100644 --- a/fpu/softfloat-specialize.h +++ b/fpu/softfloat-specialize.h @@ -145,9 +145,25 @@ const float64 float64_default_nan = const_float64(LIT64( 0xFFF8000000000000 )); #define floatx80_default_nan_low LIT64( 0xC000000000000000 ) #endif -const floatx80 floatx80_default_nan - = make_floatx80_init(floatx80_default_nan_high, floatx80_default_nan_low); +floatx80 floatx80_default_nan(float_status *status) +{ + floatx80 r; + + /* None of the targets that have snan_bit_is_one use floatx80. */ + #if SNAN_BIT_IS_ONE + # error SNAN_BIT_IS_ONE should not be set + #endif +#if defined(TARGET_M68K) + r.low = UINT64_C(0xFFFFFFFFFFFFFFFF); + r.high = 0x7FFF; +#else + /* X86 */ + r.low = UINT64_C(0xC000000000000000); + r.high = 0xFFFF; +#endif + return r; +} /*---------------------------------------------------------------------------- | The pattern for a default generated quadruple-precision NaN. The `high' and | `low' values hold the most- and least-significant bits, respectively. @@ -985,6 +1001,21 @@ floatx80 floatx80_maybe_silence_nan( floatx80 a ) return a; } +/*---------------------------------------------------------------------------- +| Returns a quiet NaN from a signalling NaN for the extended double-precision +| floating point value `a'. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_silence_nan(floatx80 a) +{ + /* None of the targets that have snan_bit_is_one use floatx80. */ + #if SNAN_BIT_IS_ONE + # error SNAN_BIT_IS_ONE should not be set + #endif + a.low |= UINT64_C(0xC000000000000000); + return a; +} + /*---------------------------------------------------------------------------- | Returns the result of converting the extended double-precision floating- | point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the diff --git a/fpu/softfloat.c b/fpu/softfloat.c index d039bda4..8d6b20c7 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -685,42 +685,6 @@ static float64 } -/*---------------------------------------------------------------------------- -| Returns the fraction bits of the extended double-precision floating-point -| value `a'. -*----------------------------------------------------------------------------*/ - -static inline uint64_t extractFloatx80Frac( floatx80 a ) -{ - - return a.low; - -} - -/*---------------------------------------------------------------------------- -| Returns the exponent bits of the extended double-precision floating-point -| value `a'. -*----------------------------------------------------------------------------*/ - -static inline int32 extractFloatx80Exp( floatx80 a ) -{ - - return a.high & 0x7FFF; - -} - -/*---------------------------------------------------------------------------- -| Returns the sign bit of the extended double-precision floating-point value -| `a'. -*----------------------------------------------------------------------------*/ - -static inline flag extractFloatx80Sign( floatx80 a ) -{ - - return a.high>>15; - -} - /*---------------------------------------------------------------------------- | Normalizes the subnormal extended double-precision floating-point value | represented by the denormalized significand `aSig'. The normalized exponent @@ -728,7 +692,7 @@ static inline flag extractFloatx80Sign( floatx80 a ) | `zSigPtr', respectively. *----------------------------------------------------------------------------*/ -static void +void normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr ) { int8 shiftCount; @@ -974,10 +938,10 @@ static floatx80 roundAndPackFloatx80(int8 roundingPrecision, flag zSign, | normalized. *----------------------------------------------------------------------------*/ -static floatx80 normalizeRoundAndPackFloatx80(int8 roundingPrecision, - flag zSign, int32 zExp, - uint64_t zSig0, uint64_t zSig1, - float_status *status) +floatx80 normalizeRoundAndPackFloatx80(int8 roundingPrecision, + flag zSign, int32 zExp, + uint64_t zSig0, uint64_t zSig1, + float_status *status) { int8 shiftCount; diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h index ded34eb0..2ab0aa9b 100644 --- a/include/fpu/softfloat.h +++ b/include/fpu/softfloat.h @@ -111,12 +111,12 @@ typedef int64_t int64; /*---------------------------------------------------------------------------- | Software IEC/IEEE floating-point ordering relations *----------------------------------------------------------------------------*/ -enum { +typedef enum { float_relation_less = -1, float_relation_equal = 0, float_relation_greater = 1, float_relation_unordered = 2 -}; +} FloatRelation; /*---------------------------------------------------------------------------- | Software IEC/IEEE floating-point types. @@ -187,13 +187,18 @@ enum { /*---------------------------------------------------------------------------- | Software IEC/IEEE floating-point rounding mode. *----------------------------------------------------------------------------*/ -enum { + +typedef enum __attribute__((__packed__)) { float_round_nearest_even = 0, float_round_down = 1, float_round_up = 2, float_round_to_zero = 3, float_round_ties_away = 4, -}; + /* Not an IEEE rounding mode: round to closest odd, overflow to max */ + float_round_to_odd = 5, + /* Not an IEEE rounding mode: round to closest odd, overflow to inf */ + float_round_to_odd_inf = 6, +} FloatRoundMode; /*---------------------------------------------------------------------------- | Software IEC/IEEE floating-point exception flags. @@ -208,6 +213,15 @@ enum { float_flag_output_denormal = 128 }; +/* + * Rounding precision for floatx80. + */ +typedef enum __attribute__((__packed__)) { + floatx80_precision_x, + floatx80_precision_d, + floatx80_precision_s, +} FloatX80RoundPrec; + typedef struct float_status { signed char float_detect_tininess; signed char float_rounding_mode; @@ -628,6 +642,7 @@ int floatx80_is_quiet_nan( floatx80 ); int floatx80_is_signaling_nan( floatx80 ); floatx80 floatx80_maybe_silence_nan( floatx80 ); floatx80 floatx80_scalbn(floatx80, int, float_status *status); +floatx80 floatx80_silence_nan( floatx80 ); static inline floatx80 floatx80_abs(floatx80 a) { @@ -666,6 +681,45 @@ static inline int floatx80_is_any_nan(floatx80 a) return ((a.high & 0x7fff) == 0x7fff) && (a.low<<1); } +/*---------------------------------------------------------------------------- +| Return whether the given value is an invalid floatx80 encoding. +| Invalid floatx80 encodings arise when the integer bit is not set, but +| the exponent is not zero. The only times the integer bit is permitted to +| be zero is in subnormal numbers and the value zero. +| This includes what the Intel software developer's manual calls pseudo-NaNs, +| pseudo-infinities and un-normal numbers. It does not include +| pseudo-denormals, which must still be correctly handled as inputs even +| if they are never generated as outputs. +*----------------------------------------------------------------------------*/ +static inline bool floatx80_invalid_encoding(floatx80 a) +{ +#if defined(TARGET_M68K) + /*------------------------------------------------------------------------- + | With m68k, the explicit integer bit can be zero in the case of: + | - zeros (exp == 0, mantissa == 0) + | - denormalized numbers (exp == 0, mantissa != 0) + | - unnormalized numbers (exp != 0, exp < 0x7FFF) + | - infinities (exp == 0x7FFF, mantissa == 0) + | - not-a-numbers (exp == 0x7FFF, mantissa != 0) + | + | For infinities and NaNs, the explicit integer bit can be either one or + | zero. + | + | The IEEE 754 standard does not define a zero integer bit. Such a number + | is an unnormalized number. Hardware does not directly support + | denormalized and unnormalized numbers, but implicitly supports them by + | trapping them as unimplemented data types, allowing efficient conversion + | in software. + | + | See "M68000 FAMILY PROGRAMMER’S REFERENCE MANUAL", + | "1.6 FLOATING-POINT DATA TYPES" + *------------------------------------------------------------------------*/ + return false; +#else + return (a.low & (1ULL << 63)) == 0 && (a.high & 0x7FFF) != 0; +#endif +} + #define floatx80_zero make_floatx80(0x0000, 0x0000000000000000LL) #define floatx80_one make_floatx80(0x3fff, 0x8000000000000000LL) #define floatx80_ln2 make_floatx80(0x3ffe, 0xb17217f7d1cf79acLL) @@ -673,10 +727,70 @@ static inline int floatx80_is_any_nan(floatx80 a) #define floatx80_half make_floatx80(0x3ffe, 0x8000000000000000LL) #define floatx80_infinity make_floatx80(0x7fff, 0x8000000000000000LL) +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the extended double-precision floating-point +| value `a'. +*----------------------------------------------------------------------------*/ + +inline uint64_t extractFloatx80Frac( floatx80 a ) +{ + + return a.low; + +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the extended double-precision floating-point +| value `a'. +*----------------------------------------------------------------------------*/ + +inline int32 extractFloatx80Exp( floatx80 a ) +{ + + return a.high & 0x7FFF; + +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the extended double-precision floating-point value +| `a'. +*----------------------------------------------------------------------------*/ + +inline flag extractFloatx80Sign( floatx80 a ) +{ + + return a.high>>15; + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent +| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', +| and returns the proper extended double-precision floating-point value +| corresponding to the abstract input. This routine is just like +| `roundAndPackFloatx80' except that the input significand does not have to be +| normalized. +*----------------------------------------------------------------------------*/ + +floatx80 normalizeRoundAndPackFloatx80(int8 roundingPrecision, + flag zSign, int32 zExp, + uint64_t zSig0, uint64_t zSig1, + float_status *status); + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal extended double-precision floating-point value +| represented by the denormalized significand `aSig'. The normalized exponent +| and significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, + uint64_t *zSigPtr); + /*---------------------------------------------------------------------------- | The pattern for a default generated extended double-precision NaN. *----------------------------------------------------------------------------*/ -extern const floatx80 floatx80_default_nan; +floatx80 floatx80_default_nan(float_status *status); /*---------------------------------------------------------------------------- | Software IEC/IEEE quadruple-precision conversion routines. diff --git a/target-i386/fpu_helper.c b/target-i386/fpu_helper.c index 30d34d5a..62626291 100644 --- a/target-i386/fpu_helper.c +++ b/target-i386/fpu_helper.c @@ -20,6 +20,8 @@ #include #include "cpu.h" #include "exec/helper-proto.h" +#include "fpu/softfloat.h" +#include "fpu/softfloat-macros.h" #include "qemu/aes.h" #include "qemu/host-utils.h" #include "exec/cpu_ldst.h" @@ -118,6 +120,26 @@ static void fpu_set_exception(CPUX86State *env, int mask) } } +static inline uint8_t save_exception_flags(CPUX86State *env) +{ + uint8_t old_flags = get_float_exception_flags(&env->fp_status); + set_float_exception_flags(0, &env->fp_status); + return old_flags; +} + +static void merge_exception_flags(CPUX86State *env, uint8_t old_flags) +{ + uint8_t new_flags = get_float_exception_flags(&env->fp_status); + float_raise(old_flags, &env->fp_status); + fpu_set_exception(env, + ((new_flags & float_flag_invalid ? FPUS_IE : 0) | + (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) | + (new_flags & float_flag_overflow ? FPUS_OE : 0) | + (new_flags & float_flag_underflow ? FPUS_UE : 0) | + (new_flags & float_flag_inexact ? FPUS_PE : 0) | + (new_flags & float_flag_input_denormal ? FPUS_DE : 0))); +} + static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b) { if (floatx80_is_zero(b)) { @@ -682,19 +704,286 @@ void helper_f2xm1(CPUX86State *env) ST0 = double_to_floatx80(env, val); } +/* 128-bit significand of log2(e). */ +#define log2_e_sig_high 0xb8aa3b295c17f0bbULL +#define log2_e_sig_low 0xbe87fed0691d3e89ULL + +/* + * Polynomial coefficients for an approximation to log2((1+x)/(1-x)), + * with only odd powers of x used, for x in the interval [2*sqrt(2)-3, + * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the + * interval [sqrt(2)/2, sqrt(2)]. + */ +#define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL) +#define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL) +#define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL) +#define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL) +#define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL) +#define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL) +#define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL) +#define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL) +#define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL) +#define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL) +#define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL) + +/* + * Compute an approximation of log2(1+arg), where 1+arg is in the + * interval [sqrt(2)/2, sqrt(2)]. It is assumed that when this + * function is called, rounding precision is set to 80 and the + * round-to-nearest mode is in effect. arg must not be exactly zero, + * and must not be so close to zero that underflow might occur. + */ +static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp, + uint64_t *sig0, uint64_t *sig1) +{ + uint64_t arg0_sig = extractFloatx80Frac(arg); + int32_t arg0_exp = extractFloatx80Exp(arg); + bool arg0_sign = extractFloatx80Sign(arg); + bool asign; + int32_t dexp, texp, aexp; + uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2; + uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3; + uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1; + floatx80 t2, accum; + + /* + * Compute an approximation of arg/(2+arg), with extra precision, + * as the argument to a polynomial approximation. The extra + * precision is only needed for the first term of the + * approximation, with subsequent terms being significantly + * smaller; the approximation only uses odd exponents, and the + * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029.... + */ + if (arg0_sign) { + dexp = 0x3fff; + shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1); + sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1); + } else { + dexp = 0x4000; + shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1); + dsig0 |= 0x8000000000000000ULL; + } + texp = arg0_exp - dexp + 0x3ffe; + rsig0 = arg0_sig; + rsig1 = 0; + rsig2 = 0; + if (dsig0 <= rsig0) { + shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1); + ++texp; + } + tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0); + mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2); + sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2, + &rsig0, &rsig1, &rsig2); + while ((int64_t) rsig0 < 0) { + --tsig0; + add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1, + &rsig0, &rsig1, &rsig2); + } + tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0); + /* + * No need to correct any estimation error in tsig1; even with + * such error, it is accurate enough. Now compute the square of + * that approximation. + */ + mul128To256(tsig0, tsig1, tsig0, tsig1, + &t2sig0, &t2sig1, &t2sig2, &t2sig3); + t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false, + texp + texp - 0x3ffe, + t2sig0, t2sig1, &env->fp_status); + + /* Compute the lower parts of the polynomial expansion. */ + accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status); + + /* + * The full polynomial expansion is fyl2x_coeff_0 + accum (where + * accum has much lower magnitude, and so, in particular, carry + * out of the addition is not possible), multiplied by t. (This + * expansion is only accurate to about 70 bits, not 128 bits.) + */ + aexp = extractFloatx80Exp(fyl2x_coeff_0); + asign = extractFloatx80Sign(fyl2x_coeff_0); + shift128RightJamming(extractFloatx80Frac(accum), 0, + aexp - extractFloatx80Exp(accum), + &asig0, &asig1); + bsig0 = extractFloatx80Frac(fyl2x_coeff_0); + bsig1 = 0; + if (asign == extractFloatx80Sign(accum)) { + add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1); + } else { + sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1); + } + /* Multiply by t to compute the required result. */ + mul128To256(asig0, asig1, tsig0, tsig1, + &asig0, &asig1, &asig2, &asig3); + aexp += texp - 0x3ffe; + *exp = aexp; + *sig0 = asig0; + *sig1 = asig1; +} + void helper_fyl2x(CPUX86State *env) { - double fptemp = floatx80_to_double(env, ST0); - - if (fptemp > 0.0) { - fptemp = log(fptemp) / log(2.0); /* log2(ST) */ - fptemp *= floatx80_to_double(env, ST1); - ST1 = double_to_floatx80(env, fptemp); - fpop(env); + uint8_t old_flags = save_exception_flags(env); + uint64_t arg0_sig = extractFloatx80Frac(ST0); + int32_t arg0_exp = extractFloatx80Exp(ST0); + bool arg0_sign = extractFloatx80Sign(ST0); + uint64_t arg1_sig = extractFloatx80Frac(ST1); + int32_t arg1_exp = extractFloatx80Exp(ST1); + bool arg1_sign = extractFloatx80Sign(ST1); + + if (floatx80_is_signaling_nan(ST0)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST0); + } else if (floatx80_is_signaling_nan(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST1); + } else if (floatx80_invalid_encoding(ST0) || + floatx80_invalid_encoding(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_any_nan(ST0)) { + ST1 = ST0; + } else if (floatx80_is_any_nan(ST1)) { + /* Pass this NaN through. */ + } else if (arg0_sign && !floatx80_is_zero(ST0)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_infinity(ST1)) { + FloatRelation cmp = floatx80_compare(ST0, floatx80_one, + &env->fp_status); + switch (cmp) { + case float_relation_less: + ST1 = floatx80_chs(ST1); + break; + case float_relation_greater: + /* Result is infinity of the same sign as ST1. */ + break; + default: + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + break; + } + } else if (floatx80_is_infinity(ST0)) { + if (floatx80_is_zero(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (arg1_sign) { + ST1 = floatx80_chs(ST0); + } else { + ST1 = ST0; + } + } else if (floatx80_is_zero(ST0)) { + if (floatx80_is_zero(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else { + /* Result is infinity with opposite sign to ST1. */ + float_raise(float_flag_divbyzero, &env->fp_status); + ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff, + 0x8000000000000000ULL); + } + } else if (floatx80_is_zero(ST1)) { + if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) { + ST1 = floatx80_chs(ST1); + } + /* Otherwise, ST1 is already the correct result. */ + } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) { + if (arg1_sign) { + ST1 = floatx80_chs(floatx80_zero); + } else { + ST1 = floatx80_zero; + } } else { - env->fpus &= ~0x4700; - env->fpus |= 0x400; + int32_t int_exp; + floatx80 arg0_m1; + FloatRoundMode save_mode = env->fp_status.float_rounding_mode; + FloatX80RoundPrec save_prec = + env->fp_status.floatx80_rounding_precision; + env->fp_status.float_rounding_mode = float_round_nearest_even; + env->fp_status.floatx80_rounding_precision = floatx80_precision_x; + + if (arg0_exp == 0) { + normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig); + } + if (arg1_exp == 0) { + normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); + } + int_exp = arg0_exp - 0x3fff; + if (arg0_sig > 0xb504f333f9de6484ULL) { + ++int_exp; + } + arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp, + &env->fp_status), + floatx80_one, &env->fp_status); + if (floatx80_is_zero(arg0_m1)) { + /* Exact power of 2; multiply by ST1. */ + env->fp_status.float_rounding_mode = save_mode; + ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status), + ST1, &env->fp_status); + } else { + bool asign = extractFloatx80Sign(arg0_m1); + int32_t aexp; + uint64_t asig0, asig1, asig2; + helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1); + if (int_exp != 0) { + bool isign = (int_exp < 0); + int32_t iexp; + uint64_t isig; + int shift; + int_exp = isign ? -int_exp : int_exp; + shift = clz32(int_exp) + 32; + isig = int_exp; + isig <<= shift; + iexp = 0x403e - shift; + shift128RightJamming(asig0, asig1, iexp - aexp, + &asig0, &asig1); + if (asign == isign) { + add128(isig, 0, asig0, asig1, &asig0, &asig1); + } else { + sub128(isig, 0, asig0, asig1, &asig0, &asig1); + } + aexp = iexp; + asign = isign; + } + /* + * Multiply by the second argument to compute the required + * result. + */ + if (arg1_exp == 0) { + normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); + } + mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2); + aexp += arg1_exp - 0x3ffe; + /* This result is inexact. */ + asig1 |= 1; + env->fp_status.float_rounding_mode = save_mode; + ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, + asign ^ arg1_sign, aexp, + asig0, asig1, &env->fp_status); + } + + env->fp_status.floatx80_rounding_precision = save_prec; } + fpop(env); + merge_exception_flags(env, old_flags); } void helper_fptan(CPUX86State *env) @@ -869,17 +1158,100 @@ void helper_fprem(CPUX86State *env) void helper_fyl2xp1(CPUX86State *env) { - double fptemp = floatx80_to_double(env, ST0); - - if ((fptemp + 1.0) > 0.0) { - fptemp = log(fptemp + 1.0) / log(2.0); /* log2(ST + 1.0) */ - fptemp *= floatx80_to_double(env, ST1); - ST1 = double_to_floatx80(env, fptemp); - fpop(env); + uint8_t old_flags = save_exception_flags(env); + uint64_t arg0_sig = extractFloatx80Frac(ST0); + int32_t arg0_exp = extractFloatx80Exp(ST0); + bool arg0_sign = extractFloatx80Sign(ST0); + uint64_t arg1_sig = extractFloatx80Frac(ST1); + int32_t arg1_exp = extractFloatx80Exp(ST1); + bool arg1_sign = extractFloatx80Sign(ST1); + + if (floatx80_is_signaling_nan(ST0)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST0); + } else if (floatx80_is_signaling_nan(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST1); + } else if (floatx80_invalid_encoding(ST0) || + floatx80_invalid_encoding(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_any_nan(ST0)) { + ST1 = ST0; + } else if (floatx80_is_any_nan(ST1)) { + /* Pass this NaN through. */ + } else if (arg0_exp > 0x3ffd || + (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ? + 0x95f619980c4336f7ULL : + 0xd413cccfe7799211ULL))) { + /* + * Out of range for the instruction (ST0 must have absolute + * value less than 1 - sqrt(2)/2 = 0.292..., according to + * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1 + * to sqrt(2) - 1, which we allow here), treat as invalid. + */ + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) || + arg1_exp == 0x7fff) { + /* + * One argument is zero, or multiplying by infinity; correct + * result is exact and can be obtained by multiplying the + * arguments. + */ + ST1 = floatx80_mul(ST0, ST1, &env->fp_status); + } else if (arg0_exp < 0x3fb0) { + /* + * Multiplying both arguments and an extra-precision version + * of log2(e) is sufficiently precise. + */ + uint64_t sig0, sig1, sig2; + int32_t exp; + if (arg0_exp == 0) { + normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig); + } + if (arg1_exp == 0) { + normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); + } + mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig, + &sig0, &sig1, &sig2); + exp = arg0_exp + 1; + mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2); + exp += arg1_exp - 0x3ffe; + /* This result is inexact. */ + sig1 |= 1; + ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, + arg0_sign ^ arg1_sign, exp, + sig0, sig1, &env->fp_status); } else { - env->fpus &= ~0x4700; - env->fpus |= 0x400; + int32_t aexp; + uint64_t asig0, asig1, asig2; + FloatRoundMode save_mode = env->fp_status.float_rounding_mode; + FloatX80RoundPrec save_prec = + env->fp_status.floatx80_rounding_precision; + env->fp_status.float_rounding_mode = float_round_nearest_even; + env->fp_status.floatx80_rounding_precision = floatx80_precision_x; + + helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1); + /* + * Multiply by the second argument to compute the required + * result. + */ + if (arg1_exp == 0) { + normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); + } + mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2); + aexp += arg1_exp - 0x3ffe; + /* This result is inexact. */ + asig1 |= 1; + env->fp_status.float_rounding_mode = save_mode; + ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, + arg0_sign ^ arg1_sign, aexp, + asig0, asig1, &env->fp_status); + env->fp_status.floatx80_rounding_precision = save_prec; } + fpop(env); + merge_exception_flags(env, old_flags); } void helper_fsqrt(CPUX86State *env)