From adbfd03a09e1604f626cfd4b65e3225e11b4822b Mon Sep 17 00:00:00 2001
From: Arvind <dnivra@users.noreply.github.com>
Date: Tue, 28 Sep 2021 23:02:32 -0700
Subject: [PATCH] fpu: Backported helpers for fyl2x and fyl2xp1 from qemu 6.1.0

---
 fpu/softfloat-specialize.h |  35 +++-
 fpu/softfloat.c            |  46 +----
 include/fpu/softfloat.h    | 124 ++++++++++-
 target-i386/fpu_helper.c   | 408 +++++++++++++++++++++++++++++++++++--
 4 files changed, 547 insertions(+), 66 deletions(-)

diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h
index fa1214a4..93ec6e73 100644
--- a/fpu/softfloat-specialize.h
+++ b/fpu/softfloat-specialize.h
@@ -145,9 +145,25 @@ const float64 float64_default_nan = const_float64(LIT64( 0xFFF8000000000000 ));
 #define floatx80_default_nan_low  LIT64( 0xC000000000000000 )
 #endif
 
-const floatx80 floatx80_default_nan
-    = make_floatx80_init(floatx80_default_nan_high, floatx80_default_nan_low);
+floatx80 floatx80_default_nan(float_status *status)
+{
+    floatx80 r;
+
+    /* None of the targets that have snan_bit_is_one use floatx80.  */
+    #if SNAN_BIT_IS_ONE
+    # error SNAN_BIT_IS_ONE should not be set
+    #endif
 
+#if defined(TARGET_M68K)
+    r.low = UINT64_C(0xFFFFFFFFFFFFFFFF);
+    r.high = 0x7FFF;
+#else
+    /* X86 */
+    r.low = UINT64_C(0xC000000000000000);
+    r.high = 0xFFFF;
+#endif
+    return r;
+}
 /*----------------------------------------------------------------------------
 | The pattern for a default generated quadruple-precision NaN.  The `high' and
 | `low' values hold the most- and least-significant bits, respectively.
@@ -985,6 +1001,21 @@ floatx80 floatx80_maybe_silence_nan( floatx80 a )
     return a;
 }
 
+/*----------------------------------------------------------------------------
+| Returns a quiet NaN from a signalling NaN for the extended double-precision
+| floating point value `a'.
+*----------------------------------------------------------------------------*/
+
+floatx80 floatx80_silence_nan(floatx80 a)
+{
+    /* None of the targets that have snan_bit_is_one use floatx80.  */
+    #if SNAN_BIT_IS_ONE
+    # error SNAN_BIT_IS_ONE should not be set
+    #endif
+    a.low |= UINT64_C(0xC000000000000000);
+    return a;
+}
+
 /*----------------------------------------------------------------------------
 | Returns the result of converting the extended double-precision floating-
 | point NaN `a' to the canonical NaN format.  If `a' is a signaling NaN, the
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index d039bda4..8d6b20c7 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -685,42 +685,6 @@ static float64
 
 }
 
-/*----------------------------------------------------------------------------
-| Returns the fraction bits of the extended double-precision floating-point
-| value `a'.
-*----------------------------------------------------------------------------*/
-
-static inline uint64_t extractFloatx80Frac( floatx80 a )
-{
-
-    return a.low;
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the exponent bits of the extended double-precision floating-point
-| value `a'.
-*----------------------------------------------------------------------------*/
-
-static inline int32 extractFloatx80Exp( floatx80 a )
-{
-
-    return a.high & 0x7FFF;
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the sign bit of the extended double-precision floating-point value
-| `a'.
-*----------------------------------------------------------------------------*/
-
-static inline flag extractFloatx80Sign( floatx80 a )
-{
-
-    return a.high>>15;
-
-}
-
 /*----------------------------------------------------------------------------
 | Normalizes the subnormal extended double-precision floating-point value
 | represented by the denormalized significand `aSig'.  The normalized exponent
@@ -728,7 +692,7 @@ static inline flag extractFloatx80Sign( floatx80 a )
 | `zSigPtr', respectively.
 *----------------------------------------------------------------------------*/
 
-static void
+void
  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 {
     int8 shiftCount;
@@ -974,10 +938,10 @@ static floatx80 roundAndPackFloatx80(int8 roundingPrecision, flag zSign,
 | normalized.
 *----------------------------------------------------------------------------*/
 
-static floatx80 normalizeRoundAndPackFloatx80(int8 roundingPrecision,
-                                              flag zSign, int32 zExp,
-                                              uint64_t zSig0, uint64_t zSig1,
-                                              float_status *status)
+floatx80 normalizeRoundAndPackFloatx80(int8 roundingPrecision,
+                                       flag zSign, int32 zExp,
+                                       uint64_t zSig0, uint64_t zSig1,
+                                       float_status *status)
 {
     int8 shiftCount;
 
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index ded34eb0..2ab0aa9b 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -111,12 +111,12 @@ typedef int64_t int64;
 /*----------------------------------------------------------------------------
 | Software IEC/IEEE floating-point ordering relations
 *----------------------------------------------------------------------------*/
-enum {
+typedef enum {
     float_relation_less      = -1,
     float_relation_equal     =  0,
     float_relation_greater   =  1,
     float_relation_unordered =  2
-};
+} FloatRelation;
 
 /*----------------------------------------------------------------------------
 | Software IEC/IEEE floating-point types.
@@ -187,13 +187,18 @@ enum {
 /*----------------------------------------------------------------------------
 | Software IEC/IEEE floating-point rounding mode.
 *----------------------------------------------------------------------------*/
-enum {
+
+typedef enum __attribute__((__packed__)) {
     float_round_nearest_even = 0,
     float_round_down         = 1,
     float_round_up           = 2,
     float_round_to_zero      = 3,
     float_round_ties_away    = 4,
-};
+    /* Not an IEEE rounding mode: round to closest odd, overflow to max */
+    float_round_to_odd       = 5,
+    /* Not an IEEE rounding mode: round to closest odd, overflow to inf */
+    float_round_to_odd_inf   = 6,
+} FloatRoundMode;
 
 /*----------------------------------------------------------------------------
 | Software IEC/IEEE floating-point exception flags.
@@ -208,6 +213,15 @@ enum {
     float_flag_output_denormal = 128
 };
 
+/*
+ * Rounding precision for floatx80.
+ */
+typedef enum __attribute__((__packed__)) {
+    floatx80_precision_x,
+    floatx80_precision_d,
+    floatx80_precision_s,
+} FloatX80RoundPrec;
+
 typedef struct float_status {
     signed char float_detect_tininess;
     signed char float_rounding_mode;
@@ -628,6 +642,7 @@ int floatx80_is_quiet_nan( floatx80 );
 int floatx80_is_signaling_nan( floatx80 );
 floatx80 floatx80_maybe_silence_nan( floatx80 );
 floatx80 floatx80_scalbn(floatx80, int, float_status *status);
+floatx80 floatx80_silence_nan( floatx80 );
 
 static inline floatx80 floatx80_abs(floatx80 a)
 {
@@ -666,6 +681,45 @@ static inline int floatx80_is_any_nan(floatx80 a)
     return ((a.high & 0x7fff) == 0x7fff) && (a.low<<1);
 }
 
+/*----------------------------------------------------------------------------
+| Return whether the given value is an invalid floatx80 encoding.
+| Invalid floatx80 encodings arise when the integer bit is not set, but
+| the exponent is not zero. The only times the integer bit is permitted to
+| be zero is in subnormal numbers and the value zero.
+| This includes what the Intel software developer's manual calls pseudo-NaNs,
+| pseudo-infinities and un-normal numbers. It does not include
+| pseudo-denormals, which must still be correctly handled as inputs even
+| if they are never generated as outputs.
+*----------------------------------------------------------------------------*/
+static inline bool floatx80_invalid_encoding(floatx80 a)
+{
+#if defined(TARGET_M68K)
+    /*-------------------------------------------------------------------------
+    | With m68k, the explicit integer bit can be zero in the case of:
+    | - zeros                (exp == 0, mantissa == 0)
+    | - denormalized numbers (exp == 0, mantissa != 0)
+    | - unnormalized numbers (exp != 0, exp < 0x7FFF)
+    | - infinities           (exp == 0x7FFF, mantissa == 0)
+    | - not-a-numbers        (exp == 0x7FFF, mantissa != 0)
+    |
+    | For infinities and NaNs, the explicit integer bit can be either one or
+    | zero.
+    |
+    | The IEEE 754 standard does not define a zero integer bit. Such a number
+    | is an unnormalized number. Hardware does not directly support
+    | denormalized and unnormalized numbers, but implicitly supports them by
+    | trapping them as unimplemented data types, allowing efficient conversion
+    | in software.
+    |
+    | See "M68000 FAMILY PROGRAMMER’S REFERENCE MANUAL",
+    |     "1.6 FLOATING-POINT DATA TYPES"
+    *------------------------------------------------------------------------*/
+    return false;
+#else
+    return (a.low & (1ULL << 63)) == 0 && (a.high & 0x7FFF) != 0;
+#endif
+}
+
 #define floatx80_zero make_floatx80(0x0000, 0x0000000000000000LL)
 #define floatx80_one make_floatx80(0x3fff, 0x8000000000000000LL)
 #define floatx80_ln2 make_floatx80(0x3ffe, 0xb17217f7d1cf79acLL)
@@ -673,10 +727,70 @@ static inline int floatx80_is_any_nan(floatx80 a)
 #define floatx80_half make_floatx80(0x3ffe, 0x8000000000000000LL)
 #define floatx80_infinity make_floatx80(0x7fff, 0x8000000000000000LL)
 
+/*----------------------------------------------------------------------------
+| Returns the fraction bits of the extended double-precision floating-point
+| value `a'.
+*----------------------------------------------------------------------------*/
+
+inline uint64_t extractFloatx80Frac( floatx80 a )
+{
+
+    return a.low;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the exponent bits of the extended double-precision floating-point
+| value `a'.
+*----------------------------------------------------------------------------*/
+
+inline int32 extractFloatx80Exp( floatx80 a )
+{
+
+    return a.high & 0x7FFF;
+
+}
+
+/*----------------------------------------------------------------------------
+| Returns the sign bit of the extended double-precision floating-point value
+| `a'.
+*----------------------------------------------------------------------------*/
+
+inline flag extractFloatx80Sign( floatx80 a )
+{
+
+    return a.high>>15;
+
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent
+| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
+| and returns the proper extended double-precision floating-point value
+| corresponding to the abstract input.  This routine is just like
+| `roundAndPackFloatx80' except that the input significand does not have to be
+| normalized.
+*----------------------------------------------------------------------------*/
+
+floatx80 normalizeRoundAndPackFloatx80(int8 roundingPrecision,
+                                              flag zSign, int32 zExp,
+                                              uint64_t zSig0, uint64_t zSig1,
+                                              float_status *status);
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal extended double-precision floating-point value
+| represented by the denormalized significand `aSig'.  The normalized exponent
+| and significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
+                                uint64_t *zSigPtr);
+
 /*----------------------------------------------------------------------------
 | The pattern for a default generated extended double-precision NaN.
 *----------------------------------------------------------------------------*/
-extern const floatx80 floatx80_default_nan;
+floatx80 floatx80_default_nan(float_status *status);
 
 /*----------------------------------------------------------------------------
 | Software IEC/IEEE quadruple-precision conversion routines.
diff --git a/target-i386/fpu_helper.c b/target-i386/fpu_helper.c
index 30d34d5a..62626291 100644
--- a/target-i386/fpu_helper.c
+++ b/target-i386/fpu_helper.c
@@ -20,6 +20,8 @@
 #include <math.h>
 #include "cpu.h"
 #include "exec/helper-proto.h"
+#include "fpu/softfloat.h"
+#include "fpu/softfloat-macros.h"
 #include "qemu/aes.h"
 #include "qemu/host-utils.h"
 #include "exec/cpu_ldst.h"
@@ -118,6 +120,26 @@ static void fpu_set_exception(CPUX86State *env, int mask)
     }
 }
 
+static inline uint8_t save_exception_flags(CPUX86State *env)
+{
+    uint8_t old_flags = get_float_exception_flags(&env->fp_status);
+    set_float_exception_flags(0, &env->fp_status);
+    return old_flags;
+}
+
+static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
+{
+    uint8_t new_flags = get_float_exception_flags(&env->fp_status);
+    float_raise(old_flags, &env->fp_status);
+    fpu_set_exception(env,
+                      ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
+                       (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
+                       (new_flags & float_flag_overflow ? FPUS_OE : 0) |
+                       (new_flags & float_flag_underflow ? FPUS_UE : 0) |
+                       (new_flags & float_flag_inexact ? FPUS_PE : 0) |
+                       (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
+}
+
 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
 {
     if (floatx80_is_zero(b)) {
@@ -682,19 +704,286 @@ void helper_f2xm1(CPUX86State *env)
     ST0 = double_to_floatx80(env, val);
 }
 
+/* 128-bit significand of log2(e).  */
+#define log2_e_sig_high 0xb8aa3b295c17f0bbULL
+#define log2_e_sig_low 0xbe87fed0691d3e89ULL
+
+/*
+ * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
+ * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
+ * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
+ * interval [sqrt(2)/2, sqrt(2)].
+ */
+#define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
+#define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
+#define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
+#define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
+#define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
+#define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
+#define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
+#define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
+#define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
+#define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
+#define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
+
+/*
+ * Compute an approximation of log2(1+arg), where 1+arg is in the
+ * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
+ * function is called, rounding precision is set to 80 and the
+ * round-to-nearest mode is in effect.  arg must not be exactly zero,
+ * and must not be so close to zero that underflow might occur.
+ */
+static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
+                                uint64_t *sig0, uint64_t *sig1)
+{
+    uint64_t arg0_sig = extractFloatx80Frac(arg);
+    int32_t arg0_exp = extractFloatx80Exp(arg);
+    bool arg0_sign = extractFloatx80Sign(arg);
+    bool asign;
+    int32_t dexp, texp, aexp;
+    uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
+    uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
+    uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
+    floatx80 t2, accum;
+
+    /*
+     * Compute an approximation of arg/(2+arg), with extra precision,
+     * as the argument to a polynomial approximation.  The extra
+     * precision is only needed for the first term of the
+     * approximation, with subsequent terms being significantly
+     * smaller; the approximation only uses odd exponents, and the
+     * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
+     */
+    if (arg0_sign) {
+        dexp = 0x3fff;
+        shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
+        sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
+    } else {
+        dexp = 0x4000;
+        shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
+        dsig0 |= 0x8000000000000000ULL;
+    }
+    texp = arg0_exp - dexp + 0x3ffe;
+    rsig0 = arg0_sig;
+    rsig1 = 0;
+    rsig2 = 0;
+    if (dsig0 <= rsig0) {
+        shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
+        ++texp;
+    }
+    tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
+    mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
+    sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
+           &rsig0, &rsig1, &rsig2);
+    while ((int64_t) rsig0 < 0) {
+        --tsig0;
+        add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
+               &rsig0, &rsig1, &rsig2);
+    }
+    tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
+    /*
+     * No need to correct any estimation error in tsig1; even with
+     * such error, it is accurate enough.  Now compute the square of
+     * that approximation.
+     */
+    mul128To256(tsig0, tsig1, tsig0, tsig1,
+                &t2sig0, &t2sig1, &t2sig2, &t2sig3);
+    t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
+                                       texp + texp - 0x3ffe,
+                                       t2sig0, t2sig1, &env->fp_status);
+
+    /* Compute the lower parts of the polynomial expansion.  */
+    accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
+    accum = floatx80_mul(accum, t2, &env->fp_status);
+    accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
+
+    /*
+     * The full polynomial expansion is fyl2x_coeff_0 + accum (where
+     * accum has much lower magnitude, and so, in particular, carry
+     * out of the addition is not possible), multiplied by t.  (This
+     * expansion is only accurate to about 70 bits, not 128 bits.)
+     */
+    aexp = extractFloatx80Exp(fyl2x_coeff_0);
+    asign = extractFloatx80Sign(fyl2x_coeff_0);
+    shift128RightJamming(extractFloatx80Frac(accum), 0,
+                         aexp - extractFloatx80Exp(accum),
+                         &asig0, &asig1);
+    bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
+    bsig1 = 0;
+    if (asign == extractFloatx80Sign(accum)) {
+        add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
+    } else {
+        sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
+    }
+    /* Multiply by t to compute the required result.  */
+    mul128To256(asig0, asig1, tsig0, tsig1,
+                &asig0, &asig1, &asig2, &asig3);
+    aexp += texp - 0x3ffe;
+    *exp = aexp;
+    *sig0 = asig0;
+    *sig1 = asig1;
+}
+
 void helper_fyl2x(CPUX86State *env)
 {
-    double fptemp = floatx80_to_double(env, ST0);
-
-    if (fptemp > 0.0) {
-        fptemp = log(fptemp) / log(2.0); /* log2(ST) */
-        fptemp *= floatx80_to_double(env, ST1);
-        ST1 = double_to_floatx80(env, fptemp);
-        fpop(env);
+    uint8_t old_flags = save_exception_flags(env);
+    uint64_t arg0_sig = extractFloatx80Frac(ST0);
+    int32_t arg0_exp = extractFloatx80Exp(ST0);
+    bool arg0_sign = extractFloatx80Sign(ST0);
+    uint64_t arg1_sig = extractFloatx80Frac(ST1);
+    int32_t arg1_exp = extractFloatx80Exp(ST1);
+    bool arg1_sign = extractFloatx80Sign(ST1);
+
+    if (floatx80_is_signaling_nan(ST0)) {
+        float_raise(float_flag_invalid, &env->fp_status);
+        ST1 = floatx80_silence_nan(ST0);
+    } else if (floatx80_is_signaling_nan(ST1)) {
+        float_raise(float_flag_invalid, &env->fp_status);
+        ST1 = floatx80_silence_nan(ST1);
+    } else if (floatx80_invalid_encoding(ST0) ||
+               floatx80_invalid_encoding(ST1)) {
+        float_raise(float_flag_invalid, &env->fp_status);
+        ST1 = floatx80_default_nan(&env->fp_status);
+    } else if (floatx80_is_any_nan(ST0)) {
+        ST1 = ST0;
+    } else if (floatx80_is_any_nan(ST1)) {
+        /* Pass this NaN through.  */
+    } else if (arg0_sign && !floatx80_is_zero(ST0)) {
+        float_raise(float_flag_invalid, &env->fp_status);
+        ST1 = floatx80_default_nan(&env->fp_status);
+    } else if (floatx80_is_infinity(ST1)) {
+        FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
+                                             &env->fp_status);
+        switch (cmp) {
+        case float_relation_less:
+            ST1 = floatx80_chs(ST1);
+            break;
+        case float_relation_greater:
+            /* Result is infinity of the same sign as ST1.  */
+            break;
+        default:
+            float_raise(float_flag_invalid, &env->fp_status);
+            ST1 = floatx80_default_nan(&env->fp_status);
+            break;
+        }
+    } else if (floatx80_is_infinity(ST0)) {
+        if (floatx80_is_zero(ST1)) {
+            float_raise(float_flag_invalid, &env->fp_status);
+            ST1 = floatx80_default_nan(&env->fp_status);
+        } else if (arg1_sign) {
+            ST1 = floatx80_chs(ST0);
+        } else {
+            ST1 = ST0;
+        }
+    } else if (floatx80_is_zero(ST0)) {
+        if (floatx80_is_zero(ST1)) {
+            float_raise(float_flag_invalid, &env->fp_status);
+            ST1 = floatx80_default_nan(&env->fp_status);
+        } else {
+            /* Result is infinity with opposite sign to ST1.  */
+            float_raise(float_flag_divbyzero, &env->fp_status);
+            ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
+                                0x8000000000000000ULL);
+        }
+    } else if (floatx80_is_zero(ST1)) {
+        if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
+            ST1 = floatx80_chs(ST1);
+        }
+        /* Otherwise, ST1 is already the correct result.  */
+    } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
+        if (arg1_sign) {
+            ST1 = floatx80_chs(floatx80_zero);
+        } else {
+            ST1 = floatx80_zero;
+        }
     } else {
-        env->fpus &= ~0x4700;
-        env->fpus |= 0x400;
+        int32_t int_exp;
+        floatx80 arg0_m1;
+        FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
+        FloatX80RoundPrec save_prec =
+            env->fp_status.floatx80_rounding_precision;
+        env->fp_status.float_rounding_mode = float_round_nearest_even;
+        env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
+
+        if (arg0_exp == 0) {
+            normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
+        }
+        if (arg1_exp == 0) {
+            normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
+        }
+        int_exp = arg0_exp - 0x3fff;
+        if (arg0_sig > 0xb504f333f9de6484ULL) {
+            ++int_exp;
+        }
+        arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
+                                               &env->fp_status),
+                               floatx80_one, &env->fp_status);
+        if (floatx80_is_zero(arg0_m1)) {
+            /* Exact power of 2; multiply by ST1.  */
+            env->fp_status.float_rounding_mode = save_mode;
+            ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
+                               ST1, &env->fp_status);
+        } else {
+            bool asign = extractFloatx80Sign(arg0_m1);
+            int32_t aexp;
+            uint64_t asig0, asig1, asig2;
+            helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
+            if (int_exp != 0) {
+                bool isign = (int_exp < 0);
+                int32_t iexp;
+                uint64_t isig;
+                int shift;
+                int_exp = isign ? -int_exp : int_exp;
+                shift = clz32(int_exp) + 32;
+                isig = int_exp;
+                isig <<= shift;
+                iexp = 0x403e - shift;
+                shift128RightJamming(asig0, asig1, iexp - aexp,
+                                     &asig0, &asig1);
+                if (asign == isign) {
+                    add128(isig, 0, asig0, asig1, &asig0, &asig1);
+                } else {
+                    sub128(isig, 0, asig0, asig1, &asig0, &asig1);
+                }
+                aexp = iexp;
+                asign = isign;
+            }
+            /*
+             * Multiply by the second argument to compute the required
+             * result.
+             */
+            if (arg1_exp == 0) {
+                normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
+            }
+            mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
+            aexp += arg1_exp - 0x3ffe;
+            /* This result is inexact.  */
+            asig1 |= 1;
+            env->fp_status.float_rounding_mode = save_mode;
+            ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
+                                                asign ^ arg1_sign, aexp,
+                                                asig0, asig1, &env->fp_status);
+        }
+
+        env->fp_status.floatx80_rounding_precision = save_prec;
     }
+    fpop(env);
+    merge_exception_flags(env, old_flags);
 }
 
 void helper_fptan(CPUX86State *env)
@@ -869,17 +1158,100 @@ void helper_fprem(CPUX86State *env)
 
 void helper_fyl2xp1(CPUX86State *env)
 {
-    double fptemp = floatx80_to_double(env, ST0);
-
-    if ((fptemp + 1.0) > 0.0) {
-        fptemp = log(fptemp + 1.0) / log(2.0); /* log2(ST + 1.0) */
-        fptemp *= floatx80_to_double(env, ST1);
-        ST1 = double_to_floatx80(env, fptemp);
-        fpop(env);
+    uint8_t old_flags = save_exception_flags(env);
+    uint64_t arg0_sig = extractFloatx80Frac(ST0);
+    int32_t arg0_exp = extractFloatx80Exp(ST0);
+    bool arg0_sign = extractFloatx80Sign(ST0);
+    uint64_t arg1_sig = extractFloatx80Frac(ST1);
+    int32_t arg1_exp = extractFloatx80Exp(ST1);
+    bool arg1_sign = extractFloatx80Sign(ST1);
+
+    if (floatx80_is_signaling_nan(ST0)) {
+        float_raise(float_flag_invalid, &env->fp_status);
+        ST1 = floatx80_silence_nan(ST0);
+    } else if (floatx80_is_signaling_nan(ST1)) {
+        float_raise(float_flag_invalid, &env->fp_status);
+        ST1 = floatx80_silence_nan(ST1);
+    } else if (floatx80_invalid_encoding(ST0) ||
+               floatx80_invalid_encoding(ST1)) {
+        float_raise(float_flag_invalid, &env->fp_status);
+        ST1 = floatx80_default_nan(&env->fp_status);
+    } else if (floatx80_is_any_nan(ST0)) {
+        ST1 = ST0;
+    } else if (floatx80_is_any_nan(ST1)) {
+        /* Pass this NaN through.  */
+    } else if (arg0_exp > 0x3ffd ||
+               (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
+                                                  0x95f619980c4336f7ULL :
+                                                  0xd413cccfe7799211ULL))) {
+        /*
+         * Out of range for the instruction (ST0 must have absolute
+         * value less than 1 - sqrt(2)/2 = 0.292..., according to
+         * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
+         * to sqrt(2) - 1, which we allow here), treat as invalid.
+         */
+        float_raise(float_flag_invalid, &env->fp_status);
+        ST1 = floatx80_default_nan(&env->fp_status);
+    } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
+               arg1_exp == 0x7fff) {
+        /*
+         * One argument is zero, or multiplying by infinity; correct
+         * result is exact and can be obtained by multiplying the
+         * arguments.
+         */
+        ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
+    } else if (arg0_exp < 0x3fb0) {
+        /*
+         * Multiplying both arguments and an extra-precision version
+         * of log2(e) is sufficiently precise.
+         */
+        uint64_t sig0, sig1, sig2;
+        int32_t exp;
+        if (arg0_exp == 0) {
+            normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
+        }
+        if (arg1_exp == 0) {
+            normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
+        }
+        mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
+                        &sig0, &sig1, &sig2);
+        exp = arg0_exp + 1;
+        mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
+        exp += arg1_exp - 0x3ffe;
+        /* This result is inexact.  */
+        sig1 |= 1;
+        ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
+                                            arg0_sign ^ arg1_sign, exp,
+                                            sig0, sig1, &env->fp_status);
     } else {
-        env->fpus &= ~0x4700;
-        env->fpus |= 0x400;
+        int32_t aexp;
+        uint64_t asig0, asig1, asig2;
+        FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
+        FloatX80RoundPrec save_prec =
+            env->fp_status.floatx80_rounding_precision;
+        env->fp_status.float_rounding_mode = float_round_nearest_even;
+        env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
+
+        helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
+        /*
+         * Multiply by the second argument to compute the required
+         * result.
+         */
+        if (arg1_exp == 0) {
+            normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
+        }
+        mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
+        aexp += arg1_exp - 0x3ffe;
+        /* This result is inexact.  */
+        asig1 |= 1;
+        env->fp_status.float_rounding_mode = save_mode;
+        ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
+                                            arg0_sign ^ arg1_sign, aexp,
+                                            asig0, asig1, &env->fp_status);
+        env->fp_status.floatx80_rounding_precision = save_prec;
     }
+    fpop(env);
+    merge_exception_flags(env, old_flags);
 }
 
 void helper_fsqrt(CPUX86State *env)