Merge pull request #13868 from Geotale/master

Fix Single-Precision-Only Inputs to FMAs Instructions in Interpreter
2025-09-11 10:40:58 +02:00 · 2025-08-18 18:25:38 -04:00
parent 8701944dd1 07443e2d41
commit 37f5ae4209
3 changed files with 283 additions and 95 deletions
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -280,12 +280,223 @@ inline FPResult NI_sub(PowerPC::PowerPCState& ppc_state, double a, double b)
  return result;
 }
-// FMA instructions on PowerPC are weird:
+// The bulk work for fm(add/sub)(s)x operations
-// They calculate (a * c) + b, but the order in which
+template <bool sub, bool single>
-// inputs are checked for NaN is still a, b, c.
+inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double c, double b)
 inline FPResult NI_madd(PowerPC::PowerPCState& ppc_state, double a, double c, double b)
 {
-  FPResult result{std::fma(a, c, b)};
+  // The FMA instructions on PowerPC are incredibly weird, and the single precision variations are
  // the only ones with the unfortunate side effect of completely accurately emulating them
  // requiring either software floats or the manual checking of the individual float
  // operations performed, down to a precision greater than that of subnormals.
  // The first oddity to be found is that they calculate (a * c) + b, but in the case of NaNs,
  // they're still checked in the order a, b, c.
  // The rest generally come from the single precision variation.
  // 1. The arguments are *not* forced to 32-bit precision in all ways, meaning you can end
  //    up with results that rely on a 64-bit precision mantissa.
  // 2. The only argument which *is* forced to 32-bit precision in a way is frC, and even that
  //    is only forced to have a 32-bit mantissa.
  // 3. fC is forced to have a 32-bit mantissa (rounding in the same way no matter what the
  //    rounding mode is set to), while keeping the exponent at any 64-bit value.
  //    But rather than the highest values rounding to infinity, because PowerPC internally uses
  //    a higher precision exponent, it rounds up to a value normally unreachable with even
  //    double precision floats.
  // 4. CPUs, unsurprisingly, don't tend to support 64-bit float inputs to an operation with a
  //    32-bit result.
  //    One quirk of PowerPC is that instead of just not caring about handling the precision
  //    in the registers of the operands of single precision instructions, it instead
  //    takes into account that extra precision and *only rounds once* to 32-bit.
  //    This means that you can have double precision inputs such that the result rounds
  //    differently than if you did a double precision FMA and rounded the result to 32-bit.
  //    - What makes FMA so special here is that it's the only basic operation which, upon being
  //      converted to a 64-bit operation then rounded back to a 32-bit result, does *not* give
  //      the same result when rounding to nearest!
  //      Addition, subtraction, multiplication, and division do not have this issue and will give
  //      the correct result for all inputs!
  //    - In fact, rounding to nearest is the *only* rounding mode where it ends up having issues!
  //      The reason being, if the result was to round in one direction, it would
  //      always direct the double precision output in a way such that rounding to single precision
  //      would end up having that same transformation (or it would already be exactly
  //      representable as a single precision value).
  //
  // It is relatively easy to find 32-bit values which do not round properly if one performs
  // f32(fma(f64(a), f64(c), f64(b))), despite the rarity.
  // The requirements can be shown fairly easily as well:
  // - Final Result = sign * (1.fffffffffffffffffffffffddddddddddddddddddddddddddddd * 2^exponent
  //                          + c * 2^(exponent - 52))
  // What we need is some form of discrepancy which occurs from rounding twice,
  // such that rounding from the perspective `d` just being in front of `c` (like in the actual
  // operation which only rounds once) will give a different result than rounding `d` then
  // rounding again to single precision.
  // There are a few ways which this discrepancy from rounding twice can be caused, with all
  // of them relating to rounding to nearest ties even:
  // 1. Tying down to even because `c` is too small
  //    a. The highest bit of `d` is 1, the rest of the bits of `d` are 0 (this means it ties)
  //    b. The lowest bit of `f` is 0 (this means it ties to even downwards)
  //    c. `c` is positive (nonzero) and does not round `d` upwards
  //    -  This means while a single round would round up,
  //       instead this rounds down because of tying to even.
  // 2. Tying up because `d` rounded up
  //    a. The highest bit of `d` is 0, the rest of the bits of `d` are 1
  //    b. The lowest bit of `f` is 1 (this means it ties to even upwards)
  //    c. `c` is positive, and the highest bit of c is 1
  //    - This will cause `d` to round to 100...00, meaning it will tie then round upwards.
  // 3. Tying up to even because `c` is too small
  //    a. The highest bit of `d` is 1, the rest of the bits of `d` are 0 (this means it ties)
  //    b. The lowest bit of `f` is 1 (this means it ties to even downwards)
  //    c. `c` is negative and does not round `d` downwards
  //    -  This is similar to the first one but in reverse, rounding up instead of down.
  // 4. Tying down because `d` rounded down
  //    a. The highest and lowest bits of `d` are 1, the rest of the bits of `d` are 0
  //    b. The lowest bit of `f` is 0 (this means it ties to even upwards)
  //    c. `c` is negative, and the highest bit of c is 1,
  //       and at least one other bit of c is nonzero
  //    - The backwards counterpart to case 2, this will cause `d` to round back down to 100..00,
  //      where the tie down will cause it to round down instead of up.
  //
  // The first values found which were shown to definitively cause issues appeared
  // in Mario Strikers Charged, where:
  // a = 0x42480000 (50.0)
  // c = 0xbc88cc38 (-0.01669894158840179443359375)
  // b = 0x1b1c72a0
  //     (1.294105489087172032066277841712287344222431784146465361118316650390625 * 10^-22)
  //
  // Performing the FMADDS we get:
  //            1.fffffffffffffffffffffffddddddddddddddddddddddddddddd * 2^exp
  //              +/- c
  // Result = -(1.1010101101111110001011110000000000000000000000000000 * 2^-1
  //               -  1.001110001110010101 * 2^-73)
  // This exactly matches case 3 as shown above, so while the result should be 0xbf55bf17,
  // Dolphin was returning 0xbf55bf18!
  // Due to being able to choose any value of `c` easily to counter the value of `d`,
  // it's not particularly difficult to make your own examples as well,
  // but of course these happening in practice is going to be absurdly uncommon most of the time.
  //
  // Currently Dolphin supports:
  // - Correct ordering of NaN checking (for both double and single precision)
  // - Rounding frC up
  // - Rounding only once for single precision inputs (this will be the large majority of cases!)
  //   - Currently this is interpreter-only.
  //     This can be implemented in the JIT just as easily, though.
  //     Eventually the JITs should hopefully support detecting back to back
  //     single-precision operations, which will lead to no overhead at all.
  //     In the cases where JITs can't do this, an alternative method is used, as
  //     is done in the interpreter as well.
  // - Rounding only once for double precision inputs
  //   - This is a side effect of how we handle single-precision inputs: By doing
  //     error calculations rather than checking if every input is a float, we ensure that we know
  //     at the very least the rounding direction that was taken and that would need to be taken.
  //
  // Currently it does not support:
  // - Handling frC overflowing to an unreachable value
  //   - This is simple enough to check for and handle properly, but the likelihood of it occurring
  //     is so low that it's not worth it to check for it for the rare accuracy improvement.
  // - Dealing with every 64-bit subnormal possibility correctly
  //   - Double precision subnormals are what cause requiring more precision than double precision
  //     to be a thing if you want a correct implementation for every possible inputs.
  //     If a case where this was necessary came up it'd just be more worth it to fall back to
  //     a software floating point implementation instead.
  //
  // All of these can be resolved in a software float emulation method, or by using things such as
  // error-free float algorithms, but the nature of both of these lead to incredible speed costs,
  // and with the extreme rarity of anything beyond what's currently handled mattering, the other
  // cases don't seem to have any reason to be implemented as of now.
  FPResult result;
  // In double precision, just doing the normal operation will be exact with no issues.
  if (!single)
  {
    result.value = std::fma(a, c, sub ? -b : b);
  }
  else
  {
    // For single precision inputs, we never actually cast to a float -- we instead compute the
    // result using a 64-bit FMA, and if the bits end up being an even tie when converting to
    // a float, we approximate (for single-precision-only inputs this will be exact) the
    // amount rounded by the FMA, and use that to manually fix which direction we round!
    // We of course still properly round `c` first, though.
    const double c_round = Force25Bit(c);
    // First, we compute the 64-bit FMA forwards
    const double b_sign = sub ? -b : b;
    result.value = std::fma(a, c_round, b_sign);
    // We then check if we're currently tying in rounding directioh
    const u64 result_bits = std::bit_cast<u64>(result.value);
    // The mask of the `d` bits as shown in the above comments
    const u64 D_MASK = 0x000000001fffffff;
    // The mask of `d` which would force a tie to even, which is the only case where there
    // can be potentially be differences compared to just casting to an f32 directly.
    const u64 EVEN_TIE = 0x0000000010000000;
    // Because we check this entire mask which includes a 1 bit, we can be sure that
    // if this result passes, the input is not an infinity that would become a NaN.
    // This means that, for the JITs, if they only wanted to check for a subset of these
    // bits (e.g. only checking if the last one was 0), then using the zero flag for a branch,
    // they would have to check if the result was NaN before here.
    if ((result_bits & D_MASK) == EVEN_TIE)
    {
      // Because we have a tie, we now compute any error in the FMA calculation
      // via an error-free transformation (Ole Møller's 2Sum algorithm)
      // s  := a  + b
      // a' := s  - b
      // b' := s  - a'
      // da := a  - a'
      // db := b  - b'
      // t  := da + db
      // But for these calculations, we assume "a" := a * c_round, allowing the usage of FMA,
      // both being shorter and allowing for likely necessary increased precision!
      // We also switch up the signs a bit so we don't introduce an instruction simply to
      // negate one of the operands of an FMA
      const double a_prime = b_sign - result.value;
      const double b_prime = result.value + a_prime;
      const double delta_a = std::fma(a, c_round, a_prime);
      const double delta_b = b_sign - b_prime;
      const double error = delta_a + delta_b;
      // `error` will properly match the direction for rounding *even for 64-bit inputs*.
      // Thoroughly proving that this works for even all normal values isn't entirely trivial,
      // nor are the exact details really important, but the basic logic is:
      // result.value = roundf64(a * c_round + b_sign) = a * c_round + b_sign - e0
      // a_prime = roundf64(b_sign - a * c_round - b_sign + e0)
      //         = -a * c_round + e0 - e1
      // b_prime = roundf64(a * c_round + b_sign - e0 - a * c_round + e0 - e1)
      //         = b_sign - e1 - e2
      // delta_a = roundf64(a * c_round - a * c_round + e0 - e1)
      //         = e0 - e1 - e3
      // delta_b = roundf64(b_sign - b_sign + e1 + e2)
      //         = e1 + e2 - e4
      // error   = roundf64(delta_a + delta_b)
      //         = roundf64(e0 + e2 - e3 - e4)
      // Then showing that e2 - e3 - e4 is tiny enough to not change the sign of
      // e0 (the true error value, as `error` can't capture all of the possible precision),
      // including that if the true e0 = 0 then e1 = e2 = e3 = e4 = error = 0.
      // This "error" value represents the number such that `result.value - error == exact_result`.
      if (error != 0.0)
      {
        // Because the error is nonzero here, we actually do need to round a specific direction
        // and don't want to just tie to even!
        // Note that it should never be possible for the error to be NaN if the result isn't either
        // infinite or NaN itself. It would require:
        // da == inf, db == -inf
        // Which expanded out is:
        // a - ((a + b) - b) == inf, b - ((a + b) - ((a + b) - b)) == -inf, where
        // a + b isn't infinite. This means (a + b) - b must be infinite on the left,
        // but this will end up giving the right hand side the same sign of infinity.
        // All this to say we don't check for `if (!std::isnan(error))` for the `else` statement.
        // Also note that we do not cast to a float here,
        // as individual instructions using this function will on their own afterwards.
        if ((error > 0.0) == (result.value > 0.0))
          result.value = std::bit_cast<double>(result_bits + 1);  // Tie is too small, round up.
        else
          result.value = std::bit_cast<double>(result_bits - 1);  // Tie is too large, round down.
      }
    }
  }
  if (std::isnan(result.value))
  {
@@ -321,42 +532,16 @@ inline FPResult NI_madd(PowerPC::PowerPCState& ppc_state, double a, double c, do
  return result;
 }
 template <bool single>
 inline FPResult NI_madd(PowerPC::PowerPCState& ppc_state, double a, double c, double b)
 {
  return NI_madd_msub<false, single>(ppc_state, a, c, b);
 }
 template <bool single>
 inline FPResult NI_msub(PowerPC::PowerPCState& ppc_state, double a, double c, double b)
 {
-  FPResult result{std::fma(a, c, -b)};
+  return NI_madd_msub<true, single>(ppc_state, a, c, b);
  if (std::isnan(result.value))
  {
    if (Common::IsSNAN(a) || Common::IsSNAN(b) || Common::IsSNAN(c))
      result.SetException(ppc_state, FPSCR_VXSNAN);
    ppc_state.fpscr.ClearFIFR();
    if (std::isnan(a))
    {
      result.value = Common::MakeQuiet(a);
      return result;
    }
    if (std::isnan(b))
    {
      result.value = Common::MakeQuiet(b);  // !
      return result;
    }
    if (std::isnan(c))
    {
      result.value = Common::MakeQuiet(c);
      return result;
    }
    result.SetException(ppc_state, std::isnan(a * c) ? FPSCR_VXIMZ : FPSCR_VXISI);
    result.value = PPC_NAN;
    return result;
  }
  if (std::isinf(a) || std::isinf(b) || std::isinf(c))
    ppc_state.fpscr.ClearFIFR();
  return result;
 }
 // used by stfsXX instructions and ps_rsqrte
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp
@@ -370,11 +370,11 @@ void Interpreter::fmulsx(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& c = ppc_state.ps[inst.FC];
  const double c_value = Force25Bit(c.PS0AsDouble());
-  const FPResult d_value = NI_mul(ppc_state, a.PS0AsDouble(), c_value);
+  const FPResult product = NI_mul(ppc_state, a.PS0AsDouble(), c_value);
-  if (ppc_state.fpscr.VE == 0 || d_value.HasNoInvalidExceptions())
+  if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
  {
-    const float result = ForceSingle(ppc_state.fpscr, d_value.value);
+    const float result = ForceSingle(ppc_state.fpscr, product.value);
    ppc_state.ps[inst.FD].Fill(result);
    ppc_state.fpscr.FI = 0;
@@ -392,7 +392,9 @@ void Interpreter::fmaddx(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& a = ppc_state.ps[inst.FA];
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const FPResult product = NI_madd(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
+
  const FPResult product =
      NI_madd<false>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
  if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
  {
@@ -412,15 +414,15 @@ void Interpreter::fmaddsx(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const double c_value = Force25Bit(c.PS0AsDouble());
+  const FPResult product =
-  const FPResult d_value = NI_madd(ppc_state, a.PS0AsDouble(), c_value, b.PS0AsDouble());
+      NI_madd<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
-  if (ppc_state.fpscr.VE == 0 || d_value.HasNoInvalidExceptions())
+  if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
  {
-    const float result = ForceSingle(ppc_state.fpscr, d_value.value);
+    const float result = ForceSingle(ppc_state.fpscr, product.value);
    ppc_state.ps[inst.FD].Fill(result);
-    ppc_state.fpscr.FI = d_value.value != result;
+    ppc_state.fpscr.FI = product.value != result;
    ppc_state.fpscr.FR = 0;
    ppc_state.UpdateFPRFSingle(result);
  }
@@ -602,7 +604,8 @@ void Interpreter::fmsubx(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const FPResult product = NI_msub(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
+  const FPResult product =
      NI_msub<false>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
  if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
  {
@@ -622,8 +625,8 @@ void Interpreter::fmsubsx(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const double c_value = Force25Bit(c.PS0AsDouble());
+  const FPResult product =
-  const FPResult product = NI_msub(ppc_state, a.PS0AsDouble(), c_value, b.PS0AsDouble());
+      NI_msub<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
  if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
  {
@@ -643,7 +646,8 @@ void Interpreter::fnmaddx(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const FPResult product = NI_madd(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
+  const FPResult product =
      NI_madd<false>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
  if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
  {
@@ -665,8 +669,8 @@ void Interpreter::fnmaddsx(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const double c_value = Force25Bit(c.PS0AsDouble());
+  const FPResult product =
-  const FPResult product = NI_madd(ppc_state, a.PS0AsDouble(), c_value, b.PS0AsDouble());
+      NI_madd<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
  if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
  {
@@ -688,7 +692,8 @@ void Interpreter::fnmsubx(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const FPResult product = NI_msub(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
+  const FPResult product =
      NI_msub<false>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
  if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
  {
@@ -710,8 +715,8 @@ void Interpreter::fnmsubsx(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const double c_value = Force25Bit(c.PS0AsDouble());
+  const FPResult product =
-  const FPResult product = NI_msub(ppc_state, a.PS0AsDouble(), c_value, b.PS0AsDouble());
+      NI_msub<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
  if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
  {
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp
@@ -263,13 +263,12 @@ void Interpreter::ps_msub(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const double c0 = Force25Bit(c.PS0AsDouble());
+  const float ps0 = ForceSingle(
-  const double c1 = Force25Bit(c.PS1AsDouble());
+      ppc_state.fpscr,
-
+      NI_msub<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value);
-  const float ps0 =
+  const float ps1 = ForceSingle(
-      ForceSingle(ppc_state.fpscr, NI_msub(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value);
+      ppc_state.fpscr,
-  const float ps1 =
+      NI_msub<true>(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value);
      ForceSingle(ppc_state.fpscr, NI_msub(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value);
  ppc_state.ps[inst.FD].SetBoth(ps0, ps1);
  ppc_state.UpdateFPRFSingle(ps0);
@@ -285,13 +284,12 @@ void Interpreter::ps_madd(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const double c0 = Force25Bit(c.PS0AsDouble());
+  const float ps0 = ForceSingle(
-  const double c1 = Force25Bit(c.PS1AsDouble());
+      ppc_state.fpscr,
-
+      NI_madd<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value);
-  const float ps0 =
+  const float ps1 = ForceSingle(
-      ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value);
+      ppc_state.fpscr,
-  const float ps1 =
+      NI_madd<true>(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value);
      ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value);
  ppc_state.ps[inst.FD].SetBoth(ps0, ps1);
  ppc_state.UpdateFPRFSingle(ps0);
@@ -307,13 +305,12 @@ void Interpreter::ps_nmsub(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const double c0 = Force25Bit(c.PS0AsDouble());
+  const float tmp0 = ForceSingle(
-  const double c1 = Force25Bit(c.PS1AsDouble());
+      ppc_state.fpscr,
-
+      NI_msub<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value);
-  const float tmp0 =
+  const float tmp1 = ForceSingle(
-      ForceSingle(ppc_state.fpscr, NI_msub(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value);
+      ppc_state.fpscr,
-  const float tmp1 =
+      NI_msub<true>(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value);
      ForceSingle(ppc_state.fpscr, NI_msub(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value);
  const float ps0 = std::isnan(tmp0) ? tmp0 : -tmp0;
  const float ps1 = std::isnan(tmp1) ? tmp1 : -tmp1;
@@ -332,13 +329,12 @@ void Interpreter::ps_nmadd(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const double c0 = Force25Bit(c.PS0AsDouble());
+  const float tmp0 = ForceSingle(
-  const double c1 = Force25Bit(c.PS1AsDouble());
+      ppc_state.fpscr,
-
+      NI_madd<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value);
-  const float tmp0 =
+  const float tmp1 = ForceSingle(
-      ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value);
+      ppc_state.fpscr,
-  const float tmp1 =
+      NI_madd<true>(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value);
      ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value);
  const float ps0 = std::isnan(tmp0) ? tmp0 : -tmp0;
  const float ps1 = std::isnan(tmp1) ? tmp1 : -tmp1;
@@ -427,11 +423,12 @@ void Interpreter::ps_madds0(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const double c0 = Force25Bit(c.PS0AsDouble());
+  const float ps0 = ForceSingle(
-  const float ps0 =
+      ppc_state.fpscr,
-      ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value);
+      NI_madd<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value);
-  const float ps1 =
+  const float ps1 = ForceSingle(
-      ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS1AsDouble(), c0, b.PS1AsDouble()).value);
+      ppc_state.fpscr,
      NI_madd<true>(ppc_state, a.PS1AsDouble(), c.PS0AsDouble(), b.PS1AsDouble()).value);
  ppc_state.ps[inst.FD].SetBoth(ps0, ps1);
  ppc_state.UpdateFPRFSingle(ps0);
@@ -447,11 +444,12 @@ void Interpreter::ps_madds1(Interpreter& interpreter, UGeckoInstruction inst)
  const auto& b = ppc_state.ps[inst.FB];
  const auto& c = ppc_state.ps[inst.FC];
-  const double c1 = Force25Bit(c.PS1AsDouble());
+  const float ps0 = ForceSingle(
-  const float ps0 =
+      ppc_state.fpscr,
-      ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS0AsDouble(), c1, b.PS0AsDouble()).value);
+      NI_madd<true>(ppc_state, a.PS0AsDouble(), c.PS1AsDouble(), b.PS0AsDouble()).value);
-  const float ps1 =
+  const float ps1 = ForceSingle(
-      ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value);
+      ppc_state.fpscr,
      NI_madd<true>(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value);
  ppc_state.ps[inst.FD].SetBoth(ps0, ps1);
  ppc_state.UpdateFPRFSingle(ps0);