diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h index 7f74ed9a01..0727df26ae 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h @@ -280,12 +280,223 @@ inline FPResult NI_sub(PowerPC::PowerPCState& ppc_state, double a, double b) return result; } -// FMA instructions on PowerPC are weird: -// They calculate (a * c) + b, but the order in which -// inputs are checked for NaN is still a, b, c. -inline FPResult NI_madd(PowerPC::PowerPCState& ppc_state, double a, double c, double b) +// The bulk work for fm(add/sub)(s)x operations +template +inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double c, double b) { - FPResult result{std::fma(a, c, b)}; + // The FMA instructions on PowerPC are incredibly weird, and the single precision variations are + // the only ones with the unfortunate side effect of completely accurately emulating them + // requiring either software floats or the manual checking of the individual float + // operations performed, down to a precision greater than that of subnormals. + // The first oddity to be found is that they calculate (a * c) + b, but in the case of NaNs, + // they're still checked in the order a, b, c. + // The rest generally come from the single precision variation. + // 1. The arguments are *not* forced to 32-bit precision in all ways, meaning you can end + // up with results that rely on a 64-bit precision mantissa. + // 2. The only argument which *is* forced to 32-bit precision in a way is frC, and even that + // is only forced to have a 32-bit mantissa. + // 3. fC is forced to have a 32-bit mantissa (rounding in the same way no matter what the + // rounding mode is set to), while keeping the exponent at any 64-bit value. + // But rather than the highest values rounding to infinity, because PowerPC internally uses + // a higher precision exponent, it rounds up to a value normally unreachable with even + // double precision floats. + // 4. CPUs, unsurprisingly, don't tend to support 64-bit float inputs to an operation with a + // 32-bit result. + // One quirk of PowerPC is that instead of just not caring about handling the precision + // in the registers of the operands of single precision instructions, it instead + // takes into account that extra precision and *only rounds once* to 32-bit. + // This means that you can have double precision inputs such that the result rounds + // differently than if you did a double precision FMA and rounded the result to 32-bit. + // - What makes FMA so special here is that it's the only basic operation which, upon being + // converted to a 64-bit operation then rounded back to a 32-bit result, does *not* give + // the same result when rounding to nearest! + // Addition, subtraction, multiplication, and division do not have this issue and will give + // the correct result for all inputs! + // - In fact, rounding to nearest is the *only* rounding mode where it ends up having issues! + // The reason being, if the result was to round in one direction, it would + // always direct the double precision output in a way such that rounding to single precision + // would end up having that same transformation (or it would already be exactly + // representable as a single precision value). + // + // It is relatively easy to find 32-bit values which do not round properly if one performs + // f32(fma(f64(a), f64(c), f64(b))), despite the rarity. + // The requirements can be shown fairly easily as well: + // - Final Result = sign * (1.fffffffffffffffffffffffddddddddddddddddddddddddddddd * 2^exponent + // + c * 2^(exponent - 52)) + // What we need is some form of discrepancy which occurs from rounding twice, + // such that rounding from the perspective `d` just being in front of `c` (like in the actual + // operation which only rounds once) will give a different result than rounding `d` then + // rounding again to single precision. + // There are a few ways which this discrepancy from rounding twice can be caused, with all + // of them relating to rounding to nearest ties even: + // 1. Tying down to even because `c` is too small + // a. The highest bit of `d` is 1, the rest of the bits of `d` are 0 (this means it ties) + // b. The lowest bit of `f` is 0 (this means it ties to even downwards) + // c. `c` is positive (nonzero) and does not round `d` upwards + // - This means while a single round would round up, + // instead this rounds down because of tying to even. + // 2. Tying up because `d` rounded up + // a. The highest bit of `d` is 0, the rest of the bits of `d` are 1 + // b. The lowest bit of `f` is 1 (this means it ties to even upwards) + // c. `c` is positive, and the highest bit of c is 1 + // - This will cause `d` to round to 100...00, meaning it will tie then round upwards. + // 3. Tying up to even because `c` is too small + // a. The highest bit of `d` is 1, the rest of the bits of `d` are 0 (this means it ties) + // b. The lowest bit of `f` is 1 (this means it ties to even downwards) + // c. `c` is negative and does not round `d` downwards + // - This is similar to the first one but in reverse, rounding up instead of down. + // 4. Tying down because `d` rounded down + // a. The highest and lowest bits of `d` are 1, the rest of the bits of `d` are 0 + // b. The lowest bit of `f` is 0 (this means it ties to even upwards) + // c. `c` is negative, and the highest bit of c is 1, + // and at least one other bit of c is nonzero + // - The backwards counterpart to case 2, this will cause `d` to round back down to 100..00, + // where the tie down will cause it to round down instead of up. + // + // The first values found which were shown to definitively cause issues appeared + // in Mario Strikers Charged, where: + // a = 0x42480000 (50.0) + // c = 0xbc88cc38 (-0.01669894158840179443359375) + // b = 0x1b1c72a0 + // (1.294105489087172032066277841712287344222431784146465361118316650390625 * 10^-22) + // + // Performing the FMADDS we get: + // 1.fffffffffffffffffffffffddddddddddddddddddddddddddddd * 2^exp + // +/- c + // Result = -(1.1010101101111110001011110000000000000000000000000000 * 2^-1 + // - 1.001110001110010101 * 2^-73) + // This exactly matches case 3 as shown above, so while the result should be 0xbf55bf17, + // Dolphin was returning 0xbf55bf18! + // Due to being able to choose any value of `c` easily to counter the value of `d`, + // it's not particularly difficult to make your own examples as well, + // but of course these happening in practice is going to be absurdly uncommon most of the time. + // + // Currently Dolphin supports: + // - Correct ordering of NaN checking (for both double and single precision) + // - Rounding frC up + // - Rounding only once for single precision inputs (this will be the large majority of cases!) + // - Currently this is interpreter-only. + // This can be implemented in the JIT just as easily, though. + // Eventually the JITs should hopefully support detecting back to back + // single-precision operations, which will lead to no overhead at all. + // In the cases where JITs can't do this, an alternative method is used, as + // is done in the interpreter as well. + // - Rounding only once for double precision inputs + // - This is a side effect of how we handle single-precision inputs: By doing + // error calculations rather than checking if every input is a float, we ensure that we know + // at the very least the rounding direction that was taken and that would need to be taken. + // + // Currently it does not support: + // - Handling frC overflowing to an unreachable value + // - This is simple enough to check for and handle properly, but the likelihood of it occurring + // is so low that it's not worth it to check for it for the rare accuracy improvement. + // - Dealing with every 64-bit subnormal possibility correctly + // - Double precision subnormals are what cause requiring more precision than double precision + // to be a thing if you want a correct implementation for every possible inputs. + // If a case where this was necessary came up it'd just be more worth it to fall back to + // a software floating point implementation instead. + // + // All of these can be resolved in a software float emulation method, or by using things such as + // error-free float algorithms, but the nature of both of these lead to incredible speed costs, + // and with the extreme rarity of anything beyond what's currently handled mattering, the other + // cases don't seem to have any reason to be implemented as of now. + + FPResult result; + + // In double precision, just doing the normal operation will be exact with no issues. + if (!single) + { + result.value = std::fma(a, c, sub ? -b : b); + } + else + { + // For single precision inputs, we never actually cast to a float -- we instead compute the + // result using a 64-bit FMA, and if the bits end up being an even tie when converting to + // a float, we approximate (for single-precision-only inputs this will be exact) the + // amount rounded by the FMA, and use that to manually fix which direction we round! + // We of course still properly round `c` first, though. + const double c_round = Force25Bit(c); + + // First, we compute the 64-bit FMA forwards + const double b_sign = sub ? -b : b; + result.value = std::fma(a, c_round, b_sign); + + // We then check if we're currently tying in rounding directioh + const u64 result_bits = std::bit_cast(result.value); + + // The mask of the `d` bits as shown in the above comments + const u64 D_MASK = 0x000000001fffffff; + // The mask of `d` which would force a tie to even, which is the only case where there + // can be potentially be differences compared to just casting to an f32 directly. + const u64 EVEN_TIE = 0x0000000010000000; + + // Because we check this entire mask which includes a 1 bit, we can be sure that + // if this result passes, the input is not an infinity that would become a NaN. + // This means that, for the JITs, if they only wanted to check for a subset of these + // bits (e.g. only checking if the last one was 0), then using the zero flag for a branch, + // they would have to check if the result was NaN before here. + if ((result_bits & D_MASK) == EVEN_TIE) + { + // Because we have a tie, we now compute any error in the FMA calculation + // via an error-free transformation (Ole Møller's 2Sum algorithm) + // s := a + b + // a' := s - b + // b' := s - a' + // da := a - a' + // db := b - b' + // t := da + db + // But for these calculations, we assume "a" := a * c_round, allowing the usage of FMA, + // both being shorter and allowing for likely necessary increased precision! + // We also switch up the signs a bit so we don't introduce an instruction simply to + // negate one of the operands of an FMA + const double a_prime = b_sign - result.value; + const double b_prime = result.value + a_prime; + const double delta_a = std::fma(a, c_round, a_prime); + const double delta_b = b_sign - b_prime; + const double error = delta_a + delta_b; + + // `error` will properly match the direction for rounding *even for 64-bit inputs*. + // Thoroughly proving that this works for even all normal values isn't entirely trivial, + // nor are the exact details really important, but the basic logic is: + // result.value = roundf64(a * c_round + b_sign) = a * c_round + b_sign - e0 + // a_prime = roundf64(b_sign - a * c_round - b_sign + e0) + // = -a * c_round + e0 - e1 + // b_prime = roundf64(a * c_round + b_sign - e0 - a * c_round + e0 - e1) + // = b_sign - e1 - e2 + // delta_a = roundf64(a * c_round - a * c_round + e0 - e1) + // = e0 - e1 - e3 + // delta_b = roundf64(b_sign - b_sign + e1 + e2) + // = e1 + e2 - e4 + // error = roundf64(delta_a + delta_b) + // = roundf64(e0 + e2 - e3 - e4) + // Then showing that e2 - e3 - e4 is tiny enough to not change the sign of + // e0 (the true error value, as `error` can't capture all of the possible precision), + // including that if the true e0 = 0 then e1 = e2 = e3 = e4 = error = 0. + + // This "error" value represents the number such that `result.value - error == exact_result`. + if (error != 0.0) + { + // Because the error is nonzero here, we actually do need to round a specific direction + // and don't want to just tie to even! + + // Note that it should never be possible for the error to be NaN if the result isn't either + // infinite or NaN itself. It would require: + // da == inf, db == -inf + // Which expanded out is: + // a - ((a + b) - b) == inf, b - ((a + b) - ((a + b) - b)) == -inf, where + // a + b isn't infinite. This means (a + b) - b must be infinite on the left, + // but this will end up giving the right hand side the same sign of infinity. + // All this to say we don't check for `if (!std::isnan(error))` for the `else` statement. + // Also note that we do not cast to a float here, + // as individual instructions using this function will on their own afterwards. + + if ((error > 0.0) == (result.value > 0.0)) + result.value = std::bit_cast(result_bits + 1); // Tie is too small, round up. + else + result.value = std::bit_cast(result_bits - 1); // Tie is too large, round down. + } + } + } if (std::isnan(result.value)) { @@ -321,42 +532,16 @@ inline FPResult NI_madd(PowerPC::PowerPCState& ppc_state, double a, double c, do return result; } +template +inline FPResult NI_madd(PowerPC::PowerPCState& ppc_state, double a, double c, double b) +{ + return NI_madd_msub(ppc_state, a, c, b); +} + +template inline FPResult NI_msub(PowerPC::PowerPCState& ppc_state, double a, double c, double b) { - FPResult result{std::fma(a, c, -b)}; - - if (std::isnan(result.value)) - { - if (Common::IsSNAN(a) || Common::IsSNAN(b) || Common::IsSNAN(c)) - result.SetException(ppc_state, FPSCR_VXSNAN); - - ppc_state.fpscr.ClearFIFR(); - - if (std::isnan(a)) - { - result.value = Common::MakeQuiet(a); - return result; - } - if (std::isnan(b)) - { - result.value = Common::MakeQuiet(b); // ! - return result; - } - if (std::isnan(c)) - { - result.value = Common::MakeQuiet(c); - return result; - } - - result.SetException(ppc_state, std::isnan(a * c) ? FPSCR_VXIMZ : FPSCR_VXISI); - result.value = PPC_NAN; - return result; - } - - if (std::isinf(a) || std::isinf(b) || std::isinf(c)) - ppc_state.fpscr.ClearFIFR(); - - return result; + return NI_madd_msub(ppc_state, a, c, b); } // used by stfsXX instructions and ps_rsqrte diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp index b1dbbf0cc1..134cae495c 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp @@ -370,11 +370,11 @@ void Interpreter::fmulsx(Interpreter& interpreter, UGeckoInstruction inst) const auto& c = ppc_state.ps[inst.FC]; const double c_value = Force25Bit(c.PS0AsDouble()); - const FPResult d_value = NI_mul(ppc_state, a.PS0AsDouble(), c_value); + const FPResult product = NI_mul(ppc_state, a.PS0AsDouble(), c_value); - if (ppc_state.fpscr.VE == 0 || d_value.HasNoInvalidExceptions()) + if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions()) { - const float result = ForceSingle(ppc_state.fpscr, d_value.value); + const float result = ForceSingle(ppc_state.fpscr, product.value); ppc_state.ps[inst.FD].Fill(result); ppc_state.fpscr.FI = 0; @@ -392,7 +392,9 @@ void Interpreter::fmaddx(Interpreter& interpreter, UGeckoInstruction inst) const auto& a = ppc_state.ps[inst.FA]; const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const FPResult product = NI_madd(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()); + + const FPResult product = + NI_madd(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()); if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions()) { @@ -412,15 +414,15 @@ void Interpreter::fmaddsx(Interpreter& interpreter, UGeckoInstruction inst) const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const double c_value = Force25Bit(c.PS0AsDouble()); - const FPResult d_value = NI_madd(ppc_state, a.PS0AsDouble(), c_value, b.PS0AsDouble()); + const FPResult product = + NI_madd(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()); - if (ppc_state.fpscr.VE == 0 || d_value.HasNoInvalidExceptions()) + if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions()) { - const float result = ForceSingle(ppc_state.fpscr, d_value.value); + const float result = ForceSingle(ppc_state.fpscr, product.value); ppc_state.ps[inst.FD].Fill(result); - ppc_state.fpscr.FI = d_value.value != result; + ppc_state.fpscr.FI = product.value != result; ppc_state.fpscr.FR = 0; ppc_state.UpdateFPRFSingle(result); } @@ -602,7 +604,8 @@ void Interpreter::fmsubx(Interpreter& interpreter, UGeckoInstruction inst) const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const FPResult product = NI_msub(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()); + const FPResult product = + NI_msub(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()); if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions()) { @@ -622,8 +625,8 @@ void Interpreter::fmsubsx(Interpreter& interpreter, UGeckoInstruction inst) const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const double c_value = Force25Bit(c.PS0AsDouble()); - const FPResult product = NI_msub(ppc_state, a.PS0AsDouble(), c_value, b.PS0AsDouble()); + const FPResult product = + NI_msub(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()); if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions()) { @@ -643,7 +646,8 @@ void Interpreter::fnmaddx(Interpreter& interpreter, UGeckoInstruction inst) const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const FPResult product = NI_madd(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()); + const FPResult product = + NI_madd(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()); if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions()) { @@ -665,8 +669,8 @@ void Interpreter::fnmaddsx(Interpreter& interpreter, UGeckoInstruction inst) const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const double c_value = Force25Bit(c.PS0AsDouble()); - const FPResult product = NI_madd(ppc_state, a.PS0AsDouble(), c_value, b.PS0AsDouble()); + const FPResult product = + NI_madd(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()); if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions()) { @@ -688,7 +692,8 @@ void Interpreter::fnmsubx(Interpreter& interpreter, UGeckoInstruction inst) const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const FPResult product = NI_msub(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()); + const FPResult product = + NI_msub(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()); if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions()) { @@ -710,8 +715,8 @@ void Interpreter::fnmsubsx(Interpreter& interpreter, UGeckoInstruction inst) const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const double c_value = Force25Bit(c.PS0AsDouble()); - const FPResult product = NI_msub(ppc_state, a.PS0AsDouble(), c_value, b.PS0AsDouble()); + const FPResult product = + NI_msub(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()); if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions()) { diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp index 69048d4530..85d919195b 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp @@ -263,13 +263,12 @@ void Interpreter::ps_msub(Interpreter& interpreter, UGeckoInstruction inst) const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const double c0 = Force25Bit(c.PS0AsDouble()); - const double c1 = Force25Bit(c.PS1AsDouble()); - - const float ps0 = - ForceSingle(ppc_state.fpscr, NI_msub(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value); - const float ps1 = - ForceSingle(ppc_state.fpscr, NI_msub(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); + const float ps0 = ForceSingle( + ppc_state.fpscr, + NI_msub(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value); + const float ps1 = ForceSingle( + ppc_state.fpscr, + NI_msub(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value); ppc_state.ps[inst.FD].SetBoth(ps0, ps1); ppc_state.UpdateFPRFSingle(ps0); @@ -285,13 +284,12 @@ void Interpreter::ps_madd(Interpreter& interpreter, UGeckoInstruction inst) const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const double c0 = Force25Bit(c.PS0AsDouble()); - const double c1 = Force25Bit(c.PS1AsDouble()); - - const float ps0 = - ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value); - const float ps1 = - ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); + const float ps0 = ForceSingle( + ppc_state.fpscr, + NI_madd(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value); + const float ps1 = ForceSingle( + ppc_state.fpscr, + NI_madd(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value); ppc_state.ps[inst.FD].SetBoth(ps0, ps1); ppc_state.UpdateFPRFSingle(ps0); @@ -307,13 +305,12 @@ void Interpreter::ps_nmsub(Interpreter& interpreter, UGeckoInstruction inst) const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const double c0 = Force25Bit(c.PS0AsDouble()); - const double c1 = Force25Bit(c.PS1AsDouble()); - - const float tmp0 = - ForceSingle(ppc_state.fpscr, NI_msub(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value); - const float tmp1 = - ForceSingle(ppc_state.fpscr, NI_msub(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); + const float tmp0 = ForceSingle( + ppc_state.fpscr, + NI_msub(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value); + const float tmp1 = ForceSingle( + ppc_state.fpscr, + NI_msub(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value); const float ps0 = std::isnan(tmp0) ? tmp0 : -tmp0; const float ps1 = std::isnan(tmp1) ? tmp1 : -tmp1; @@ -332,13 +329,12 @@ void Interpreter::ps_nmadd(Interpreter& interpreter, UGeckoInstruction inst) const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const double c0 = Force25Bit(c.PS0AsDouble()); - const double c1 = Force25Bit(c.PS1AsDouble()); - - const float tmp0 = - ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value); - const float tmp1 = - ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); + const float tmp0 = ForceSingle( + ppc_state.fpscr, + NI_madd(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value); + const float tmp1 = ForceSingle( + ppc_state.fpscr, + NI_madd(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value); const float ps0 = std::isnan(tmp0) ? tmp0 : -tmp0; const float ps1 = std::isnan(tmp1) ? tmp1 : -tmp1; @@ -427,11 +423,12 @@ void Interpreter::ps_madds0(Interpreter& interpreter, UGeckoInstruction inst) const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const double c0 = Force25Bit(c.PS0AsDouble()); - const float ps0 = - ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value); - const float ps1 = - ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS1AsDouble(), c0, b.PS1AsDouble()).value); + const float ps0 = ForceSingle( + ppc_state.fpscr, + NI_madd(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value); + const float ps1 = ForceSingle( + ppc_state.fpscr, + NI_madd(ppc_state, a.PS1AsDouble(), c.PS0AsDouble(), b.PS1AsDouble()).value); ppc_state.ps[inst.FD].SetBoth(ps0, ps1); ppc_state.UpdateFPRFSingle(ps0); @@ -447,11 +444,12 @@ void Interpreter::ps_madds1(Interpreter& interpreter, UGeckoInstruction inst) const auto& b = ppc_state.ps[inst.FB]; const auto& c = ppc_state.ps[inst.FC]; - const double c1 = Force25Bit(c.PS1AsDouble()); - const float ps0 = - ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS0AsDouble(), c1, b.PS0AsDouble()).value); - const float ps1 = - ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); + const float ps0 = ForceSingle( + ppc_state.fpscr, + NI_madd(ppc_state, a.PS0AsDouble(), c.PS1AsDouble(), b.PS0AsDouble()).value); + const float ps1 = ForceSingle( + ppc_state.fpscr, + NI_madd(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value); ppc_state.ps[inst.FD].SetBoth(ps0, ps1); ppc_state.UpdateFPRFSingle(ps0);