mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-09-11 10:40:58 +02:00
Merge pull request #13868 from Geotale/master
Fix Single-Precision-Only Inputs to FMAs Instructions in Interpreter
This commit is contained in:
@@ -280,12 +280,223 @@ inline FPResult NI_sub(PowerPC::PowerPCState& ppc_state, double a, double b)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// FMA instructions on PowerPC are weird:
|
// The bulk work for fm(add/sub)(s)x operations
|
||||||
// They calculate (a * c) + b, but the order in which
|
template <bool sub, bool single>
|
||||||
// inputs are checked for NaN is still a, b, c.
|
inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double c, double b)
|
||||||
inline FPResult NI_madd(PowerPC::PowerPCState& ppc_state, double a, double c, double b)
|
|
||||||
{
|
{
|
||||||
FPResult result{std::fma(a, c, b)};
|
// The FMA instructions on PowerPC are incredibly weird, and the single precision variations are
|
||||||
|
// the only ones with the unfortunate side effect of completely accurately emulating them
|
||||||
|
// requiring either software floats or the manual checking of the individual float
|
||||||
|
// operations performed, down to a precision greater than that of subnormals.
|
||||||
|
// The first oddity to be found is that they calculate (a * c) + b, but in the case of NaNs,
|
||||||
|
// they're still checked in the order a, b, c.
|
||||||
|
// The rest generally come from the single precision variation.
|
||||||
|
// 1. The arguments are *not* forced to 32-bit precision in all ways, meaning you can end
|
||||||
|
// up with results that rely on a 64-bit precision mantissa.
|
||||||
|
// 2. The only argument which *is* forced to 32-bit precision in a way is frC, and even that
|
||||||
|
// is only forced to have a 32-bit mantissa.
|
||||||
|
// 3. fC is forced to have a 32-bit mantissa (rounding in the same way no matter what the
|
||||||
|
// rounding mode is set to), while keeping the exponent at any 64-bit value.
|
||||||
|
// But rather than the highest values rounding to infinity, because PowerPC internally uses
|
||||||
|
// a higher precision exponent, it rounds up to a value normally unreachable with even
|
||||||
|
// double precision floats.
|
||||||
|
// 4. CPUs, unsurprisingly, don't tend to support 64-bit float inputs to an operation with a
|
||||||
|
// 32-bit result.
|
||||||
|
// One quirk of PowerPC is that instead of just not caring about handling the precision
|
||||||
|
// in the registers of the operands of single precision instructions, it instead
|
||||||
|
// takes into account that extra precision and *only rounds once* to 32-bit.
|
||||||
|
// This means that you can have double precision inputs such that the result rounds
|
||||||
|
// differently than if you did a double precision FMA and rounded the result to 32-bit.
|
||||||
|
// - What makes FMA so special here is that it's the only basic operation which, upon being
|
||||||
|
// converted to a 64-bit operation then rounded back to a 32-bit result, does *not* give
|
||||||
|
// the same result when rounding to nearest!
|
||||||
|
// Addition, subtraction, multiplication, and division do not have this issue and will give
|
||||||
|
// the correct result for all inputs!
|
||||||
|
// - In fact, rounding to nearest is the *only* rounding mode where it ends up having issues!
|
||||||
|
// The reason being, if the result was to round in one direction, it would
|
||||||
|
// always direct the double precision output in a way such that rounding to single precision
|
||||||
|
// would end up having that same transformation (or it would already be exactly
|
||||||
|
// representable as a single precision value).
|
||||||
|
//
|
||||||
|
// It is relatively easy to find 32-bit values which do not round properly if one performs
|
||||||
|
// f32(fma(f64(a), f64(c), f64(b))), despite the rarity.
|
||||||
|
// The requirements can be shown fairly easily as well:
|
||||||
|
// - Final Result = sign * (1.fffffffffffffffffffffffddddddddddddddddddddddddddddd * 2^exponent
|
||||||
|
// + c * 2^(exponent - 52))
|
||||||
|
// What we need is some form of discrepancy which occurs from rounding twice,
|
||||||
|
// such that rounding from the perspective `d` just being in front of `c` (like in the actual
|
||||||
|
// operation which only rounds once) will give a different result than rounding `d` then
|
||||||
|
// rounding again to single precision.
|
||||||
|
// There are a few ways which this discrepancy from rounding twice can be caused, with all
|
||||||
|
// of them relating to rounding to nearest ties even:
|
||||||
|
// 1. Tying down to even because `c` is too small
|
||||||
|
// a. The highest bit of `d` is 1, the rest of the bits of `d` are 0 (this means it ties)
|
||||||
|
// b. The lowest bit of `f` is 0 (this means it ties to even downwards)
|
||||||
|
// c. `c` is positive (nonzero) and does not round `d` upwards
|
||||||
|
// - This means while a single round would round up,
|
||||||
|
// instead this rounds down because of tying to even.
|
||||||
|
// 2. Tying up because `d` rounded up
|
||||||
|
// a. The highest bit of `d` is 0, the rest of the bits of `d` are 1
|
||||||
|
// b. The lowest bit of `f` is 1 (this means it ties to even upwards)
|
||||||
|
// c. `c` is positive, and the highest bit of c is 1
|
||||||
|
// - This will cause `d` to round to 100...00, meaning it will tie then round upwards.
|
||||||
|
// 3. Tying up to even because `c` is too small
|
||||||
|
// a. The highest bit of `d` is 1, the rest of the bits of `d` are 0 (this means it ties)
|
||||||
|
// b. The lowest bit of `f` is 1 (this means it ties to even downwards)
|
||||||
|
// c. `c` is negative and does not round `d` downwards
|
||||||
|
// - This is similar to the first one but in reverse, rounding up instead of down.
|
||||||
|
// 4. Tying down because `d` rounded down
|
||||||
|
// a. The highest and lowest bits of `d` are 1, the rest of the bits of `d` are 0
|
||||||
|
// b. The lowest bit of `f` is 0 (this means it ties to even upwards)
|
||||||
|
// c. `c` is negative, and the highest bit of c is 1,
|
||||||
|
// and at least one other bit of c is nonzero
|
||||||
|
// - The backwards counterpart to case 2, this will cause `d` to round back down to 100..00,
|
||||||
|
// where the tie down will cause it to round down instead of up.
|
||||||
|
//
|
||||||
|
// The first values found which were shown to definitively cause issues appeared
|
||||||
|
// in Mario Strikers Charged, where:
|
||||||
|
// a = 0x42480000 (50.0)
|
||||||
|
// c = 0xbc88cc38 (-0.01669894158840179443359375)
|
||||||
|
// b = 0x1b1c72a0
|
||||||
|
// (1.294105489087172032066277841712287344222431784146465361118316650390625 * 10^-22)
|
||||||
|
//
|
||||||
|
// Performing the FMADDS we get:
|
||||||
|
// 1.fffffffffffffffffffffffddddddddddddddddddddddddddddd * 2^exp
|
||||||
|
// +/- c
|
||||||
|
// Result = -(1.1010101101111110001011110000000000000000000000000000 * 2^-1
|
||||||
|
// - 1.001110001110010101 * 2^-73)
|
||||||
|
// This exactly matches case 3 as shown above, so while the result should be 0xbf55bf17,
|
||||||
|
// Dolphin was returning 0xbf55bf18!
|
||||||
|
// Due to being able to choose any value of `c` easily to counter the value of `d`,
|
||||||
|
// it's not particularly difficult to make your own examples as well,
|
||||||
|
// but of course these happening in practice is going to be absurdly uncommon most of the time.
|
||||||
|
//
|
||||||
|
// Currently Dolphin supports:
|
||||||
|
// - Correct ordering of NaN checking (for both double and single precision)
|
||||||
|
// - Rounding frC up
|
||||||
|
// - Rounding only once for single precision inputs (this will be the large majority of cases!)
|
||||||
|
// - Currently this is interpreter-only.
|
||||||
|
// This can be implemented in the JIT just as easily, though.
|
||||||
|
// Eventually the JITs should hopefully support detecting back to back
|
||||||
|
// single-precision operations, which will lead to no overhead at all.
|
||||||
|
// In the cases where JITs can't do this, an alternative method is used, as
|
||||||
|
// is done in the interpreter as well.
|
||||||
|
// - Rounding only once for double precision inputs
|
||||||
|
// - This is a side effect of how we handle single-precision inputs: By doing
|
||||||
|
// error calculations rather than checking if every input is a float, we ensure that we know
|
||||||
|
// at the very least the rounding direction that was taken and that would need to be taken.
|
||||||
|
//
|
||||||
|
// Currently it does not support:
|
||||||
|
// - Handling frC overflowing to an unreachable value
|
||||||
|
// - This is simple enough to check for and handle properly, but the likelihood of it occurring
|
||||||
|
// is so low that it's not worth it to check for it for the rare accuracy improvement.
|
||||||
|
// - Dealing with every 64-bit subnormal possibility correctly
|
||||||
|
// - Double precision subnormals are what cause requiring more precision than double precision
|
||||||
|
// to be a thing if you want a correct implementation for every possible inputs.
|
||||||
|
// If a case where this was necessary came up it'd just be more worth it to fall back to
|
||||||
|
// a software floating point implementation instead.
|
||||||
|
//
|
||||||
|
// All of these can be resolved in a software float emulation method, or by using things such as
|
||||||
|
// error-free float algorithms, but the nature of both of these lead to incredible speed costs,
|
||||||
|
// and with the extreme rarity of anything beyond what's currently handled mattering, the other
|
||||||
|
// cases don't seem to have any reason to be implemented as of now.
|
||||||
|
|
||||||
|
FPResult result;
|
||||||
|
|
||||||
|
// In double precision, just doing the normal operation will be exact with no issues.
|
||||||
|
if (!single)
|
||||||
|
{
|
||||||
|
result.value = std::fma(a, c, sub ? -b : b);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// For single precision inputs, we never actually cast to a float -- we instead compute the
|
||||||
|
// result using a 64-bit FMA, and if the bits end up being an even tie when converting to
|
||||||
|
// a float, we approximate (for single-precision-only inputs this will be exact) the
|
||||||
|
// amount rounded by the FMA, and use that to manually fix which direction we round!
|
||||||
|
// We of course still properly round `c` first, though.
|
||||||
|
const double c_round = Force25Bit(c);
|
||||||
|
|
||||||
|
// First, we compute the 64-bit FMA forwards
|
||||||
|
const double b_sign = sub ? -b : b;
|
||||||
|
result.value = std::fma(a, c_round, b_sign);
|
||||||
|
|
||||||
|
// We then check if we're currently tying in rounding directioh
|
||||||
|
const u64 result_bits = std::bit_cast<u64>(result.value);
|
||||||
|
|
||||||
|
// The mask of the `d` bits as shown in the above comments
|
||||||
|
const u64 D_MASK = 0x000000001fffffff;
|
||||||
|
// The mask of `d` which would force a tie to even, which is the only case where there
|
||||||
|
// can be potentially be differences compared to just casting to an f32 directly.
|
||||||
|
const u64 EVEN_TIE = 0x0000000010000000;
|
||||||
|
|
||||||
|
// Because we check this entire mask which includes a 1 bit, we can be sure that
|
||||||
|
// if this result passes, the input is not an infinity that would become a NaN.
|
||||||
|
// This means that, for the JITs, if they only wanted to check for a subset of these
|
||||||
|
// bits (e.g. only checking if the last one was 0), then using the zero flag for a branch,
|
||||||
|
// they would have to check if the result was NaN before here.
|
||||||
|
if ((result_bits & D_MASK) == EVEN_TIE)
|
||||||
|
{
|
||||||
|
// Because we have a tie, we now compute any error in the FMA calculation
|
||||||
|
// via an error-free transformation (Ole Møller's 2Sum algorithm)
|
||||||
|
// s := a + b
|
||||||
|
// a' := s - b
|
||||||
|
// b' := s - a'
|
||||||
|
// da := a - a'
|
||||||
|
// db := b - b'
|
||||||
|
// t := da + db
|
||||||
|
// But for these calculations, we assume "a" := a * c_round, allowing the usage of FMA,
|
||||||
|
// both being shorter and allowing for likely necessary increased precision!
|
||||||
|
// We also switch up the signs a bit so we don't introduce an instruction simply to
|
||||||
|
// negate one of the operands of an FMA
|
||||||
|
const double a_prime = b_sign - result.value;
|
||||||
|
const double b_prime = result.value + a_prime;
|
||||||
|
const double delta_a = std::fma(a, c_round, a_prime);
|
||||||
|
const double delta_b = b_sign - b_prime;
|
||||||
|
const double error = delta_a + delta_b;
|
||||||
|
|
||||||
|
// `error` will properly match the direction for rounding *even for 64-bit inputs*.
|
||||||
|
// Thoroughly proving that this works for even all normal values isn't entirely trivial,
|
||||||
|
// nor are the exact details really important, but the basic logic is:
|
||||||
|
// result.value = roundf64(a * c_round + b_sign) = a * c_round + b_sign - e0
|
||||||
|
// a_prime = roundf64(b_sign - a * c_round - b_sign + e0)
|
||||||
|
// = -a * c_round + e0 - e1
|
||||||
|
// b_prime = roundf64(a * c_round + b_sign - e0 - a * c_round + e0 - e1)
|
||||||
|
// = b_sign - e1 - e2
|
||||||
|
// delta_a = roundf64(a * c_round - a * c_round + e0 - e1)
|
||||||
|
// = e0 - e1 - e3
|
||||||
|
// delta_b = roundf64(b_sign - b_sign + e1 + e2)
|
||||||
|
// = e1 + e2 - e4
|
||||||
|
// error = roundf64(delta_a + delta_b)
|
||||||
|
// = roundf64(e0 + e2 - e3 - e4)
|
||||||
|
// Then showing that e2 - e3 - e4 is tiny enough to not change the sign of
|
||||||
|
// e0 (the true error value, as `error` can't capture all of the possible precision),
|
||||||
|
// including that if the true e0 = 0 then e1 = e2 = e3 = e4 = error = 0.
|
||||||
|
|
||||||
|
// This "error" value represents the number such that `result.value - error == exact_result`.
|
||||||
|
if (error != 0.0)
|
||||||
|
{
|
||||||
|
// Because the error is nonzero here, we actually do need to round a specific direction
|
||||||
|
// and don't want to just tie to even!
|
||||||
|
|
||||||
|
// Note that it should never be possible for the error to be NaN if the result isn't either
|
||||||
|
// infinite or NaN itself. It would require:
|
||||||
|
// da == inf, db == -inf
|
||||||
|
// Which expanded out is:
|
||||||
|
// a - ((a + b) - b) == inf, b - ((a + b) - ((a + b) - b)) == -inf, where
|
||||||
|
// a + b isn't infinite. This means (a + b) - b must be infinite on the left,
|
||||||
|
// but this will end up giving the right hand side the same sign of infinity.
|
||||||
|
// All this to say we don't check for `if (!std::isnan(error))` for the `else` statement.
|
||||||
|
// Also note that we do not cast to a float here,
|
||||||
|
// as individual instructions using this function will on their own afterwards.
|
||||||
|
|
||||||
|
if ((error > 0.0) == (result.value > 0.0))
|
||||||
|
result.value = std::bit_cast<double>(result_bits + 1); // Tie is too small, round up.
|
||||||
|
else
|
||||||
|
result.value = std::bit_cast<double>(result_bits - 1); // Tie is too large, round down.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (std::isnan(result.value))
|
if (std::isnan(result.value))
|
||||||
{
|
{
|
||||||
@@ -321,42 +532,16 @@ inline FPResult NI_madd(PowerPC::PowerPCState& ppc_state, double a, double c, do
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <bool single>
|
||||||
|
inline FPResult NI_madd(PowerPC::PowerPCState& ppc_state, double a, double c, double b)
|
||||||
|
{
|
||||||
|
return NI_madd_msub<false, single>(ppc_state, a, c, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool single>
|
||||||
inline FPResult NI_msub(PowerPC::PowerPCState& ppc_state, double a, double c, double b)
|
inline FPResult NI_msub(PowerPC::PowerPCState& ppc_state, double a, double c, double b)
|
||||||
{
|
{
|
||||||
FPResult result{std::fma(a, c, -b)};
|
return NI_madd_msub<true, single>(ppc_state, a, c, b);
|
||||||
|
|
||||||
if (std::isnan(result.value))
|
|
||||||
{
|
|
||||||
if (Common::IsSNAN(a) || Common::IsSNAN(b) || Common::IsSNAN(c))
|
|
||||||
result.SetException(ppc_state, FPSCR_VXSNAN);
|
|
||||||
|
|
||||||
ppc_state.fpscr.ClearFIFR();
|
|
||||||
|
|
||||||
if (std::isnan(a))
|
|
||||||
{
|
|
||||||
result.value = Common::MakeQuiet(a);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
if (std::isnan(b))
|
|
||||||
{
|
|
||||||
result.value = Common::MakeQuiet(b); // !
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
if (std::isnan(c))
|
|
||||||
{
|
|
||||||
result.value = Common::MakeQuiet(c);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
result.SetException(ppc_state, std::isnan(a * c) ? FPSCR_VXIMZ : FPSCR_VXISI);
|
|
||||||
result.value = PPC_NAN;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (std::isinf(a) || std::isinf(b) || std::isinf(c))
|
|
||||||
ppc_state.fpscr.ClearFIFR();
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// used by stfsXX instructions and ps_rsqrte
|
// used by stfsXX instructions and ps_rsqrte
|
||||||
|
@@ -370,11 +370,11 @@ void Interpreter::fmulsx(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const double c_value = Force25Bit(c.PS0AsDouble());
|
const double c_value = Force25Bit(c.PS0AsDouble());
|
||||||
const FPResult d_value = NI_mul(ppc_state, a.PS0AsDouble(), c_value);
|
const FPResult product = NI_mul(ppc_state, a.PS0AsDouble(), c_value);
|
||||||
|
|
||||||
if (ppc_state.fpscr.VE == 0 || d_value.HasNoInvalidExceptions())
|
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
||||||
{
|
{
|
||||||
const float result = ForceSingle(ppc_state.fpscr, d_value.value);
|
const float result = ForceSingle(ppc_state.fpscr, product.value);
|
||||||
|
|
||||||
ppc_state.ps[inst.FD].Fill(result);
|
ppc_state.ps[inst.FD].Fill(result);
|
||||||
ppc_state.fpscr.FI = 0;
|
ppc_state.fpscr.FI = 0;
|
||||||
@@ -392,7 +392,9 @@ void Interpreter::fmaddx(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& a = ppc_state.ps[inst.FA];
|
const auto& a = ppc_state.ps[inst.FA];
|
||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
const FPResult product = NI_madd(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
|
|
||||||
|
const FPResult product =
|
||||||
|
NI_madd<false>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
|
||||||
|
|
||||||
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
||||||
{
|
{
|
||||||
@@ -412,15 +414,15 @@ void Interpreter::fmaddsx(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const double c_value = Force25Bit(c.PS0AsDouble());
|
const FPResult product =
|
||||||
const FPResult d_value = NI_madd(ppc_state, a.PS0AsDouble(), c_value, b.PS0AsDouble());
|
NI_madd<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
|
||||||
|
|
||||||
if (ppc_state.fpscr.VE == 0 || d_value.HasNoInvalidExceptions())
|
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
||||||
{
|
{
|
||||||
const float result = ForceSingle(ppc_state.fpscr, d_value.value);
|
const float result = ForceSingle(ppc_state.fpscr, product.value);
|
||||||
|
|
||||||
ppc_state.ps[inst.FD].Fill(result);
|
ppc_state.ps[inst.FD].Fill(result);
|
||||||
ppc_state.fpscr.FI = d_value.value != result;
|
ppc_state.fpscr.FI = product.value != result;
|
||||||
ppc_state.fpscr.FR = 0;
|
ppc_state.fpscr.FR = 0;
|
||||||
ppc_state.UpdateFPRFSingle(result);
|
ppc_state.UpdateFPRFSingle(result);
|
||||||
}
|
}
|
||||||
@@ -602,7 +604,8 @@ void Interpreter::fmsubx(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const FPResult product = NI_msub(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
|
const FPResult product =
|
||||||
|
NI_msub<false>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
|
||||||
|
|
||||||
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
||||||
{
|
{
|
||||||
@@ -622,8 +625,8 @@ void Interpreter::fmsubsx(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const double c_value = Force25Bit(c.PS0AsDouble());
|
const FPResult product =
|
||||||
const FPResult product = NI_msub(ppc_state, a.PS0AsDouble(), c_value, b.PS0AsDouble());
|
NI_msub<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
|
||||||
|
|
||||||
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
||||||
{
|
{
|
||||||
@@ -643,7 +646,8 @@ void Interpreter::fnmaddx(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const FPResult product = NI_madd(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
|
const FPResult product =
|
||||||
|
NI_madd<false>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
|
||||||
|
|
||||||
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
||||||
{
|
{
|
||||||
@@ -665,8 +669,8 @@ void Interpreter::fnmaddsx(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const double c_value = Force25Bit(c.PS0AsDouble());
|
const FPResult product =
|
||||||
const FPResult product = NI_madd(ppc_state, a.PS0AsDouble(), c_value, b.PS0AsDouble());
|
NI_madd<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
|
||||||
|
|
||||||
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
||||||
{
|
{
|
||||||
@@ -688,7 +692,8 @@ void Interpreter::fnmsubx(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const FPResult product = NI_msub(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
|
const FPResult product =
|
||||||
|
NI_msub<false>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
|
||||||
|
|
||||||
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
||||||
{
|
{
|
||||||
@@ -710,8 +715,8 @@ void Interpreter::fnmsubsx(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const double c_value = Force25Bit(c.PS0AsDouble());
|
const FPResult product =
|
||||||
const FPResult product = NI_msub(ppc_state, a.PS0AsDouble(), c_value, b.PS0AsDouble());
|
NI_msub<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble());
|
||||||
|
|
||||||
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
if (ppc_state.fpscr.VE == 0 || product.HasNoInvalidExceptions())
|
||||||
{
|
{
|
||||||
|
@@ -263,13 +263,12 @@ void Interpreter::ps_msub(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const double c0 = Force25Bit(c.PS0AsDouble());
|
const float ps0 = ForceSingle(
|
||||||
const double c1 = Force25Bit(c.PS1AsDouble());
|
ppc_state.fpscr,
|
||||||
|
NI_msub<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value);
|
||||||
const float ps0 =
|
const float ps1 = ForceSingle(
|
||||||
ForceSingle(ppc_state.fpscr, NI_msub(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value);
|
ppc_state.fpscr,
|
||||||
const float ps1 =
|
NI_msub<true>(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value);
|
||||||
ForceSingle(ppc_state.fpscr, NI_msub(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value);
|
|
||||||
|
|
||||||
ppc_state.ps[inst.FD].SetBoth(ps0, ps1);
|
ppc_state.ps[inst.FD].SetBoth(ps0, ps1);
|
||||||
ppc_state.UpdateFPRFSingle(ps0);
|
ppc_state.UpdateFPRFSingle(ps0);
|
||||||
@@ -285,13 +284,12 @@ void Interpreter::ps_madd(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const double c0 = Force25Bit(c.PS0AsDouble());
|
const float ps0 = ForceSingle(
|
||||||
const double c1 = Force25Bit(c.PS1AsDouble());
|
ppc_state.fpscr,
|
||||||
|
NI_madd<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value);
|
||||||
const float ps0 =
|
const float ps1 = ForceSingle(
|
||||||
ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value);
|
ppc_state.fpscr,
|
||||||
const float ps1 =
|
NI_madd<true>(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value);
|
||||||
ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value);
|
|
||||||
|
|
||||||
ppc_state.ps[inst.FD].SetBoth(ps0, ps1);
|
ppc_state.ps[inst.FD].SetBoth(ps0, ps1);
|
||||||
ppc_state.UpdateFPRFSingle(ps0);
|
ppc_state.UpdateFPRFSingle(ps0);
|
||||||
@@ -307,13 +305,12 @@ void Interpreter::ps_nmsub(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const double c0 = Force25Bit(c.PS0AsDouble());
|
const float tmp0 = ForceSingle(
|
||||||
const double c1 = Force25Bit(c.PS1AsDouble());
|
ppc_state.fpscr,
|
||||||
|
NI_msub<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value);
|
||||||
const float tmp0 =
|
const float tmp1 = ForceSingle(
|
||||||
ForceSingle(ppc_state.fpscr, NI_msub(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value);
|
ppc_state.fpscr,
|
||||||
const float tmp1 =
|
NI_msub<true>(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value);
|
||||||
ForceSingle(ppc_state.fpscr, NI_msub(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value);
|
|
||||||
|
|
||||||
const float ps0 = std::isnan(tmp0) ? tmp0 : -tmp0;
|
const float ps0 = std::isnan(tmp0) ? tmp0 : -tmp0;
|
||||||
const float ps1 = std::isnan(tmp1) ? tmp1 : -tmp1;
|
const float ps1 = std::isnan(tmp1) ? tmp1 : -tmp1;
|
||||||
@@ -332,13 +329,12 @@ void Interpreter::ps_nmadd(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const double c0 = Force25Bit(c.PS0AsDouble());
|
const float tmp0 = ForceSingle(
|
||||||
const double c1 = Force25Bit(c.PS1AsDouble());
|
ppc_state.fpscr,
|
||||||
|
NI_madd<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value);
|
||||||
const float tmp0 =
|
const float tmp1 = ForceSingle(
|
||||||
ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value);
|
ppc_state.fpscr,
|
||||||
const float tmp1 =
|
NI_madd<true>(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value);
|
||||||
ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value);
|
|
||||||
|
|
||||||
const float ps0 = std::isnan(tmp0) ? tmp0 : -tmp0;
|
const float ps0 = std::isnan(tmp0) ? tmp0 : -tmp0;
|
||||||
const float ps1 = std::isnan(tmp1) ? tmp1 : -tmp1;
|
const float ps1 = std::isnan(tmp1) ? tmp1 : -tmp1;
|
||||||
@@ -427,11 +423,12 @@ void Interpreter::ps_madds0(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const double c0 = Force25Bit(c.PS0AsDouble());
|
const float ps0 = ForceSingle(
|
||||||
const float ps0 =
|
ppc_state.fpscr,
|
||||||
ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS0AsDouble(), c0, b.PS0AsDouble()).value);
|
NI_madd<true>(ppc_state, a.PS0AsDouble(), c.PS0AsDouble(), b.PS0AsDouble()).value);
|
||||||
const float ps1 =
|
const float ps1 = ForceSingle(
|
||||||
ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS1AsDouble(), c0, b.PS1AsDouble()).value);
|
ppc_state.fpscr,
|
||||||
|
NI_madd<true>(ppc_state, a.PS1AsDouble(), c.PS0AsDouble(), b.PS1AsDouble()).value);
|
||||||
|
|
||||||
ppc_state.ps[inst.FD].SetBoth(ps0, ps1);
|
ppc_state.ps[inst.FD].SetBoth(ps0, ps1);
|
||||||
ppc_state.UpdateFPRFSingle(ps0);
|
ppc_state.UpdateFPRFSingle(ps0);
|
||||||
@@ -447,11 +444,12 @@ void Interpreter::ps_madds1(Interpreter& interpreter, UGeckoInstruction inst)
|
|||||||
const auto& b = ppc_state.ps[inst.FB];
|
const auto& b = ppc_state.ps[inst.FB];
|
||||||
const auto& c = ppc_state.ps[inst.FC];
|
const auto& c = ppc_state.ps[inst.FC];
|
||||||
|
|
||||||
const double c1 = Force25Bit(c.PS1AsDouble());
|
const float ps0 = ForceSingle(
|
||||||
const float ps0 =
|
ppc_state.fpscr,
|
||||||
ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS0AsDouble(), c1, b.PS0AsDouble()).value);
|
NI_madd<true>(ppc_state, a.PS0AsDouble(), c.PS1AsDouble(), b.PS0AsDouble()).value);
|
||||||
const float ps1 =
|
const float ps1 = ForceSingle(
|
||||||
ForceSingle(ppc_state.fpscr, NI_madd(ppc_state, a.PS1AsDouble(), c1, b.PS1AsDouble()).value);
|
ppc_state.fpscr,
|
||||||
|
NI_madd<true>(ppc_state, a.PS1AsDouble(), c.PS1AsDouble(), b.PS1AsDouble()).value);
|
||||||
|
|
||||||
ppc_state.ps[inst.FD].SetBoth(ps0, ps1);
|
ppc_state.ps[inst.FD].SetBoth(ps0, ps1);
|
||||||
ppc_state.UpdateFPRFSingle(ps0);
|
ppc_state.UpdateFPRFSingle(ps0);
|
||||||
|
Reference in New Issue
Block a user