From 2454bd5ba67acfa5df99a50528c29146455438ce Mon Sep 17 00:00:00 2001 From: Sintendo Date: Thu, 25 Feb 2021 23:22:17 +0100 Subject: [PATCH 01/10] Jit64: Add optional argument to GenerateOverflow This allows setting the overflow flag based on any condition code. Defaults to NO (no overflow). --- Source/Core/Core/PowerPC/Jit64/Jit.h | 2 +- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index c1237463d8..358e7a8ea6 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -96,7 +96,7 @@ public: void GenerateConstantOverflow(bool overflow); void GenerateConstantOverflow(s64 val); - void GenerateOverflow(); + void GenerateOverflow(Gen::CCFlags cond = Gen::CCFlags::CC_NO); void FinalizeCarryOverflow(bool oe, bool inv = false); void FinalizeCarry(Gen::CCFlags cond); void FinalizeCarry(bool ca); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 35df5a06a5..5c6f2cc17d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -42,9 +42,9 @@ void Jit64::GenerateConstantOverflow(bool overflow) } // We could do overflow branchlessly, but unlike carry it seems to be quite a bit rarer. -void Jit64::GenerateOverflow() +void Jit64::GenerateOverflow(Gen::CCFlags cond) { - FixupBranch jno = J_CC(CC_NO); + FixupBranch jno = J_CC(cond); // XER[OV/SO] = 1 MOV(8, PPCSTATE(xer_so_ov), Imm8(XER_OV_MASK | XER_SO_MASK)); FixupBranch exit = J(); From c081e3f2b35fc5e80ef6d14aa28fcaeb26d9e9af Mon Sep 17 00:00:00 2001 From: Sintendo Date: Sat, 27 Feb 2021 11:30:59 +0100 Subject: [PATCH 02/10] Jit64: divwx - Optimize constant dividend When the dividend is known at compile time, we can eliminate some of the branching and precompute the result for the overflow case. Before: B8 54 D3 E6 02 mov eax,2E6D354h 85 FF test edi,edi 74 0C je overflow 3D 00 00 00 80 cmp eax,80000000h 75 0C jne normal_path 83 FF FF cmp edi,0FFFFFFFFh 75 07 jne normal_path overflow: C1 F8 1F sar eax,1Fh 8B F8 mov edi,eax EB 05 jmp done normal_path: 99 cdq F7 FF idiv eax,edi 8B F8 mov edi,eax done: After: 85 FF test edi,edi 75 04 jne normal_path 33 FF xor edi,edi EB 0A jmp done normal_path: B8 54 D3 E6 02 mov eax,2E6D354h 99 cdq F7 FF idiv eax,edi 8B F8 mov edi,eax done: Fairly common with constant dividend of zero. Non-zero values occur frequently in Ocarina of Time Master Quest. --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 5c6f2cc17d..fd6feace7d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1340,6 +1340,63 @@ void Jit64::divwx(UGeckoInstruction inst) GenerateConstantOverflow(false); } } + else if (gpr.IsImm(a)) + { + // Constant dividend + const u32 dividend = gpr.Imm32(a); + + RCX64Reg Rb = gpr.Bind(b, RCMode::Read); + RCX64Reg Rd = gpr.Bind(d, RCMode::Write); + // no register choice + RCX64Reg eax = gpr.Scratch(EAX); + RCX64Reg edx = gpr.Scratch(EDX); + RegCache::Realize(Rb, Rd, eax, edx); + + // Check for divisor == 0 + TEST(32, Rb, Rb); + + FixupBranch normal_path; + + if (dividend == 0x80000000) + { + // Divisor is 0, proceed to overflow case + const FixupBranch overflow = J_CC(CC_Z); + // Otherwise, check for divisor == -1 + CMP(32, Rb, Imm32(0xFFFFFFFF)); + normal_path = J_CC(CC_NE); + + SetJumpTarget(overflow); + } + else + { + // Divisor is not 0, take normal path + normal_path = J_CC(CC_NZ); + // Otherwise, proceed to overflow case + } + + // Set Rd to all ones or all zeroes + if (dividend & 0x80000000) + MOV(32, Rd, Imm32(0xFFFFFFFF)); + else + XOR(32, Rd, Rd); + + if (inst.OE) + GenerateConstantOverflow(true); + + const FixupBranch done = J(); + + SetJumpTarget(normal_path); + + MOV(32, eax, Imm32(dividend)); + CDQ(); + IDIV(32, Rb); + MOV(32, Rd, eax); + + if (inst.OE) + GenerateConstantOverflow(false); + + SetJumpTarget(done); + } else { RCOpArg Ra = gpr.Use(a, RCMode::Read); From c9adc60d730812c1cb09bcce9bf7b4ace91883c3 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Sat, 27 Feb 2021 20:51:38 +0100 Subject: [PATCH 03/10] Jit64: divwx - Special case dividend == 0 Zero divided by any number is still zero. For whatever reason, this case shows up frequently too. Before: B8 00 00 00 00 mov eax,0 85 F6 test esi,esi 74 0C je overflow 3D 00 00 00 80 cmp eax,80000000h 75 0C jne normal_path 83 FE FF cmp esi,0FFFFFFFFh 75 07 jne normal_path overflow: C1 F8 1F sar eax,1Fh 8B F8 mov edi,eax EB 05 jmp done normal_path: 99 cdq F7 FE idiv eax,esi 8B F8 mov edi,eax done: After: Nothing! --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 107 ++++++++++-------- 1 file changed, 62 insertions(+), 45 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index fd6feace7d..88e139abb2 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1345,57 +1345,74 @@ void Jit64::divwx(UGeckoInstruction inst) // Constant dividend const u32 dividend = gpr.Imm32(a); - RCX64Reg Rb = gpr.Bind(b, RCMode::Read); - RCX64Reg Rd = gpr.Bind(d, RCMode::Write); - // no register choice - RCX64Reg eax = gpr.Scratch(EAX); - RCX64Reg edx = gpr.Scratch(EDX); - RegCache::Realize(Rb, Rd, eax, edx); - - // Check for divisor == 0 - TEST(32, Rb, Rb); - - FixupBranch normal_path; - - if (dividend == 0x80000000) + if (dividend == 0) { - // Divisor is 0, proceed to overflow case - const FixupBranch overflow = J_CC(CC_Z); - // Otherwise, check for divisor == -1 - CMP(32, Rb, Imm32(0xFFFFFFFF)); - normal_path = J_CC(CC_NE); + if (inst.OE) + { + RCOpArg Rb = gpr.Use(b, RCMode::Read); + RegCache::Realize(Rb); - SetJumpTarget(overflow); + CMP_or_TEST(32, Rb, Imm32(0)); + GenerateOverflow(CC_NZ); + } + + // Zero divided by anything is always zero + gpr.SetImmediate32(d, 0); } else { - // Divisor is not 0, take normal path - normal_path = J_CC(CC_NZ); - // Otherwise, proceed to overflow case + RCX64Reg Rb = gpr.Bind(b, RCMode::Read); + RCX64Reg Rd = gpr.Bind(d, RCMode::Write); + // no register choice + RCX64Reg eax = gpr.Scratch(EAX); + RCX64Reg edx = gpr.Scratch(EDX); + RegCache::Realize(Rb, Rd, eax, edx); + + // Check for divisor == 0 + TEST(32, Rb, Rb); + + FixupBranch normal_path; + + if (dividend == 0x80000000) + { + // Divisor is 0, proceed to overflow case + const FixupBranch overflow = J_CC(CC_Z); + // Otherwise, check for divisor == -1 + CMP(32, Rb, Imm32(0xFFFFFFFF)); + normal_path = J_CC(CC_NE); + + SetJumpTarget(overflow); + } + else + { + // Divisor is not 0, take normal path + normal_path = J_CC(CC_NZ); + // Otherwise, proceed to overflow case + } + + // Set Rd to all ones or all zeroes + if (dividend & 0x80000000) + MOV(32, Rd, Imm32(0xFFFFFFFF)); + else + XOR(32, Rd, Rd); + + if (inst.OE) + GenerateConstantOverflow(true); + + const FixupBranch done = J(); + + SetJumpTarget(normal_path); + + MOV(32, eax, Imm32(dividend)); + CDQ(); + IDIV(32, Rb); + MOV(32, Rd, eax); + + if (inst.OE) + GenerateConstantOverflow(false); + + SetJumpTarget(done); } - - // Set Rd to all ones or all zeroes - if (dividend & 0x80000000) - MOV(32, Rd, Imm32(0xFFFFFFFF)); - else - XOR(32, Rd, Rd); - - if (inst.OE) - GenerateConstantOverflow(true); - - const FixupBranch done = J(); - - SetJumpTarget(normal_path); - - MOV(32, eax, Imm32(dividend)); - CDQ(); - IDIV(32, Rb); - MOV(32, Rd, eax); - - if (inst.OE) - GenerateConstantOverflow(false); - - SetJumpTarget(done); } else { From 5bb8798df614fc1b03a69529a9c1b7b0cf6f49b7 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 3 Mar 2021 23:48:28 +0100 Subject: [PATCH 04/10] JitCommon: Signed 32-bit division magic constants Add a function to calculate the magic constants required to optimize signed 32-bit division. Since this optimization is not exclusive to any particular architecture, JitCommon seemed like a good place to put this. --- Source/Core/Core/CMakeLists.txt | 2 + .../Core/Core/PowerPC/JitCommon/DivUtils.cpp | 57 +++++++++++++++++++ Source/Core/Core/PowerPC/JitCommon/DivUtils.h | 22 +++++++ Source/Core/DolphinLib.vcxproj | 6 ++ Source/UnitTests/DivUtilsTest.cpp | 33 +++++++++++ Source/UnitTests/UnitTests.vcxproj | 1 + 6 files changed, 121 insertions(+) create mode 100644 Source/Core/Core/PowerPC/JitCommon/DivUtils.cpp create mode 100644 Source/Core/Core/PowerPC/JitCommon/DivUtils.h create mode 100644 Source/UnitTests/DivUtilsTest.cpp diff --git a/Source/Core/Core/CMakeLists.txt b/Source/Core/Core/CMakeLists.txt index 4137b8e23f..d14d998462 100644 --- a/Source/Core/Core/CMakeLists.txt +++ b/Source/Core/Core/CMakeLists.txt @@ -430,6 +430,8 @@ add_library(core PowerPC/Interpreter/Interpreter_Tables.cpp PowerPC/Interpreter/Interpreter.cpp PowerPC/Interpreter/Interpreter.h + PowerPC/JitCommon/DivUtils.cpp + PowerPC/JitCommon/DivUtils.h PowerPC/JitCommon/JitAsmCommon.cpp PowerPC/JitCommon/JitAsmCommon.h PowerPC/JitCommon/JitBase.cpp diff --git a/Source/Core/Core/PowerPC/JitCommon/DivUtils.cpp b/Source/Core/Core/PowerPC/JitCommon/DivUtils.cpp new file mode 100644 index 0000000000..c19b7c8091 --- /dev/null +++ b/Source/Core/Core/PowerPC/JitCommon/DivUtils.cpp @@ -0,0 +1,57 @@ +// Copyright 2021 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include + +#include "Core/PowerPC/JitCommon/DivUtils.h" + +namespace JitCommon +{ +Magic SignedDivisionConstants(s32 d) +{ + const u32 two31 = 2147483648; + + const u32 ad = std::abs(d); + const u32 t = two31 - (d < 0); + const u32 anc = t - 1 - t % ad; + u32 q1 = two31 / anc; + u32 r1 = two31 - q1 * anc; + u32 q2 = two31 / ad; + u32 r2 = two31 - q2 * ad; + + s32 p = 31; + u32 delta; + + do + { + p++; + + q1 *= 2; + r1 *= 2; + if (r1 >= anc) + { + q1++; + r1 -= anc; + } + + q2 *= 2; + r2 *= 2; + if (r2 >= ad) + { + q2++; + r2 -= ad; + } + delta = ad - r2; + } while (q1 < delta || (q1 == delta && r1 == 0)); + + Magic mag; + mag.multiplier = q2 + 1; + if (d < 0) + mag.multiplier = -mag.multiplier; + mag.shift = p - 32; + + return mag; +} + +} // namespace JitCommon diff --git a/Source/Core/Core/PowerPC/JitCommon/DivUtils.h b/Source/Core/Core/PowerPC/JitCommon/DivUtils.h new file mode 100644 index 0000000000..b243e2654b --- /dev/null +++ b/Source/Core/Core/PowerPC/JitCommon/DivUtils.h @@ -0,0 +1,22 @@ +// Copyright 2021 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include "Common/CommonTypes.h" + +namespace JitCommon +{ +struct Magic +{ + s32 multiplier; + u8 shift; +}; + +// Calculate the constants required to optimize a signed 32-bit integer division. +// Taken from The PowerPC Compiler Writer's Guide and LLVM. +// Divisor must not be -1, 0, and 1. +Magic SignedDivisionConstants(s32 divisor); + +} // namespace JitCommon diff --git a/Source/Core/DolphinLib.vcxproj b/Source/Core/DolphinLib.vcxproj index 5485304bdd..211b0a5423 100644 --- a/Source/Core/DolphinLib.vcxproj +++ b/Source/Core/DolphinLib.vcxproj @@ -27,6 +27,12 @@ {41279555-f94f-4ebc-99de-af863c10c5c4} + + + + + + \ No newline at end of file diff --git a/Source/UnitTests/DivUtilsTest.cpp b/Source/UnitTests/DivUtilsTest.cpp new file mode 100644 index 0000000000..7802fc8496 --- /dev/null +++ b/Source/UnitTests/DivUtilsTest.cpp @@ -0,0 +1,33 @@ +// Copyright 2021 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include + +#include "Core/PowerPC/JitCommon/DivUtils.h" + +using namespace JitCommon; + +TEST(DivUtils, Signed) +{ + Magic m3 = SignedDivisionConstants(3); + Magic m5 = SignedDivisionConstants(5); + Magic m7 = SignedDivisionConstants(7); + Magic minus3 = SignedDivisionConstants(-3); + Magic minus5 = SignedDivisionConstants(-5); + Magic minus7 = SignedDivisionConstants(-7); + + EXPECT_EQ(0x55555556, m3.multiplier); + EXPECT_EQ(0, m3.shift); + EXPECT_EQ(0x66666667, m5.multiplier); + EXPECT_EQ(1, m5.shift); + EXPECT_EQ(-0x6DB6DB6D, m7.multiplier); + EXPECT_EQ(2, m7.shift); + + EXPECT_EQ(-0x55555556, minus3.multiplier); + EXPECT_EQ(0, minus3.shift); + EXPECT_EQ(-0x66666667, minus5.multiplier); + EXPECT_EQ(1, minus5.shift); + EXPECT_EQ(0x6DB6DB6D, minus7.multiplier); + EXPECT_EQ(2, minus7.shift); +} diff --git a/Source/UnitTests/UnitTests.vcxproj b/Source/UnitTests/UnitTests.vcxproj index 230ac50412..5711a8614e 100644 --- a/Source/UnitTests/UnitTests.vcxproj +++ b/Source/UnitTests/UnitTests.vcxproj @@ -68,6 +68,7 @@ + From 95698c5ae1a100cf87fa5f939480bacdf62258c6 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Thu, 4 Mar 2021 20:17:50 +0100 Subject: [PATCH 05/10] Jit64: divwx - Optimize constant divisor Optimize division by a constant into multiplication. This method is also used by GCC and LLVM. We also add optimized paths for divisors 0, 1, and -1, because they don't work using this method. They don't occur very often, but are necessary for correctness. - Division by 1 Before: 41 BF 01 00 00 00 mov r15d,1 41 8B C5 mov eax,r13d 45 85 FF test r15d,r15d 74 0D je overflow 3D 00 00 00 80 cmp eax,80000000h 75 0E jne normal_path 41 83 FF FF cmp r15d,0FFFFFFFFh 75 08 jne normal_path overflow: C1 F8 1F sar eax,1Fh 44 8B F8 mov r15d,eax EB 07 jmp done normal_path: 99 cdq 41 F7 FF idiv eax,r15d 44 8B F8 mov r15d,eax done: After: 45 8B FD mov r15d,r13d - Division by 30307 Before: 41 BA 63 76 00 00 mov r10d,7663h 41 8B C5 mov eax,r13d 45 85 D2 test r10d,r10d 74 0D je overflow 3D 00 00 00 80 cmp eax,80000000h 75 0E jne normal_path 41 83 FA FF cmp r10d,0FFFFFFFFh 75 08 jne normal_path overflow: C1 F8 1F sar eax,1Fh 44 8B C0 mov r8d,eax EB 07 jmp done normal_path: 99 cdq 41 F7 FA idiv eax,r10d 44 8B C0 mov r8d,eax done: After: 49 63 C5 movsxd rax,r13d 48 69 C0 65 6B 32 45 imul rax,rax,45326B65h 4C 8B C0 mov r8,rax 48 C1 E8 3F shr rax,3Fh 49 C1 F8 2D sar r8,2Dh 44 03 C0 add r8d,eax - Division by 30323 Before: 41 BA 73 76 00 00 mov r10d,7673h 41 8B C5 mov eax,r13d 45 85 D2 test r10d,r10d 74 0D je overflow 3D 00 00 00 80 cmp eax,80000000h 75 0E jne normal_path 41 83 FA FF cmp r10d,0FFFFFFFFh 75 08 jne normal_path overflow: C1 F8 1F sar eax,1Fh 44 8B C0 mov r8d,eax EB 07 jmp 00000000161737E7 normal_path: 99 cdq 41 F7 FA idiv eax,r10d 44 8B C0 mov r8d,eax done: After: 49 63 C5 movsxd rax,r13d 4C 69 C0 19 25 52 8A imul r8,rax,0FFFFFFFF8A522519h 49 C1 E8 20 shr r8,20h 44 03 C0 add r8d,eax C1 E8 1F shr eax,1Fh 41 C1 F8 0E sar r8d,0Eh 44 03 C0 add r8d,eax --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 88e139abb2..e8a865c63b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -16,10 +16,12 @@ #include "Core/PowerPC/Jit64/Jit.h" #include "Core/PowerPC/Jit64/RegCache/JitRegCache.h" #include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h" +#include "Core/PowerPC/JitCommon/DivUtils.h" #include "Core/PowerPC/PPCAnalyst.h" #include "Core/PowerPC/PowerPC.h" using namespace Gen; +using namespace JitCommon; void Jit64::GenerateConstantOverflow(s64 val) { @@ -1414,6 +1416,88 @@ void Jit64::divwx(UGeckoInstruction inst) SetJumpTarget(done); } } + else if (gpr.IsImm(b)) + { + // Constant divisor + const s32 divisor = gpr.SImm32(b); + RCOpArg Ra = gpr.Use(a, RCMode::Read); + RCX64Reg Rd = gpr.Bind(d, RCMode::Write); + RegCache::Realize(Ra, Rd); + + // Handle 0, 1, and -1 explicitly + if (divisor == 0) + { + if (d != a) + MOV(32, Rd, Ra); + SAR(32, Rd, Imm8(31)); + if (inst.OE) + GenerateConstantOverflow(true); + } + else if (divisor == 1) + { + if (d != a) + MOV(32, Rd, Ra); + if (inst.OE) + GenerateConstantOverflow(false); + } + else if (divisor == -1) + { + if (d != a) + MOV(32, Rd, Ra); + + CMP(32, Rd, Imm32(0x80000000)); + const FixupBranch normal = J_CC(CC_NE); + + MOV(32, Rd, Imm32(0xFFFFFFFF)); + if (inst.OE) + GenerateConstantOverflow(true); + const FixupBranch done = J(); + + SetJumpTarget(normal); + NEG(32, Rd); + if (inst.OE) + GenerateConstantOverflow(false); + + SetJumpTarget(done); + } + else + { + // Optimize signed 32-bit integer division by a constant + Magic m = SignedDivisionConstants(divisor); + + MOVSX(64, 32, RSCRATCH, Ra); + + if (divisor > 0 && m.multiplier < 0) + { + IMUL(64, Rd, R(RSCRATCH), Imm32(m.multiplier)); + SHR(64, Rd, Imm8(32)); + ADD(32, Rd, R(RSCRATCH)); + SHR(32, R(RSCRATCH), Imm8(31)); + SAR(32, Rd, Imm8(m.shift)); + } + else if (divisor < 0 && m.multiplier > 0) + { + IMUL(64, Rd, R(RSCRATCH), Imm32(m.multiplier)); + SHR(64, R(RSCRATCH), Imm8(32)); + SUB(32, R(RSCRATCH), Rd); + MOV(32, Rd, R(RSCRATCH)); + SHR(32, Rd, Imm8(31)); + SAR(32, R(RSCRATCH), Imm8(m.shift)); + } + else + { + IMUL(64, RSCRATCH, R(RSCRATCH), Imm32(m.multiplier)); + MOV(64, Rd, R(RSCRATCH)); + SHR(64, R(RSCRATCH), Imm8(63)); + SAR(64, R(Rd), Imm8(32 + m.shift)); + } + + ADD(32, Rd, R(RSCRATCH)); + + if (inst.OE) + GenerateConstantOverflow(false); + } + } else { RCOpArg Ra = gpr.Use(a, RCMode::Read); From 530475dce8d3c4e94e0737be0dc48757a78bc475 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Thu, 4 Mar 2021 21:43:21 +0100 Subject: [PATCH 06/10] Jit64: divwx - Micro-optimize certain divisors When the multiplier is positive (which is the most common case), we can generate slightly better code. - Division by 30307 Before: 49 63 C5 movsxd rax,r13d 48 69 C0 65 6B 32 45 imul rax,rax,45326B65h 4C 8B C0 mov r8,rax 48 C1 E8 3F shr rax,3Fh 49 C1 F8 2D sar r8,2Dh 44 03 C0 add r8d,eax After: 49 63 C5 movsxd rax,r13d 4C 69 C0 65 6B 32 45 imul r8,rax,45326B65h C1 E8 1F shr eax,1Fh 49 C1 F8 2D sar r8,2Dh 44 03 C0 add r8d,eax --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index e8a865c63b..41bd9d6ea7 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1484,6 +1484,12 @@ void Jit64::divwx(UGeckoInstruction inst) SHR(32, Rd, Imm8(31)); SAR(32, R(RSCRATCH), Imm8(m.shift)); } + else if (m.multiplier > 0) + { + IMUL(64, Rd, R(RSCRATCH), Imm32(m.multiplier)); + SHR(32, R(RSCRATCH), Imm8(31)); + SAR(64, R(Rd), Imm8(32 + m.shift)); + } else { IMUL(64, RSCRATCH, R(RSCRATCH), Imm32(m.multiplier)); From 0637a7ec597d6b96ac9b6274f390bf2ee22807fd Mon Sep 17 00:00:00 2001 From: Sintendo Date: Thu, 4 Mar 2021 22:16:52 +0100 Subject: [PATCH 07/10] Jit64: divwx - Optimize power-of-two divisors Power-of-two divisors can be done more elegantly, so handle them separately. - Division by 4 Before: 41 BD 04 00 00 00 mov r13d,4 41 8B C0 mov eax,r8d 45 85 ED test r13d,r13d 74 0D je overflow 3D 00 00 00 80 cmp eax,80000000h 75 0E jne normal_path 41 83 FD FF cmp r13d,0FFFFFFFFh 75 08 jne normal_path overflow: C1 F8 1F sar eax,1Fh 44 8B E8 mov r13d,eax EB 07 jmp done normal_path: 99 cdq 41 F7 FD idiv eax,r13d 44 8B E8 mov r13d,eax done: After: 45 85 C0 test r8d,r8d 45 8D 68 03 lea r13d,[r8+3] 45 0F 49 E8 cmovns r13d,r8d 41 C1 FD 02 sar r13d,2 --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 41bd9d6ea7..e673a52b6d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1460,6 +1460,27 @@ void Jit64::divwx(UGeckoInstruction inst) SetJumpTarget(done); } + else if (MathUtil::IsPow2(divisor) || MathUtil::IsPow2(-divisor)) + { + u32 abs_val = std::abs(divisor); + + X64Reg tmp = RSCRATCH; + if (Ra.IsSimpleReg() && Ra.GetSimpleReg() != Rd) + tmp = Ra.GetSimpleReg(); + else + MOV(32, R(tmp), Ra); + + TEST(32, R(tmp), R(tmp)); + LEA(32, Rd, MDisp(tmp, abs_val - 1)); + CMOVcc(32, Rd, R(tmp), CC_NS); + SAR(32, Rd, Imm8(IntLog2(abs_val))); + + if (divisor < 0) + NEG(32, Rd); + + if (inst.OE) + GenerateConstantOverflow(false); + } else { // Optimize signed 32-bit integer division by a constant From 18650357988c54727bb9eb9f8de864b6b2e528a2 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Thu, 4 Mar 2021 22:29:15 +0100 Subject: [PATCH 08/10] Jit64: divwx - Optimize division by 2 ...and let's optimize a divisor of 2 ever so slightly for good measure. I wouldn't have bothered, but most GameCube games seem to hit this on launch. - Division by 2 Before: 41 BE 02 00 00 00 mov r14d,2 41 8B C2 mov eax,r10d 45 85 F6 test r14d,r14d 74 0D je overflow 3D 00 00 00 80 cmp eax,80000000h 75 0E jne normal_path 41 83 FE FF cmp r14d,0FFFFFFFFh 75 08 jne normal_path overflow: C1 F8 1F sar eax,1Fh 44 8B F0 mov r14d,eax EB 07 jmp done normal_path: 99 cdq 41 F7 FE idiv eax,r14d 44 8B F0 mov r14d,eax done: After: 45 8B F2 mov r14d,r10d 41 C1 EE 1F shr r14d,1Fh 45 03 F2 add r14d,r10d 41 D1 FE sar r14d,1 --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index e673a52b6d..464bcb4521 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1460,6 +1460,25 @@ void Jit64::divwx(UGeckoInstruction inst) SetJumpTarget(done); } + else if (divisor == 2 || divisor == -2) + { + X64Reg tmp = RSCRATCH; + if (Ra.IsSimpleReg() && Ra.GetSimpleReg() != Rd) + tmp = Ra.GetSimpleReg(); + else + MOV(32, R(tmp), Ra); + + MOV(32, Rd, R(tmp)); + SHR(32, Rd, Imm8(31)); + ADD(32, Rd, R(tmp)); + SAR(32, Rd, Imm8(1)); + + if (divisor < 0) + NEG(32, Rd); + + if (inst.OE) + GenerateConstantOverflow(false); + } else if (MathUtil::IsPow2(divisor) || MathUtil::IsPow2(-divisor)) { u32 abs_val = std::abs(divisor); From 83f38388a1657a9572d667ed384ee033768f836d Mon Sep 17 00:00:00 2001 From: Sintendo Date: Thu, 4 Mar 2021 22:45:45 +0100 Subject: [PATCH 09/10] Jit64: divwx - Micro-optimize default case Both the normal path and the overflow path end with the same instruction, so their tails can be merged. Before: 41 8B C7 mov eax,r15d 45 85 C0 test r8d,r8d 74 0D je overflow 3D 00 00 00 80 cmp eax,80000000h 75 0E jne normal_path 41 83 F8 FF cmp r8d,0FFFFFFFFh 75 08 jne normal_path overflow: C1 F8 1F sar eax,1Fh 44 8B F0 mov r14d,eax EB 07 jmp done normal_path: 99 cdq 41 F7 F8 idiv eax,r8d 44 8B F0 mov r14d,eax done: After: 41 8B C7 mov eax,r15d 45 85 C0 test r8d,r8d 74 0D je overflow 3D 00 00 00 80 cmp eax,80000000h 75 0B jne normal_path 41 83 F8 FF cmp r8d,0FFFFFFFFh 75 05 jne normal_path overflow: C1 F8 1F sar eax,1Fh EB 04 jmp done normal_path: 99 cdq 41 F7 F8 idiv eax,r8d done: 44 8B F0 mov r14d,eax --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 464bcb4521..1db3836dab 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1566,7 +1566,6 @@ void Jit64::divwx(UGeckoInstruction inst) SetJumpTarget(overflow); SAR(32, eax, Imm8(31)); - MOV(32, Rd, eax); if (inst.OE) { GenerateConstantOverflow(true); @@ -1578,12 +1577,13 @@ void Jit64::divwx(UGeckoInstruction inst) CDQ(); IDIV(32, Rb); - MOV(32, Rd, eax); if (inst.OE) { GenerateConstantOverflow(false); } + SetJumpTarget(done); + MOV(32, Rd, eax); } if (inst.Rc) ComputeRC(d); From defe7162f557e6b43919f074e211d57875600e47 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Sat, 6 Mar 2021 22:28:40 +0100 Subject: [PATCH 10/10] Jit64: divwx - Simplify divisor == -1 case Suggested by @MerryMage. Thanks! Co-authored-by: merry --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 1db3836dab..c6b6ba6ddc 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1445,8 +1445,8 @@ void Jit64::divwx(UGeckoInstruction inst) if (d != a) MOV(32, Rd, Ra); - CMP(32, Rd, Imm32(0x80000000)); - const FixupBranch normal = J_CC(CC_NE); + NEG(32, Rd); + const FixupBranch normal = J_CC(CC_NO); MOV(32, Rd, Imm32(0xFFFFFFFF)); if (inst.OE) @@ -1454,7 +1454,6 @@ void Jit64::divwx(UGeckoInstruction inst) const FixupBranch done = J(); SetJumpTarget(normal); - NEG(32, Rd); if (inst.OE) GenerateConstantOverflow(false);