Merge pull request #3621 from SparkiDev/sp_mac_arm64

SP arm64 MAC: stop non-ct mod inv from using x29
This commit is contained in:
toddouska
2021-01-18 15:19:46 -08:00
committed by GitHub

View File

@ -37358,275 +37358,275 @@ static int sp_256_mod_inv_4(sp_digit* r, const sp_digit* a,
const sp_digit* m)
{
__asm__ __volatile__ (
"ldr x20, [%[m], 0]\n\t"
"ldr x21, [%[m], 8]\n\t"
"ldr x22, [%[m], 16]\n\t"
"ldr x23, [%[m], 24]\n\t"
"ldr x7, [%[a], 0]\n\t"
"ldr x8, [%[a], 8]\n\t"
"ldr x9, [%[a], 16]\n\t"
"ldr x10, [%[a], 24]\n\t"
"mov x3, x20\n\t"
"mov x4, x21\n\t"
"mov x5, x22\n\t"
"mov x6, x23\n\t"
"mov x11, xzr\n\t"
"mov x12, xzr\n\t"
"mov x13, xzr\n\t"
"mov x14, xzr\n\t"
"mov x15, 1\n\t"
"mov x16, xzr\n\t"
"mov x17, xzr\n\t"
"mov x19, xzr\n\t"
"cmp x6, 0\n\t"
"b.eq 10f\n\t"
"mov x26, 256\n\t"
"clz x24, x6\n\t"
"sub x24, x26, x24\n\t"
"b 13f\n\t"
"ldr x20, [%[m], 0]\n\t"
"ldr x21, [%[m], 8]\n\t"
"ldr x22, [%[m], 16]\n\t"
"ldr %[m], [%[m], 24]\n\t"
"ldr x7, [%[a], 0]\n\t"
"ldr x8, [%[a], 8]\n\t"
"ldr x9, [%[a], 16]\n\t"
"ldr x10, [%[a], 24]\n\t"
"mov x3, x20\n\t"
"mov x4, x21\n\t"
"mov x5, x22\n\t"
"mov x6, %[m]\n\t"
"mov x11, xzr\n\t"
"mov x12, xzr\n\t"
"mov x13, xzr\n\t"
"mov x14, xzr\n\t"
"mov x15, 1\n\t"
"mov x16, xzr\n\t"
"mov x17, xzr\n\t"
"mov x19, xzr\n\t"
"cmp x6, 0\n\t"
"b.eq 10f\n\t"
"mov x25, 256\n\t"
"clz x23, x6\n\t"
"sub x23, x25, x23\n\t"
"b 13f\n\t"
"\n10:\n\t"
"cmp x5, 0\n\t"
"b.eq 11f\n\t"
"mov x26, 192\n\t"
"clz x24, x5\n\t"
"sub x24, x26, x24\n\t"
"b 13f\n\t"
"cmp x5, 0\n\t"
"b.eq 11f\n\t"
"mov x25, 192\n\t"
"clz x23, x5\n\t"
"sub x23, x25, x23\n\t"
"b 13f\n\t"
"\n11:\n\t"
"cmp x4, 0\n\t"
"b.eq 12f\n\t"
"mov x26, 128\n\t"
"clz x24, x4\n\t"
"sub x24, x26, x24\n\t"
"b 13f\n\t"
"cmp x4, 0\n\t"
"b.eq 12f\n\t"
"mov x25, 128\n\t"
"clz x23, x4\n\t"
"sub x23, x25, x23\n\t"
"b 13f\n\t"
"\n12:\n\t"
"mov x26, 64\n\t"
"clz x24, x3\n\t"
"sub x24, x26, x24\n\t"
"mov x25, 64\n\t"
"clz x23, x3\n\t"
"sub x23, x25, x23\n\t"
"\n13:\n\t"
"cmp x10, 0\n\t"
"b.eq 20f\n\t"
"mov x26, 256\n\t"
"clz x25, x10\n\t"
"sub x25, x26, x25\n\t"
"b 23f\n\t"
"cmp x10, 0\n\t"
"b.eq 20f\n\t"
"mov x25, 256\n\t"
"clz x24, x10\n\t"
"sub x24, x25, x24\n\t"
"b 23f\n\t"
"\n20:\n\t"
"cmp x9, 0\n\t"
"b.eq 21f\n\t"
"mov x26, 192\n\t"
"clz x25, x9\n\t"
"sub x25, x26, x25\n\t"
"b 23f\n\t"
"cmp x9, 0\n\t"
"b.eq 21f\n\t"
"mov x25, 192\n\t"
"clz x24, x9\n\t"
"sub x24, x25, x24\n\t"
"b 23f\n\t"
"\n21:\n\t"
"cmp x8, 0\n\t"
"b.eq 22f\n\t"
"mov x26, 128\n\t"
"clz x25, x8\n\t"
"sub x25, x26, x25\n\t"
"b 23f\n\t"
"cmp x8, 0\n\t"
"b.eq 22f\n\t"
"mov x25, 128\n\t"
"clz x24, x8\n\t"
"sub x24, x25, x24\n\t"
"b 23f\n\t"
"\n22:\n\t"
"mov x26, 64\n\t"
"clz x25, x7\n\t"
"sub x25, x26, x25\n\t"
"mov x25, 64\n\t"
"clz x24, x7\n\t"
"sub x24, x25, x24\n\t"
"\n23:\n\t"
"tst x7, 1\n\t"
"b.ne 90f\n\t"
"tst x7, 1\n\t"
"b.ne 90f\n\t"
"\n1:\n\t"
"lsr x7, x7, 1\n\t"
"lsr x27, x8, 1\n\t"
"lsr x28, x9, 1\n\t"
"orr x7, x7, x8, lsl 63\n\t"
"orr x8, x27, x9, lsl 63\n\t"
"orr x9, x28, x10, lsl 63\n\t"
"lsr x10, x10, 1\n\t"
"sub x25, x25, 1\n\t"
"ands x26, x15, 1\n\t"
"b.eq 2f\n\t"
"adds x15, x15, x20\n\t"
"adcs x16, x16, x21\n\t"
"adcs x17, x17, x22\n\t"
"adcs x19, x19, x23\n\t"
"cset x26, cs\n\t"
"lsr x7, x7, 1\n\t"
"lsr x26, x8, 1\n\t"
"lsr x27, x9, 1\n\t"
"orr x7, x7, x8, lsl 63\n\t"
"orr x8, x26, x9, lsl 63\n\t"
"orr x9, x27, x10, lsl 63\n\t"
"lsr x10, x10, 1\n\t"
"sub x24, x24, 1\n\t"
"ands x25, x15, 1\n\t"
"b.eq 2f\n\t"
"adds x15, x15, x20\n\t"
"adcs x16, x16, x21\n\t"
"adcs x17, x17, x22\n\t"
"adcs x19, x19, %[m]\n\t"
"cset x25, cs\n\t"
"\n2:\n\t"
"lsr x15, x15, 1\n\t"
"lsr x27, x16, 1\n\t"
"lsr x28, x17, 1\n\t"
"lsr x29, x19, 1\n\t"
"orr x15, x15, x16, lsl 63\n\t"
"orr x16, x27, x17, lsl 63\n\t"
"orr x17, x28, x19, lsl 63\n\t"
"orr x19, x29, x26, lsl 63\n\t"
"tst x7, 1\n\t"
"b.eq 1b\n\t"
"lsr x15, x15, 1\n\t"
"lsr x26, x16, 1\n\t"
"lsr x27, x17, 1\n\t"
"lsr x28, x19, 1\n\t"
"orr x15, x15, x16, lsl 63\n\t"
"orr x16, x26, x17, lsl 63\n\t"
"orr x17, x27, x19, lsl 63\n\t"
"orr x19, x28, x25, lsl 63\n\t"
"tst x7, 1\n\t"
"b.eq 1b\n\t"
"\n90:\n\t"
"cmp x24, 1\n\t"
"b.eq 100f\n\t"
"cmp x25, 1\n\t"
"b.eq 101f\n\t"
"cmp x24, x25\n\t"
"b.hi 91f\n\t"
"b.cc 92f\n\t"
"cmp x6, x10\n\t"
"b.hi 91f\n\t"
"b.cc 92f\n\t"
"cmp x5, x9\n\t"
"b.hi 91f\n\t"
"b.cc 92f\n\t"
"cmp x4, x8\n\t"
"b.hi 91f\n\t"
"b.cc 92f\n\t"
"cmp x3, x7\n\t"
"b.cc 92f\n\t"
"cmp x23, 1\n\t"
"b.eq 100f\n\t"
"cmp x24, 1\n\t"
"b.eq 101f\n\t"
"cmp x23, x24\n\t"
"b.hi 91f\n\t"
"b.cc 92f\n\t"
"cmp x6, x10\n\t"
"b.hi 91f\n\t"
"b.cc 92f\n\t"
"cmp x5, x9\n\t"
"b.hi 91f\n\t"
"b.cc 92f\n\t"
"cmp x4, x8\n\t"
"b.hi 91f\n\t"
"b.cc 92f\n\t"
"cmp x3, x7\n\t"
"b.cc 92f\n\t"
"\n91:\n\t"
"subs x3, x3, x7\n\t"
"sbcs x4, x4, x8\n\t"
"sbcs x5, x5, x9\n\t"
"sbc x6, x6, x10\n\t"
"subs x11, x11, x15\n\t"
"sbcs x12, x12, x16\n\t"
"sbcs x13, x13, x17\n\t"
"sbcs x14, x14, x19\n\t"
"b.cs 30f\n\t"
"adds x11, x11, x20\n\t"
"adcs x12, x12, x21\n\t"
"adcs x13, x13, x22\n\t"
"adc x14, x14, x23\n\t"
"subs x3, x3, x7\n\t"
"sbcs x4, x4, x8\n\t"
"sbcs x5, x5, x9\n\t"
"sbc x6, x6, x10\n\t"
"subs x11, x11, x15\n\t"
"sbcs x12, x12, x16\n\t"
"sbcs x13, x13, x17\n\t"
"sbcs x14, x14, x19\n\t"
"b.cs 30f\n\t"
"adds x11, x11, x20\n\t"
"adcs x12, x12, x21\n\t"
"adcs x13, x13, x22\n\t"
"adc x14, x14, %[m]\n\t"
"\n30:\n\t"
"cmp x6, 0\n\t"
"b.eq 40f\n\t"
"mov x26, 256\n\t"
"clz x24, x6\n\t"
"sub x24, x26, x24\n\t"
"b 43f\n\t"
"cmp x6, 0\n\t"
"b.eq 40f\n\t"
"mov x25, 256\n\t"
"clz x23, x6\n\t"
"sub x23, x25, x23\n\t"
"b 43f\n\t"
"\n40:\n\t"
"cmp x5, 0\n\t"
"b.eq 41f\n\t"
"mov x26, 192\n\t"
"clz x24, x5\n\t"
"sub x24, x26, x24\n\t"
"b 43f\n\t"
"cmp x5, 0\n\t"
"b.eq 41f\n\t"
"mov x25, 192\n\t"
"clz x23, x5\n\t"
"sub x23, x25, x23\n\t"
"b 43f\n\t"
"\n41:\n\t"
"cmp x4, 0\n\t"
"b.eq 42f\n\t"
"mov x26, 128\n\t"
"clz x24, x4\n\t"
"sub x24, x26, x24\n\t"
"b 43f\n\t"
"cmp x4, 0\n\t"
"b.eq 42f\n\t"
"mov x25, 128\n\t"
"clz x23, x4\n\t"
"sub x23, x25, x23\n\t"
"b 43f\n\t"
"\n42:\n\t"
"mov x26, 64\n\t"
"clz x24, x3\n\t"
"sub x24, x26, x24\n\t"
"mov x25, 64\n\t"
"clz x23, x3\n\t"
"sub x23, x25, x23\n\t"
"\n43:\n\t"
"\n50:\n\t"
"lsr x3, x3, 1\n\t"
"lsr x27, x4, 1\n\t"
"lsr x28, x5, 1\n\t"
"orr x3, x3, x4, lsl 63\n\t"
"orr x4, x27, x5, lsl 63\n\t"
"orr x5, x28, x6, lsl 63\n\t"
"lsr x6, x6, 1\n\t"
"sub x24, x24, 1\n\t"
"ands x26, x11, 1\n\t"
"b.eq 51f\n\t"
"adds x11, x11, x20\n\t"
"adcs x12, x12, x21\n\t"
"adcs x13, x13, x22\n\t"
"adcs x14, x14, x23\n\t"
"cset x26, cs\n\t"
"lsr x3, x3, 1\n\t"
"lsr x26, x4, 1\n\t"
"lsr x27, x5, 1\n\t"
"orr x3, x3, x4, lsl 63\n\t"
"orr x4, x26, x5, lsl 63\n\t"
"orr x5, x27, x6, lsl 63\n\t"
"lsr x6, x6, 1\n\t"
"sub x23, x23, 1\n\t"
"ands x25, x11, 1\n\t"
"b.eq 51f\n\t"
"adds x11, x11, x20\n\t"
"adcs x12, x12, x21\n\t"
"adcs x13, x13, x22\n\t"
"adcs x14, x14, %[m]\n\t"
"cset x25, cs\n\t"
"\n51:\n\t"
"lsr x11, x11, 1\n\t"
"lsr x27, x12, 1\n\t"
"lsr x28, x13, 1\n\t"
"lsr x29, x14, 1\n\t"
"orr x11, x11, x12, lsl 63\n\t"
"orr x12, x27, x13, lsl 63\n\t"
"orr x13, x28, x14, lsl 63\n\t"
"orr x14, x29, x26, lsl 63\n\t"
"tst x3, 1\n\t"
"b.eq 50b\n\t"
"b 90b\n\t"
"lsr x11, x11, 1\n\t"
"lsr x26, x12, 1\n\t"
"lsr x27, x13, 1\n\t"
"lsr x28, x14, 1\n\t"
"orr x11, x11, x12, lsl 63\n\t"
"orr x12, x26, x13, lsl 63\n\t"
"orr x13, x27, x14, lsl 63\n\t"
"orr x14, x28, x25, lsl 63\n\t"
"tst x3, 1\n\t"
"b.eq 50b\n\t"
"b 90b\n\t"
"\n92:\n\t"
"subs x7, x7, x3\n\t"
"sbcs x8, x8, x4\n\t"
"sbcs x9, x9, x5\n\t"
"sbc x10, x10, x6\n\t"
"subs x15, x15, x11\n\t"
"sbcs x16, x16, x12\n\t"
"sbcs x17, x17, x13\n\t"
"sbcs x19, x19, x14\n\t"
"b.cs 60f\n\t"
"adds x15, x15, x20\n\t"
"adcs x16, x16, x21\n\t"
"adcs x17, x17, x22\n\t"
"adc x19, x19, x23\n\t"
"subs x7, x7, x3\n\t"
"sbcs x8, x8, x4\n\t"
"sbcs x9, x9, x5\n\t"
"sbc x10, x10, x6\n\t"
"subs x15, x15, x11\n\t"
"sbcs x16, x16, x12\n\t"
"sbcs x17, x17, x13\n\t"
"sbcs x19, x19, x14\n\t"
"b.cs 60f\n\t"
"adds x15, x15, x20\n\t"
"adcs x16, x16, x21\n\t"
"adcs x17, x17, x22\n\t"
"adc x19, x19, %[m]\n\t"
"\n60:\n\t"
"cmp x10, 0\n\t"
"b.eq 70f\n\t"
"mov x26, 256\n\t"
"clz x25, x10\n\t"
"sub x25, x26, x25\n\t"
"b 73f\n\t"
"cmp x10, 0\n\t"
"b.eq 70f\n\t"
"mov x25, 256\n\t"
"clz x24, x10\n\t"
"sub x24, x25, x24\n\t"
"b 73f\n\t"
"\n70:\n\t"
"cmp x9, 0\n\t"
"b.eq 71f\n\t"
"mov x26, 192\n\t"
"clz x25, x9\n\t"
"sub x25, x26, x25\n\t"
"b 73f\n\t"
"cmp x9, 0\n\t"
"b.eq 71f\n\t"
"mov x25, 192\n\t"
"clz x24, x9\n\t"
"sub x24, x25, x24\n\t"
"b 73f\n\t"
"\n71:\n\t"
"cmp x8, 0\n\t"
"b.eq 72f\n\t"
"mov x26, 128\n\t"
"clz x25, x8\n\t"
"sub x25, x26, x25\n\t"
"b 73f\n\t"
"cmp x8, 0\n\t"
"b.eq 72f\n\t"
"mov x25, 128\n\t"
"clz x24, x8\n\t"
"sub x24, x25, x24\n\t"
"b 73f\n\t"
"\n72:\n\t"
"mov x26, 64\n\t"
"clz x25, x7\n\t"
"sub x25, x26, x25\n\t"
"mov x25, 64\n\t"
"clz x24, x7\n\t"
"sub x24, x25, x24\n\t"
"\n73:\n\t"
"\n80:\n\t"
"lsr x7, x7, 1\n\t"
"lsr x27, x8, 1\n\t"
"lsr x28, x9, 1\n\t"
"orr x7, x7, x8, lsl 63\n\t"
"orr x8, x27, x9, lsl 63\n\t"
"orr x9, x28, x10, lsl 63\n\t"
"lsr x10, x10, 1\n\t"
"sub x25, x25, 1\n\t"
"ands x26, x15, 1\n\t"
"b.eq 81f\n\t"
"adds x15, x15, x20\n\t"
"adcs x16, x16, x21\n\t"
"adcs x17, x17, x22\n\t"
"adcs x19, x19, x23\n\t"
"cset x26, cs\n\t"
"lsr x7, x7, 1\n\t"
"lsr x26, x8, 1\n\t"
"lsr x27, x9, 1\n\t"
"orr x7, x7, x8, lsl 63\n\t"
"orr x8, x26, x9, lsl 63\n\t"
"orr x9, x27, x10, lsl 63\n\t"
"lsr x10, x10, 1\n\t"
"sub x24, x24, 1\n\t"
"ands x25, x15, 1\n\t"
"b.eq 81f\n\t"
"adds x15, x15, x20\n\t"
"adcs x16, x16, x21\n\t"
"adcs x17, x17, x22\n\t"
"adcs x19, x19, %[m]\n\t"
"cset x25, cs\n\t"
"\n81:\n\t"
"lsr x15, x15, 1\n\t"
"lsr x27, x16, 1\n\t"
"lsr x28, x17, 1\n\t"
"lsr x29, x19, 1\n\t"
"orr x15, x15, x16, lsl 63\n\t"
"orr x16, x27, x17, lsl 63\n\t"
"orr x17, x28, x19, lsl 63\n\t"
"orr x19, x29, x26, lsl 63\n\t"
"tst x7, 1\n\t"
"b.eq 80b\n\t"
"b 90b\n\t"
"lsr x15, x15, 1\n\t"
"lsr x26, x16, 1\n\t"
"lsr x27, x17, 1\n\t"
"lsr x28, x19, 1\n\t"
"orr x15, x15, x16, lsl 63\n\t"
"orr x16, x26, x17, lsl 63\n\t"
"orr x17, x27, x19, lsl 63\n\t"
"orr x19, x28, x25, lsl 63\n\t"
"tst x7, 1\n\t"
"b.eq 80b\n\t"
"b 90b\n\t"
"\n100:\n\t"
"str x11, [%[r], 0]\n\t"
"str x12, [%[r], 8]\n\t"
"str x13, [%[r], 16]\n\t"
"str x14, [%[r], 24]\n\t"
"b 102f\n\t"
"str x11, [%[r], 0]\n\t"
"str x12, [%[r], 8]\n\t"
"str x13, [%[r], 16]\n\t"
"str x14, [%[r], 24]\n\t"
"b 102f\n\t"
"\n101:\n\t"
"str x15, [%[r], 0]\n\t"
"str x16, [%[r], 8]\n\t"
"str x17, [%[r], 16]\n\t"
"str x19, [%[r], 24]\n\t"
"str x15, [%[r], 0]\n\t"
"str x16, [%[r], 8]\n\t"
"str x17, [%[r], 16]\n\t"
"str x19, [%[r], 24]\n\t"
"\n102:\n\t"
:
: [r] "r" (r), [a] "r" (a), [m] "r" (m)
: "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29"
: [m] "+r" (m)
: [r] "r" (r), [a] "r" (a)
: "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
return MP_OKAY;