Merge pull request #3621 from SparkiDev/sp_mac_arm64

SP arm64 MAC: stop non-ct mod inv from using x29
This commit is contained in:
toddouska
2021-01-18 15:19:46 -08:00
committed by GitHub

View File

@@ -37358,275 +37358,275 @@ static int sp_256_mod_inv_4(sp_digit* r, const sp_digit* a,
const sp_digit* m) const sp_digit* m)
{ {
__asm__ __volatile__ ( __asm__ __volatile__ (
"ldr x20, [%[m], 0]\n\t" "ldr x20, [%[m], 0]\n\t"
"ldr x21, [%[m], 8]\n\t" "ldr x21, [%[m], 8]\n\t"
"ldr x22, [%[m], 16]\n\t" "ldr x22, [%[m], 16]\n\t"
"ldr x23, [%[m], 24]\n\t" "ldr %[m], [%[m], 24]\n\t"
"ldr x7, [%[a], 0]\n\t" "ldr x7, [%[a], 0]\n\t"
"ldr x8, [%[a], 8]\n\t" "ldr x8, [%[a], 8]\n\t"
"ldr x9, [%[a], 16]\n\t" "ldr x9, [%[a], 16]\n\t"
"ldr x10, [%[a], 24]\n\t" "ldr x10, [%[a], 24]\n\t"
"mov x3, x20\n\t" "mov x3, x20\n\t"
"mov x4, x21\n\t" "mov x4, x21\n\t"
"mov x5, x22\n\t" "mov x5, x22\n\t"
"mov x6, x23\n\t" "mov x6, %[m]\n\t"
"mov x11, xzr\n\t" "mov x11, xzr\n\t"
"mov x12, xzr\n\t" "mov x12, xzr\n\t"
"mov x13, xzr\n\t" "mov x13, xzr\n\t"
"mov x14, xzr\n\t" "mov x14, xzr\n\t"
"mov x15, 1\n\t" "mov x15, 1\n\t"
"mov x16, xzr\n\t" "mov x16, xzr\n\t"
"mov x17, xzr\n\t" "mov x17, xzr\n\t"
"mov x19, xzr\n\t" "mov x19, xzr\n\t"
"cmp x6, 0\n\t" "cmp x6, 0\n\t"
"b.eq 10f\n\t" "b.eq 10f\n\t"
"mov x26, 256\n\t" "mov x25, 256\n\t"
"clz x24, x6\n\t" "clz x23, x6\n\t"
"sub x24, x26, x24\n\t" "sub x23, x25, x23\n\t"
"b 13f\n\t" "b 13f\n\t"
"\n10:\n\t" "\n10:\n\t"
"cmp x5, 0\n\t" "cmp x5, 0\n\t"
"b.eq 11f\n\t" "b.eq 11f\n\t"
"mov x26, 192\n\t" "mov x25, 192\n\t"
"clz x24, x5\n\t" "clz x23, x5\n\t"
"sub x24, x26, x24\n\t" "sub x23, x25, x23\n\t"
"b 13f\n\t" "b 13f\n\t"
"\n11:\n\t" "\n11:\n\t"
"cmp x4, 0\n\t" "cmp x4, 0\n\t"
"b.eq 12f\n\t" "b.eq 12f\n\t"
"mov x26, 128\n\t" "mov x25, 128\n\t"
"clz x24, x4\n\t" "clz x23, x4\n\t"
"sub x24, x26, x24\n\t" "sub x23, x25, x23\n\t"
"b 13f\n\t" "b 13f\n\t"
"\n12:\n\t" "\n12:\n\t"
"mov x26, 64\n\t" "mov x25, 64\n\t"
"clz x24, x3\n\t" "clz x23, x3\n\t"
"sub x24, x26, x24\n\t" "sub x23, x25, x23\n\t"
"\n13:\n\t" "\n13:\n\t"
"cmp x10, 0\n\t" "cmp x10, 0\n\t"
"b.eq 20f\n\t" "b.eq 20f\n\t"
"mov x26, 256\n\t" "mov x25, 256\n\t"
"clz x25, x10\n\t" "clz x24, x10\n\t"
"sub x25, x26, x25\n\t" "sub x24, x25, x24\n\t"
"b 23f\n\t" "b 23f\n\t"
"\n20:\n\t" "\n20:\n\t"
"cmp x9, 0\n\t" "cmp x9, 0\n\t"
"b.eq 21f\n\t" "b.eq 21f\n\t"
"mov x26, 192\n\t" "mov x25, 192\n\t"
"clz x25, x9\n\t" "clz x24, x9\n\t"
"sub x25, x26, x25\n\t" "sub x24, x25, x24\n\t"
"b 23f\n\t" "b 23f\n\t"
"\n21:\n\t" "\n21:\n\t"
"cmp x8, 0\n\t" "cmp x8, 0\n\t"
"b.eq 22f\n\t" "b.eq 22f\n\t"
"mov x26, 128\n\t" "mov x25, 128\n\t"
"clz x25, x8\n\t" "clz x24, x8\n\t"
"sub x25, x26, x25\n\t" "sub x24, x25, x24\n\t"
"b 23f\n\t" "b 23f\n\t"
"\n22:\n\t" "\n22:\n\t"
"mov x26, 64\n\t" "mov x25, 64\n\t"
"clz x25, x7\n\t" "clz x24, x7\n\t"
"sub x25, x26, x25\n\t" "sub x24, x25, x24\n\t"
"\n23:\n\t" "\n23:\n\t"
"tst x7, 1\n\t" "tst x7, 1\n\t"
"b.ne 90f\n\t" "b.ne 90f\n\t"
"\n1:\n\t" "\n1:\n\t"
"lsr x7, x7, 1\n\t" "lsr x7, x7, 1\n\t"
"lsr x27, x8, 1\n\t" "lsr x26, x8, 1\n\t"
"lsr x28, x9, 1\n\t" "lsr x27, x9, 1\n\t"
"orr x7, x7, x8, lsl 63\n\t" "orr x7, x7, x8, lsl 63\n\t"
"orr x8, x27, x9, lsl 63\n\t" "orr x8, x26, x9, lsl 63\n\t"
"orr x9, x28, x10, lsl 63\n\t" "orr x9, x27, x10, lsl 63\n\t"
"lsr x10, x10, 1\n\t" "lsr x10, x10, 1\n\t"
"sub x25, x25, 1\n\t" "sub x24, x24, 1\n\t"
"ands x26, x15, 1\n\t" "ands x25, x15, 1\n\t"
"b.eq 2f\n\t" "b.eq 2f\n\t"
"adds x15, x15, x20\n\t" "adds x15, x15, x20\n\t"
"adcs x16, x16, x21\n\t" "adcs x16, x16, x21\n\t"
"adcs x17, x17, x22\n\t" "adcs x17, x17, x22\n\t"
"adcs x19, x19, x23\n\t" "adcs x19, x19, %[m]\n\t"
"cset x26, cs\n\t" "cset x25, cs\n\t"
"\n2:\n\t" "\n2:\n\t"
"lsr x15, x15, 1\n\t" "lsr x15, x15, 1\n\t"
"lsr x27, x16, 1\n\t" "lsr x26, x16, 1\n\t"
"lsr x28, x17, 1\n\t" "lsr x27, x17, 1\n\t"
"lsr x29, x19, 1\n\t" "lsr x28, x19, 1\n\t"
"orr x15, x15, x16, lsl 63\n\t" "orr x15, x15, x16, lsl 63\n\t"
"orr x16, x27, x17, lsl 63\n\t" "orr x16, x26, x17, lsl 63\n\t"
"orr x17, x28, x19, lsl 63\n\t" "orr x17, x27, x19, lsl 63\n\t"
"orr x19, x29, x26, lsl 63\n\t" "orr x19, x28, x25, lsl 63\n\t"
"tst x7, 1\n\t" "tst x7, 1\n\t"
"b.eq 1b\n\t" "b.eq 1b\n\t"
"\n90:\n\t" "\n90:\n\t"
"cmp x24, 1\n\t" "cmp x23, 1\n\t"
"b.eq 100f\n\t" "b.eq 100f\n\t"
"cmp x25, 1\n\t" "cmp x24, 1\n\t"
"b.eq 101f\n\t" "b.eq 101f\n\t"
"cmp x24, x25\n\t" "cmp x23, x24\n\t"
"b.hi 91f\n\t" "b.hi 91f\n\t"
"b.cc 92f\n\t" "b.cc 92f\n\t"
"cmp x6, x10\n\t" "cmp x6, x10\n\t"
"b.hi 91f\n\t" "b.hi 91f\n\t"
"b.cc 92f\n\t" "b.cc 92f\n\t"
"cmp x5, x9\n\t" "cmp x5, x9\n\t"
"b.hi 91f\n\t" "b.hi 91f\n\t"
"b.cc 92f\n\t" "b.cc 92f\n\t"
"cmp x4, x8\n\t" "cmp x4, x8\n\t"
"b.hi 91f\n\t" "b.hi 91f\n\t"
"b.cc 92f\n\t" "b.cc 92f\n\t"
"cmp x3, x7\n\t" "cmp x3, x7\n\t"
"b.cc 92f\n\t" "b.cc 92f\n\t"
"\n91:\n\t" "\n91:\n\t"
"subs x3, x3, x7\n\t" "subs x3, x3, x7\n\t"
"sbcs x4, x4, x8\n\t" "sbcs x4, x4, x8\n\t"
"sbcs x5, x5, x9\n\t" "sbcs x5, x5, x9\n\t"
"sbc x6, x6, x10\n\t" "sbc x6, x6, x10\n\t"
"subs x11, x11, x15\n\t" "subs x11, x11, x15\n\t"
"sbcs x12, x12, x16\n\t" "sbcs x12, x12, x16\n\t"
"sbcs x13, x13, x17\n\t" "sbcs x13, x13, x17\n\t"
"sbcs x14, x14, x19\n\t" "sbcs x14, x14, x19\n\t"
"b.cs 30f\n\t" "b.cs 30f\n\t"
"adds x11, x11, x20\n\t" "adds x11, x11, x20\n\t"
"adcs x12, x12, x21\n\t" "adcs x12, x12, x21\n\t"
"adcs x13, x13, x22\n\t" "adcs x13, x13, x22\n\t"
"adc x14, x14, x23\n\t" "adc x14, x14, %[m]\n\t"
"\n30:\n\t" "\n30:\n\t"
"cmp x6, 0\n\t" "cmp x6, 0\n\t"
"b.eq 40f\n\t" "b.eq 40f\n\t"
"mov x26, 256\n\t" "mov x25, 256\n\t"
"clz x24, x6\n\t" "clz x23, x6\n\t"
"sub x24, x26, x24\n\t" "sub x23, x25, x23\n\t"
"b 43f\n\t" "b 43f\n\t"
"\n40:\n\t" "\n40:\n\t"
"cmp x5, 0\n\t" "cmp x5, 0\n\t"
"b.eq 41f\n\t" "b.eq 41f\n\t"
"mov x26, 192\n\t" "mov x25, 192\n\t"
"clz x24, x5\n\t" "clz x23, x5\n\t"
"sub x24, x26, x24\n\t" "sub x23, x25, x23\n\t"
"b 43f\n\t" "b 43f\n\t"
"\n41:\n\t" "\n41:\n\t"
"cmp x4, 0\n\t" "cmp x4, 0\n\t"
"b.eq 42f\n\t" "b.eq 42f\n\t"
"mov x26, 128\n\t" "mov x25, 128\n\t"
"clz x24, x4\n\t" "clz x23, x4\n\t"
"sub x24, x26, x24\n\t" "sub x23, x25, x23\n\t"
"b 43f\n\t" "b 43f\n\t"
"\n42:\n\t" "\n42:\n\t"
"mov x26, 64\n\t" "mov x25, 64\n\t"
"clz x24, x3\n\t" "clz x23, x3\n\t"
"sub x24, x26, x24\n\t" "sub x23, x25, x23\n\t"
"\n43:\n\t" "\n43:\n\t"
"\n50:\n\t" "\n50:\n\t"
"lsr x3, x3, 1\n\t" "lsr x3, x3, 1\n\t"
"lsr x27, x4, 1\n\t" "lsr x26, x4, 1\n\t"
"lsr x28, x5, 1\n\t" "lsr x27, x5, 1\n\t"
"orr x3, x3, x4, lsl 63\n\t" "orr x3, x3, x4, lsl 63\n\t"
"orr x4, x27, x5, lsl 63\n\t" "orr x4, x26, x5, lsl 63\n\t"
"orr x5, x28, x6, lsl 63\n\t" "orr x5, x27, x6, lsl 63\n\t"
"lsr x6, x6, 1\n\t" "lsr x6, x6, 1\n\t"
"sub x24, x24, 1\n\t" "sub x23, x23, 1\n\t"
"ands x26, x11, 1\n\t" "ands x25, x11, 1\n\t"
"b.eq 51f\n\t" "b.eq 51f\n\t"
"adds x11, x11, x20\n\t" "adds x11, x11, x20\n\t"
"adcs x12, x12, x21\n\t" "adcs x12, x12, x21\n\t"
"adcs x13, x13, x22\n\t" "adcs x13, x13, x22\n\t"
"adcs x14, x14, x23\n\t" "adcs x14, x14, %[m]\n\t"
"cset x26, cs\n\t" "cset x25, cs\n\t"
"\n51:\n\t" "\n51:\n\t"
"lsr x11, x11, 1\n\t" "lsr x11, x11, 1\n\t"
"lsr x27, x12, 1\n\t" "lsr x26, x12, 1\n\t"
"lsr x28, x13, 1\n\t" "lsr x27, x13, 1\n\t"
"lsr x29, x14, 1\n\t" "lsr x28, x14, 1\n\t"
"orr x11, x11, x12, lsl 63\n\t" "orr x11, x11, x12, lsl 63\n\t"
"orr x12, x27, x13, lsl 63\n\t" "orr x12, x26, x13, lsl 63\n\t"
"orr x13, x28, x14, lsl 63\n\t" "orr x13, x27, x14, lsl 63\n\t"
"orr x14, x29, x26, lsl 63\n\t" "orr x14, x28, x25, lsl 63\n\t"
"tst x3, 1\n\t" "tst x3, 1\n\t"
"b.eq 50b\n\t" "b.eq 50b\n\t"
"b 90b\n\t" "b 90b\n\t"
"\n92:\n\t" "\n92:\n\t"
"subs x7, x7, x3\n\t" "subs x7, x7, x3\n\t"
"sbcs x8, x8, x4\n\t" "sbcs x8, x8, x4\n\t"
"sbcs x9, x9, x5\n\t" "sbcs x9, x9, x5\n\t"
"sbc x10, x10, x6\n\t" "sbc x10, x10, x6\n\t"
"subs x15, x15, x11\n\t" "subs x15, x15, x11\n\t"
"sbcs x16, x16, x12\n\t" "sbcs x16, x16, x12\n\t"
"sbcs x17, x17, x13\n\t" "sbcs x17, x17, x13\n\t"
"sbcs x19, x19, x14\n\t" "sbcs x19, x19, x14\n\t"
"b.cs 60f\n\t" "b.cs 60f\n\t"
"adds x15, x15, x20\n\t" "adds x15, x15, x20\n\t"
"adcs x16, x16, x21\n\t" "adcs x16, x16, x21\n\t"
"adcs x17, x17, x22\n\t" "adcs x17, x17, x22\n\t"
"adc x19, x19, x23\n\t" "adc x19, x19, %[m]\n\t"
"\n60:\n\t" "\n60:\n\t"
"cmp x10, 0\n\t" "cmp x10, 0\n\t"
"b.eq 70f\n\t" "b.eq 70f\n\t"
"mov x26, 256\n\t" "mov x25, 256\n\t"
"clz x25, x10\n\t" "clz x24, x10\n\t"
"sub x25, x26, x25\n\t" "sub x24, x25, x24\n\t"
"b 73f\n\t" "b 73f\n\t"
"\n70:\n\t" "\n70:\n\t"
"cmp x9, 0\n\t" "cmp x9, 0\n\t"
"b.eq 71f\n\t" "b.eq 71f\n\t"
"mov x26, 192\n\t" "mov x25, 192\n\t"
"clz x25, x9\n\t" "clz x24, x9\n\t"
"sub x25, x26, x25\n\t" "sub x24, x25, x24\n\t"
"b 73f\n\t" "b 73f\n\t"
"\n71:\n\t" "\n71:\n\t"
"cmp x8, 0\n\t" "cmp x8, 0\n\t"
"b.eq 72f\n\t" "b.eq 72f\n\t"
"mov x26, 128\n\t" "mov x25, 128\n\t"
"clz x25, x8\n\t" "clz x24, x8\n\t"
"sub x25, x26, x25\n\t" "sub x24, x25, x24\n\t"
"b 73f\n\t" "b 73f\n\t"
"\n72:\n\t" "\n72:\n\t"
"mov x26, 64\n\t" "mov x25, 64\n\t"
"clz x25, x7\n\t" "clz x24, x7\n\t"
"sub x25, x26, x25\n\t" "sub x24, x25, x24\n\t"
"\n73:\n\t" "\n73:\n\t"
"\n80:\n\t" "\n80:\n\t"
"lsr x7, x7, 1\n\t" "lsr x7, x7, 1\n\t"
"lsr x27, x8, 1\n\t" "lsr x26, x8, 1\n\t"
"lsr x28, x9, 1\n\t" "lsr x27, x9, 1\n\t"
"orr x7, x7, x8, lsl 63\n\t" "orr x7, x7, x8, lsl 63\n\t"
"orr x8, x27, x9, lsl 63\n\t" "orr x8, x26, x9, lsl 63\n\t"
"orr x9, x28, x10, lsl 63\n\t" "orr x9, x27, x10, lsl 63\n\t"
"lsr x10, x10, 1\n\t" "lsr x10, x10, 1\n\t"
"sub x25, x25, 1\n\t" "sub x24, x24, 1\n\t"
"ands x26, x15, 1\n\t" "ands x25, x15, 1\n\t"
"b.eq 81f\n\t" "b.eq 81f\n\t"
"adds x15, x15, x20\n\t" "adds x15, x15, x20\n\t"
"adcs x16, x16, x21\n\t" "adcs x16, x16, x21\n\t"
"adcs x17, x17, x22\n\t" "adcs x17, x17, x22\n\t"
"adcs x19, x19, x23\n\t" "adcs x19, x19, %[m]\n\t"
"cset x26, cs\n\t" "cset x25, cs\n\t"
"\n81:\n\t" "\n81:\n\t"
"lsr x15, x15, 1\n\t" "lsr x15, x15, 1\n\t"
"lsr x27, x16, 1\n\t" "lsr x26, x16, 1\n\t"
"lsr x28, x17, 1\n\t" "lsr x27, x17, 1\n\t"
"lsr x29, x19, 1\n\t" "lsr x28, x19, 1\n\t"
"orr x15, x15, x16, lsl 63\n\t" "orr x15, x15, x16, lsl 63\n\t"
"orr x16, x27, x17, lsl 63\n\t" "orr x16, x26, x17, lsl 63\n\t"
"orr x17, x28, x19, lsl 63\n\t" "orr x17, x27, x19, lsl 63\n\t"
"orr x19, x29, x26, lsl 63\n\t" "orr x19, x28, x25, lsl 63\n\t"
"tst x7, 1\n\t" "tst x7, 1\n\t"
"b.eq 80b\n\t" "b.eq 80b\n\t"
"b 90b\n\t" "b 90b\n\t"
"\n100:\n\t" "\n100:\n\t"
"str x11, [%[r], 0]\n\t" "str x11, [%[r], 0]\n\t"
"str x12, [%[r], 8]\n\t" "str x12, [%[r], 8]\n\t"
"str x13, [%[r], 16]\n\t" "str x13, [%[r], 16]\n\t"
"str x14, [%[r], 24]\n\t" "str x14, [%[r], 24]\n\t"
"b 102f\n\t" "b 102f\n\t"
"\n101:\n\t" "\n101:\n\t"
"str x15, [%[r], 0]\n\t" "str x15, [%[r], 0]\n\t"
"str x16, [%[r], 8]\n\t" "str x16, [%[r], 8]\n\t"
"str x17, [%[r], 16]\n\t" "str x17, [%[r], 16]\n\t"
"str x19, [%[r], 24]\n\t" "str x19, [%[r], 24]\n\t"
"\n102:\n\t" "\n102:\n\t"
: : [m] "+r" (m)
: [r] "r" (r), [a] "r" (a), [m] "r" (m) : [r] "r" (r), [a] "r" (a)
: "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29" : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
); );
return MP_OKAY; return MP_OKAY;