diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c index d5136d17a..24560941b 100644 --- a/wolfcrypt/src/sp_arm64.c +++ b/wolfcrypt/src/sp_arm64.c @@ -37358,275 +37358,275 @@ static int sp_256_mod_inv_4(sp_digit* r, const sp_digit* a, const sp_digit* m) { __asm__ __volatile__ ( - "ldr x20, [%[m], 0]\n\t" - "ldr x21, [%[m], 8]\n\t" - "ldr x22, [%[m], 16]\n\t" - "ldr x23, [%[m], 24]\n\t" - "ldr x7, [%[a], 0]\n\t" - "ldr x8, [%[a], 8]\n\t" - "ldr x9, [%[a], 16]\n\t" - "ldr x10, [%[a], 24]\n\t" - "mov x3, x20\n\t" - "mov x4, x21\n\t" - "mov x5, x22\n\t" - "mov x6, x23\n\t" - "mov x11, xzr\n\t" - "mov x12, xzr\n\t" - "mov x13, xzr\n\t" - "mov x14, xzr\n\t" - "mov x15, 1\n\t" - "mov x16, xzr\n\t" - "mov x17, xzr\n\t" - "mov x19, xzr\n\t" - "cmp x6, 0\n\t" - "b.eq 10f\n\t" - "mov x26, 256\n\t" - "clz x24, x6\n\t" - "sub x24, x26, x24\n\t" - "b 13f\n\t" + "ldr x20, [%[m], 0]\n\t" + "ldr x21, [%[m], 8]\n\t" + "ldr x22, [%[m], 16]\n\t" + "ldr %[m], [%[m], 24]\n\t" + "ldr x7, [%[a], 0]\n\t" + "ldr x8, [%[a], 8]\n\t" + "ldr x9, [%[a], 16]\n\t" + "ldr x10, [%[a], 24]\n\t" + "mov x3, x20\n\t" + "mov x4, x21\n\t" + "mov x5, x22\n\t" + "mov x6, %[m]\n\t" + "mov x11, xzr\n\t" + "mov x12, xzr\n\t" + "mov x13, xzr\n\t" + "mov x14, xzr\n\t" + "mov x15, 1\n\t" + "mov x16, xzr\n\t" + "mov x17, xzr\n\t" + "mov x19, xzr\n\t" + "cmp x6, 0\n\t" + "b.eq 10f\n\t" + "mov x25, 256\n\t" + "clz x23, x6\n\t" + "sub x23, x25, x23\n\t" + "b 13f\n\t" "\n10:\n\t" - "cmp x5, 0\n\t" - "b.eq 11f\n\t" - "mov x26, 192\n\t" - "clz x24, x5\n\t" - "sub x24, x26, x24\n\t" - "b 13f\n\t" + "cmp x5, 0\n\t" + "b.eq 11f\n\t" + "mov x25, 192\n\t" + "clz x23, x5\n\t" + "sub x23, x25, x23\n\t" + "b 13f\n\t" "\n11:\n\t" - "cmp x4, 0\n\t" - "b.eq 12f\n\t" - "mov x26, 128\n\t" - "clz x24, x4\n\t" - "sub x24, x26, x24\n\t" - "b 13f\n\t" + "cmp x4, 0\n\t" + "b.eq 12f\n\t" + "mov x25, 128\n\t" + "clz x23, x4\n\t" + "sub x23, x25, x23\n\t" + "b 13f\n\t" "\n12:\n\t" - "mov x26, 64\n\t" - "clz x24, x3\n\t" - "sub x24, x26, x24\n\t" + "mov x25, 64\n\t" + "clz x23, x3\n\t" + "sub x23, x25, x23\n\t" "\n13:\n\t" - "cmp x10, 0\n\t" - "b.eq 20f\n\t" - "mov x26, 256\n\t" - "clz x25, x10\n\t" - "sub x25, x26, x25\n\t" - "b 23f\n\t" + "cmp x10, 0\n\t" + "b.eq 20f\n\t" + "mov x25, 256\n\t" + "clz x24, x10\n\t" + "sub x24, x25, x24\n\t" + "b 23f\n\t" "\n20:\n\t" - "cmp x9, 0\n\t" - "b.eq 21f\n\t" - "mov x26, 192\n\t" - "clz x25, x9\n\t" - "sub x25, x26, x25\n\t" - "b 23f\n\t" + "cmp x9, 0\n\t" + "b.eq 21f\n\t" + "mov x25, 192\n\t" + "clz x24, x9\n\t" + "sub x24, x25, x24\n\t" + "b 23f\n\t" "\n21:\n\t" - "cmp x8, 0\n\t" - "b.eq 22f\n\t" - "mov x26, 128\n\t" - "clz x25, x8\n\t" - "sub x25, x26, x25\n\t" - "b 23f\n\t" + "cmp x8, 0\n\t" + "b.eq 22f\n\t" + "mov x25, 128\n\t" + "clz x24, x8\n\t" + "sub x24, x25, x24\n\t" + "b 23f\n\t" "\n22:\n\t" - "mov x26, 64\n\t" - "clz x25, x7\n\t" - "sub x25, x26, x25\n\t" + "mov x25, 64\n\t" + "clz x24, x7\n\t" + "sub x24, x25, x24\n\t" "\n23:\n\t" - "tst x7, 1\n\t" - "b.ne 90f\n\t" + "tst x7, 1\n\t" + "b.ne 90f\n\t" "\n1:\n\t" - "lsr x7, x7, 1\n\t" - "lsr x27, x8, 1\n\t" - "lsr x28, x9, 1\n\t" - "orr x7, x7, x8, lsl 63\n\t" - "orr x8, x27, x9, lsl 63\n\t" - "orr x9, x28, x10, lsl 63\n\t" - "lsr x10, x10, 1\n\t" - "sub x25, x25, 1\n\t" - "ands x26, x15, 1\n\t" - "b.eq 2f\n\t" - "adds x15, x15, x20\n\t" - "adcs x16, x16, x21\n\t" - "adcs x17, x17, x22\n\t" - "adcs x19, x19, x23\n\t" - "cset x26, cs\n\t" + "lsr x7, x7, 1\n\t" + "lsr x26, x8, 1\n\t" + "lsr x27, x9, 1\n\t" + "orr x7, x7, x8, lsl 63\n\t" + "orr x8, x26, x9, lsl 63\n\t" + "orr x9, x27, x10, lsl 63\n\t" + "lsr x10, x10, 1\n\t" + "sub x24, x24, 1\n\t" + "ands x25, x15, 1\n\t" + "b.eq 2f\n\t" + "adds x15, x15, x20\n\t" + "adcs x16, x16, x21\n\t" + "adcs x17, x17, x22\n\t" + "adcs x19, x19, %[m]\n\t" + "cset x25, cs\n\t" "\n2:\n\t" - "lsr x15, x15, 1\n\t" - "lsr x27, x16, 1\n\t" - "lsr x28, x17, 1\n\t" - "lsr x29, x19, 1\n\t" - "orr x15, x15, x16, lsl 63\n\t" - "orr x16, x27, x17, lsl 63\n\t" - "orr x17, x28, x19, lsl 63\n\t" - "orr x19, x29, x26, lsl 63\n\t" - "tst x7, 1\n\t" - "b.eq 1b\n\t" + "lsr x15, x15, 1\n\t" + "lsr x26, x16, 1\n\t" + "lsr x27, x17, 1\n\t" + "lsr x28, x19, 1\n\t" + "orr x15, x15, x16, lsl 63\n\t" + "orr x16, x26, x17, lsl 63\n\t" + "orr x17, x27, x19, lsl 63\n\t" + "orr x19, x28, x25, lsl 63\n\t" + "tst x7, 1\n\t" + "b.eq 1b\n\t" "\n90:\n\t" - "cmp x24, 1\n\t" - "b.eq 100f\n\t" - "cmp x25, 1\n\t" - "b.eq 101f\n\t" - "cmp x24, x25\n\t" - "b.hi 91f\n\t" - "b.cc 92f\n\t" - "cmp x6, x10\n\t" - "b.hi 91f\n\t" - "b.cc 92f\n\t" - "cmp x5, x9\n\t" - "b.hi 91f\n\t" - "b.cc 92f\n\t" - "cmp x4, x8\n\t" - "b.hi 91f\n\t" - "b.cc 92f\n\t" - "cmp x3, x7\n\t" - "b.cc 92f\n\t" + "cmp x23, 1\n\t" + "b.eq 100f\n\t" + "cmp x24, 1\n\t" + "b.eq 101f\n\t" + "cmp x23, x24\n\t" + "b.hi 91f\n\t" + "b.cc 92f\n\t" + "cmp x6, x10\n\t" + "b.hi 91f\n\t" + "b.cc 92f\n\t" + "cmp x5, x9\n\t" + "b.hi 91f\n\t" + "b.cc 92f\n\t" + "cmp x4, x8\n\t" + "b.hi 91f\n\t" + "b.cc 92f\n\t" + "cmp x3, x7\n\t" + "b.cc 92f\n\t" "\n91:\n\t" - "subs x3, x3, x7\n\t" - "sbcs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbc x6, x6, x10\n\t" - "subs x11, x11, x15\n\t" - "sbcs x12, x12, x16\n\t" - "sbcs x13, x13, x17\n\t" - "sbcs x14, x14, x19\n\t" - "b.cs 30f\n\t" - "adds x11, x11, x20\n\t" - "adcs x12, x12, x21\n\t" - "adcs x13, x13, x22\n\t" - "adc x14, x14, x23\n\t" + "subs x3, x3, x7\n\t" + "sbcs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbc x6, x6, x10\n\t" + "subs x11, x11, x15\n\t" + "sbcs x12, x12, x16\n\t" + "sbcs x13, x13, x17\n\t" + "sbcs x14, x14, x19\n\t" + "b.cs 30f\n\t" + "adds x11, x11, x20\n\t" + "adcs x12, x12, x21\n\t" + "adcs x13, x13, x22\n\t" + "adc x14, x14, %[m]\n\t" "\n30:\n\t" - "cmp x6, 0\n\t" - "b.eq 40f\n\t" - "mov x26, 256\n\t" - "clz x24, x6\n\t" - "sub x24, x26, x24\n\t" - "b 43f\n\t" + "cmp x6, 0\n\t" + "b.eq 40f\n\t" + "mov x25, 256\n\t" + "clz x23, x6\n\t" + "sub x23, x25, x23\n\t" + "b 43f\n\t" "\n40:\n\t" - "cmp x5, 0\n\t" - "b.eq 41f\n\t" - "mov x26, 192\n\t" - "clz x24, x5\n\t" - "sub x24, x26, x24\n\t" - "b 43f\n\t" + "cmp x5, 0\n\t" + "b.eq 41f\n\t" + "mov x25, 192\n\t" + "clz x23, x5\n\t" + "sub x23, x25, x23\n\t" + "b 43f\n\t" "\n41:\n\t" - "cmp x4, 0\n\t" - "b.eq 42f\n\t" - "mov x26, 128\n\t" - "clz x24, x4\n\t" - "sub x24, x26, x24\n\t" - "b 43f\n\t" + "cmp x4, 0\n\t" + "b.eq 42f\n\t" + "mov x25, 128\n\t" + "clz x23, x4\n\t" + "sub x23, x25, x23\n\t" + "b 43f\n\t" "\n42:\n\t" - "mov x26, 64\n\t" - "clz x24, x3\n\t" - "sub x24, x26, x24\n\t" + "mov x25, 64\n\t" + "clz x23, x3\n\t" + "sub x23, x25, x23\n\t" "\n43:\n\t" "\n50:\n\t" - "lsr x3, x3, 1\n\t" - "lsr x27, x4, 1\n\t" - "lsr x28, x5, 1\n\t" - "orr x3, x3, x4, lsl 63\n\t" - "orr x4, x27, x5, lsl 63\n\t" - "orr x5, x28, x6, lsl 63\n\t" - "lsr x6, x6, 1\n\t" - "sub x24, x24, 1\n\t" - "ands x26, x11, 1\n\t" - "b.eq 51f\n\t" - "adds x11, x11, x20\n\t" - "adcs x12, x12, x21\n\t" - "adcs x13, x13, x22\n\t" - "adcs x14, x14, x23\n\t" - "cset x26, cs\n\t" + "lsr x3, x3, 1\n\t" + "lsr x26, x4, 1\n\t" + "lsr x27, x5, 1\n\t" + "orr x3, x3, x4, lsl 63\n\t" + "orr x4, x26, x5, lsl 63\n\t" + "orr x5, x27, x6, lsl 63\n\t" + "lsr x6, x6, 1\n\t" + "sub x23, x23, 1\n\t" + "ands x25, x11, 1\n\t" + "b.eq 51f\n\t" + "adds x11, x11, x20\n\t" + "adcs x12, x12, x21\n\t" + "adcs x13, x13, x22\n\t" + "adcs x14, x14, %[m]\n\t" + "cset x25, cs\n\t" "\n51:\n\t" - "lsr x11, x11, 1\n\t" - "lsr x27, x12, 1\n\t" - "lsr x28, x13, 1\n\t" - "lsr x29, x14, 1\n\t" - "orr x11, x11, x12, lsl 63\n\t" - "orr x12, x27, x13, lsl 63\n\t" - "orr x13, x28, x14, lsl 63\n\t" - "orr x14, x29, x26, lsl 63\n\t" - "tst x3, 1\n\t" - "b.eq 50b\n\t" - "b 90b\n\t" + "lsr x11, x11, 1\n\t" + "lsr x26, x12, 1\n\t" + "lsr x27, x13, 1\n\t" + "lsr x28, x14, 1\n\t" + "orr x11, x11, x12, lsl 63\n\t" + "orr x12, x26, x13, lsl 63\n\t" + "orr x13, x27, x14, lsl 63\n\t" + "orr x14, x28, x25, lsl 63\n\t" + "tst x3, 1\n\t" + "b.eq 50b\n\t" + "b 90b\n\t" "\n92:\n\t" - "subs x7, x7, x3\n\t" - "sbcs x8, x8, x4\n\t" - "sbcs x9, x9, x5\n\t" - "sbc x10, x10, x6\n\t" - "subs x15, x15, x11\n\t" - "sbcs x16, x16, x12\n\t" - "sbcs x17, x17, x13\n\t" - "sbcs x19, x19, x14\n\t" - "b.cs 60f\n\t" - "adds x15, x15, x20\n\t" - "adcs x16, x16, x21\n\t" - "adcs x17, x17, x22\n\t" - "adc x19, x19, x23\n\t" + "subs x7, x7, x3\n\t" + "sbcs x8, x8, x4\n\t" + "sbcs x9, x9, x5\n\t" + "sbc x10, x10, x6\n\t" + "subs x15, x15, x11\n\t" + "sbcs x16, x16, x12\n\t" + "sbcs x17, x17, x13\n\t" + "sbcs x19, x19, x14\n\t" + "b.cs 60f\n\t" + "adds x15, x15, x20\n\t" + "adcs x16, x16, x21\n\t" + "adcs x17, x17, x22\n\t" + "adc x19, x19, %[m]\n\t" "\n60:\n\t" - "cmp x10, 0\n\t" - "b.eq 70f\n\t" - "mov x26, 256\n\t" - "clz x25, x10\n\t" - "sub x25, x26, x25\n\t" - "b 73f\n\t" + "cmp x10, 0\n\t" + "b.eq 70f\n\t" + "mov x25, 256\n\t" + "clz x24, x10\n\t" + "sub x24, x25, x24\n\t" + "b 73f\n\t" "\n70:\n\t" - "cmp x9, 0\n\t" - "b.eq 71f\n\t" - "mov x26, 192\n\t" - "clz x25, x9\n\t" - "sub x25, x26, x25\n\t" - "b 73f\n\t" + "cmp x9, 0\n\t" + "b.eq 71f\n\t" + "mov x25, 192\n\t" + "clz x24, x9\n\t" + "sub x24, x25, x24\n\t" + "b 73f\n\t" "\n71:\n\t" - "cmp x8, 0\n\t" - "b.eq 72f\n\t" - "mov x26, 128\n\t" - "clz x25, x8\n\t" - "sub x25, x26, x25\n\t" - "b 73f\n\t" + "cmp x8, 0\n\t" + "b.eq 72f\n\t" + "mov x25, 128\n\t" + "clz x24, x8\n\t" + "sub x24, x25, x24\n\t" + "b 73f\n\t" "\n72:\n\t" - "mov x26, 64\n\t" - "clz x25, x7\n\t" - "sub x25, x26, x25\n\t" + "mov x25, 64\n\t" + "clz x24, x7\n\t" + "sub x24, x25, x24\n\t" "\n73:\n\t" "\n80:\n\t" - "lsr x7, x7, 1\n\t" - "lsr x27, x8, 1\n\t" - "lsr x28, x9, 1\n\t" - "orr x7, x7, x8, lsl 63\n\t" - "orr x8, x27, x9, lsl 63\n\t" - "orr x9, x28, x10, lsl 63\n\t" - "lsr x10, x10, 1\n\t" - "sub x25, x25, 1\n\t" - "ands x26, x15, 1\n\t" - "b.eq 81f\n\t" - "adds x15, x15, x20\n\t" - "adcs x16, x16, x21\n\t" - "adcs x17, x17, x22\n\t" - "adcs x19, x19, x23\n\t" - "cset x26, cs\n\t" + "lsr x7, x7, 1\n\t" + "lsr x26, x8, 1\n\t" + "lsr x27, x9, 1\n\t" + "orr x7, x7, x8, lsl 63\n\t" + "orr x8, x26, x9, lsl 63\n\t" + "orr x9, x27, x10, lsl 63\n\t" + "lsr x10, x10, 1\n\t" + "sub x24, x24, 1\n\t" + "ands x25, x15, 1\n\t" + "b.eq 81f\n\t" + "adds x15, x15, x20\n\t" + "adcs x16, x16, x21\n\t" + "adcs x17, x17, x22\n\t" + "adcs x19, x19, %[m]\n\t" + "cset x25, cs\n\t" "\n81:\n\t" - "lsr x15, x15, 1\n\t" - "lsr x27, x16, 1\n\t" - "lsr x28, x17, 1\n\t" - "lsr x29, x19, 1\n\t" - "orr x15, x15, x16, lsl 63\n\t" - "orr x16, x27, x17, lsl 63\n\t" - "orr x17, x28, x19, lsl 63\n\t" - "orr x19, x29, x26, lsl 63\n\t" - "tst x7, 1\n\t" - "b.eq 80b\n\t" - "b 90b\n\t" + "lsr x15, x15, 1\n\t" + "lsr x26, x16, 1\n\t" + "lsr x27, x17, 1\n\t" + "lsr x28, x19, 1\n\t" + "orr x15, x15, x16, lsl 63\n\t" + "orr x16, x26, x17, lsl 63\n\t" + "orr x17, x27, x19, lsl 63\n\t" + "orr x19, x28, x25, lsl 63\n\t" + "tst x7, 1\n\t" + "b.eq 80b\n\t" + "b 90b\n\t" "\n100:\n\t" - "str x11, [%[r], 0]\n\t" - "str x12, [%[r], 8]\n\t" - "str x13, [%[r], 16]\n\t" - "str x14, [%[r], 24]\n\t" - "b 102f\n\t" + "str x11, [%[r], 0]\n\t" + "str x12, [%[r], 8]\n\t" + "str x13, [%[r], 16]\n\t" + "str x14, [%[r], 24]\n\t" + "b 102f\n\t" "\n101:\n\t" - "str x15, [%[r], 0]\n\t" - "str x16, [%[r], 8]\n\t" - "str x17, [%[r], 16]\n\t" - "str x19, [%[r], 24]\n\t" + "str x15, [%[r], 0]\n\t" + "str x16, [%[r], 8]\n\t" + "str x17, [%[r], 16]\n\t" + "str x19, [%[r], 24]\n\t" "\n102:\n\t" - : - : [r] "r" (r), [a] "r" (a), [m] "r" (m) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29" + : [m] "+r" (m) + : [r] "r" (r), [a] "r" (a) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); return MP_OKAY;