diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c index 916f06d26..3d3d0d3f6 100644 --- a/wolfcrypt/src/sp_arm32.c +++ b/wolfcrypt/src/sp_arm32.c @@ -29560,7 +29560,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "umull r8, r9, r6, r7\n\t" "str r8, [sp, #0]\n\t" "# A[0] * B[1]\n\t" - "ldr r6, [%[a], #0]\n\t" "ldr r7, [%[b], #4]\n\t" "umull r3, r4, r6, r7\n\t" "adds r9, r3, r9\n\t" @@ -29573,9 +29572,8 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adcs r10, r4, r10\n\t" "adc r14, r5, #0\n\t" "str r9, [sp, #4]\n\t" - "# A[0] * B[2]\n\t" - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[b], #8]\n\t" + "# A[2] * B[0]\n\t" + "ldr r6, [%[a], #8]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adc r14, r4, r14\n\t" @@ -29586,16 +29584,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" "adc r8, r5, #0\n\t" - "# A[2] * B[0]\n\t" - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[b], #0]\n\t" + "# A[0] * B[2]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r7, [%[b], #8]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" "adc r8, r5, r8\n\t" "str r10, [sp, #8]\n\t" "# A[0] * B[3]\n\t" - "ldr r6, [%[a], #0]\n\t" "ldr r7, [%[b], #12]\n\t" "umull r3, r4, r6, r7\n\t" "adds r14, r3, r14\n\t" @@ -29623,16 +29620,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adcs r8, r4, r8\n\t" "adc r9, r5, r9\n\t" "str r14, [sp, #12]\n\t" - "# A[0] * B[4]\n\t" - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[b], #16]\n\t" + "# A[4] * B[0]\n\t" + "ldr r6, [%[a], #16]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" "adc r10, r5, #0\n\t" - "# A[1] * B[3]\n\t" - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[b], #12]\n\t" + "# A[3] * B[1]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r7, [%[b], #4]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" @@ -29644,23 +29640,22 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" "adc r10, r5, r10\n\t" - "# A[3] * B[1]\n\t" - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b], #4]\n\t" + "# A[1] * B[3]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r7, [%[b], #12]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" "adc r10, r5, r10\n\t" - "# A[4] * B[0]\n\t" - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[b], #0]\n\t" + "# A[0] * B[4]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r7, [%[b], #16]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" "adc r10, r5, r10\n\t" "str r8, [sp, #16]\n\t" "# A[0] * B[5]\n\t" - "ldr r6, [%[a], #0]\n\t" "ldr r7, [%[b], #20]\n\t" "umull r3, r4, r6, r7\n\t" "adds r9, r3, r9\n\t" @@ -29702,30 +29697,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adcs r10, r4, r10\n\t" "adc r14, r5, r14\n\t" "str r9, [sp, #20]\n\t" - "# A[0] * B[6]\n\t" - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[b], #24]\n\t" + "# A[6] * B[0]\n\t" + "ldr r6, [%[a], #24]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" "adc r8, r5, #0\n\t" - "# A[1] * B[5]\n\t" - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[b], #20]\n\t" - "umull r3, r4, r6, r7\n\t" - "adds r10, r3, r10\n\t" - "adcs r14, r4, r14\n\t" - "adc r8, r5, r8\n\t" - "# A[2] * B[4]\n\t" - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[b], #16]\n\t" - "umull r3, r4, r6, r7\n\t" - "adds r10, r3, r10\n\t" - "adcs r14, r4, r14\n\t" - "adc r8, r5, r8\n\t" - "# A[3] * B[3]\n\t" - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b], #12]\n\t" + "# A[5] * B[1]\n\t" + "ldr r6, [%[a], #20]\n\t" + "ldr r7, [%[b], #4]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" @@ -29737,23 +29717,36 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" "adc r8, r5, r8\n\t" - "# A[5] * B[1]\n\t" - "ldr r6, [%[a], #20]\n\t" - "ldr r7, [%[b], #4]\n\t" + "# A[3] * B[3]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r7, [%[b], #12]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" "adc r8, r5, r8\n\t" - "# A[6] * B[0]\n\t" - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[b], #0]\n\t" + "# A[2] * B[4]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r7, [%[b], #16]\n\t" + "umull r3, r4, r6, r7\n\t" + "adds r10, r3, r10\n\t" + "adcs r14, r4, r14\n\t" + "adc r8, r5, r8\n\t" + "# A[1] * B[5]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r7, [%[b], #20]\n\t" + "umull r3, r4, r6, r7\n\t" + "adds r10, r3, r10\n\t" + "adcs r14, r4, r14\n\t" + "adc r8, r5, r8\n\t" + "# A[0] * B[6]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r7, [%[b], #24]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" "adc r8, r5, r8\n\t" "str r10, [sp, #24]\n\t" "# A[0] * B[7]\n\t" - "ldr r6, [%[a], #0]\n\t" "ldr r7, [%[b], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r14, r3, r14\n\t" @@ -29809,30 +29802,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adcs r8, r4, r8\n\t" "adc r9, r5, r9\n\t" "str r14, [sp, #28]\n\t" - "# A[1] * B[7]\n\t" - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[b], #28]\n\t" + "# A[7] * B[1]\n\t" + "ldr r7, [%[b], #4]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" "adc r10, r5, #0\n\t" - "# A[2] * B[6]\n\t" - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[b], #24]\n\t" - "umull r3, r4, r6, r7\n\t" - "adds r8, r3, r8\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - "# A[3] * B[5]\n\t" - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b], #20]\n\t" - "umull r3, r4, r6, r7\n\t" - "adds r8, r3, r8\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - "# A[4] * B[4]\n\t" - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[b], #16]\n\t" + "# A[6] * B[2]\n\t" + "ldr r6, [%[a], #24]\n\t" + "ldr r7, [%[b], #8]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" @@ -29844,16 +29822,30 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" "adc r10, r5, r10\n\t" - "# A[6] * B[2]\n\t" - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[b], #8]\n\t" + "# A[4] * B[4]\n\t" + "ldr r6, [%[a], #16]\n\t" + "ldr r7, [%[b], #16]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" "adc r10, r5, r10\n\t" - "# A[7] * B[1]\n\t" - "ldr r6, [%[a], #28]\n\t" - "ldr r7, [%[b], #4]\n\t" + "# A[3] * B[5]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r7, [%[b], #20]\n\t" + "umull r3, r4, r6, r7\n\t" + "adds r8, r3, r8\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[2] * B[6]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r7, [%[b], #24]\n\t" + "umull r3, r4, r6, r7\n\t" + "adds r8, r3, r8\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[1] * B[7]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r7, [%[b], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" @@ -29861,7 +29853,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "str r8, [sp, #32]\n\t" "# A[2] * B[7]\n\t" "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[b], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r9, r3, r9\n\t" "adcs r10, r4, r10\n\t" @@ -29902,16 +29893,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adcs r10, r4, r10\n\t" "adc r14, r5, r14\n\t" "str r9, [sp, #36]\n\t" - "# A[3] * B[7]\n\t" - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[b], #28]\n\t" + "# A[7] * B[3]\n\t" + "ldr r7, [%[b], #12]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" "adc r8, r5, #0\n\t" - "# A[4] * B[6]\n\t" - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[b], #24]\n\t" + "# A[6] * B[4]\n\t" + "ldr r6, [%[a], #24]\n\t" + "ldr r7, [%[b], #16]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" @@ -29923,16 +29913,16 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" "adc r8, r5, r8\n\t" - "# A[6] * B[4]\n\t" - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[b], #16]\n\t" + "# A[4] * B[6]\n\t" + "ldr r6, [%[a], #16]\n\t" + "ldr r7, [%[b], #24]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" "adc r8, r5, r8\n\t" - "# A[7] * B[3]\n\t" - "ldr r6, [%[a], #28]\n\t" - "ldr r7, [%[b], #12]\n\t" + "# A[3] * B[7]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r7, [%[b], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" @@ -29940,7 +29930,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "str r10, [sp, #40]\n\t" "# A[4] * B[7]\n\t" "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[b], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r14, r3, r14\n\t" "adcs r8, r4, r8\n\t" @@ -29967,9 +29956,8 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adcs r8, r4, r8\n\t" "adc r9, r5, r9\n\t" "str r14, [sp, #44]\n\t" - "# A[5] * B[7]\n\t" - "ldr r6, [%[a], #20]\n\t" - "ldr r7, [%[b], #28]\n\t" + "# A[7] * B[5]\n\t" + "ldr r7, [%[b], #20]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" @@ -29981,16 +29969,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" "adc r10, r5, r10\n\t" - "# A[7] * B[5]\n\t" - "ldr r6, [%[a], #28]\n\t" - "ldr r7, [%[b], #20]\n\t" + "# A[5] * B[7]\n\t" + "ldr r6, [%[a], #20]\n\t" + "ldr r7, [%[b], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" "adc r10, r5, r10\n\t" "# A[6] * B[7]\n\t" "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[b], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r9, r3, r9\n\t" "adcs r10, r4, r10\n\t" @@ -30003,7 +29990,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adcs r10, r4, r10\n\t" "adc r14, r5, r14\n\t" "# A[7] * B[7]\n\t" - "ldr r6, [%[a], #28]\n\t" "ldr r7, [%[b], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" @@ -30021,16 +30007,16 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "ldr r9, [sp, #20]\n\t" "ldr r10, [sp, #24]\n\t" "ldr r14, [sp, #28]\n\t" - "# mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192\n\t" + "# mu = a[0..7] + a[0..4] << 96 + (a[0..1] * 2) << 192\n\t" "# - a[0] << 224\n\t" - "# + (a[0]-a[1] * 2) << (6 * 32)\n\t" + "# + (a[0..1] * 2) << (6 * 32)\n\t" "adds r10, r10, r4\n\t" "adc r14, r14, r5\n\t" "adds r10, r10, r4\n\t" "adc r14, r14, r5\n\t" "# - a[0] << (7 * 32)\n\t" "sub r14, r14, r4\n\t" - "# + a[0]-a[4] << (3 * 32)\n\t" + "# + a[0..4] << (3 * 32)\n\t" "mov %[a], r7\n\t" "mov %[b], r8\n\t" "adds r7, r7, r4\n\t" @@ -30038,9 +30024,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "adcs r9, r9, r6\n\t" "adcs r10, r10, %[a]\n\t" "adc r14, r14, %[b]\n\t" - "str r4, [sp, #0]\n\t" - "str r5, [sp, #4]\n\t" - "str r6, [sp, #8]\n\t" "str r7, [sp, #12]\n\t" "str r8, [sp, #16]\n\t" "str r9, [sp, #20]\n\t" @@ -30156,38 +30139,28 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const "sbcs r5, r5, r10\n\t" "sbcs r6, r6, r14\n\t" "sbc r7, r7, #0\n\t" - "str r3, [sp, #44]\n\t" - "str r4, [sp, #48]\n\t" - "str r5, [sp, #52]\n\t" - "str r6, [sp, #56]\n\t" - "str r7, [sp, #60]\n\t" "# mask m and sub from result if overflow\n\t" "sub %[b], %[a], %[b]\n\t" "and %[a], %[b], #1\n\t" - "ldr r3, [sp, #32]\n\t" - "ldr r4, [sp, #36]\n\t" - "ldr r5, [sp, #40]\n\t" - "ldr r6, [sp, #44]\n\t" - "ldr r7, [sp, #48]\n\t" - "ldr r8, [sp, #52]\n\t" - "ldr r9, [sp, #56]\n\t" - "ldr r10, [sp, #60]\n\t" - "subs r3, r3, %[b]\n\t" - "sbcs r4, r4, %[b]\n\t" - "sbcs r5, r5, %[b]\n\t" - "sbcs r6, r6, #0\n\t" - "sbcs r7, r7, #0\n\t" - "sbcs r8, r8, #0\n\t" - "sbcs r9, r9, %[a]\n\t" - "sbc r10, r10, %[b]\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "str r7, [%[r], #16]\n\t" - "str r8, [%[r], #20]\n\t" - "str r9, [%[r], #24]\n\t" - "str r10, [%[r], #28]\n\t" + "ldr r8, [sp, #32]\n\t" + "ldr r9, [sp, #36]\n\t" + "ldr r10, [sp, #40]\n\t" + "subs r8, r8, %[b]\n\t" + "sbcs r9, r9, %[b]\n\t" + "sbcs r10, r10, %[b]\n\t" + "sbcs r3, r3, #0\n\t" + "sbcs r4, r4, #0\n\t" + "sbcs r5, r5, #0\n\t" + "sbcs r6, r6, %[a]\n\t" + "sbc r7, r7, %[b]\n\t" + "str r8, [%[r], #0]\n\t" + "str r9, [%[r], #4]\n\t" + "str r10, [%[r], #8]\n\t" + "str r3, [%[r], #12]\n\t" + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "str r6, [%[r], #24]\n\t" + "str r7, [%[r], #28]\n\t" "add sp, sp, #68\n\t" : [a] "+r" (a), [b] "+r" (b) : [r] "r" (r) @@ -30217,14 +30190,12 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "umull r9, r10, r6, r7\n\t" "str r9, [sp, #4]\n\t" "# A[0] * A[2]\n\t" - "ldr r6, [%[a], #0]\n\t" "ldr r7, [%[a], #8]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adc r14, r4, #0\n\t" "str r10, [sp, #8]\n\t" "# A[0] * A[3]\n\t" - "ldr r6, [%[a], #0]\n\t" "ldr r7, [%[a], #12]\n\t" "umull r3, r4, r6, r7\n\t" "adds r14, r3, r14\n\t" @@ -30237,22 +30208,20 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "adcs r8, r4, r8\n\t" "adc r9, r5, #0\n\t" "str r14, [sp, #12]\n\t" - "# A[0] * A[4]\n\t" - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[a], #16]\n\t" + "# A[1] * A[3]\n\t" + "ldr r7, [%[a], #12]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adc r9, r4, r9\n\t" - "# A[1] * A[3]\n\t" - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[a], #12]\n\t" + "# A[0] * A[4]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r7, [%[a], #16]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" "adc r10, r5, #0\n\t" "str r8, [sp, #16]\n\t" "# A[0] * A[5]\n\t" - "ldr r6, [%[a], #0]\n\t" "ldr r7, [%[a], #20]\n\t" "umull r3, r4, r6, r7\n\t" "adds r9, r3, r9\n\t" @@ -30272,9 +30241,8 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "adcs r10, r4, r10\n\t" "adc r14, r5, r14\n\t" "str r9, [sp, #20]\n\t" - "# A[0] * A[6]\n\t" - "ldr r6, [%[a], #0]\n\t" - "ldr r7, [%[a], #24]\n\t" + "# A[2] * A[4]\n\t" + "ldr r7, [%[a], #16]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" @@ -30286,16 +30254,15 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" "adc r8, r5, r8\n\t" - "# A[2] * A[4]\n\t" - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[a], #16]\n\t" + "# A[0] * A[6]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r7, [%[a], #24]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" "adc r8, r5, r8\n\t" "str r10, [sp, #24]\n\t" "# A[0] * A[7]\n\t" - "ldr r6, [%[a], #0]\n\t" "ldr r7, [%[a], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r14, r3, r14\n\t" @@ -30323,9 +30290,8 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "adcs r8, r4, r8\n\t" "adc r9, r5, r9\n\t" "str r14, [sp, #28]\n\t" - "# A[1] * A[7]\n\t" - "ldr r6, [%[a], #4]\n\t" - "ldr r7, [%[a], #28]\n\t" + "# A[3] * A[5]\n\t" + "ldr r7, [%[a], #20]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" @@ -30337,9 +30303,9 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" "adc r10, r5, r10\n\t" - "# A[3] * A[5]\n\t" - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[a], #20]\n\t" + "# A[1] * A[7]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r7, [%[a], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" "adcs r9, r4, r9\n\t" @@ -30347,7 +30313,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "str r8, [sp, #32]\n\t" "# A[2] * A[7]\n\t" "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[a], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r9, r3, r9\n\t" "adcs r10, r4, r10\n\t" @@ -30367,16 +30332,15 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "adcs r10, r4, r10\n\t" "adc r14, r5, r14\n\t" "str r9, [sp, #36]\n\t" - "# A[3] * A[7]\n\t" - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[a], #28]\n\t" + "# A[4] * A[6]\n\t" + "ldr r7, [%[a], #24]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" "adc r8, r5, #0\n\t" - "# A[4] * A[6]\n\t" - "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[a], #24]\n\t" + "# A[3] * A[7]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r7, [%[a], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r10, r3, r10\n\t" "adcs r14, r4, r14\n\t" @@ -30384,7 +30348,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "str r10, [sp, #40]\n\t" "# A[4] * A[7]\n\t" "ldr r6, [%[a], #16]\n\t" - "ldr r7, [%[a], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r14, r3, r14\n\t" "adcs r8, r4, r8\n\t" @@ -30398,7 +30361,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "adc r9, r5, r9\n\t" "str r14, [sp, #44]\n\t" "# A[5] * A[7]\n\t" - "ldr r6, [%[a], #20]\n\t" "ldr r7, [%[a], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r8, r3, r8\n\t" @@ -30407,7 +30369,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "str r8, [sp, #48]\n\t" "# A[6] * A[7]\n\t" "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[a], #28]\n\t" "umull r3, r4, r6, r7\n\t" "adds r9, r3, r9\n\t" "adc r10, r4, r10\n\t" @@ -30537,16 +30498,16 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "ldr r9, [sp, #20]\n\t" "ldr r10, [sp, #24]\n\t" "ldr r14, [sp, #28]\n\t" - "# mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192\n\t" + "# mu = a[0..7] + a[0..4] << 96 + (a[0..1] * 2) << 192\n\t" "# - a[0] << 224\n\t" - "# + (a[0]-a[1] * 2) << (6 * 32)\n\t" + "# + (a[0..1] * 2) << (6 * 32)\n\t" "adds r10, r10, r4\n\t" "adc r14, r14, r5\n\t" "adds r10, r10, r4\n\t" "adc r14, r14, r5\n\t" "# - a[0] << (7 * 32)\n\t" "sub r14, r14, r4\n\t" - "# + a[0]-a[4] << (3 * 32)\n\t" + "# + a[0..4] << (3 * 32)\n\t" "mov %[a], r7\n\t" "mov r12, r8\n\t" "adds r7, r7, r4\n\t" @@ -30554,9 +30515,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "adcs r9, r9, r6\n\t" "adcs r10, r10, %[a]\n\t" "adc r14, r14, r12\n\t" - "str r4, [sp, #0]\n\t" - "str r5, [sp, #4]\n\t" - "str r6, [sp, #8]\n\t" "str r7, [sp, #12]\n\t" "str r8, [sp, #16]\n\t" "str r9, [sp, #20]\n\t" @@ -30672,38 +30630,28 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const "sbcs r5, r5, r10\n\t" "sbcs r6, r6, r14\n\t" "sbc r7, r7, #0\n\t" - "str r3, [sp, #44]\n\t" - "str r4, [sp, #48]\n\t" - "str r5, [sp, #52]\n\t" - "str r6, [sp, #56]\n\t" - "str r7, [sp, #60]\n\t" "# mask m and sub from result if overflow\n\t" "sub r12, %[a], r12\n\t" "and %[a], r12, #1\n\t" - "ldr r3, [sp, #32]\n\t" - "ldr r4, [sp, #36]\n\t" - "ldr r5, [sp, #40]\n\t" - "ldr r6, [sp, #44]\n\t" - "ldr r7, [sp, #48]\n\t" - "ldr r8, [sp, #52]\n\t" - "ldr r9, [sp, #56]\n\t" - "ldr r10, [sp, #60]\n\t" - "subs r3, r3, r12\n\t" - "sbcs r4, r4, r12\n\t" - "sbcs r5, r5, r12\n\t" - "sbcs r6, r6, #0\n\t" - "sbcs r7, r7, #0\n\t" - "sbcs r8, r8, #0\n\t" - "sbcs r9, r9, %[a]\n\t" - "sbc r10, r10, r12\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "str r7, [%[r], #16]\n\t" - "str r8, [%[r], #20]\n\t" - "str r9, [%[r], #24]\n\t" - "str r10, [%[r], #28]\n\t" + "ldr r8, [sp, #32]\n\t" + "ldr r9, [sp, #36]\n\t" + "ldr r10, [sp, #40]\n\t" + "subs r8, r8, r12\n\t" + "sbcs r9, r9, r12\n\t" + "sbcs r10, r10, r12\n\t" + "sbcs r3, r3, #0\n\t" + "sbcs r4, r4, #0\n\t" + "sbcs r5, r5, #0\n\t" + "sbcs r6, r6, %[a]\n\t" + "sbc r7, r7, r12\n\t" + "str r8, [%[r], #0]\n\t" + "str r9, [%[r], #4]\n\t" + "str r10, [%[r], #8]\n\t" + "str r3, [%[r], #12]\n\t" + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "str r6, [%[r], #24]\n\t" + "str r7, [%[r], #28]\n\t" "add sp, sp, #68\n\t" : [a] "+r" (a) : [r] "r" (r)