SP ARM32 ASM: Improve performance of P-256 mont mul/sqr

2025-07-29 18:27:29 +02:00 · 2022-06-15 16:39:13 +10:00
parent af4fff80db
commit e073500e8e
1 changed files with 146 additions and 198 deletions
--- a/wolfcrypt/src/sp_arm32.c
+++ b/wolfcrypt/src/sp_arm32.c
@ -29560,7 +29560,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "umull	r8, r9, r6, r7\n\t"
        "str	r8, [sp, #0]\n\t"
        "#  A[0] * B[1]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
        "ldr	r7, [%[b], #4]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r9, r3, r9\n\t"
@ -29573,9 +29572,8 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adcs	r10, r4, r10\n\t"
        "adc	r14, r5, #0\n\t"
        "str	r9, [sp, #4]\n\t"
-        "#  A[0] * B[2]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
-        "ldr	r7, [%[b], #8]\n\t"
+        "#  A[2] * B[0]\n\t"
+        "ldr	r6, [%[a], #8]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adc	r14, r4, r14\n\t"
@ -29586,16 +29584,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
        "adc	r8, r5, #0\n\t"
-        "#  A[2] * B[0]\n\t"
-        "ldr	r6, [%[a], #8]\n\t"
-        "ldr	r7, [%[b], #0]\n\t"
+        "#  A[0] * B[2]\n\t"
+        "ldr	r6, [%[a], #0]\n\t"
+        "ldr	r7, [%[b], #8]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
        "adc	r8, r5, r8\n\t"
        "str	r10, [sp, #8]\n\t"
        "#  A[0] * B[3]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
        "ldr	r7, [%[b], #12]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r14, r3, r14\n\t"
@ -29623,16 +29620,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adcs	r8, r4, r8\n\t"
        "adc	r9, r5, r9\n\t"
        "str	r14, [sp, #12]\n\t"
-        "#  A[0] * B[4]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
-        "ldr	r7, [%[b], #16]\n\t"
+        "#  A[4] * B[0]\n\t"
+        "ldr	r6, [%[a], #16]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
        "adc	r10, r5, #0\n\t"
-        "#  A[1] * B[3]\n\t"
-        "ldr	r6, [%[a], #4]\n\t"
-        "ldr	r7, [%[b], #12]\n\t"
+        "#  A[3] * B[1]\n\t"
+        "ldr	r6, [%[a], #12]\n\t"
+        "ldr	r7, [%[b], #4]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
@ -29644,23 +29640,22 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
        "adc	r10, r5, r10\n\t"
-        "#  A[3] * B[1]\n\t"
-        "ldr	r6, [%[a], #12]\n\t"
-        "ldr	r7, [%[b], #4]\n\t"
+        "#  A[1] * B[3]\n\t"
+        "ldr	r6, [%[a], #4]\n\t"
+        "ldr	r7, [%[b], #12]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
        "adc	r10, r5, r10\n\t"
-        "#  A[4] * B[0]\n\t"
-        "ldr	r6, [%[a], #16]\n\t"
-        "ldr	r7, [%[b], #0]\n\t"
+        "#  A[0] * B[4]\n\t"
+        "ldr	r6, [%[a], #0]\n\t"
+        "ldr	r7, [%[b], #16]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
        "adc	r10, r5, r10\n\t"
        "str	r8, [sp, #16]\n\t"
        "#  A[0] * B[5]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
        "ldr	r7, [%[b], #20]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r9, r3, r9\n\t"
@ -29702,30 +29697,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adcs	r10, r4, r10\n\t"
        "adc	r14, r5, r14\n\t"
        "str	r9, [sp, #20]\n\t"
-        "#  A[0] * B[6]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
-        "ldr	r7, [%[b], #24]\n\t"
+        "#  A[6] * B[0]\n\t"
+        "ldr	r6, [%[a], #24]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
        "adc	r8, r5, #0\n\t"
-        "#  A[1] * B[5]\n\t"
-        "ldr	r6, [%[a], #4]\n\t"
-        "ldr	r7, [%[b], #20]\n\t"
-        "umull	r3, r4, r6, r7\n\t"
-        "adds	r10, r3, r10\n\t"
-        "adcs	r14, r4, r14\n\t"
-        "adc	r8, r5, r8\n\t"
-        "#  A[2] * B[4]\n\t"
-        "ldr	r6, [%[a], #8]\n\t"
-        "ldr	r7, [%[b], #16]\n\t"
-        "umull	r3, r4, r6, r7\n\t"
-        "adds	r10, r3, r10\n\t"
-        "adcs	r14, r4, r14\n\t"
-        "adc	r8, r5, r8\n\t"
-        "#  A[3] * B[3]\n\t"
-        "ldr	r6, [%[a], #12]\n\t"
-        "ldr	r7, [%[b], #12]\n\t"
+        "#  A[5] * B[1]\n\t"
+        "ldr	r6, [%[a], #20]\n\t"
+        "ldr	r7, [%[b], #4]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
@ -29737,23 +29717,36 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
        "adc	r8, r5, r8\n\t"
-        "#  A[5] * B[1]\n\t"
-        "ldr	r6, [%[a], #20]\n\t"
-        "ldr	r7, [%[b], #4]\n\t"
+        "#  A[3] * B[3]\n\t"
+        "ldr	r6, [%[a], #12]\n\t"
+        "ldr	r7, [%[b], #12]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
        "adc	r8, r5, r8\n\t"
-        "#  A[6] * B[0]\n\t"
-        "ldr	r6, [%[a], #24]\n\t"
-        "ldr	r7, [%[b], #0]\n\t"
+        "#  A[2] * B[4]\n\t"
+        "ldr	r6, [%[a], #8]\n\t"
+        "ldr	r7, [%[b], #16]\n\t"
+        "umull	r3, r4, r6, r7\n\t"
+        "adds	r10, r3, r10\n\t"
+        "adcs	r14, r4, r14\n\t"
+        "adc	r8, r5, r8\n\t"
+        "#  A[1] * B[5]\n\t"
+        "ldr	r6, [%[a], #4]\n\t"
+        "ldr	r7, [%[b], #20]\n\t"
+        "umull	r3, r4, r6, r7\n\t"
+        "adds	r10, r3, r10\n\t"
+        "adcs	r14, r4, r14\n\t"
+        "adc	r8, r5, r8\n\t"
+        "#  A[0] * B[6]\n\t"
+        "ldr	r6, [%[a], #0]\n\t"
+        "ldr	r7, [%[b], #24]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
        "adc	r8, r5, r8\n\t"
        "str	r10, [sp, #24]\n\t"
        "#  A[0] * B[7]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
        "ldr	r7, [%[b], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r14, r3, r14\n\t"
@ -29809,30 +29802,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adcs	r8, r4, r8\n\t"
        "adc	r9, r5, r9\n\t"
        "str	r14, [sp, #28]\n\t"
-        "#  A[1] * B[7]\n\t"
-        "ldr	r6, [%[a], #4]\n\t"
-        "ldr	r7, [%[b], #28]\n\t"
+        "#  A[7] * B[1]\n\t"
+        "ldr	r7, [%[b], #4]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
        "adc	r10, r5, #0\n\t"
-        "#  A[2] * B[6]\n\t"
-        "ldr	r6, [%[a], #8]\n\t"
-        "ldr	r7, [%[b], #24]\n\t"
-        "umull	r3, r4, r6, r7\n\t"
-        "adds	r8, r3, r8\n\t"
-        "adcs	r9, r4, r9\n\t"
-        "adc	r10, r5, r10\n\t"
-        "#  A[3] * B[5]\n\t"
-        "ldr	r6, [%[a], #12]\n\t"
-        "ldr	r7, [%[b], #20]\n\t"
-        "umull	r3, r4, r6, r7\n\t"
-        "adds	r8, r3, r8\n\t"
-        "adcs	r9, r4, r9\n\t"
-        "adc	r10, r5, r10\n\t"
-        "#  A[4] * B[4]\n\t"
-        "ldr	r6, [%[a], #16]\n\t"
-        "ldr	r7, [%[b], #16]\n\t"
+        "#  A[6] * B[2]\n\t"
+        "ldr	r6, [%[a], #24]\n\t"
+        "ldr	r7, [%[b], #8]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
@ -29844,16 +29822,30 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
        "adc	r10, r5, r10\n\t"
-        "#  A[6] * B[2]\n\t"
-        "ldr	r6, [%[a], #24]\n\t"
-        "ldr	r7, [%[b], #8]\n\t"
+        "#  A[4] * B[4]\n\t"
+        "ldr	r6, [%[a], #16]\n\t"
+        "ldr	r7, [%[b], #16]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
        "adc	r10, r5, r10\n\t"
-        "#  A[7] * B[1]\n\t"
-        "ldr	r6, [%[a], #28]\n\t"
-        "ldr	r7, [%[b], #4]\n\t"
+        "#  A[3] * B[5]\n\t"
+        "ldr	r6, [%[a], #12]\n\t"
+        "ldr	r7, [%[b], #20]\n\t"
+        "umull	r3, r4, r6, r7\n\t"
+        "adds	r8, r3, r8\n\t"
+        "adcs	r9, r4, r9\n\t"
+        "adc	r10, r5, r10\n\t"
+        "#  A[2] * B[6]\n\t"
+        "ldr	r6, [%[a], #8]\n\t"
+        "ldr	r7, [%[b], #24]\n\t"
+        "umull	r3, r4, r6, r7\n\t"
+        "adds	r8, r3, r8\n\t"
+        "adcs	r9, r4, r9\n\t"
+        "adc	r10, r5, r10\n\t"
+        "#  A[1] * B[7]\n\t"
+        "ldr	r6, [%[a], #4]\n\t"
+        "ldr	r7, [%[b], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
@ -29861,7 +29853,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "str	r8, [sp, #32]\n\t"
        "#  A[2] * B[7]\n\t"
        "ldr	r6, [%[a], #8]\n\t"
-        "ldr	r7, [%[b], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r9, r3, r9\n\t"
        "adcs	r10, r4, r10\n\t"
@ -29902,16 +29893,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adcs	r10, r4, r10\n\t"
        "adc	r14, r5, r14\n\t"
        "str	r9, [sp, #36]\n\t"
-        "#  A[3] * B[7]\n\t"
-        "ldr	r6, [%[a], #12]\n\t"
-        "ldr	r7, [%[b], #28]\n\t"
+        "#  A[7] * B[3]\n\t"
+        "ldr	r7, [%[b], #12]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
        "adc	r8, r5, #0\n\t"
-        "#  A[4] * B[6]\n\t"
-        "ldr	r6, [%[a], #16]\n\t"
-        "ldr	r7, [%[b], #24]\n\t"
+        "#  A[6] * B[4]\n\t"
+        "ldr	r6, [%[a], #24]\n\t"
+        "ldr	r7, [%[b], #16]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
@ -29923,16 +29913,16 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
        "adc	r8, r5, r8\n\t"
-        "#  A[6] * B[4]\n\t"
-        "ldr	r6, [%[a], #24]\n\t"
-        "ldr	r7, [%[b], #16]\n\t"
+        "#  A[4] * B[6]\n\t"
+        "ldr	r6, [%[a], #16]\n\t"
+        "ldr	r7, [%[b], #24]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
        "adc	r8, r5, r8\n\t"
-        "#  A[7] * B[3]\n\t"
-        "ldr	r6, [%[a], #28]\n\t"
-        "ldr	r7, [%[b], #12]\n\t"
+        "#  A[3] * B[7]\n\t"
+        "ldr	r6, [%[a], #12]\n\t"
+        "ldr	r7, [%[b], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
@ -29940,7 +29930,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "str	r10, [sp, #40]\n\t"
        "#  A[4] * B[7]\n\t"
        "ldr	r6, [%[a], #16]\n\t"
-        "ldr	r7, [%[b], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r14, r3, r14\n\t"
        "adcs	r8, r4, r8\n\t"
@ -29967,9 +29956,8 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adcs	r8, r4, r8\n\t"
        "adc	r9, r5, r9\n\t"
        "str	r14, [sp, #44]\n\t"
-        "#  A[5] * B[7]\n\t"
-        "ldr	r6, [%[a], #20]\n\t"
-        "ldr	r7, [%[b], #28]\n\t"
+        "#  A[7] * B[5]\n\t"
+        "ldr	r7, [%[b], #20]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
@ -29981,16 +29969,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
        "adc	r10, r5, r10\n\t"
-        "#  A[7] * B[5]\n\t"
-        "ldr	r6, [%[a], #28]\n\t"
-        "ldr	r7, [%[b], #20]\n\t"
+        "#  A[5] * B[7]\n\t"
+        "ldr	r6, [%[a], #20]\n\t"
+        "ldr	r7, [%[b], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
        "adc	r10, r5, r10\n\t"
        "#  A[6] * B[7]\n\t"
        "ldr	r6, [%[a], #24]\n\t"
-        "ldr	r7, [%[b], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r9, r3, r9\n\t"
        "adcs	r10, r4, r10\n\t"
@ -30003,7 +29990,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adcs	r10, r4, r10\n\t"
        "adc	r14, r5, r14\n\t"
        "#  A[7] * B[7]\n\t"
-        "ldr	r6, [%[a], #28]\n\t"
        "ldr	r7, [%[b], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
@ -30021,16 +30007,16 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "ldr	r9, [sp, #20]\n\t"
        "ldr	r10, [sp, #24]\n\t"
        "ldr	r14, [sp, #28]\n\t"
-        "# mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192\n\t"
+        "# mu = a[0..7] + a[0..4] << 96 + (a[0..1] * 2) << 192\n\t"
        "#    - a[0] << 224\n\t"
-        "#   + (a[0]-a[1] * 2) << (6 * 32)\n\t"
+        "#   + (a[0..1] * 2) << (6 * 32)\n\t"
        "adds	r10, r10, r4\n\t"
        "adc	r14, r14, r5\n\t"
        "adds	r10, r10, r4\n\t"
        "adc	r14, r14, r5\n\t"
        "#   - a[0] << (7 * 32)\n\t"
        "sub	r14, r14, r4\n\t"
-        "#   + a[0]-a[4] << (3 * 32)\n\t"
+        "#   + a[0..4] << (3 * 32)\n\t"
        "mov	%[a], r7\n\t"
        "mov	%[b], r8\n\t"
        "adds	r7, r7, r4\n\t"
@ -30038,9 +30024,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "adcs	r9, r9, r6\n\t"
        "adcs	r10, r10, %[a]\n\t"
        "adc	r14, r14, %[b]\n\t"
-        "str	r4, [sp, #0]\n\t"
-        "str	r5, [sp, #4]\n\t"
-        "str	r6, [sp, #8]\n\t"
        "str	r7, [sp, #12]\n\t"
        "str	r8, [sp, #16]\n\t"
        "str	r9, [sp, #20]\n\t"
@ -30156,38 +30139,28 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
        "sbcs	r5, r5, r10\n\t"
        "sbcs	r6, r6, r14\n\t"
        "sbc	r7, r7, #0\n\t"
-        "str	r3, [sp, #44]\n\t"
-        "str	r4, [sp, #48]\n\t"
-        "str	r5, [sp, #52]\n\t"
-        "str	r6, [sp, #56]\n\t"
-        "str	r7, [sp, #60]\n\t"
        "# mask m and sub from result if overflow\n\t"
        "sub	%[b], %[a], %[b]\n\t"
        "and	%[a], %[b], #1\n\t"
-        "ldr	r3, [sp, #32]\n\t"
-        "ldr	r4, [sp, #36]\n\t"
-        "ldr	r5, [sp, #40]\n\t"
-        "ldr	r6, [sp, #44]\n\t"
-        "ldr	r7, [sp, #48]\n\t"
-        "ldr	r8, [sp, #52]\n\t"
-        "ldr	r9, [sp, #56]\n\t"
-        "ldr	r10, [sp, #60]\n\t"
-        "subs	r3, r3, %[b]\n\t"
-        "sbcs	r4, r4, %[b]\n\t"
-        "sbcs	r5, r5, %[b]\n\t"
-        "sbcs	r6, r6, #0\n\t"
-        "sbcs	r7, r7, #0\n\t"
-        "sbcs	r8, r8, #0\n\t"
-        "sbcs	r9, r9, %[a]\n\t"
-        "sbc	r10, r10, %[b]\n\t"
-        "str	r3, [%[r], #0]\n\t"
-        "str	r4, [%[r], #4]\n\t"
-        "str	r5, [%[r], #8]\n\t"
-        "str	r6, [%[r], #12]\n\t"
-        "str	r7, [%[r], #16]\n\t"
-        "str	r8, [%[r], #20]\n\t"
-        "str	r9, [%[r], #24]\n\t"
-        "str	r10, [%[r], #28]\n\t"
+        "ldr	r8, [sp, #32]\n\t"
+        "ldr	r9, [sp, #36]\n\t"
+        "ldr	r10, [sp, #40]\n\t"
+        "subs	r8, r8, %[b]\n\t"
+        "sbcs	r9, r9, %[b]\n\t"
+        "sbcs	r10, r10, %[b]\n\t"
+        "sbcs	r3, r3, #0\n\t"
+        "sbcs	r4, r4, #0\n\t"
+        "sbcs	r5, r5, #0\n\t"
+        "sbcs	r6, r6, %[a]\n\t"
+        "sbc	r7, r7, %[b]\n\t"
+        "str	r8, [%[r], #0]\n\t"
+        "str	r9, [%[r], #4]\n\t"
+        "str	r10, [%[r], #8]\n\t"
+        "str	r3, [%[r], #12]\n\t"
+        "str	r4, [%[r], #16]\n\t"
+        "str	r5, [%[r], #20]\n\t"
+        "str	r6, [%[r], #24]\n\t"
+        "str	r7, [%[r], #28]\n\t"
        "add	sp, sp, #68\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        : [r] "r" (r)
@ -30217,14 +30190,12 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "umull	r9, r10, r6, r7\n\t"
        "str	r9, [sp, #4]\n\t"
        "#  A[0] * A[2]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
        "ldr	r7, [%[a], #8]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adc	r14, r4, #0\n\t"
        "str	r10, [sp, #8]\n\t"
        "#  A[0] * A[3]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
        "ldr	r7, [%[a], #12]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r14, r3, r14\n\t"
@ -30237,22 +30208,20 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "adcs	r8, r4, r8\n\t"
        "adc	r9, r5, #0\n\t"
        "str	r14, [sp, #12]\n\t"
-        "#  A[0] * A[4]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
-        "ldr	r7, [%[a], #16]\n\t"
+        "#  A[1] * A[3]\n\t"
+        "ldr	r7, [%[a], #12]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adc	r9, r4, r9\n\t"
-        "#  A[1] * A[3]\n\t"
-        "ldr	r6, [%[a], #4]\n\t"
-        "ldr	r7, [%[a], #12]\n\t"
+        "#  A[0] * A[4]\n\t"
+        "ldr	r6, [%[a], #0]\n\t"
+        "ldr	r7, [%[a], #16]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
        "adc	r10, r5, #0\n\t"
        "str	r8, [sp, #16]\n\t"
        "#  A[0] * A[5]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
        "ldr	r7, [%[a], #20]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r9, r3, r9\n\t"
@ -30272,9 +30241,8 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "adcs	r10, r4, r10\n\t"
        "adc	r14, r5, r14\n\t"
        "str	r9, [sp, #20]\n\t"
-        "#  A[0] * A[6]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
-        "ldr	r7, [%[a], #24]\n\t"
+        "#  A[2] * A[4]\n\t"
+        "ldr	r7, [%[a], #16]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
@ -30286,16 +30254,15 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
        "adc	r8, r5, r8\n\t"
-        "#  A[2] * A[4]\n\t"
-        "ldr	r6, [%[a], #8]\n\t"
-        "ldr	r7, [%[a], #16]\n\t"
+        "#  A[0] * A[6]\n\t"
+        "ldr	r6, [%[a], #0]\n\t"
+        "ldr	r7, [%[a], #24]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
        "adc	r8, r5, r8\n\t"
        "str	r10, [sp, #24]\n\t"
        "#  A[0] * A[7]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
        "ldr	r7, [%[a], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r14, r3, r14\n\t"
@ -30323,9 +30290,8 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "adcs	r8, r4, r8\n\t"
        "adc	r9, r5, r9\n\t"
        "str	r14, [sp, #28]\n\t"
-        "#  A[1] * A[7]\n\t"
-        "ldr	r6, [%[a], #4]\n\t"
-        "ldr	r7, [%[a], #28]\n\t"
+        "#  A[3] * A[5]\n\t"
+        "ldr	r7, [%[a], #20]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
@ -30337,9 +30303,9 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
        "adc	r10, r5, r10\n\t"
-        "#  A[3] * A[5]\n\t"
-        "ldr	r6, [%[a], #12]\n\t"
-        "ldr	r7, [%[a], #20]\n\t"
+        "#  A[1] * A[7]\n\t"
+        "ldr	r6, [%[a], #4]\n\t"
+        "ldr	r7, [%[a], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
        "adcs	r9, r4, r9\n\t"
@ -30347,7 +30313,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "str	r8, [sp, #32]\n\t"
        "#  A[2] * A[7]\n\t"
        "ldr	r6, [%[a], #8]\n\t"
-        "ldr	r7, [%[a], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r9, r3, r9\n\t"
        "adcs	r10, r4, r10\n\t"
@ -30367,16 +30332,15 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "adcs	r10, r4, r10\n\t"
        "adc	r14, r5, r14\n\t"
        "str	r9, [sp, #36]\n\t"
-        "#  A[3] * A[7]\n\t"
-        "ldr	r6, [%[a], #12]\n\t"
-        "ldr	r7, [%[a], #28]\n\t"
+        "#  A[4] * A[6]\n\t"
+        "ldr	r7, [%[a], #24]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
        "adc	r8, r5, #0\n\t"
-        "#  A[4] * A[6]\n\t"
-        "ldr	r6, [%[a], #16]\n\t"
-        "ldr	r7, [%[a], #24]\n\t"
+        "#  A[3] * A[7]\n\t"
+        "ldr	r6, [%[a], #12]\n\t"
+        "ldr	r7, [%[a], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r10, r3, r10\n\t"
        "adcs	r14, r4, r14\n\t"
@ -30384,7 +30348,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "str	r10, [sp, #40]\n\t"
        "#  A[4] * A[7]\n\t"
        "ldr	r6, [%[a], #16]\n\t"
-        "ldr	r7, [%[a], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r14, r3, r14\n\t"
        "adcs	r8, r4, r8\n\t"
@ -30398,7 +30361,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "adc	r9, r5, r9\n\t"
        "str	r14, [sp, #44]\n\t"
        "#  A[5] * A[7]\n\t"
-        "ldr	r6, [%[a], #20]\n\t"
        "ldr	r7, [%[a], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r8, r3, r8\n\t"
@ -30407,7 +30369,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "str	r8, [sp, #48]\n\t"
        "#  A[6] * A[7]\n\t"
        "ldr	r6, [%[a], #24]\n\t"
-        "ldr	r7, [%[a], #28]\n\t"
        "umull	r3, r4, r6, r7\n\t"
        "adds	r9, r3, r9\n\t"
        "adc	r10, r4, r10\n\t"
@ -30537,16 +30498,16 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "ldr	r9, [sp, #20]\n\t"
        "ldr	r10, [sp, #24]\n\t"
        "ldr	r14, [sp, #28]\n\t"
-        "# mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192\n\t"
+        "# mu = a[0..7] + a[0..4] << 96 + (a[0..1] * 2) << 192\n\t"
        "#    - a[0] << 224\n\t"
-        "#   + (a[0]-a[1] * 2) << (6 * 32)\n\t"
+        "#   + (a[0..1] * 2) << (6 * 32)\n\t"
        "adds	r10, r10, r4\n\t"
        "adc	r14, r14, r5\n\t"
        "adds	r10, r10, r4\n\t"
        "adc	r14, r14, r5\n\t"
        "#   - a[0] << (7 * 32)\n\t"
        "sub	r14, r14, r4\n\t"
-        "#   + a[0]-a[4] << (3 * 32)\n\t"
+        "#   + a[0..4] << (3 * 32)\n\t"
        "mov	%[a], r7\n\t"
        "mov	r12, r8\n\t"
        "adds	r7, r7, r4\n\t"
@ -30554,9 +30515,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "adcs	r9, r9, r6\n\t"
        "adcs	r10, r10, %[a]\n\t"
        "adc	r14, r14, r12\n\t"
-        "str	r4, [sp, #0]\n\t"
-        "str	r5, [sp, #4]\n\t"
-        "str	r6, [sp, #8]\n\t"
        "str	r7, [sp, #12]\n\t"
        "str	r8, [sp, #16]\n\t"
        "str	r9, [sp, #20]\n\t"
@ -30672,38 +30630,28 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
        "sbcs	r5, r5, r10\n\t"
        "sbcs	r6, r6, r14\n\t"
        "sbc	r7, r7, #0\n\t"
-        "str	r3, [sp, #44]\n\t"
-        "str	r4, [sp, #48]\n\t"
-        "str	r5, [sp, #52]\n\t"
-        "str	r6, [sp, #56]\n\t"
-        "str	r7, [sp, #60]\n\t"
        "# mask m and sub from result if overflow\n\t"
        "sub	r12, %[a], r12\n\t"
        "and	%[a], r12, #1\n\t"
-        "ldr	r3, [sp, #32]\n\t"
-        "ldr	r4, [sp, #36]\n\t"
-        "ldr	r5, [sp, #40]\n\t"
-        "ldr	r6, [sp, #44]\n\t"
-        "ldr	r7, [sp, #48]\n\t"
-        "ldr	r8, [sp, #52]\n\t"
-        "ldr	r9, [sp, #56]\n\t"
-        "ldr	r10, [sp, #60]\n\t"
-        "subs	r3, r3, r12\n\t"
-        "sbcs	r4, r4, r12\n\t"
-        "sbcs	r5, r5, r12\n\t"
-        "sbcs	r6, r6, #0\n\t"
-        "sbcs	r7, r7, #0\n\t"
-        "sbcs	r8, r8, #0\n\t"
-        "sbcs	r9, r9, %[a]\n\t"
-        "sbc	r10, r10, r12\n\t"
-        "str	r3, [%[r], #0]\n\t"
-        "str	r4, [%[r], #4]\n\t"
-        "str	r5, [%[r], #8]\n\t"
-        "str	r6, [%[r], #12]\n\t"
-        "str	r7, [%[r], #16]\n\t"
-        "str	r8, [%[r], #20]\n\t"
-        "str	r9, [%[r], #24]\n\t"
-        "str	r10, [%[r], #28]\n\t"
+        "ldr	r8, [sp, #32]\n\t"
+        "ldr	r9, [sp, #36]\n\t"
+        "ldr	r10, [sp, #40]\n\t"
+        "subs	r8, r8, r12\n\t"
+        "sbcs	r9, r9, r12\n\t"
+        "sbcs	r10, r10, r12\n\t"
+        "sbcs	r3, r3, #0\n\t"
+        "sbcs	r4, r4, #0\n\t"
+        "sbcs	r5, r5, #0\n\t"
+        "sbcs	r6, r6, %[a]\n\t"
+        "sbc	r7, r7, r12\n\t"
+        "str	r8, [%[r], #0]\n\t"
+        "str	r9, [%[r], #4]\n\t"
+        "str	r10, [%[r], #8]\n\t"
+        "str	r3, [%[r], #12]\n\t"
+        "str	r4, [%[r], #16]\n\t"
+        "str	r5, [%[r], #20]\n\t"
+        "str	r6, [%[r], #24]\n\t"
+        "str	r7, [%[r], #28]\n\t"
        "add	sp, sp, #68\n\t"
        : [a] "+r" (a)
        : [r] "r" (r)