diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c
index 916f06d26..3d3d0d3f6 100644
--- a/wolfcrypt/src/sp_arm32.c
+++ b/wolfcrypt/src/sp_arm32.c
@@ -29560,7 +29560,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "umull	r8, r9, r6, r7\n\t"
         "str	r8, [sp, #0]\n\t"
         "#  A[0] * B[1]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
         "ldr	r7, [%[b], #4]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r9, r3, r9\n\t"
@@ -29573,9 +29572,8 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adcs	r10, r4, r10\n\t"
         "adc	r14, r5, #0\n\t"
         "str	r9, [sp, #4]\n\t"
-        "#  A[0] * B[2]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
-        "ldr	r7, [%[b], #8]\n\t"
+        "#  A[2] * B[0]\n\t"
+        "ldr	r6, [%[a], #8]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adc	r14, r4, r14\n\t"
@@ -29586,16 +29584,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
         "adc	r8, r5, #0\n\t"
-        "#  A[2] * B[0]\n\t"
-        "ldr	r6, [%[a], #8]\n\t"
-        "ldr	r7, [%[b], #0]\n\t"
+        "#  A[0] * B[2]\n\t"
+        "ldr	r6, [%[a], #0]\n\t"
+        "ldr	r7, [%[b], #8]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
         "adc	r8, r5, r8\n\t"
         "str	r10, [sp, #8]\n\t"
         "#  A[0] * B[3]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
         "ldr	r7, [%[b], #12]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r14, r3, r14\n\t"
@@ -29623,16 +29620,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adcs	r8, r4, r8\n\t"
         "adc	r9, r5, r9\n\t"
         "str	r14, [sp, #12]\n\t"
-        "#  A[0] * B[4]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
-        "ldr	r7, [%[b], #16]\n\t"
+        "#  A[4] * B[0]\n\t"
+        "ldr	r6, [%[a], #16]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
         "adc	r10, r5, #0\n\t"
-        "#  A[1] * B[3]\n\t"
-        "ldr	r6, [%[a], #4]\n\t"
-        "ldr	r7, [%[b], #12]\n\t"
+        "#  A[3] * B[1]\n\t"
+        "ldr	r6, [%[a], #12]\n\t"
+        "ldr	r7, [%[b], #4]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
@@ -29644,23 +29640,22 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
         "adc	r10, r5, r10\n\t"
-        "#  A[3] * B[1]\n\t"
-        "ldr	r6, [%[a], #12]\n\t"
-        "ldr	r7, [%[b], #4]\n\t"
+        "#  A[1] * B[3]\n\t"
+        "ldr	r6, [%[a], #4]\n\t"
+        "ldr	r7, [%[b], #12]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
         "adc	r10, r5, r10\n\t"
-        "#  A[4] * B[0]\n\t"
-        "ldr	r6, [%[a], #16]\n\t"
-        "ldr	r7, [%[b], #0]\n\t"
+        "#  A[0] * B[4]\n\t"
+        "ldr	r6, [%[a], #0]\n\t"
+        "ldr	r7, [%[b], #16]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
         "adc	r10, r5, r10\n\t"
         "str	r8, [sp, #16]\n\t"
         "#  A[0] * B[5]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
         "ldr	r7, [%[b], #20]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r9, r3, r9\n\t"
@@ -29702,30 +29697,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adcs	r10, r4, r10\n\t"
         "adc	r14, r5, r14\n\t"
         "str	r9, [sp, #20]\n\t"
-        "#  A[0] * B[6]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
-        "ldr	r7, [%[b], #24]\n\t"
+        "#  A[6] * B[0]\n\t"
+        "ldr	r6, [%[a], #24]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
         "adc	r8, r5, #0\n\t"
-        "#  A[1] * B[5]\n\t"
-        "ldr	r6, [%[a], #4]\n\t"
-        "ldr	r7, [%[b], #20]\n\t"
-        "umull	r3, r4, r6, r7\n\t"
-        "adds	r10, r3, r10\n\t"
-        "adcs	r14, r4, r14\n\t"
-        "adc	r8, r5, r8\n\t"
-        "#  A[2] * B[4]\n\t"
-        "ldr	r6, [%[a], #8]\n\t"
-        "ldr	r7, [%[b], #16]\n\t"
-        "umull	r3, r4, r6, r7\n\t"
-        "adds	r10, r3, r10\n\t"
-        "adcs	r14, r4, r14\n\t"
-        "adc	r8, r5, r8\n\t"
-        "#  A[3] * B[3]\n\t"
-        "ldr	r6, [%[a], #12]\n\t"
-        "ldr	r7, [%[b], #12]\n\t"
+        "#  A[5] * B[1]\n\t"
+        "ldr	r6, [%[a], #20]\n\t"
+        "ldr	r7, [%[b], #4]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
@@ -29737,23 +29717,36 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
         "adc	r8, r5, r8\n\t"
-        "#  A[5] * B[1]\n\t"
-        "ldr	r6, [%[a], #20]\n\t"
-        "ldr	r7, [%[b], #4]\n\t"
+        "#  A[3] * B[3]\n\t"
+        "ldr	r6, [%[a], #12]\n\t"
+        "ldr	r7, [%[b], #12]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
         "adc	r8, r5, r8\n\t"
-        "#  A[6] * B[0]\n\t"
-        "ldr	r6, [%[a], #24]\n\t"
-        "ldr	r7, [%[b], #0]\n\t"
+        "#  A[2] * B[4]\n\t"
+        "ldr	r6, [%[a], #8]\n\t"
+        "ldr	r7, [%[b], #16]\n\t"
+        "umull	r3, r4, r6, r7\n\t"
+        "adds	r10, r3, r10\n\t"
+        "adcs	r14, r4, r14\n\t"
+        "adc	r8, r5, r8\n\t"
+        "#  A[1] * B[5]\n\t"
+        "ldr	r6, [%[a], #4]\n\t"
+        "ldr	r7, [%[b], #20]\n\t"
+        "umull	r3, r4, r6, r7\n\t"
+        "adds	r10, r3, r10\n\t"
+        "adcs	r14, r4, r14\n\t"
+        "adc	r8, r5, r8\n\t"
+        "#  A[0] * B[6]\n\t"
+        "ldr	r6, [%[a], #0]\n\t"
+        "ldr	r7, [%[b], #24]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
         "adc	r8, r5, r8\n\t"
         "str	r10, [sp, #24]\n\t"
         "#  A[0] * B[7]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
         "ldr	r7, [%[b], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r14, r3, r14\n\t"
@@ -29809,30 +29802,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adcs	r8, r4, r8\n\t"
         "adc	r9, r5, r9\n\t"
         "str	r14, [sp, #28]\n\t"
-        "#  A[1] * B[7]\n\t"
-        "ldr	r6, [%[a], #4]\n\t"
-        "ldr	r7, [%[b], #28]\n\t"
+        "#  A[7] * B[1]\n\t"
+        "ldr	r7, [%[b], #4]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
         "adc	r10, r5, #0\n\t"
-        "#  A[2] * B[6]\n\t"
-        "ldr	r6, [%[a], #8]\n\t"
-        "ldr	r7, [%[b], #24]\n\t"
-        "umull	r3, r4, r6, r7\n\t"
-        "adds	r8, r3, r8\n\t"
-        "adcs	r9, r4, r9\n\t"
-        "adc	r10, r5, r10\n\t"
-        "#  A[3] * B[5]\n\t"
-        "ldr	r6, [%[a], #12]\n\t"
-        "ldr	r7, [%[b], #20]\n\t"
-        "umull	r3, r4, r6, r7\n\t"
-        "adds	r8, r3, r8\n\t"
-        "adcs	r9, r4, r9\n\t"
-        "adc	r10, r5, r10\n\t"
-        "#  A[4] * B[4]\n\t"
-        "ldr	r6, [%[a], #16]\n\t"
-        "ldr	r7, [%[b], #16]\n\t"
+        "#  A[6] * B[2]\n\t"
+        "ldr	r6, [%[a], #24]\n\t"
+        "ldr	r7, [%[b], #8]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
@@ -29844,16 +29822,30 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
         "adc	r10, r5, r10\n\t"
-        "#  A[6] * B[2]\n\t"
-        "ldr	r6, [%[a], #24]\n\t"
-        "ldr	r7, [%[b], #8]\n\t"
+        "#  A[4] * B[4]\n\t"
+        "ldr	r6, [%[a], #16]\n\t"
+        "ldr	r7, [%[b], #16]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
         "adc	r10, r5, r10\n\t"
-        "#  A[7] * B[1]\n\t"
-        "ldr	r6, [%[a], #28]\n\t"
-        "ldr	r7, [%[b], #4]\n\t"
+        "#  A[3] * B[5]\n\t"
+        "ldr	r6, [%[a], #12]\n\t"
+        "ldr	r7, [%[b], #20]\n\t"
+        "umull	r3, r4, r6, r7\n\t"
+        "adds	r8, r3, r8\n\t"
+        "adcs	r9, r4, r9\n\t"
+        "adc	r10, r5, r10\n\t"
+        "#  A[2] * B[6]\n\t"
+        "ldr	r6, [%[a], #8]\n\t"
+        "ldr	r7, [%[b], #24]\n\t"
+        "umull	r3, r4, r6, r7\n\t"
+        "adds	r8, r3, r8\n\t"
+        "adcs	r9, r4, r9\n\t"
+        "adc	r10, r5, r10\n\t"
+        "#  A[1] * B[7]\n\t"
+        "ldr	r6, [%[a], #4]\n\t"
+        "ldr	r7, [%[b], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
@@ -29861,7 +29853,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "str	r8, [sp, #32]\n\t"
         "#  A[2] * B[7]\n\t"
         "ldr	r6, [%[a], #8]\n\t"
-        "ldr	r7, [%[b], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r9, r3, r9\n\t"
         "adcs	r10, r4, r10\n\t"
@@ -29902,16 +29893,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adcs	r10, r4, r10\n\t"
         "adc	r14, r5, r14\n\t"
         "str	r9, [sp, #36]\n\t"
-        "#  A[3] * B[7]\n\t"
-        "ldr	r6, [%[a], #12]\n\t"
-        "ldr	r7, [%[b], #28]\n\t"
+        "#  A[7] * B[3]\n\t"
+        "ldr	r7, [%[b], #12]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
         "adc	r8, r5, #0\n\t"
-        "#  A[4] * B[6]\n\t"
-        "ldr	r6, [%[a], #16]\n\t"
-        "ldr	r7, [%[b], #24]\n\t"
+        "#  A[6] * B[4]\n\t"
+        "ldr	r6, [%[a], #24]\n\t"
+        "ldr	r7, [%[b], #16]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
@@ -29923,16 +29913,16 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
         "adc	r8, r5, r8\n\t"
-        "#  A[6] * B[4]\n\t"
-        "ldr	r6, [%[a], #24]\n\t"
-        "ldr	r7, [%[b], #16]\n\t"
+        "#  A[4] * B[6]\n\t"
+        "ldr	r6, [%[a], #16]\n\t"
+        "ldr	r7, [%[b], #24]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
         "adc	r8, r5, r8\n\t"
-        "#  A[7] * B[3]\n\t"
-        "ldr	r6, [%[a], #28]\n\t"
-        "ldr	r7, [%[b], #12]\n\t"
+        "#  A[3] * B[7]\n\t"
+        "ldr	r6, [%[a], #12]\n\t"
+        "ldr	r7, [%[b], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
@@ -29940,7 +29930,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "str	r10, [sp, #40]\n\t"
         "#  A[4] * B[7]\n\t"
         "ldr	r6, [%[a], #16]\n\t"
-        "ldr	r7, [%[b], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r14, r3, r14\n\t"
         "adcs	r8, r4, r8\n\t"
@@ -29967,9 +29956,8 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adcs	r8, r4, r8\n\t"
         "adc	r9, r5, r9\n\t"
         "str	r14, [sp, #44]\n\t"
-        "#  A[5] * B[7]\n\t"
-        "ldr	r6, [%[a], #20]\n\t"
-        "ldr	r7, [%[b], #28]\n\t"
+        "#  A[7] * B[5]\n\t"
+        "ldr	r7, [%[b], #20]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
@@ -29981,16 +29969,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
         "adc	r10, r5, r10\n\t"
-        "#  A[7] * B[5]\n\t"
-        "ldr	r6, [%[a], #28]\n\t"
-        "ldr	r7, [%[b], #20]\n\t"
+        "#  A[5] * B[7]\n\t"
+        "ldr	r6, [%[a], #20]\n\t"
+        "ldr	r7, [%[b], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
         "adc	r10, r5, r10\n\t"
         "#  A[6] * B[7]\n\t"
         "ldr	r6, [%[a], #24]\n\t"
-        "ldr	r7, [%[b], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r9, r3, r9\n\t"
         "adcs	r10, r4, r10\n\t"
@@ -30003,7 +29990,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adcs	r10, r4, r10\n\t"
         "adc	r14, r5, r14\n\t"
         "#  A[7] * B[7]\n\t"
-        "ldr	r6, [%[a], #28]\n\t"
         "ldr	r7, [%[b], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
@@ -30021,16 +30007,16 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "ldr	r9, [sp, #20]\n\t"
         "ldr	r10, [sp, #24]\n\t"
         "ldr	r14, [sp, #28]\n\t"
-        "# mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192\n\t"
+        "# mu = a[0..7] + a[0..4] << 96 + (a[0..1] * 2) << 192\n\t"
         "#    - a[0] << 224\n\t"
-        "#   + (a[0]-a[1] * 2) << (6 * 32)\n\t"
+        "#   + (a[0..1] * 2) << (6 * 32)\n\t"
         "adds	r10, r10, r4\n\t"
         "adc	r14, r14, r5\n\t"
         "adds	r10, r10, r4\n\t"
         "adc	r14, r14, r5\n\t"
         "#   - a[0] << (7 * 32)\n\t"
         "sub	r14, r14, r4\n\t"
-        "#   + a[0]-a[4] << (3 * 32)\n\t"
+        "#   + a[0..4] << (3 * 32)\n\t"
         "mov	%[a], r7\n\t"
         "mov	%[b], r8\n\t"
         "adds	r7, r7, r4\n\t"
@@ -30038,9 +30024,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "adcs	r9, r9, r6\n\t"
         "adcs	r10, r10, %[a]\n\t"
         "adc	r14, r14, %[b]\n\t"
-        "str	r4, [sp, #0]\n\t"
-        "str	r5, [sp, #4]\n\t"
-        "str	r6, [sp, #8]\n\t"
         "str	r7, [sp, #12]\n\t"
         "str	r8, [sp, #16]\n\t"
         "str	r9, [sp, #20]\n\t"
@@ -30156,38 +30139,28 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
         "sbcs	r5, r5, r10\n\t"
         "sbcs	r6, r6, r14\n\t"
         "sbc	r7, r7, #0\n\t"
-        "str	r3, [sp, #44]\n\t"
-        "str	r4, [sp, #48]\n\t"
-        "str	r5, [sp, #52]\n\t"
-        "str	r6, [sp, #56]\n\t"
-        "str	r7, [sp, #60]\n\t"
         "# mask m and sub from result if overflow\n\t"
         "sub	%[b], %[a], %[b]\n\t"
         "and	%[a], %[b], #1\n\t"
-        "ldr	r3, [sp, #32]\n\t"
-        "ldr	r4, [sp, #36]\n\t"
-        "ldr	r5, [sp, #40]\n\t"
-        "ldr	r6, [sp, #44]\n\t"
-        "ldr	r7, [sp, #48]\n\t"
-        "ldr	r8, [sp, #52]\n\t"
-        "ldr	r9, [sp, #56]\n\t"
-        "ldr	r10, [sp, #60]\n\t"
-        "subs	r3, r3, %[b]\n\t"
-        "sbcs	r4, r4, %[b]\n\t"
-        "sbcs	r5, r5, %[b]\n\t"
-        "sbcs	r6, r6, #0\n\t"
-        "sbcs	r7, r7, #0\n\t"
-        "sbcs	r8, r8, #0\n\t"
-        "sbcs	r9, r9, %[a]\n\t"
-        "sbc	r10, r10, %[b]\n\t"
-        "str	r3, [%[r], #0]\n\t"
-        "str	r4, [%[r], #4]\n\t"
-        "str	r5, [%[r], #8]\n\t"
-        "str	r6, [%[r], #12]\n\t"
-        "str	r7, [%[r], #16]\n\t"
-        "str	r8, [%[r], #20]\n\t"
-        "str	r9, [%[r], #24]\n\t"
-        "str	r10, [%[r], #28]\n\t"
+        "ldr	r8, [sp, #32]\n\t"
+        "ldr	r9, [sp, #36]\n\t"
+        "ldr	r10, [sp, #40]\n\t"
+        "subs	r8, r8, %[b]\n\t"
+        "sbcs	r9, r9, %[b]\n\t"
+        "sbcs	r10, r10, %[b]\n\t"
+        "sbcs	r3, r3, #0\n\t"
+        "sbcs	r4, r4, #0\n\t"
+        "sbcs	r5, r5, #0\n\t"
+        "sbcs	r6, r6, %[a]\n\t"
+        "sbc	r7, r7, %[b]\n\t"
+        "str	r8, [%[r], #0]\n\t"
+        "str	r9, [%[r], #4]\n\t"
+        "str	r10, [%[r], #8]\n\t"
+        "str	r3, [%[r], #12]\n\t"
+        "str	r4, [%[r], #16]\n\t"
+        "str	r5, [%[r], #20]\n\t"
+        "str	r6, [%[r], #24]\n\t"
+        "str	r7, [%[r], #28]\n\t"
         "add	sp, sp, #68\n\t"
         : [a] "+r" (a), [b] "+r" (b)
         : [r] "r" (r)
@@ -30217,14 +30190,12 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "umull	r9, r10, r6, r7\n\t"
         "str	r9, [sp, #4]\n\t"
         "#  A[0] * A[2]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
         "ldr	r7, [%[a], #8]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adc	r14, r4, #0\n\t"
         "str	r10, [sp, #8]\n\t"
         "#  A[0] * A[3]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
         "ldr	r7, [%[a], #12]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r14, r3, r14\n\t"
@@ -30237,22 +30208,20 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "adcs	r8, r4, r8\n\t"
         "adc	r9, r5, #0\n\t"
         "str	r14, [sp, #12]\n\t"
-        "#  A[0] * A[4]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
-        "ldr	r7, [%[a], #16]\n\t"
+        "#  A[1] * A[3]\n\t"
+        "ldr	r7, [%[a], #12]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adc	r9, r4, r9\n\t"
-        "#  A[1] * A[3]\n\t"
-        "ldr	r6, [%[a], #4]\n\t"
-        "ldr	r7, [%[a], #12]\n\t"
+        "#  A[0] * A[4]\n\t"
+        "ldr	r6, [%[a], #0]\n\t"
+        "ldr	r7, [%[a], #16]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
         "adc	r10, r5, #0\n\t"
         "str	r8, [sp, #16]\n\t"
         "#  A[0] * A[5]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
         "ldr	r7, [%[a], #20]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r9, r3, r9\n\t"
@@ -30272,9 +30241,8 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "adcs	r10, r4, r10\n\t"
         "adc	r14, r5, r14\n\t"
         "str	r9, [sp, #20]\n\t"
-        "#  A[0] * A[6]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
-        "ldr	r7, [%[a], #24]\n\t"
+        "#  A[2] * A[4]\n\t"
+        "ldr	r7, [%[a], #16]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
@@ -30286,16 +30254,15 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
         "adc	r8, r5, r8\n\t"
-        "#  A[2] * A[4]\n\t"
-        "ldr	r6, [%[a], #8]\n\t"
-        "ldr	r7, [%[a], #16]\n\t"
+        "#  A[0] * A[6]\n\t"
+        "ldr	r6, [%[a], #0]\n\t"
+        "ldr	r7, [%[a], #24]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
         "adc	r8, r5, r8\n\t"
         "str	r10, [sp, #24]\n\t"
         "#  A[0] * A[7]\n\t"
-        "ldr	r6, [%[a], #0]\n\t"
         "ldr	r7, [%[a], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r14, r3, r14\n\t"
@@ -30323,9 +30290,8 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "adcs	r8, r4, r8\n\t"
         "adc	r9, r5, r9\n\t"
         "str	r14, [sp, #28]\n\t"
-        "#  A[1] * A[7]\n\t"
-        "ldr	r6, [%[a], #4]\n\t"
-        "ldr	r7, [%[a], #28]\n\t"
+        "#  A[3] * A[5]\n\t"
+        "ldr	r7, [%[a], #20]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
@@ -30337,9 +30303,9 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
         "adc	r10, r5, r10\n\t"
-        "#  A[3] * A[5]\n\t"
-        "ldr	r6, [%[a], #12]\n\t"
-        "ldr	r7, [%[a], #20]\n\t"
+        "#  A[1] * A[7]\n\t"
+        "ldr	r6, [%[a], #4]\n\t"
+        "ldr	r7, [%[a], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
         "adcs	r9, r4, r9\n\t"
@@ -30347,7 +30313,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "str	r8, [sp, #32]\n\t"
         "#  A[2] * A[7]\n\t"
         "ldr	r6, [%[a], #8]\n\t"
-        "ldr	r7, [%[a], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r9, r3, r9\n\t"
         "adcs	r10, r4, r10\n\t"
@@ -30367,16 +30332,15 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "adcs	r10, r4, r10\n\t"
         "adc	r14, r5, r14\n\t"
         "str	r9, [sp, #36]\n\t"
-        "#  A[3] * A[7]\n\t"
-        "ldr	r6, [%[a], #12]\n\t"
-        "ldr	r7, [%[a], #28]\n\t"
+        "#  A[4] * A[6]\n\t"
+        "ldr	r7, [%[a], #24]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
         "adc	r8, r5, #0\n\t"
-        "#  A[4] * A[6]\n\t"
-        "ldr	r6, [%[a], #16]\n\t"
-        "ldr	r7, [%[a], #24]\n\t"
+        "#  A[3] * A[7]\n\t"
+        "ldr	r6, [%[a], #12]\n\t"
+        "ldr	r7, [%[a], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r10, r3, r10\n\t"
         "adcs	r14, r4, r14\n\t"
@@ -30384,7 +30348,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "str	r10, [sp, #40]\n\t"
         "#  A[4] * A[7]\n\t"
         "ldr	r6, [%[a], #16]\n\t"
-        "ldr	r7, [%[a], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r14, r3, r14\n\t"
         "adcs	r8, r4, r8\n\t"
@@ -30398,7 +30361,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "adc	r9, r5, r9\n\t"
         "str	r14, [sp, #44]\n\t"
         "#  A[5] * A[7]\n\t"
-        "ldr	r6, [%[a], #20]\n\t"
         "ldr	r7, [%[a], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r8, r3, r8\n\t"
@@ -30407,7 +30369,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "str	r8, [sp, #48]\n\t"
         "#  A[6] * A[7]\n\t"
         "ldr	r6, [%[a], #24]\n\t"
-        "ldr	r7, [%[a], #28]\n\t"
         "umull	r3, r4, r6, r7\n\t"
         "adds	r9, r3, r9\n\t"
         "adc	r10, r4, r10\n\t"
@@ -30537,16 +30498,16 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "ldr	r9, [sp, #20]\n\t"
         "ldr	r10, [sp, #24]\n\t"
         "ldr	r14, [sp, #28]\n\t"
-        "# mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192\n\t"
+        "# mu = a[0..7] + a[0..4] << 96 + (a[0..1] * 2) << 192\n\t"
         "#    - a[0] << 224\n\t"
-        "#   + (a[0]-a[1] * 2) << (6 * 32)\n\t"
+        "#   + (a[0..1] * 2) << (6 * 32)\n\t"
         "adds	r10, r10, r4\n\t"
         "adc	r14, r14, r5\n\t"
         "adds	r10, r10, r4\n\t"
         "adc	r14, r14, r5\n\t"
         "#   - a[0] << (7 * 32)\n\t"
         "sub	r14, r14, r4\n\t"
-        "#   + a[0]-a[4] << (3 * 32)\n\t"
+        "#   + a[0..4] << (3 * 32)\n\t"
         "mov	%[a], r7\n\t"
         "mov	r12, r8\n\t"
         "adds	r7, r7, r4\n\t"
@@ -30554,9 +30515,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "adcs	r9, r9, r6\n\t"
         "adcs	r10, r10, %[a]\n\t"
         "adc	r14, r14, r12\n\t"
-        "str	r4, [sp, #0]\n\t"
-        "str	r5, [sp, #4]\n\t"
-        "str	r6, [sp, #8]\n\t"
         "str	r7, [sp, #12]\n\t"
         "str	r8, [sp, #16]\n\t"
         "str	r9, [sp, #20]\n\t"
@@ -30672,38 +30630,28 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
         "sbcs	r5, r5, r10\n\t"
         "sbcs	r6, r6, r14\n\t"
         "sbc	r7, r7, #0\n\t"
-        "str	r3, [sp, #44]\n\t"
-        "str	r4, [sp, #48]\n\t"
-        "str	r5, [sp, #52]\n\t"
-        "str	r6, [sp, #56]\n\t"
-        "str	r7, [sp, #60]\n\t"
         "# mask m and sub from result if overflow\n\t"
         "sub	r12, %[a], r12\n\t"
         "and	%[a], r12, #1\n\t"
-        "ldr	r3, [sp, #32]\n\t"
-        "ldr	r4, [sp, #36]\n\t"
-        "ldr	r5, [sp, #40]\n\t"
-        "ldr	r6, [sp, #44]\n\t"
-        "ldr	r7, [sp, #48]\n\t"
-        "ldr	r8, [sp, #52]\n\t"
-        "ldr	r9, [sp, #56]\n\t"
-        "ldr	r10, [sp, #60]\n\t"
-        "subs	r3, r3, r12\n\t"
-        "sbcs	r4, r4, r12\n\t"
-        "sbcs	r5, r5, r12\n\t"
-        "sbcs	r6, r6, #0\n\t"
-        "sbcs	r7, r7, #0\n\t"
-        "sbcs	r8, r8, #0\n\t"
-        "sbcs	r9, r9, %[a]\n\t"
-        "sbc	r10, r10, r12\n\t"
-        "str	r3, [%[r], #0]\n\t"
-        "str	r4, [%[r], #4]\n\t"
-        "str	r5, [%[r], #8]\n\t"
-        "str	r6, [%[r], #12]\n\t"
-        "str	r7, [%[r], #16]\n\t"
-        "str	r8, [%[r], #20]\n\t"
-        "str	r9, [%[r], #24]\n\t"
-        "str	r10, [%[r], #28]\n\t"
+        "ldr	r8, [sp, #32]\n\t"
+        "ldr	r9, [sp, #36]\n\t"
+        "ldr	r10, [sp, #40]\n\t"
+        "subs	r8, r8, r12\n\t"
+        "sbcs	r9, r9, r12\n\t"
+        "sbcs	r10, r10, r12\n\t"
+        "sbcs	r3, r3, #0\n\t"
+        "sbcs	r4, r4, #0\n\t"
+        "sbcs	r5, r5, #0\n\t"
+        "sbcs	r6, r6, %[a]\n\t"
+        "sbc	r7, r7, r12\n\t"
+        "str	r8, [%[r], #0]\n\t"
+        "str	r9, [%[r], #4]\n\t"
+        "str	r10, [%[r], #8]\n\t"
+        "str	r3, [%[r], #12]\n\t"
+        "str	r4, [%[r], #16]\n\t"
+        "str	r5, [%[r], #20]\n\t"
+        "str	r6, [%[r], #24]\n\t"
+        "str	r7, [%[r], #28]\n\t"
         "add	sp, sp, #68\n\t"
         : [a] "+r" (a)
         : [r] "r" (r)