SP ARM32 ASM: Improve performance of P-256 mont mul/sqr

This commit is contained in:
Sean Parkinson
2022-06-15 16:39:13 +10:00
parent af4fff80db
commit e073500e8e

View File

@ -29560,7 +29560,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"umull r8, r9, r6, r7\n\t"
"str r8, [sp, #0]\n\t"
"# A[0] * B[1]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #4]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t"
@ -29573,9 +29572,8 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r10, r4, r10\n\t"
"adc r14, r5, #0\n\t"
"str r9, [sp, #4]\n\t"
"# A[0] * B[2]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #8]\n\t"
"# A[2] * B[0]\n\t"
"ldr r6, [%[a], #8]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adc r14, r4, r14\n\t"
@ -29586,16 +29584,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, #0\n\t"
"# A[2] * B[0]\n\t"
"ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[b], #0]\n\t"
"# A[0] * B[2]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #8]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"str r10, [sp, #8]\n\t"
"# A[0] * B[3]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #12]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r14, r3, r14\n\t"
@ -29623,16 +29620,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r8, r4, r8\n\t"
"adc r9, r5, r9\n\t"
"str r14, [sp, #12]\n\t"
"# A[0] * B[4]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #16]\n\t"
"# A[4] * B[0]\n\t"
"ldr r6, [%[a], #16]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, #0\n\t"
"# A[1] * B[3]\n\t"
"ldr r6, [%[a], #4]\n\t"
"ldr r7, [%[b], #12]\n\t"
"# A[3] * B[1]\n\t"
"ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #4]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
@ -29644,23 +29640,22 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[3] * B[1]\n\t"
"ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #4]\n\t"
"# A[1] * B[3]\n\t"
"ldr r6, [%[a], #4]\n\t"
"ldr r7, [%[b], #12]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[4] * B[0]\n\t"
"ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[b], #0]\n\t"
"# A[0] * B[4]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #16]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"str r8, [sp, #16]\n\t"
"# A[0] * B[5]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #20]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t"
@ -29702,30 +29697,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r10, r4, r10\n\t"
"adc r14, r5, r14\n\t"
"str r9, [sp, #20]\n\t"
"# A[0] * B[6]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #24]\n\t"
"# A[6] * B[0]\n\t"
"ldr r6, [%[a], #24]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, #0\n\t"
"# A[1] * B[5]\n\t"
"ldr r6, [%[a], #4]\n\t"
"ldr r7, [%[b], #20]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"# A[2] * B[4]\n\t"
"ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[b], #16]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"# A[3] * B[3]\n\t"
"ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #12]\n\t"
"# A[5] * B[1]\n\t"
"ldr r6, [%[a], #20]\n\t"
"ldr r7, [%[b], #4]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
@ -29737,23 +29717,36 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"# A[5] * B[1]\n\t"
"ldr r6, [%[a], #20]\n\t"
"ldr r7, [%[b], #4]\n\t"
"# A[3] * B[3]\n\t"
"ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #12]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"# A[6] * B[0]\n\t"
"ldr r6, [%[a], #24]\n\t"
"ldr r7, [%[b], #0]\n\t"
"# A[2] * B[4]\n\t"
"ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[b], #16]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"# A[1] * B[5]\n\t"
"ldr r6, [%[a], #4]\n\t"
"ldr r7, [%[b], #20]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"# A[0] * B[6]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #24]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"str r10, [sp, #24]\n\t"
"# A[0] * B[7]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r14, r3, r14\n\t"
@ -29809,30 +29802,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r8, r4, r8\n\t"
"adc r9, r5, r9\n\t"
"str r14, [sp, #28]\n\t"
"# A[1] * B[7]\n\t"
"ldr r6, [%[a], #4]\n\t"
"ldr r7, [%[b], #28]\n\t"
"# A[7] * B[1]\n\t"
"ldr r7, [%[b], #4]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, #0\n\t"
"# A[2] * B[6]\n\t"
"ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[b], #24]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[3] * B[5]\n\t"
"ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #20]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[4] * B[4]\n\t"
"ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[b], #16]\n\t"
"# A[6] * B[2]\n\t"
"ldr r6, [%[a], #24]\n\t"
"ldr r7, [%[b], #8]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
@ -29844,16 +29822,30 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[6] * B[2]\n\t"
"ldr r6, [%[a], #24]\n\t"
"ldr r7, [%[b], #8]\n\t"
"# A[4] * B[4]\n\t"
"ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[b], #16]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[7] * B[1]\n\t"
"ldr r6, [%[a], #28]\n\t"
"ldr r7, [%[b], #4]\n\t"
"# A[3] * B[5]\n\t"
"ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #20]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[2] * B[6]\n\t"
"ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[b], #24]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[1] * B[7]\n\t"
"ldr r6, [%[a], #4]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
@ -29861,7 +29853,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"str r8, [sp, #32]\n\t"
"# A[2] * B[7]\n\t"
"ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t"
"adcs r10, r4, r10\n\t"
@ -29902,16 +29893,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r10, r4, r10\n\t"
"adc r14, r5, r14\n\t"
"str r9, [sp, #36]\n\t"
"# A[3] * B[7]\n\t"
"ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #28]\n\t"
"# A[7] * B[3]\n\t"
"ldr r7, [%[b], #12]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, #0\n\t"
"# A[4] * B[6]\n\t"
"ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[b], #24]\n\t"
"# A[6] * B[4]\n\t"
"ldr r6, [%[a], #24]\n\t"
"ldr r7, [%[b], #16]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
@ -29923,16 +29913,16 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"# A[6] * B[4]\n\t"
"ldr r6, [%[a], #24]\n\t"
"ldr r7, [%[b], #16]\n\t"
"# A[4] * B[6]\n\t"
"ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[b], #24]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"# A[7] * B[3]\n\t"
"ldr r6, [%[a], #28]\n\t"
"ldr r7, [%[b], #12]\n\t"
"# A[3] * B[7]\n\t"
"ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
@ -29940,7 +29930,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"str r10, [sp, #40]\n\t"
"# A[4] * B[7]\n\t"
"ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r14, r3, r14\n\t"
"adcs r8, r4, r8\n\t"
@ -29967,9 +29956,8 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r8, r4, r8\n\t"
"adc r9, r5, r9\n\t"
"str r14, [sp, #44]\n\t"
"# A[5] * B[7]\n\t"
"ldr r6, [%[a], #20]\n\t"
"ldr r7, [%[b], #28]\n\t"
"# A[7] * B[5]\n\t"
"ldr r7, [%[b], #20]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
@ -29981,16 +29969,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[7] * B[5]\n\t"
"ldr r6, [%[a], #28]\n\t"
"ldr r7, [%[b], #20]\n\t"
"# A[5] * B[7]\n\t"
"ldr r6, [%[a], #20]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[6] * B[7]\n\t"
"ldr r6, [%[a], #24]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t"
"adcs r10, r4, r10\n\t"
@ -30003,7 +29990,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r10, r4, r10\n\t"
"adc r14, r5, r14\n\t"
"# A[7] * B[7]\n\t"
"ldr r6, [%[a], #28]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
@ -30021,16 +30007,16 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"ldr r9, [sp, #20]\n\t"
"ldr r10, [sp, #24]\n\t"
"ldr r14, [sp, #28]\n\t"
"# mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192\n\t"
"# mu = a[0..7] + a[0..4] << 96 + (a[0..1] * 2) << 192\n\t"
"# - a[0] << 224\n\t"
"# + (a[0]-a[1] * 2) << (6 * 32)\n\t"
"# + (a[0..1] * 2) << (6 * 32)\n\t"
"adds r10, r10, r4\n\t"
"adc r14, r14, r5\n\t"
"adds r10, r10, r4\n\t"
"adc r14, r14, r5\n\t"
"# - a[0] << (7 * 32)\n\t"
"sub r14, r14, r4\n\t"
"# + a[0]-a[4] << (3 * 32)\n\t"
"# + a[0..4] << (3 * 32)\n\t"
"mov %[a], r7\n\t"
"mov %[b], r8\n\t"
"adds r7, r7, r4\n\t"
@ -30038,9 +30024,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r9, r9, r6\n\t"
"adcs r10, r10, %[a]\n\t"
"adc r14, r14, %[b]\n\t"
"str r4, [sp, #0]\n\t"
"str r5, [sp, #4]\n\t"
"str r6, [sp, #8]\n\t"
"str r7, [sp, #12]\n\t"
"str r8, [sp, #16]\n\t"
"str r9, [sp, #20]\n\t"
@ -30156,38 +30139,28 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"sbcs r5, r5, r10\n\t"
"sbcs r6, r6, r14\n\t"
"sbc r7, r7, #0\n\t"
"str r3, [sp, #44]\n\t"
"str r4, [sp, #48]\n\t"
"str r5, [sp, #52]\n\t"
"str r6, [sp, #56]\n\t"
"str r7, [sp, #60]\n\t"
"# mask m and sub from result if overflow\n\t"
"sub %[b], %[a], %[b]\n\t"
"and %[a], %[b], #1\n\t"
"ldr r3, [sp, #32]\n\t"
"ldr r4, [sp, #36]\n\t"
"ldr r5, [sp, #40]\n\t"
"ldr r6, [sp, #44]\n\t"
"ldr r7, [sp, #48]\n\t"
"ldr r8, [sp, #52]\n\t"
"ldr r9, [sp, #56]\n\t"
"ldr r10, [sp, #60]\n\t"
"subs r3, r3, %[b]\n\t"
"sbcs r4, r4, %[b]\n\t"
"sbcs r5, r5, %[b]\n\t"
"sbcs r6, r6, #0\n\t"
"sbcs r7, r7, #0\n\t"
"sbcs r8, r8, #0\n\t"
"sbcs r9, r9, %[a]\n\t"
"sbc r10, r10, %[b]\n\t"
"str r3, [%[r], #0]\n\t"
"str r4, [%[r], #4]\n\t"
"str r5, [%[r], #8]\n\t"
"str r6, [%[r], #12]\n\t"
"str r7, [%[r], #16]\n\t"
"str r8, [%[r], #20]\n\t"
"str r9, [%[r], #24]\n\t"
"str r10, [%[r], #28]\n\t"
"ldr r8, [sp, #32]\n\t"
"ldr r9, [sp, #36]\n\t"
"ldr r10, [sp, #40]\n\t"
"subs r8, r8, %[b]\n\t"
"sbcs r9, r9, %[b]\n\t"
"sbcs r10, r10, %[b]\n\t"
"sbcs r3, r3, #0\n\t"
"sbcs r4, r4, #0\n\t"
"sbcs r5, r5, #0\n\t"
"sbcs r6, r6, %[a]\n\t"
"sbc r7, r7, %[b]\n\t"
"str r8, [%[r], #0]\n\t"
"str r9, [%[r], #4]\n\t"
"str r10, [%[r], #8]\n\t"
"str r3, [%[r], #12]\n\t"
"str r4, [%[r], #16]\n\t"
"str r5, [%[r], #20]\n\t"
"str r6, [%[r], #24]\n\t"
"str r7, [%[r], #28]\n\t"
"add sp, sp, #68\n\t"
: [a] "+r" (a), [b] "+r" (b)
: [r] "r" (r)
@ -30217,14 +30190,12 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"umull r9, r10, r6, r7\n\t"
"str r9, [sp, #4]\n\t"
"# A[0] * A[2]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #8]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adc r14, r4, #0\n\t"
"str r10, [sp, #8]\n\t"
"# A[0] * A[3]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #12]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r14, r3, r14\n\t"
@ -30237,22 +30208,20 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adcs r8, r4, r8\n\t"
"adc r9, r5, #0\n\t"
"str r14, [sp, #12]\n\t"
"# A[0] * A[4]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #16]\n\t"
"# A[1] * A[3]\n\t"
"ldr r7, [%[a], #12]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adc r9, r4, r9\n\t"
"# A[1] * A[3]\n\t"
"ldr r6, [%[a], #4]\n\t"
"ldr r7, [%[a], #12]\n\t"
"# A[0] * A[4]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #16]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, #0\n\t"
"str r8, [sp, #16]\n\t"
"# A[0] * A[5]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #20]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t"
@ -30272,9 +30241,8 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adcs r10, r4, r10\n\t"
"adc r14, r5, r14\n\t"
"str r9, [sp, #20]\n\t"
"# A[0] * A[6]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #24]\n\t"
"# A[2] * A[4]\n\t"
"ldr r7, [%[a], #16]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
@ -30286,16 +30254,15 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"# A[2] * A[4]\n\t"
"ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[a], #16]\n\t"
"# A[0] * A[6]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #24]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"str r10, [sp, #24]\n\t"
"# A[0] * A[7]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r14, r3, r14\n\t"
@ -30323,9 +30290,8 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adcs r8, r4, r8\n\t"
"adc r9, r5, r9\n\t"
"str r14, [sp, #28]\n\t"
"# A[1] * A[7]\n\t"
"ldr r6, [%[a], #4]\n\t"
"ldr r7, [%[a], #28]\n\t"
"# A[3] * A[5]\n\t"
"ldr r7, [%[a], #20]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
@ -30337,9 +30303,9 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[3] * A[5]\n\t"
"ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[a], #20]\n\t"
"# A[1] * A[7]\n\t"
"ldr r6, [%[a], #4]\n\t"
"ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
@ -30347,7 +30313,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"str r8, [sp, #32]\n\t"
"# A[2] * A[7]\n\t"
"ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t"
"adcs r10, r4, r10\n\t"
@ -30367,16 +30332,15 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adcs r10, r4, r10\n\t"
"adc r14, r5, r14\n\t"
"str r9, [sp, #36]\n\t"
"# A[3] * A[7]\n\t"
"ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[a], #28]\n\t"
"# A[4] * A[6]\n\t"
"ldr r7, [%[a], #24]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, #0\n\t"
"# A[4] * A[6]\n\t"
"ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[a], #24]\n\t"
"# A[3] * A[7]\n\t"
"ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
@ -30384,7 +30348,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"str r10, [sp, #40]\n\t"
"# A[4] * A[7]\n\t"
"ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r14, r3, r14\n\t"
"adcs r8, r4, r8\n\t"
@ -30398,7 +30361,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adc r9, r5, r9\n\t"
"str r14, [sp, #44]\n\t"
"# A[5] * A[7]\n\t"
"ldr r6, [%[a], #20]\n\t"
"ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
@ -30407,7 +30369,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"str r8, [sp, #48]\n\t"
"# A[6] * A[7]\n\t"
"ldr r6, [%[a], #24]\n\t"
"ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t"
"adc r10, r4, r10\n\t"
@ -30537,16 +30498,16 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"ldr r9, [sp, #20]\n\t"
"ldr r10, [sp, #24]\n\t"
"ldr r14, [sp, #28]\n\t"
"# mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192\n\t"
"# mu = a[0..7] + a[0..4] << 96 + (a[0..1] * 2) << 192\n\t"
"# - a[0] << 224\n\t"
"# + (a[0]-a[1] * 2) << (6 * 32)\n\t"
"# + (a[0..1] * 2) << (6 * 32)\n\t"
"adds r10, r10, r4\n\t"
"adc r14, r14, r5\n\t"
"adds r10, r10, r4\n\t"
"adc r14, r14, r5\n\t"
"# - a[0] << (7 * 32)\n\t"
"sub r14, r14, r4\n\t"
"# + a[0]-a[4] << (3 * 32)\n\t"
"# + a[0..4] << (3 * 32)\n\t"
"mov %[a], r7\n\t"
"mov r12, r8\n\t"
"adds r7, r7, r4\n\t"
@ -30554,9 +30515,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adcs r9, r9, r6\n\t"
"adcs r10, r10, %[a]\n\t"
"adc r14, r14, r12\n\t"
"str r4, [sp, #0]\n\t"
"str r5, [sp, #4]\n\t"
"str r6, [sp, #8]\n\t"
"str r7, [sp, #12]\n\t"
"str r8, [sp, #16]\n\t"
"str r9, [sp, #20]\n\t"
@ -30672,38 +30630,28 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"sbcs r5, r5, r10\n\t"
"sbcs r6, r6, r14\n\t"
"sbc r7, r7, #0\n\t"
"str r3, [sp, #44]\n\t"
"str r4, [sp, #48]\n\t"
"str r5, [sp, #52]\n\t"
"str r6, [sp, #56]\n\t"
"str r7, [sp, #60]\n\t"
"# mask m and sub from result if overflow\n\t"
"sub r12, %[a], r12\n\t"
"and %[a], r12, #1\n\t"
"ldr r3, [sp, #32]\n\t"
"ldr r4, [sp, #36]\n\t"
"ldr r5, [sp, #40]\n\t"
"ldr r6, [sp, #44]\n\t"
"ldr r7, [sp, #48]\n\t"
"ldr r8, [sp, #52]\n\t"
"ldr r9, [sp, #56]\n\t"
"ldr r10, [sp, #60]\n\t"
"subs r3, r3, r12\n\t"
"sbcs r4, r4, r12\n\t"
"sbcs r5, r5, r12\n\t"
"sbcs r6, r6, #0\n\t"
"sbcs r7, r7, #0\n\t"
"sbcs r8, r8, #0\n\t"
"sbcs r9, r9, %[a]\n\t"
"sbc r10, r10, r12\n\t"
"str r3, [%[r], #0]\n\t"
"str r4, [%[r], #4]\n\t"
"str r5, [%[r], #8]\n\t"
"str r6, [%[r], #12]\n\t"
"str r7, [%[r], #16]\n\t"
"str r8, [%[r], #20]\n\t"
"str r9, [%[r], #24]\n\t"
"str r10, [%[r], #28]\n\t"
"ldr r8, [sp, #32]\n\t"
"ldr r9, [sp, #36]\n\t"
"ldr r10, [sp, #40]\n\t"
"subs r8, r8, r12\n\t"
"sbcs r9, r9, r12\n\t"
"sbcs r10, r10, r12\n\t"
"sbcs r3, r3, #0\n\t"
"sbcs r4, r4, #0\n\t"
"sbcs r5, r5, #0\n\t"
"sbcs r6, r6, %[a]\n\t"
"sbc r7, r7, r12\n\t"
"str r8, [%[r], #0]\n\t"
"str r9, [%[r], #4]\n\t"
"str r10, [%[r], #8]\n\t"
"str r3, [%[r], #12]\n\t"
"str r4, [%[r], #16]\n\t"
"str r5, [%[r], #20]\n\t"
"str r6, [%[r], #24]\n\t"
"str r7, [%[r], #28]\n\t"
"add sp, sp, #68\n\t"
: [a] "+r" (a)
: [r] "r" (r)