SP ARM32 ASM: Improve performance of P-256 mont mul/sqr

This commit is contained in:
Sean Parkinson
2022-06-15 16:39:13 +10:00
parent af4fff80db
commit e073500e8e

View File

@ -29560,7 +29560,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"umull r8, r9, r6, r7\n\t" "umull r8, r9, r6, r7\n\t"
"str r8, [sp, #0]\n\t" "str r8, [sp, #0]\n\t"
"# A[0] * B[1]\n\t" "# A[0] * B[1]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #4]\n\t" "ldr r7, [%[b], #4]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t" "adds r9, r3, r9\n\t"
@ -29573,9 +29572,8 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r10, r4, r10\n\t" "adcs r10, r4, r10\n\t"
"adc r14, r5, #0\n\t" "adc r14, r5, #0\n\t"
"str r9, [sp, #4]\n\t" "str r9, [sp, #4]\n\t"
"# A[0] * B[2]\n\t" "# A[2] * B[0]\n\t"
"ldr r6, [%[a], #0]\n\t" "ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[b], #8]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adc r14, r4, r14\n\t" "adc r14, r4, r14\n\t"
@ -29586,16 +29584,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
"adc r8, r5, #0\n\t" "adc r8, r5, #0\n\t"
"# A[2] * B[0]\n\t" "# A[0] * B[2]\n\t"
"ldr r6, [%[a], #8]\n\t" "ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #0]\n\t" "ldr r7, [%[b], #8]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t" "adc r8, r5, r8\n\t"
"str r10, [sp, #8]\n\t" "str r10, [sp, #8]\n\t"
"# A[0] * B[3]\n\t" "# A[0] * B[3]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #12]\n\t" "ldr r7, [%[b], #12]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r14, r3, r14\n\t" "adds r14, r3, r14\n\t"
@ -29623,16 +29620,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r8, r4, r8\n\t" "adcs r8, r4, r8\n\t"
"adc r9, r5, r9\n\t" "adc r9, r5, r9\n\t"
"str r14, [sp, #12]\n\t" "str r14, [sp, #12]\n\t"
"# A[0] * B[4]\n\t" "# A[4] * B[0]\n\t"
"ldr r6, [%[a], #0]\n\t" "ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[b], #16]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
"adc r10, r5, #0\n\t" "adc r10, r5, #0\n\t"
"# A[1] * B[3]\n\t" "# A[3] * B[1]\n\t"
"ldr r6, [%[a], #4]\n\t" "ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #12]\n\t" "ldr r7, [%[b], #4]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
@ -29644,23 +29640,22 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t" "adc r10, r5, r10\n\t"
"# A[3] * B[1]\n\t" "# A[1] * B[3]\n\t"
"ldr r6, [%[a], #12]\n\t" "ldr r6, [%[a], #4]\n\t"
"ldr r7, [%[b], #4]\n\t" "ldr r7, [%[b], #12]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t" "adc r10, r5, r10\n\t"
"# A[4] * B[0]\n\t" "# A[0] * B[4]\n\t"
"ldr r6, [%[a], #16]\n\t" "ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #0]\n\t" "ldr r7, [%[b], #16]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t" "adc r10, r5, r10\n\t"
"str r8, [sp, #16]\n\t" "str r8, [sp, #16]\n\t"
"# A[0] * B[5]\n\t" "# A[0] * B[5]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #20]\n\t" "ldr r7, [%[b], #20]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t" "adds r9, r3, r9\n\t"
@ -29702,30 +29697,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r10, r4, r10\n\t" "adcs r10, r4, r10\n\t"
"adc r14, r5, r14\n\t" "adc r14, r5, r14\n\t"
"str r9, [sp, #20]\n\t" "str r9, [sp, #20]\n\t"
"# A[0] * B[6]\n\t" "# A[6] * B[0]\n\t"
"ldr r6, [%[a], #0]\n\t" "ldr r6, [%[a], #24]\n\t"
"ldr r7, [%[b], #24]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
"adc r8, r5, #0\n\t" "adc r8, r5, #0\n\t"
"# A[1] * B[5]\n\t" "# A[5] * B[1]\n\t"
"ldr r6, [%[a], #4]\n\t" "ldr r6, [%[a], #20]\n\t"
"ldr r7, [%[b], #20]\n\t" "ldr r7, [%[b], #4]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"# A[2] * B[4]\n\t"
"ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[b], #16]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"# A[3] * B[3]\n\t"
"ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #12]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
@ -29737,23 +29717,36 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t" "adc r8, r5, r8\n\t"
"# A[5] * B[1]\n\t" "# A[3] * B[3]\n\t"
"ldr r6, [%[a], #20]\n\t" "ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #4]\n\t" "ldr r7, [%[b], #12]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t" "adc r8, r5, r8\n\t"
"# A[6] * B[0]\n\t" "# A[2] * B[4]\n\t"
"ldr r6, [%[a], #24]\n\t" "ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[b], #0]\n\t" "ldr r7, [%[b], #16]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"# A[1] * B[5]\n\t"
"ldr r6, [%[a], #4]\n\t"
"ldr r7, [%[b], #20]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t"
"# A[0] * B[6]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #24]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t" "adc r8, r5, r8\n\t"
"str r10, [sp, #24]\n\t" "str r10, [sp, #24]\n\t"
"# A[0] * B[7]\n\t" "# A[0] * B[7]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[b], #28]\n\t" "ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r14, r3, r14\n\t" "adds r14, r3, r14\n\t"
@ -29809,30 +29802,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r8, r4, r8\n\t" "adcs r8, r4, r8\n\t"
"adc r9, r5, r9\n\t" "adc r9, r5, r9\n\t"
"str r14, [sp, #28]\n\t" "str r14, [sp, #28]\n\t"
"# A[1] * B[7]\n\t" "# A[7] * B[1]\n\t"
"ldr r6, [%[a], #4]\n\t" "ldr r7, [%[b], #4]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
"adc r10, r5, #0\n\t" "adc r10, r5, #0\n\t"
"# A[2] * B[6]\n\t" "# A[6] * B[2]\n\t"
"ldr r6, [%[a], #8]\n\t" "ldr r6, [%[a], #24]\n\t"
"ldr r7, [%[b], #24]\n\t" "ldr r7, [%[b], #8]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[3] * B[5]\n\t"
"ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #20]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[4] * B[4]\n\t"
"ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[b], #16]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
@ -29844,16 +29822,30 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t" "adc r10, r5, r10\n\t"
"# A[6] * B[2]\n\t" "# A[4] * B[4]\n\t"
"ldr r6, [%[a], #24]\n\t" "ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[b], #8]\n\t" "ldr r7, [%[b], #16]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t" "adc r10, r5, r10\n\t"
"# A[7] * B[1]\n\t" "# A[3] * B[5]\n\t"
"ldr r6, [%[a], #28]\n\t" "ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #4]\n\t" "ldr r7, [%[b], #20]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[2] * B[6]\n\t"
"ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[b], #24]\n\t"
"umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t"
"# A[1] * B[7]\n\t"
"ldr r6, [%[a], #4]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
@ -29861,7 +29853,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"str r8, [sp, #32]\n\t" "str r8, [sp, #32]\n\t"
"# A[2] * B[7]\n\t" "# A[2] * B[7]\n\t"
"ldr r6, [%[a], #8]\n\t" "ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t" "adds r9, r3, r9\n\t"
"adcs r10, r4, r10\n\t" "adcs r10, r4, r10\n\t"
@ -29902,16 +29893,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r10, r4, r10\n\t" "adcs r10, r4, r10\n\t"
"adc r14, r5, r14\n\t" "adc r14, r5, r14\n\t"
"str r9, [sp, #36]\n\t" "str r9, [sp, #36]\n\t"
"# A[3] * B[7]\n\t" "# A[7] * B[3]\n\t"
"ldr r6, [%[a], #12]\n\t" "ldr r7, [%[b], #12]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
"adc r8, r5, #0\n\t" "adc r8, r5, #0\n\t"
"# A[4] * B[6]\n\t" "# A[6] * B[4]\n\t"
"ldr r6, [%[a], #16]\n\t" "ldr r6, [%[a], #24]\n\t"
"ldr r7, [%[b], #24]\n\t" "ldr r7, [%[b], #16]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
@ -29923,16 +29913,16 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t" "adc r8, r5, r8\n\t"
"# A[6] * B[4]\n\t" "# A[4] * B[6]\n\t"
"ldr r6, [%[a], #24]\n\t" "ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[b], #16]\n\t" "ldr r7, [%[b], #24]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t" "adc r8, r5, r8\n\t"
"# A[7] * B[3]\n\t" "# A[3] * B[7]\n\t"
"ldr r6, [%[a], #28]\n\t" "ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[b], #12]\n\t" "ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
@ -29940,7 +29930,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"str r10, [sp, #40]\n\t" "str r10, [sp, #40]\n\t"
"# A[4] * B[7]\n\t" "# A[4] * B[7]\n\t"
"ldr r6, [%[a], #16]\n\t" "ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r14, r3, r14\n\t" "adds r14, r3, r14\n\t"
"adcs r8, r4, r8\n\t" "adcs r8, r4, r8\n\t"
@ -29967,9 +29956,8 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r8, r4, r8\n\t" "adcs r8, r4, r8\n\t"
"adc r9, r5, r9\n\t" "adc r9, r5, r9\n\t"
"str r14, [sp, #44]\n\t" "str r14, [sp, #44]\n\t"
"# A[5] * B[7]\n\t" "# A[7] * B[5]\n\t"
"ldr r6, [%[a], #20]\n\t" "ldr r7, [%[b], #20]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
@ -29981,16 +29969,15 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t" "adc r10, r5, r10\n\t"
"# A[7] * B[5]\n\t" "# A[5] * B[7]\n\t"
"ldr r6, [%[a], #28]\n\t" "ldr r6, [%[a], #20]\n\t"
"ldr r7, [%[b], #20]\n\t" "ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t" "adc r10, r5, r10\n\t"
"# A[6] * B[7]\n\t" "# A[6] * B[7]\n\t"
"ldr r6, [%[a], #24]\n\t" "ldr r6, [%[a], #24]\n\t"
"ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t" "adds r9, r3, r9\n\t"
"adcs r10, r4, r10\n\t" "adcs r10, r4, r10\n\t"
@ -30003,7 +29990,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r10, r4, r10\n\t" "adcs r10, r4, r10\n\t"
"adc r14, r5, r14\n\t" "adc r14, r5, r14\n\t"
"# A[7] * B[7]\n\t" "# A[7] * B[7]\n\t"
"ldr r6, [%[a], #28]\n\t"
"ldr r7, [%[b], #28]\n\t" "ldr r7, [%[b], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
@ -30021,16 +30007,16 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"ldr r9, [sp, #20]\n\t" "ldr r9, [sp, #20]\n\t"
"ldr r10, [sp, #24]\n\t" "ldr r10, [sp, #24]\n\t"
"ldr r14, [sp, #28]\n\t" "ldr r14, [sp, #28]\n\t"
"# mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192\n\t" "# mu = a[0..7] + a[0..4] << 96 + (a[0..1] * 2) << 192\n\t"
"# - a[0] << 224\n\t" "# - a[0] << 224\n\t"
"# + (a[0]-a[1] * 2) << (6 * 32)\n\t" "# + (a[0..1] * 2) << (6 * 32)\n\t"
"adds r10, r10, r4\n\t" "adds r10, r10, r4\n\t"
"adc r14, r14, r5\n\t" "adc r14, r14, r5\n\t"
"adds r10, r10, r4\n\t" "adds r10, r10, r4\n\t"
"adc r14, r14, r5\n\t" "adc r14, r14, r5\n\t"
"# - a[0] << (7 * 32)\n\t" "# - a[0] << (7 * 32)\n\t"
"sub r14, r14, r4\n\t" "sub r14, r14, r4\n\t"
"# + a[0]-a[4] << (3 * 32)\n\t" "# + a[0..4] << (3 * 32)\n\t"
"mov %[a], r7\n\t" "mov %[a], r7\n\t"
"mov %[b], r8\n\t" "mov %[b], r8\n\t"
"adds r7, r7, r4\n\t" "adds r7, r7, r4\n\t"
@ -30038,9 +30024,6 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"adcs r9, r9, r6\n\t" "adcs r9, r9, r6\n\t"
"adcs r10, r10, %[a]\n\t" "adcs r10, r10, %[a]\n\t"
"adc r14, r14, %[b]\n\t" "adc r14, r14, %[b]\n\t"
"str r4, [sp, #0]\n\t"
"str r5, [sp, #4]\n\t"
"str r6, [sp, #8]\n\t"
"str r7, [sp, #12]\n\t" "str r7, [sp, #12]\n\t"
"str r8, [sp, #16]\n\t" "str r8, [sp, #16]\n\t"
"str r9, [sp, #20]\n\t" "str r9, [sp, #20]\n\t"
@ -30156,38 +30139,28 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const
"sbcs r5, r5, r10\n\t" "sbcs r5, r5, r10\n\t"
"sbcs r6, r6, r14\n\t" "sbcs r6, r6, r14\n\t"
"sbc r7, r7, #0\n\t" "sbc r7, r7, #0\n\t"
"str r3, [sp, #44]\n\t"
"str r4, [sp, #48]\n\t"
"str r5, [sp, #52]\n\t"
"str r6, [sp, #56]\n\t"
"str r7, [sp, #60]\n\t"
"# mask m and sub from result if overflow\n\t" "# mask m and sub from result if overflow\n\t"
"sub %[b], %[a], %[b]\n\t" "sub %[b], %[a], %[b]\n\t"
"and %[a], %[b], #1\n\t" "and %[a], %[b], #1\n\t"
"ldr r3, [sp, #32]\n\t" "ldr r8, [sp, #32]\n\t"
"ldr r4, [sp, #36]\n\t" "ldr r9, [sp, #36]\n\t"
"ldr r5, [sp, #40]\n\t" "ldr r10, [sp, #40]\n\t"
"ldr r6, [sp, #44]\n\t" "subs r8, r8, %[b]\n\t"
"ldr r7, [sp, #48]\n\t" "sbcs r9, r9, %[b]\n\t"
"ldr r8, [sp, #52]\n\t" "sbcs r10, r10, %[b]\n\t"
"ldr r9, [sp, #56]\n\t" "sbcs r3, r3, #0\n\t"
"ldr r10, [sp, #60]\n\t" "sbcs r4, r4, #0\n\t"
"subs r3, r3, %[b]\n\t" "sbcs r5, r5, #0\n\t"
"sbcs r4, r4, %[b]\n\t" "sbcs r6, r6, %[a]\n\t"
"sbcs r5, r5, %[b]\n\t" "sbc r7, r7, %[b]\n\t"
"sbcs r6, r6, #0\n\t" "str r8, [%[r], #0]\n\t"
"sbcs r7, r7, #0\n\t" "str r9, [%[r], #4]\n\t"
"sbcs r8, r8, #0\n\t" "str r10, [%[r], #8]\n\t"
"sbcs r9, r9, %[a]\n\t" "str r3, [%[r], #12]\n\t"
"sbc r10, r10, %[b]\n\t" "str r4, [%[r], #16]\n\t"
"str r3, [%[r], #0]\n\t" "str r5, [%[r], #20]\n\t"
"str r4, [%[r], #4]\n\t" "str r6, [%[r], #24]\n\t"
"str r5, [%[r], #8]\n\t" "str r7, [%[r], #28]\n\t"
"str r6, [%[r], #12]\n\t"
"str r7, [%[r], #16]\n\t"
"str r8, [%[r], #20]\n\t"
"str r9, [%[r], #24]\n\t"
"str r10, [%[r], #28]\n\t"
"add sp, sp, #68\n\t" "add sp, sp, #68\n\t"
: [a] "+r" (a), [b] "+r" (b) : [a] "+r" (a), [b] "+r" (b)
: [r] "r" (r) : [r] "r" (r)
@ -30217,14 +30190,12 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"umull r9, r10, r6, r7\n\t" "umull r9, r10, r6, r7\n\t"
"str r9, [sp, #4]\n\t" "str r9, [sp, #4]\n\t"
"# A[0] * A[2]\n\t" "# A[0] * A[2]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #8]\n\t" "ldr r7, [%[a], #8]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adc r14, r4, #0\n\t" "adc r14, r4, #0\n\t"
"str r10, [sp, #8]\n\t" "str r10, [sp, #8]\n\t"
"# A[0] * A[3]\n\t" "# A[0] * A[3]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #12]\n\t" "ldr r7, [%[a], #12]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r14, r3, r14\n\t" "adds r14, r3, r14\n\t"
@ -30237,22 +30208,20 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adcs r8, r4, r8\n\t" "adcs r8, r4, r8\n\t"
"adc r9, r5, #0\n\t" "adc r9, r5, #0\n\t"
"str r14, [sp, #12]\n\t" "str r14, [sp, #12]\n\t"
"# A[0] * A[4]\n\t" "# A[1] * A[3]\n\t"
"ldr r6, [%[a], #0]\n\t" "ldr r7, [%[a], #12]\n\t"
"ldr r7, [%[a], #16]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adc r9, r4, r9\n\t" "adc r9, r4, r9\n\t"
"# A[1] * A[3]\n\t" "# A[0] * A[4]\n\t"
"ldr r6, [%[a], #4]\n\t" "ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #12]\n\t" "ldr r7, [%[a], #16]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
"adc r10, r5, #0\n\t" "adc r10, r5, #0\n\t"
"str r8, [sp, #16]\n\t" "str r8, [sp, #16]\n\t"
"# A[0] * A[5]\n\t" "# A[0] * A[5]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #20]\n\t" "ldr r7, [%[a], #20]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t" "adds r9, r3, r9\n\t"
@ -30272,9 +30241,8 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adcs r10, r4, r10\n\t" "adcs r10, r4, r10\n\t"
"adc r14, r5, r14\n\t" "adc r14, r5, r14\n\t"
"str r9, [sp, #20]\n\t" "str r9, [sp, #20]\n\t"
"# A[0] * A[6]\n\t" "# A[2] * A[4]\n\t"
"ldr r6, [%[a], #0]\n\t" "ldr r7, [%[a], #16]\n\t"
"ldr r7, [%[a], #24]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
@ -30286,16 +30254,15 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t" "adc r8, r5, r8\n\t"
"# A[2] * A[4]\n\t" "# A[0] * A[6]\n\t"
"ldr r6, [%[a], #8]\n\t" "ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #16]\n\t" "ldr r7, [%[a], #24]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
"adc r8, r5, r8\n\t" "adc r8, r5, r8\n\t"
"str r10, [sp, #24]\n\t" "str r10, [sp, #24]\n\t"
"# A[0] * A[7]\n\t" "# A[0] * A[7]\n\t"
"ldr r6, [%[a], #0]\n\t"
"ldr r7, [%[a], #28]\n\t" "ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r14, r3, r14\n\t" "adds r14, r3, r14\n\t"
@ -30323,9 +30290,8 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adcs r8, r4, r8\n\t" "adcs r8, r4, r8\n\t"
"adc r9, r5, r9\n\t" "adc r9, r5, r9\n\t"
"str r14, [sp, #28]\n\t" "str r14, [sp, #28]\n\t"
"# A[1] * A[7]\n\t" "# A[3] * A[5]\n\t"
"ldr r6, [%[a], #4]\n\t" "ldr r7, [%[a], #20]\n\t"
"ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
@ -30337,9 +30303,9 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
"adc r10, r5, r10\n\t" "adc r10, r5, r10\n\t"
"# A[3] * A[5]\n\t" "# A[1] * A[7]\n\t"
"ldr r6, [%[a], #12]\n\t" "ldr r6, [%[a], #4]\n\t"
"ldr r7, [%[a], #20]\n\t" "ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
"adcs r9, r4, r9\n\t" "adcs r9, r4, r9\n\t"
@ -30347,7 +30313,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"str r8, [sp, #32]\n\t" "str r8, [sp, #32]\n\t"
"# A[2] * A[7]\n\t" "# A[2] * A[7]\n\t"
"ldr r6, [%[a], #8]\n\t" "ldr r6, [%[a], #8]\n\t"
"ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t" "adds r9, r3, r9\n\t"
"adcs r10, r4, r10\n\t" "adcs r10, r4, r10\n\t"
@ -30367,16 +30332,15 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adcs r10, r4, r10\n\t" "adcs r10, r4, r10\n\t"
"adc r14, r5, r14\n\t" "adc r14, r5, r14\n\t"
"str r9, [sp, #36]\n\t" "str r9, [sp, #36]\n\t"
"# A[3] * A[7]\n\t" "# A[4] * A[6]\n\t"
"ldr r6, [%[a], #12]\n\t" "ldr r7, [%[a], #24]\n\t"
"ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
"adc r8, r5, #0\n\t" "adc r8, r5, #0\n\t"
"# A[4] * A[6]\n\t" "# A[3] * A[7]\n\t"
"ldr r6, [%[a], #16]\n\t" "ldr r6, [%[a], #12]\n\t"
"ldr r7, [%[a], #24]\n\t" "ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r10, r3, r10\n\t" "adds r10, r3, r10\n\t"
"adcs r14, r4, r14\n\t" "adcs r14, r4, r14\n\t"
@ -30384,7 +30348,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"str r10, [sp, #40]\n\t" "str r10, [sp, #40]\n\t"
"# A[4] * A[7]\n\t" "# A[4] * A[7]\n\t"
"ldr r6, [%[a], #16]\n\t" "ldr r6, [%[a], #16]\n\t"
"ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r14, r3, r14\n\t" "adds r14, r3, r14\n\t"
"adcs r8, r4, r8\n\t" "adcs r8, r4, r8\n\t"
@ -30398,7 +30361,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adc r9, r5, r9\n\t" "adc r9, r5, r9\n\t"
"str r14, [sp, #44]\n\t" "str r14, [sp, #44]\n\t"
"# A[5] * A[7]\n\t" "# A[5] * A[7]\n\t"
"ldr r6, [%[a], #20]\n\t"
"ldr r7, [%[a], #28]\n\t" "ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r8, r3, r8\n\t" "adds r8, r3, r8\n\t"
@ -30407,7 +30369,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"str r8, [sp, #48]\n\t" "str r8, [sp, #48]\n\t"
"# A[6] * A[7]\n\t" "# A[6] * A[7]\n\t"
"ldr r6, [%[a], #24]\n\t" "ldr r6, [%[a], #24]\n\t"
"ldr r7, [%[a], #28]\n\t"
"umull r3, r4, r6, r7\n\t" "umull r3, r4, r6, r7\n\t"
"adds r9, r3, r9\n\t" "adds r9, r3, r9\n\t"
"adc r10, r4, r10\n\t" "adc r10, r4, r10\n\t"
@ -30537,16 +30498,16 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"ldr r9, [sp, #20]\n\t" "ldr r9, [sp, #20]\n\t"
"ldr r10, [sp, #24]\n\t" "ldr r10, [sp, #24]\n\t"
"ldr r14, [sp, #28]\n\t" "ldr r14, [sp, #28]\n\t"
"# mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192\n\t" "# mu = a[0..7] + a[0..4] << 96 + (a[0..1] * 2) << 192\n\t"
"# - a[0] << 224\n\t" "# - a[0] << 224\n\t"
"# + (a[0]-a[1] * 2) << (6 * 32)\n\t" "# + (a[0..1] * 2) << (6 * 32)\n\t"
"adds r10, r10, r4\n\t" "adds r10, r10, r4\n\t"
"adc r14, r14, r5\n\t" "adc r14, r14, r5\n\t"
"adds r10, r10, r4\n\t" "adds r10, r10, r4\n\t"
"adc r14, r14, r5\n\t" "adc r14, r14, r5\n\t"
"# - a[0] << (7 * 32)\n\t" "# - a[0] << (7 * 32)\n\t"
"sub r14, r14, r4\n\t" "sub r14, r14, r4\n\t"
"# + a[0]-a[4] << (3 * 32)\n\t" "# + a[0..4] << (3 * 32)\n\t"
"mov %[a], r7\n\t" "mov %[a], r7\n\t"
"mov r12, r8\n\t" "mov r12, r8\n\t"
"adds r7, r7, r4\n\t" "adds r7, r7, r4\n\t"
@ -30554,9 +30515,6 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"adcs r9, r9, r6\n\t" "adcs r9, r9, r6\n\t"
"adcs r10, r10, %[a]\n\t" "adcs r10, r10, %[a]\n\t"
"adc r14, r14, r12\n\t" "adc r14, r14, r12\n\t"
"str r4, [sp, #0]\n\t"
"str r5, [sp, #4]\n\t"
"str r6, [sp, #8]\n\t"
"str r7, [sp, #12]\n\t" "str r7, [sp, #12]\n\t"
"str r8, [sp, #16]\n\t" "str r8, [sp, #16]\n\t"
"str r9, [sp, #20]\n\t" "str r9, [sp, #20]\n\t"
@ -30672,38 +30630,28 @@ SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const
"sbcs r5, r5, r10\n\t" "sbcs r5, r5, r10\n\t"
"sbcs r6, r6, r14\n\t" "sbcs r6, r6, r14\n\t"
"sbc r7, r7, #0\n\t" "sbc r7, r7, #0\n\t"
"str r3, [sp, #44]\n\t"
"str r4, [sp, #48]\n\t"
"str r5, [sp, #52]\n\t"
"str r6, [sp, #56]\n\t"
"str r7, [sp, #60]\n\t"
"# mask m and sub from result if overflow\n\t" "# mask m and sub from result if overflow\n\t"
"sub r12, %[a], r12\n\t" "sub r12, %[a], r12\n\t"
"and %[a], r12, #1\n\t" "and %[a], r12, #1\n\t"
"ldr r3, [sp, #32]\n\t" "ldr r8, [sp, #32]\n\t"
"ldr r4, [sp, #36]\n\t" "ldr r9, [sp, #36]\n\t"
"ldr r5, [sp, #40]\n\t" "ldr r10, [sp, #40]\n\t"
"ldr r6, [sp, #44]\n\t" "subs r8, r8, r12\n\t"
"ldr r7, [sp, #48]\n\t" "sbcs r9, r9, r12\n\t"
"ldr r8, [sp, #52]\n\t" "sbcs r10, r10, r12\n\t"
"ldr r9, [sp, #56]\n\t" "sbcs r3, r3, #0\n\t"
"ldr r10, [sp, #60]\n\t" "sbcs r4, r4, #0\n\t"
"subs r3, r3, r12\n\t" "sbcs r5, r5, #0\n\t"
"sbcs r4, r4, r12\n\t" "sbcs r6, r6, %[a]\n\t"
"sbcs r5, r5, r12\n\t" "sbc r7, r7, r12\n\t"
"sbcs r6, r6, #0\n\t" "str r8, [%[r], #0]\n\t"
"sbcs r7, r7, #0\n\t" "str r9, [%[r], #4]\n\t"
"sbcs r8, r8, #0\n\t" "str r10, [%[r], #8]\n\t"
"sbcs r9, r9, %[a]\n\t" "str r3, [%[r], #12]\n\t"
"sbc r10, r10, r12\n\t" "str r4, [%[r], #16]\n\t"
"str r3, [%[r], #0]\n\t" "str r5, [%[r], #20]\n\t"
"str r4, [%[r], #4]\n\t" "str r6, [%[r], #24]\n\t"
"str r5, [%[r], #8]\n\t" "str r7, [%[r], #28]\n\t"
"str r6, [%[r], #12]\n\t"
"str r7, [%[r], #16]\n\t"
"str r8, [%[r], #20]\n\t"
"str r9, [%[r], #24]\n\t"
"str r10, [%[r], #28]\n\t"
"add sp, sp, #68\n\t" "add sp, sp, #68\n\t"
: [a] "+r" (a) : [a] "+r" (a)
: [r] "r" (r) : [r] "r" (r)