diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c index b03de8ab4..e5214c3e3 100644 --- a/wolfcrypt/src/sp_cortexm.c +++ b/wolfcrypt/src/sp_cortexm.c @@ -13803,65 +13803,229 @@ static void sp_256_point_free_8(sp_point_256* p, int clear, void* heap) */ static int sp_256_mod_mul_norm_8(sp_digit* r, const sp_digit* a, const sp_digit* m) { - int64_t t[8]; - int64_t a64[8]; - int64_t o; + (void)m; - (void)m; - - a64[0] = a[0]; - a64[1] = a[1]; - a64[2] = a[2]; - a64[3] = a[3]; - a64[4] = a[4]; - a64[5] = a[5]; - a64[6] = a[6]; - a64[7] = a[7]; - - /* 1 1 0 -1 -1 -1 -1 0 */ - t[0] = 0 + a64[0] + a64[1] - a64[3] - a64[4] - a64[5] - a64[6]; - /* 0 1 1 0 -1 -1 -1 -1 */ - t[1] = 0 + a64[1] + a64[2] - a64[4] - a64[5] - a64[6] - a64[7]; - /* 0 0 1 1 0 -1 -1 -1 */ - t[2] = 0 + a64[2] + a64[3] - a64[5] - a64[6] - a64[7]; - /* -1 -1 0 2 2 1 0 -1 */ - t[3] = 0 - a64[0] - a64[1] + 2 * a64[3] + 2 * a64[4] + a64[5] - a64[7]; - /* 0 -1 -1 0 2 2 1 0 */ - t[4] = 0 - a64[1] - a64[2] + 2 * a64[4] + 2 * a64[5] + a64[6]; - /* 0 0 -1 -1 0 2 2 1 */ - t[5] = 0 - a64[2] - a64[3] + 2 * a64[5] + 2 * a64[6] + a64[7]; - /* -1 -1 0 0 0 1 3 2 */ - t[6] = 0 - a64[0] - a64[1] + a64[5] + 3 * a64[6] + 2 * a64[7]; - /* 1 0 -1 -1 -1 -1 0 3 */ - t[7] = 0 + a64[0] - a64[2] - a64[3] - a64[4] - a64[5] + 3 * a64[7]; - - t[1] += t[0] >> 32; t[0] &= 0xffffffff; - t[2] += t[1] >> 32; t[1] &= 0xffffffff; - t[3] += t[2] >> 32; t[2] &= 0xffffffff; - t[4] += t[3] >> 32; t[3] &= 0xffffffff; - t[5] += t[4] >> 32; t[4] &= 0xffffffff; - t[6] += t[5] >> 32; t[5] &= 0xffffffff; - t[7] += t[6] >> 32; t[6] &= 0xffffffff; - o = t[7] >> 32; t[7] &= 0xffffffff; - t[0] += o; - t[3] -= o; - t[6] -= o; - t[7] += o; - t[1] += t[0] >> 32; t[0] &= 0xffffffff; - t[2] += t[1] >> 32; t[1] &= 0xffffffff; - t[3] += t[2] >> 32; t[2] &= 0xffffffff; - t[4] += t[3] >> 32; t[3] &= 0xffffffff; - t[5] += t[4] >> 32; t[4] &= 0xffffffff; - t[6] += t[5] >> 32; t[5] &= 0xffffffff; - t[7] += t[6] >> 32; t[6] &= 0xffffffff; - r[0] = t[0]; - r[1] = t[1]; - r[2] = t[2]; - r[3] = t[3]; - r[4] = t[4]; - r[5] = t[5]; - r[6] = t[6]; - r[7] = t[7]; + __asm__ __volatile__ ( + "sub sp, sp, #24\n\t" + "ldr r2, [%[a], #0]\n\t" + "ldr r3, [%[a], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[a], #20]\n\t" + "ldr r9, [%[a], #24]\n\t" + "ldr r10, [%[a], #28]\n\t" + "# Clear overflow and underflow\n\t" + "mov r14, #0\n\t" + "mov r12, #0\n\t" + "# t[0] = 1 1 0 -1 -1 -1 -1 0\n\t" + "adds r11, r2, r3\n\t" + "adc r14, r14, #0\n\t" + "subs r11, r11, r5\n\t" + "sbc r12, r12, #0\n\t" + "subs r11, r11, r6\n\t" + "sbc r12, r12, #0\n\t" + "subs r11, r11, r8\n\t" + "sbc r12, r12, #0\n\t" + "subs r11, r11, r9\n\t" + "sbc r12, r12, #0\n\t" + "# Store t[0]\n\t" + "str r11, [sp, #0]\n\t" + "neg r12, r12\n\t" + "mov r11, #0\n\t" + "# t[1] = 0 1 1 0 -1 -1 -1 -1\n\t" + "adds r14, r14, r3\n\t" + "adc r11, r11, #0\n\t" + "adds r14, r14, r4\n\t" + "adc r11, r11, #0\n\t" + "subs r14, r14, r12\n\t" + "mov r12, #0\n\t" + "sbc r12, r12, #0\n\t" + "subs r14, r14, r6\n\t" + "sbc r12, r12, #0\n\t" + "subs r14, r14, r8\n\t" + "sbc r12, r12, #0\n\t" + "subs r14, r14, r9\n\t" + "sbc r12, r12, #0\n\t" + "subs r14, r14, r10\n\t" + "sbc r12, r12, #0\n\t" + "# Store t[1]\n\t" + "str r14, [sp, #4]\n\t" + "neg r12, r12\n\t" + "mov r14, #0\n\t" + "# t[2] = 0 0 1 1 0 -1 -1 -1\n\t" + "adds r11, r11, r4\n\t" + "adc r14, r14, #0\n\t" + "adds r11, r11, r5\n\t" + "adc r14, r14, #0\n\t" + "subs r11, r11, r12\n\t" + "mov r12, #0\n\t" + "sbc r12, r12, #0\n\t" + "subs r11, r11, r8\n\t" + "sbc r12, r12, #0\n\t" + "subs r11, r11, r9\n\t" + "sbc r12, r12, #0\n\t" + "subs r11, r11, r10\n\t" + "sbc r12, r12, #0\n\t" + "# Store t[2]\n\t" + "str r11, [sp, #8]\n\t" + "neg r12, r12\n\t" + "mov r11, #0\n\t" + "# t[3] = -1 -1 0 2 2 1 0 -1\n\t" + "adds r14, r14, r5\n\t" + "adc r11, r11, #0\n\t" + "adds r14, r14, r5\n\t" + "adc r11, r11, #0\n\t" + "adds r14, r14, r6\n\t" + "adc r11, r11, #0\n\t" + "adds r14, r14, r6\n\t" + "adc r11, r11, #0\n\t" + "adds r14, r14, r8\n\t" + "adc r11, r11, #0\n\t" + "subs r14, r14, r12\n\t" + "mov r12, #0\n\t" + "sbc r12, r12, #0\n\t" + "subs r14, r14, r2\n\t" + "sbc r12, r12, #0\n\t" + "subs r14, r14, r3\n\t" + "sbc r12, r12, #0\n\t" + "subs r14, r14, r10\n\t" + "sbc r12, r12, #0\n\t" + "# Store t[3]\n\t" + "str r14, [sp, #12]\n\t" + "neg r12, r12\n\t" + "mov r14, #0\n\t" + "# t[4] = 0 -1 -1 0 2 2 1 0\n\t" + "adds r11, r11, r6\n\t" + "adc r14, r14, #0\n\t" + "adds r11, r11, r6\n\t" + "adc r14, r14, #0\n\t" + "adds r11, r11, r8\n\t" + "adc r14, r14, #0\n\t" + "adds r11, r11, r8\n\t" + "adc r14, r14, #0\n\t" + "adds r11, r11, r9\n\t" + "adc r14, r14, #0\n\t" + "subs r11, r11, r12\n\t" + "mov r12, #0\n\t" + "sbc r12, r12, #0\n\t" + "subs r11, r11, r3\n\t" + "sbc r12, r12, #0\n\t" + "subs r11, r11, r4\n\t" + "sbc r12, r12, #0\n\t" + "# Store t[4]\n\t" + "str r11, [sp, #16]\n\t" + "neg r12, r12\n\t" + "mov r11, #0\n\t" + "# t[5] = 0 0 -1 -1 0 2 2 1\n\t" + "adds r14, r14, r8\n\t" + "adc r11, r11, #0\n\t" + "adds r14, r14, r8\n\t" + "adc r11, r11, #0\n\t" + "adds r14, r14, r9\n\t" + "adc r11, r11, #0\n\t" + "adds r14, r14, r9\n\t" + "adc r11, r11, #0\n\t" + "adds r14, r14, r10\n\t" + "adc r11, r11, #0\n\t" + "subs r14, r14, r12\n\t" + "mov r12, #0\n\t" + "sbc r12, r12, #0\n\t" + "subs r14, r14, r4\n\t" + "sbc r12, r12, #0\n\t" + "subs r14, r14, r5\n\t" + "sbc r12, r12, #0\n\t" + "# Store t[5]\n\t" + "str r14, [sp, #20]\n\t" + "neg r12, r12\n\t" + "mov r14, #0\n\t" + "# t[6] = -1 -1 0 0 0 1 3 2\n\t" + "adds r11, r11, r8\n\t" + "adc r14, r14, #0\n\t" + "adds r11, r11, r9\n\t" + "adc r14, r14, #0\n\t" + "adds r11, r11, r9\n\t" + "adc r14, r14, #0\n\t" + "adds r11, r11, r9\n\t" + "adc r14, r14, #0\n\t" + "adds r11, r11, r10\n\t" + "adc r14, r14, #0\n\t" + "adds r11, r11, r10\n\t" + "adc r14, r14, #0\n\t" + "subs r11, r11, r12\n\t" + "mov r12, #0\n\t" + "sbc r12, r12, #0\n\t" + "subs r11, r11, r2\n\t" + "sbc r12, r12, #0\n\t" + "subs r11, r11, r3\n\t" + "sbc r12, r12, #0\n\t" + "# Store t[6]\n\t" + "mov r9, r11\n\t" + "neg r12, r12\n\t" + "mov r11, #0\n\t" + "# t[7] = 1 0 -1 -1 -1 -1 0 3\n\t" + "adds r14, r14, r2\n\t" + "adc r11, r11, #0\n\t" + "adds r14, r14, r10\n\t" + "adc r11, r11, #0\n\t" + "adds r14, r14, r10\n\t" + "adc r11, r11, #0\n\t" + "adds r14, r14, r10\n\t" + "adc r11, r11, #0\n\t" + "subs r14, r14, r12\n\t" + "mov r12, #0\n\t" + "sbc r12, r12, #0\n\t" + "subs r14, r14, r4\n\t" + "sbc r12, r12, #0\n\t" + "subs r14, r14, r5\n\t" + "sbc r12, r12, #0\n\t" + "subs r14, r14, r6\n\t" + "sbc r12, r12, #0\n\t" + "subs r14, r14, r8\n\t" + "sbc r12, r12, #0\n\t" + "# Store t[7]\n\t" + "# Load intermediate\n\t" + "ldr r2, [sp, #0]\n\t" + "ldr r3, [sp, #4]\n\t" + "ldr r4, [sp, #8]\n\t" + "ldr r5, [sp, #12]\n\t" + "ldr r6, [sp, #16]\n\t" + "ldr r8, [sp, #20]\n\t" + "neg r12, r12\n\t" + "# Add overflow\n\t" + "# Subtract underflow - add neg underflow\n\t" + "adds r2, r2, r11\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adds r5, r5, r12\n\t" + "adcs r6, r6, #0\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, r12\n\t" + "adc r14, r14, r11\n\t" + "# Subtract overflow\n\t" + "# Add underflow - subtract neg underflow\n\t" + "subs r2, r2, r12\n\t" + "sbcs r3, r3, #0\n\t" + "sbcs r4, r4, #0\n\t" + "subs r5, r5, r11\n\t" + "sbcs r6, r6, #0\n\t" + "sbcs r8, r8, #0\n\t" + "sbcs r9, r9, r11\n\t" + "sbc r14, r14, r12\n\t" + "# Store result\n\t" + "str r2, [%[r], #0]\n\t" + "str r3, [%[r], #4]\n\t" + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "str r6, [%[r], #16]\n\t" + "str r8, [%[r], #20]\n\t" + "str r9, [%[r], #24]\n\t" + "str r14, [%[r], #28]\n\t" + "add sp, sp, #24\n\t" + : + : [r] "r" (r), [a] "r" (a) + : "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r14", "r12" + ); return MP_OKAY; } @@ -14050,765 +14214,6 @@ static int sp_256_point_to_ecc_point_8(const sp_point_256* p, ecc_point* pm) return err; } -/* Multiply a and b into r. (r = a * b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static void sp_256_mul_8(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit tmp[8]; - - __asm__ __volatile__ ( - /* A[0] * B[0] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r3, r4, r6, r8\n\t" - "mov r5, #0\n\t" - "str r3, [%[tmp], #0]\n\t" - "mov r3, #0\n\t" - /* A[0] * B[1] */ - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r8\n\t" - /* A[1] * B[0] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[tmp], #4]\n\t" - "mov r4, #0\n\t" - /* A[0] * B[2] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * B[1] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[2] * B[0] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[tmp], #8]\n\t" - "mov r5, #0\n\t" - /* A[0] * B[3] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * B[2] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * B[1] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * B[0] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[tmp], #12]\n\t" - "mov r3, #0\n\t" - /* A[0] * B[4] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[1] * B[3] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[2] * B[2] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[3] * B[1] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[4] * B[0] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[tmp], #16]\n\t" - "mov r4, #0\n\t" - /* A[0] * B[5] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * B[4] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[2] * B[3] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[3] * B[2] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[4] * B[1] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[5] * B[0] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[tmp], #20]\n\t" - "mov r5, #0\n\t" - /* A[0] * B[6] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * B[5] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * B[4] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * B[3] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[4] * B[2] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[5] * B[1] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * B[0] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[tmp], #24]\n\t" - "mov r3, #0\n\t" - /* A[0] * B[7] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[1] * B[6] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[2] * B[5] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[3] * B[4] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[4] * B[3] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[5] * B[2] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[6] * B[1] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[7] * B[0] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[tmp], #28]\n\t" - "mov r4, #0\n\t" - /* A[1] * B[7] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[2] * B[6] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[3] * B[5] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[4] * B[4] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[5] * B[3] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[6] * B[2] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[7] * B[1] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[r], #32]\n\t" - "mov r5, #0\n\t" - /* A[2] * B[7] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * B[6] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[4] * B[5] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[5] * B[4] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * B[3] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[7] * B[2] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[r], #36]\n\t" - "mov r3, #0\n\t" - /* A[3] * B[7] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[4] * B[6] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[5] * B[5] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[6] * B[4] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[7] * B[3] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[r], #40]\n\t" - "mov r4, #0\n\t" - /* A[4] * B[7] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[5] * B[6] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[6] * B[5] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[7] * B[4] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[r], #44]\n\t" - "mov r5, #0\n\t" - /* A[5] * B[7] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * B[6] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[7] * B[5] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[r], #48]\n\t" - "mov r3, #0\n\t" - /* A[6] * B[7] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[7] * B[6] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[r], #52]\n\t" - "mov r4, #0\n\t" - /* A[7] * B[7] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r8\n\t" - "str r5, [%[r], #56]\n\t" - "str r3, [%[r], #60]\n\t" - /* Transfer tmp to r */ - "ldr r3, [%[tmp], #0]\n\t" - "ldr r4, [%[tmp], #4]\n\t" - "ldr r5, [%[tmp], #8]\n\t" - "ldr r6, [%[tmp], #12]\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "ldr r3, [%[tmp], #16]\n\t" - "ldr r4, [%[tmp], #20]\n\t" - "ldr r5, [%[tmp], #24]\n\t" - "ldr r6, [%[tmp], #28]\n\t" - "str r3, [%[r], #16]\n\t" - "str r4, [%[r], #20]\n\t" - "str r5, [%[r], #24]\n\t" - "str r6, [%[r], #28]\n\t" - : - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [tmp] "r" (tmp) - : "memory", "r3", "r4", "r5", "r6", "r8" - ); -} - -/* Conditionally subtract b from a using the mask m. - * m is -1 to subtract and 0 when not copying. - * - * r A single precision number representing condition subtract result. - * a A single precision number to subtract from. - * b A single precision number to subtract. - * m Mask value to apply. - */ -SP_NOINLINE static sp_digit sp_256_cond_sub_8(sp_digit* r, const sp_digit* a, - const sp_digit* b, sp_digit m) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "mov r5, #32\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r5, [%[a], r8]\n\t" - "sbcs r5, r5, r6\n\t" - "sbcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" - "blt 1b\n\t" - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" - ); - - return c; -} - -/* Reduce the number back to 256 bits using Montgomery reduction. - * - * a A single precision number to reduce in place. - * m The single precision number representing the modulus. - * mp The digit representing the negative inverse of m mod 2^n. - */ -SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m, - sp_digit mp) -{ - (void)mp; - (void)m; - - __asm__ __volatile__ ( - "mov r2, #0\n\t" - "mov r1, #0\n\t" - /* i = 0 */ - "mov r9, r2\n\t" - "\n1:\n\t" - "mov r4, #0\n\t" - /* mu = a[i] * 1 (mp) = a[i] */ - "ldr r3, [%[a]]\n\t" - /* a[i] += -1 * mu = -1 * a[i] => a[i] = 0 no carry */ - /* a[i+1] += -1 * mu */ - "ldr r6, [%[a], #4]\n\t" - "mov r5, #0\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r2\n\t" - "str r4, [%[a], #4]\n\t" - /* a[i+2] += -1 * mu */ - "ldr r6, [%[a], #8]\n\t" - "mov r4, #0\n\t" - "adds r5, r5, r6\n\t" - "adc r4, r4, r2\n\t" - "str r5, [%[a], #8]\n\t" - /* a[i+3] += 0 * mu */ - "ldr r6, [%[a], #12]\n\t" - "mov r5, #0\n\t" - "adds r4, r4, r3\n\t" - "adc r5, r5, r2\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r2\n\t" - "str r4, [%[a], #12]\n\t" - /* a[i+4] += 0 * mu */ - "ldr r6, [%[a], #16]\n\t" - "mov r4, #0\n\t" - "adds r5, r5, r6\n\t" - "adc r4, r4, r2\n\t" - "str r5, [%[a], #16]\n\t" - /* a[i+5] += 0 * mu */ - "ldr r6, [%[a], #20]\n\t" - "mov r5, #0\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r2\n\t" - "str r4, [%[a], #20]\n\t" - /* a[i+6] += 1 * mu */ - "ldr r6, [%[a], #24]\n\t" - "mov r4, #0\n\t" - "adds r5, r5, r3\n\t" - "adc r4, r4, r2\n\t" - "adds r5, r5, r6\n\t" - "adc r4, r4, r2\n\t" - "str r5, [%[a], #24]\n\t" - /* a[i+7] += -1 * mu */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[a], #32]\n\t" - "adds r5, r1, r3\n\t" - "mov r1, #0\n\t" - "adc r1, r1, r2\n\t" - "subs r4, r4, r3\n\t" - "sbcs r5, r5, r2\n\t" - "sbc r1, r1, r2\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r1, r1, r2\n\t" - "str r4, [%[a], #28]\n\t" - "str r5, [%[a], #32]\n\t" - /* i += 1 */ - "add r9, r9, #1\n\t" - "add %[a], %[a], #4\n\t" - "mov r6, #8\n\t" - "cmp r9, r6\n\t" - "blt 1b\n\t" - "sub %[a], %[a], #32\n\t" - "mov r3, r1\n\t" - "sub r1, r1, #1\n\t" - "mvn r1, r1\n\t" - "ldr r4, [%[a],#32]\n\t" - "ldr r5, [%[a],#36]\n\t" - "ldr r6, [%[a],#40]\n\t" - "ldr r8, [%[a],#44]\n\t" - "subs r4, r4, r1\n\t" - "sbcs r5, r5, r1\n\t" - "sbcs r6, r6, r1\n\t" - "sbcs r8, r8, r2\n\t" - "str r4, [%[a],#0]\n\t" - "str r5, [%[a],#4]\n\t" - "str r6, [%[a],#8]\n\t" - "str r8, [%[a],#12]\n\t" - "ldr r4, [%[a],#48]\n\t" - "ldr r5, [%[a],#52]\n\t" - "ldr r6, [%[a],#56]\n\t" - "ldr r8, [%[a],#60]\n\t" - "sbcs r4, r4, r2\n\t" - "sbcs r5, r5, r2\n\t" - "sbcs r6, r6, r3\n\t" - "sbc r8, r8, r1\n\t" - "str r4, [%[a],#16]\n\t" - "str r5, [%[a],#20]\n\t" - "str r6, [%[a],#24]\n\t" - "str r8, [%[a],#28]\n\t" - : [a] "+r" (a) - : - : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r9" - ); - - - (void)m; - (void)mp; -} - -/* Reduce the number back to 256 bits using Montgomery reduction. - * - * a A single precision number to reduce in place. - * m The single precision number representing the modulus. - * mp The digit representing the negative inverse of m mod 2^n. - */ -SP_NOINLINE static void sp_256_mont_reduce_order_8(sp_digit* a, const sp_digit* m, - sp_digit mp) -{ - sp_digit ca = 0; - - __asm__ __volatile__ ( - "mov r9, %[mp]\n\t" - "mov r12, %[m]\n\t" - "mov r10, %[a]\n\t" - "mov r4, #0\n\t" - "add r11, r10, #32\n\t" - "\n1:\n\t" - /* mu = a[i] * mp */ - "mov %[mp], r9\n\t" - "ldr %[a], [r10]\n\t" - "mul %[mp], %[mp], %[a]\n\t" - "mov %[m], r12\n\t" - "add r14, r10, #24\n\t" - "\n2:\n\t" - /* a[i+j] += m[j] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" - /* a[i+j+1] += m[j+1] * mu */ - "ldr %[a], [r10]\n\t" - "mov r4, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r4, r4, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r5, r5, %[a]\n\t" - "adc r4, r4, #0\n\t" - "str r5, [r10], #4\n\t" - "cmp r10, r14\n\t" - "blt 2b\n\t" - /* a[i+6] += m[6] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" - /* a[i+7] += m[7] * mu */ - "mov r4, %[ca]\n\t" - "mov %[ca], #0\n\t" - /* Multiply m[7] and mu - Start */ - "ldr r8, [%[m]]\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc %[ca], %[ca], #0\n\t" - /* Multiply m[7] and mu - Done */ - "ldr r6, [r10]\n\t" - "ldr r8, [r10, #4]\n\t" - "adds r6, r6, r5\n\t" - "adcs r8, r8, r4\n\t" - "adc %[ca], %[ca], #0\n\t" - "str r6, [r10]\n\t" - "str r8, [r10, #4]\n\t" - /* Next word in a */ - "sub r10, r10, #24\n\t" - "cmp r10, r11\n\t" - "blt 1b\n\t" - "mov %[a], r10\n\t" - "mov %[m], r12\n\t" - : [ca] "+r" (ca), [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" - ); - - sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - ca); -} - /* Multiply two Montogmery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -14818,375 +14223,1170 @@ SP_NOINLINE static void sp_256_mont_reduce_order_8(sp_digit* a, const sp_digit* * m Modulus (prime). * mp Montogmery mulitplier. */ -static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b, +SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { - sp_256_mul_8(r, a, b); - sp_256_mont_reduce_8(r, m, mp); -} + (void)mp; + (void)m; -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -SP_NOINLINE static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) -{ - sp_digit tmp[8]; __asm__ __volatile__ ( - /* A[0] * A[0] */ + "sub sp, sp, #68\n\t" + "mov r5, #0\n\t" + "# A[0] * B[0]\n\t" "ldr r6, [%[a], #0]\n\t" - "umull r3, r4, r6, r6\n\t" - "mov r5, #0\n\t" - "str r3, [%[tmp], #0]\n\t" - "mov r3, #0\n\t" - /* A[0] * A[1] */ - "ldr r8, [%[a], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[tmp], #4]\n\t" - "mov r4, #0\n\t" - /* A[0] * A[2] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * A[1] */ - "ldr r6, [%[a], #4]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[tmp], #8]\n\t" - "mov r5, #0\n\t" - /* A[0] * A[3] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #12]\n\t" + "ldr r8, [%[b], #0]\n\t" "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[2] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adc r5, r5, r11\n\t" - "str r3, [%[tmp], #12]\n\t" - "mov r3, #0\n\t" - /* A[0] * A[4] */ + "str r9, [sp, #0]\n\t" + "# A[0] * B[1]\n\t" "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[3] */ + "ldr r8, [%[b], #4]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adc r11, r4, #0\n\t" + "# A[1] * B[0]\n\t" "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[2] */ + "ldr r8, [%[b], #0]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, #0\n\t" + "str r10, [sp, #4]\n\t" + "# A[0] * B[2]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[b], #8]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adc r14, r4, r14\n\t" + "# A[1] * B[1]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[b], #4]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, #0\n\t" + "# A[2] * B[0]\n\t" "ldr r6, [%[a], #8]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adc r3, r3, r11\n\t" - "str r4, [%[tmp], #16]\n\t" - "mov r4, #0\n\t" - /* A[0] * A[5] */ + "ldr r8, [%[b], #0]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "str r11, [sp, #8]\n\t" + "# A[0] * B[3]\n\t" "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[4] */ + "ldr r8, [%[b], #12]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, #0\n\t" + "# A[1] * B[2]\n\t" "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[3] */ + "ldr r8, [%[b], #8]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[2] * B[1]\n\t" "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r5, r5, r9\n\t" - "adcs r3, r3, r10\n\t" - "adc r4, r4, r11\n\t" - "str r5, [%[tmp], #20]\n\t" - "mov r5, #0\n\t" - /* A[0] * A[6] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[5] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[4] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[3] * A[3] */ + "ldr r8, [%[b], #4]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[3] * B[0]\n\t" "ldr r6, [%[a], #12]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adc r5, r5, r11\n\t" - "str r3, [%[tmp], #24]\n\t" - "mov r3, #0\n\t" - /* A[0] * A[7] */ + "ldr r8, [%[b], #0]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "str r14, [sp, #12]\n\t" + "# A[0] * B[4]\n\t" "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[6] */ + "ldr r8, [%[b], #16]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, #0\n\t" + "# A[1] * B[3]\n\t" "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[5] */ + "ldr r8, [%[b], #12]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "# A[2] * B[2]\n\t" "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[3] * A[4] */ + "ldr r8, [%[b], #8]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "# A[3] * B[1]\n\t" "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adc r3, r3, r11\n\t" - "str r4, [%[tmp], #28]\n\t" - "mov r4, #0\n\t" - /* A[1] * A[7] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[2] * A[6] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[3] * A[5] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[4] * A[4] */ + "ldr r8, [%[b], #4]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "# A[4] * B[0]\n\t" "ldr r6, [%[a], #16]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r5, r5, r9\n\t" - "adcs r3, r3, r10\n\t" - "adc r4, r4, r11\n\t" - "str r5, [%[r], #32]\n\t" - "mov r5, #0\n\t" - /* A[2] * A[7] */ + "ldr r8, [%[b], #0]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "str r9, [sp, #16]\n\t" + "# A[0] * B[5]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, #0\n\t" + "# A[1] * B[4]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "# A[2] * B[3]\n\t" "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[3] * A[6] */ + "ldr r8, [%[b], #12]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "# A[3] * B[2]\n\t" "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[4] * A[5] */ + "ldr r8, [%[b], #8]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "# A[4] * B[1]\n\t" "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adc r5, r5, r11\n\t" - "str r3, [%[r], #36]\n\t" - "mov r3, #0\n\t" - /* A[3] * A[7] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[4] * A[6] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[5] * A[5] */ + "ldr r8, [%[b], #4]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "# A[5] * B[0]\n\t" "ldr r6, [%[a], #20]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adc r3, r3, r11\n\t" - "str r4, [%[r], #40]\n\t" - "mov r4, #0\n\t" - /* A[4] * A[7] */ + "ldr r8, [%[b], #0]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "str r10, [sp, #20]\n\t" + "# A[0] * B[6]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, #0\n\t" + "# A[1] * B[5]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "# A[2] * B[4]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "# A[3] * B[3]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[b], #12]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "# A[4] * B[2]\n\t" "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[5] * A[6] */ + "ldr r8, [%[b], #8]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "# A[5] * B[1]\n\t" "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[r], #44]\n\t" - "mov r5, #0\n\t" - /* A[5] * A[7] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * A[6] */ + "ldr r8, [%[b], #4]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "# A[6] * B[0]\n\t" "ldr r6, [%[a], #24]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[r], #48]\n\t" - "mov r3, #0\n\t" - /* A[6] * A[7] */ + "ldr r8, [%[b], #0]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "str r11, [sp, #24]\n\t" + "# A[0] * B[7]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, #0\n\t" + "# A[1] * B[6]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[2] * B[5]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[3] * B[4]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[4] * B[3]\n\t" + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[b], #12]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[5] * B[2]\n\t" + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #8]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[6] * B[1]\n\t" "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[r], #52]\n\t" - "mov r4, #0\n\t" - /* A[7] * A[7] */ + "ldr r8, [%[b], #4]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[7] * B[0]\n\t" "ldr r6, [%[a], #28]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r8\n\t" - "str r5, [%[r], #56]\n\t" - "str r3, [%[r], #60]\n\t" - /* Transfer tmp to r */ - "ldr r3, [%[tmp], #0]\n\t" - "ldr r4, [%[tmp], #4]\n\t" - "ldr r5, [%[tmp], #8]\n\t" - "ldr r6, [%[tmp], #12]\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "ldr r3, [%[tmp], #16]\n\t" - "ldr r4, [%[tmp], #20]\n\t" - "ldr r5, [%[tmp], #24]\n\t" - "ldr r6, [%[tmp], #28]\n\t" - "str r3, [%[r], #16]\n\t" - "str r4, [%[r], #20]\n\t" - "str r5, [%[r], #24]\n\t" - "str r6, [%[r], #28]\n\t" - : - : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11" + "ldr r8, [%[b], #0]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "str r14, [sp, #28]\n\t" + "# A[1] * B[7]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, #0\n\t" + "# A[2] * B[6]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "# A[3] * B[5]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "# A[4] * B[4]\n\t" + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "# A[5] * B[3]\n\t" + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #12]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "# A[6] * B[2]\n\t" + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #8]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "# A[7] * B[1]\n\t" + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #4]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "str r9, [sp, #32]\n\t" + "# A[2] * B[7]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, #0\n\t" + "# A[3] * B[6]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "# A[4] * B[5]\n\t" + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "# A[5] * B[4]\n\t" + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "# A[6] * B[3]\n\t" + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #12]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "# A[7] * B[2]\n\t" + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #8]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "str r10, [sp, #36]\n\t" + "# A[3] * B[7]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, #0\n\t" + "# A[4] * B[6]\n\t" + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "# A[5] * B[5]\n\t" + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "# A[6] * B[4]\n\t" + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "# A[7] * B[3]\n\t" + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #12]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "str r11, [sp, #40]\n\t" + "# A[4] * B[7]\n\t" + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, #0\n\t" + "# A[5] * B[6]\n\t" + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[6] * B[5]\n\t" + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[7] * B[4]\n\t" + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "str r14, [sp, #44]\n\t" + "# A[5] * B[7]\n\t" + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, #0\n\t" + "# A[6] * B[6]\n\t" + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "# A[7] * B[5]\n\t" + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "# A[6] * B[7]\n\t" + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, #0\n\t" + "# A[7] * B[6]\n\t" + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "# A[7] * B[7]\n\t" + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adc r14, r4, r14\n\t" + "str r9, [sp, #48]\n\t" + "str r10, [sp, #52]\n\t" + "str r11, [sp, #56]\n\t" + "str r14, [sp, #60]\n\t" + "# Start Reduction\n\t" + "ldr r4, [sp, #0]\n\t" + "ldr r5, [sp, #4]\n\t" + "ldr r6, [sp, #8]\n\t" + "ldr r8, [sp, #12]\n\t" + "ldr r9, [sp, #16]\n\t" + "ldr r10, [sp, #20]\n\t" + "ldr r11, [sp, #24]\n\t" + "ldr r14, [sp, #28]\n\t" + "# mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192\n\t" + "# - a[0] << 224\n\t" + "# + (a[0]-a[1] * 2) << (6 * 32)\n\t" + "adds r11, r11, r4\n\t" + "adc r14, r14, r5\n\t" + "adds r11, r11, r4\n\t" + "adc r14, r14, r5\n\t" + "# - a[0] << (7 * 32)\n\t" + "sub r14, r14, r4\n\t" + "# + a[0]-a[4] << (3 * 32)\n\t" + "mov %[a], r8\n\t" + "mov %[b], r9\n\t" + "adds r8, r8, r4\n\t" + "adcs r9, r9, r5\n\t" + "adcs r10, r10, r6\n\t" + "adcs r11, r11, %[a]\n\t" + "adc r14, r14, %[b]\n\t" + "str r4, [sp, #0]\n\t" + "str r5, [sp, #4]\n\t" + "str r6, [sp, #8]\n\t" + "str r8, [sp, #12]\n\t" + "str r9, [sp, #16]\n\t" + "str r10, [sp, #20]\n\t" + "# a += mu * m\n\t" + "# += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1)\n\t" + "mov %[a], #0\n\t" + "# a[6] += t[0] + t[3]\n\t" + "ldr r3, [sp, #24]\n\t" + "adds r3, r3, r4\n\t" + "adc %[b], %[a], #0\n\t" + "adds r3, r3, r8\n\t" + "adc %[b], %[b], #0\n\t" + "str r11, [sp, #24]\n\t" + "# a[7] += t[1] + t[4]\n\t" + "ldr r3, [sp, #28]\n\t" + "adds r3, r3, %[b]\n\t" + "adc %[b], %[a], #0\n\t" + "adds r3, r3, r5\n\t" + "adc %[b], %[b], #0\n\t" + "adds r3, r3, r9\n\t" + "adc %[b], %[b], #0\n\t" + "str r14, [sp, #28]\n\t" + "str r3, [sp, #64]\n\t" + "# a[8] += t[0] + t[2] + t[5]\n\t" + "ldr r3, [sp, #32]\n\t" + "adds r3, r3, %[b]\n\t" + "adc %[b], %[a], #0\n\t" + "adds r3, r3, r4\n\t" + "adc %[b], %[b], #0\n\t" + "adds r3, r3, r6\n\t" + "adc %[b], %[b], #0\n\t" + "adds r3, r3, r10\n\t" + "adc %[b], %[b], #0\n\t" + "str r3, [sp, #32]\n\t" + "# a[9] += t[1] + t[3] + t[6]\n\t" + "# a[10] += t[2] + t[4] + t[7]\n\t" + "ldr r3, [sp, #36]\n\t" + "ldr r4, [sp, #40]\n\t" + "adds r3, r3, %[b]\n\t" + "adcs r4, r4, #0\n\t" + "adc %[b], %[a], #0\n\t" + "adds r3, r3, r5\n\t" + "adcs r4, r4, r6\n\t" + "adc %[b], %[b], #0\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc %[b], %[b], #0\n\t" + "adds r3, r3, r11\n\t" + "adcs r4, r4, r14\n\t" + "adc %[b], %[b], #0\n\t" + "str r3, [sp, #36]\n\t" + "str r4, [sp, #40]\n\t" + "# a[11] += t[3] + t[5]\n\t" + "# a[12] += t[4] + t[6]\n\t" + "# a[13] += t[5] + t[7]\n\t" + "# a[14] += t[6]\n\t" + "ldr r3, [sp, #44]\n\t" + "ldr r4, [sp, #48]\n\t" + "ldr r5, [sp, #52]\n\t" + "ldr r6, [sp, #56]\n\t" + "adds r3, r3, %[b]\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adc %[b], %[a], #0\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adcs r5, r5, r10\n\t" + "adcs r6, r6, r11\n\t" + "adc %[b], %[b], #0\n\t" + "adds r3, r3, r10\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r14\n\t" + "adcs r6, r6, #0\n\t" + "adc %[b], %[b], #0\n\t" + "str r3, [sp, #44]\n\t" + "str r4, [sp, #48]\n\t" + "str r5, [sp, #52]\n\t" + "str r6, [sp, #56]\n\t" + "# a[15] += t[7]\n\t" + "ldr r3, [sp, #60]\n\t" + "adds r3, r3, %[b]\n\t" + "adc %[b], %[a], #0\n\t" + "adds r3, r3, r14\n\t" + "adc %[b], %[b], #0\n\t" + "str r3, [sp, #60]\n\t" + "ldr r3, [sp, #64]\n\t" + "ldr r4, [sp, #32]\n\t" + "ldr r5, [sp, #36]\n\t" + "ldr r6, [sp, #40]\n\t" + "ldr r9, [sp, #0]\n\t" + "ldr r10, [sp, #4]\n\t" + "ldr r11, [sp, #8]\n\t" + "ldr r14, [sp, #12]\n\t" + "subs r3, r3, r9\n\t" + "sbcs r4, r4, r10\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r14\n\t" + "str r4, [sp, #32]\n\t" + "str r5, [sp, #36]\n\t" + "str r6, [sp, #40]\n\t" + "ldr r3, [sp, #44]\n\t" + "ldr r4, [sp, #48]\n\t" + "ldr r5, [sp, #52]\n\t" + "ldr r6, [sp, #56]\n\t" + "ldr r8, [sp, #60]\n\t" + "ldr r9, [sp, #16]\n\t" + "ldr r10, [sp, #20]\n\t" + "ldr r11, [sp, #24]\n\t" + "ldr r14, [sp, #28]\n\t" + "sbcs r3, r3, r9\n\t" + "sbcs r4, r4, r10\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r14\n\t" + "sbc r8, r8, #0\n\t" + "str r3, [sp, #44]\n\t" + "str r4, [sp, #48]\n\t" + "str r5, [sp, #52]\n\t" + "str r6, [sp, #56]\n\t" + "str r8, [sp, #60]\n\t" + "# mask m and sub from result if overflow\n\t" + "sub %[b], %[a], %[b]\n\t" + "and %[a], %[b], #1\n\t" + "ldr r3, [sp, #32]\n\t" + "ldr r4, [sp, #36]\n\t" + "ldr r5, [sp, #40]\n\t" + "ldr r6, [sp, #44]\n\t" + "ldr r8, [sp, #48]\n\t" + "ldr r9, [sp, #52]\n\t" + "ldr r10, [sp, #56]\n\t" + "ldr r11, [sp, #60]\n\t" + "subs r3, r3, %[b]\n\t" + "sbcs r4, r4, %[b]\n\t" + "sbcs r5, r5, %[b]\n\t" + "sbcs r6, r6, #0\n\t" + "sbcs r8, r8, #0\n\t" + "sbcs r9, r9, #0\n\t" + "sbcs r10, r10, %[a]\n\t" + "sbc r11, r11, %[b]\n\t" + "str r3, [%[r], #0]\n\t" + "str r4, [%[r], #4]\n\t" + "str r5, [%[r], #8]\n\t" + "str r6, [%[r], #12]\n\t" + "str r8, [%[r], #16]\n\t" + "str r9, [%[r], #20]\n\t" + "str r10, [%[r], #24]\n\t" + "str r11, [%[r], #28]\n\t" + "add sp, sp, #68\n\t" + : [a] "+r" (a), [b] "+r" (b) + : [r] "r" (r) + : "memory", "r9", "r10", "r11", "r14", "r3", "r4", "r5", "r6", "r8" ); } -/* Square the Montgomery form number. (r = a * a mod m) +/* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m) * * r Result of squaring. * a Number to square in Montogmery form. * m Modulus (prime). * mp Montogmery mulitplier. */ -static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const sp_digit* m, +SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { - sp_256_sqr_8(r, a); - sp_256_mont_reduce_8(r, m, mp); + (void)mp; + (void)m; + + __asm__ __volatile__ ( + "sub sp, sp, #68\n\t" + "mov r5, #0\n\t" + "# A[0] * A[1]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #4]\n\t" + "umull r10, r11, r6, r8\n\t" + "str r10, [sp, #4]\n\t" + "# A[0] * A[2]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adc r14, r4, #0\n\t" + "str r11, [sp, #8]\n\t" + "# A[0] * A[3]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adc r9, r4, #0\n\t" + "# A[1] * A[2]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, #0\n\t" + "str r14, [sp, #12]\n\t" + "# A[0] * A[4]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adc r10, r4, r10\n\t" + "# A[1] * A[3]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, #0\n\t" + "str r9, [sp, #16]\n\t" + "# A[0] * A[5]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adc r11, r4, r11\n\t" + "# A[1] * A[4]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, #0\n\t" + "# A[2] * A[3]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "str r10, [sp, #20]\n\t" + "# A[0] * A[6]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, #0\n\t" + "# A[1] * A[5]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "# A[2] * A[4]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "str r11, [sp, #24]\n\t" + "# A[0] * A[7]\n\t" + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, #0\n\t" + "# A[1] * A[6]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[2] * A[5]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "# A[3] * A[4]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "str r14, [sp, #28]\n\t" + "# A[1] * A[7]\n\t" + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, #0\n\t" + "# A[2] * A[6]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "# A[3] * A[5]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, r11\n\t" + "str r9, [sp, #32]\n\t" + "# A[2] * A[7]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, #0\n\t" + "# A[3] * A[6]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "# A[4] * A[5]\n\t" + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adcs r11, r4, r11\n\t" + "adc r14, r5, r14\n\t" + "str r10, [sp, #36]\n\t" + "# A[3] * A[7]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, #0\n\t" + "# A[4] * A[6]\n\t" + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r11, r3, r11\n\t" + "adcs r14, r4, r14\n\t" + "adc r9, r5, r9\n\t" + "str r11, [sp, #40]\n\t" + "# A[4] * A[7]\n\t" + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, #0\n\t" + "# A[5] * A[6]\n\t" + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r14, r3, r14\n\t" + "adcs r9, r4, r9\n\t" + "adc r10, r5, r10\n\t" + "str r14, [sp, #44]\n\t" + "# A[5] * A[7]\n\t" + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r9, r3, r9\n\t" + "adcs r10, r4, r10\n\t" + "adc r11, r5, #0\n\t" + "str r9, [sp, #48]\n\t" + "# A[6] * A[7]\n\t" + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r3, r4, r6, r8\n\t" + "adds r10, r3, r10\n\t" + "adc r11, r4, r11\n\t" + "str r10, [sp, #52]\n\t" + "str r11, [sp, #56]\n\t" + "# Double\n\t" + "ldr r4, [sp, #4]\n\t" + "ldr r6, [sp, #8]\n\t" + "ldr r8, [sp, #12]\n\t" + "ldr r9, [sp, #16]\n\t" + "ldr r10, [sp, #20]\n\t" + "ldr r11, [sp, #24]\n\t" + "ldr r14, [sp, #28]\n\t" + "ldr r12, [sp, #32]\n\t" + "ldr r3, [sp, #36]\n\t" + "adds r4, r4, r4\n\t" + "adcs r6, r6, r6\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adcs r11, r11, r11\n\t" + "adcs r14, r14, r14\n\t" + "adcs r12, r12, r12\n\t" + "adcs r3, r3, r3\n\t" + "str r4, [sp, #4]\n\t" + "str r6, [sp, #8]\n\t" + "str r8, [sp, #12]\n\t" + "str r9, [sp, #16]\n\t" + "str r10, [sp, #20]\n\t" + "str r11, [sp, #24]\n\t" + "str r14, [sp, #28]\n\t" + "str r12, [sp, #32]\n\t" + "str r3, [sp, #36]\n\t" + "ldr r4, [sp, #40]\n\t" + "ldr r6, [sp, #44]\n\t" + "ldr r8, [sp, #48]\n\t" + "ldr r9, [sp, #52]\n\t" + "ldr r10, [sp, #56]\n\t" + "adcs r4, r4, r4\n\t" + "adcs r6, r6, r6\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "str r4, [sp, #40]\n\t" + "str r6, [sp, #44]\n\t" + "str r8, [sp, #48]\n\t" + "str r9, [sp, #52]\n\t" + "str r10, [sp, #56]\n\t" + "adc r11, r5, #0\n\t" + "str r11, [sp, #60]\n\t" + "ldr r4, [sp, #4]\n\t" + "ldr r5, [sp, #8]\n\t" + "ldr r12, [sp, #12]\n\t" + "# A[0] * A[0]\n\t" + "ldr r6, [%[a], #0]\n\t" + "umull r9, r10, r6, r6\n\t" + "# A[1] * A[1]\n\t" + "ldr r6, [%[a], #4]\n\t" + "umull r11, r14, r6, r6\n\t" + "adds r10, r10, r4\n\t" + "adcs r11, r11, r5\n\t" + "adcs r14, r14, r12\n\t" + "str r9, [sp, #0]\n\t" + "str r10, [sp, #4]\n\t" + "str r11, [sp, #8]\n\t" + "str r14, [sp, #12]\n\t" + "ldr r3, [sp, #16]\n\t" + "ldr r4, [sp, #20]\n\t" + "ldr r5, [sp, #24]\n\t" + "ldr r12, [sp, #28]\n\t" + "# A[2] * A[2]\n\t" + "ldr r6, [%[a], #8]\n\t" + "umull r9, r10, r6, r6\n\t" + "# A[3] * A[3]\n\t" + "ldr r6, [%[a], #12]\n\t" + "umull r11, r14, r6, r6\n\t" + "adcs r9, r9, r3\n\t" + "adcs r10, r10, r4\n\t" + "adcs r11, r11, r5\n\t" + "adcs r14, r14, r12\n\t" + "str r9, [sp, #16]\n\t" + "str r10, [sp, #20]\n\t" + "str r11, [sp, #24]\n\t" + "str r14, [sp, #28]\n\t" + "ldr r3, [sp, #32]\n\t" + "ldr r4, [sp, #36]\n\t" + "ldr r5, [sp, #40]\n\t" + "ldr r12, [sp, #44]\n\t" + "# A[4] * A[4]\n\t" + "ldr r6, [%[a], #16]\n\t" + "umull r9, r10, r6, r6\n\t" + "# A[5] * A[5]\n\t" + "ldr r6, [%[a], #20]\n\t" + "umull r11, r14, r6, r6\n\t" + "adcs r9, r9, r3\n\t" + "adcs r10, r10, r4\n\t" + "adcs r11, r11, r5\n\t" + "adcs r14, r14, r12\n\t" + "str r9, [sp, #32]\n\t" + "str r10, [sp, #36]\n\t" + "str r11, [sp, #40]\n\t" + "str r14, [sp, #44]\n\t" + "ldr r3, [sp, #48]\n\t" + "ldr r4, [sp, #52]\n\t" + "ldr r5, [sp, #56]\n\t" + "ldr r12, [sp, #60]\n\t" + "# A[6] * A[6]\n\t" + "ldr r6, [%[a], #24]\n\t" + "umull r9, r10, r6, r6\n\t" + "# A[7] * A[7]\n\t" + "ldr r6, [%[a], #28]\n\t" + "umull r11, r14, r6, r6\n\t" + "adcs r9, r9, r3\n\t" + "adcs r10, r10, r4\n\t" + "adcs r11, r11, r5\n\t" + "adc r14, r14, r12\n\t" + "str r9, [sp, #48]\n\t" + "str r10, [sp, #52]\n\t" + "str r11, [sp, #56]\n\t" + "str r14, [sp, #60]\n\t" + "# Start Reduction\n\t" + "ldr r4, [sp, #0]\n\t" + "ldr r5, [sp, #4]\n\t" + "ldr r6, [sp, #8]\n\t" + "ldr r8, [sp, #12]\n\t" + "ldr r9, [sp, #16]\n\t" + "ldr r10, [sp, #20]\n\t" + "ldr r11, [sp, #24]\n\t" + "ldr r14, [sp, #28]\n\t" + "# mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192\n\t" + "# - a[0] << 224\n\t" + "# + (a[0]-a[1] * 2) << (6 * 32)\n\t" + "adds r11, r11, r4\n\t" + "adc r14, r14, r5\n\t" + "adds r11, r11, r4\n\t" + "adc r14, r14, r5\n\t" + "# - a[0] << (7 * 32)\n\t" + "sub r14, r14, r4\n\t" + "# + a[0]-a[4] << (3 * 32)\n\t" + "mov %[a], r8\n\t" + "mov r12, r9\n\t" + "adds r8, r8, r4\n\t" + "adcs r9, r9, r5\n\t" + "adcs r10, r10, r6\n\t" + "adcs r11, r11, %[a]\n\t" + "adc r14, r14, r12\n\t" + "str r4, [sp, #0]\n\t" + "str r5, [sp, #4]\n\t" + "str r6, [sp, #8]\n\t" + "str r8, [sp, #12]\n\t" + "str r9, [sp, #16]\n\t" + "str r10, [sp, #20]\n\t" + "# a += mu * m\n\t" + "# += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1)\n\t" + "mov %[a], #0\n\t" + "# a[6] += t[0] + t[3]\n\t" + "ldr r3, [sp, #24]\n\t" + "adds r3, r3, r4\n\t" + "adc r12, %[a], #0\n\t" + "adds r3, r3, r8\n\t" + "adc r12, r12, #0\n\t" + "str r11, [sp, #24]\n\t" + "# a[7] += t[1] + t[4]\n\t" + "ldr r3, [sp, #28]\n\t" + "adds r3, r3, r12\n\t" + "adc r12, %[a], #0\n\t" + "adds r3, r3, r5\n\t" + "adc r12, r12, #0\n\t" + "adds r3, r3, r9\n\t" + "adc r12, r12, #0\n\t" + "str r14, [sp, #28]\n\t" + "str r3, [sp, #64]\n\t" + "# a[8] += t[0] + t[2] + t[5]\n\t" + "ldr r3, [sp, #32]\n\t" + "adds r3, r3, r12\n\t" + "adc r12, %[a], #0\n\t" + "adds r3, r3, r4\n\t" + "adc r12, r12, #0\n\t" + "adds r3, r3, r6\n\t" + "adc r12, r12, #0\n\t" + "adds r3, r3, r10\n\t" + "adc r12, r12, #0\n\t" + "str r3, [sp, #32]\n\t" + "# a[9] += t[1] + t[3] + t[6]\n\t" + "# a[10] += t[2] + t[4] + t[7]\n\t" + "ldr r3, [sp, #36]\n\t" + "ldr r4, [sp, #40]\n\t" + "adds r3, r3, r12\n\t" + "adcs r4, r4, #0\n\t" + "adc r12, %[a], #0\n\t" + "adds r3, r3, r5\n\t" + "adcs r4, r4, r6\n\t" + "adc r12, r12, #0\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc r12, r12, #0\n\t" + "adds r3, r3, r11\n\t" + "adcs r4, r4, r14\n\t" + "adc r12, r12, #0\n\t" + "str r3, [sp, #36]\n\t" + "str r4, [sp, #40]\n\t" + "# a[11] += t[3] + t[5]\n\t" + "# a[12] += t[4] + t[6]\n\t" + "# a[13] += t[5] + t[7]\n\t" + "# a[14] += t[6]\n\t" + "ldr r3, [sp, #44]\n\t" + "ldr r4, [sp, #48]\n\t" + "ldr r5, [sp, #52]\n\t" + "ldr r6, [sp, #56]\n\t" + "adds r3, r3, r12\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adc r12, %[a], #0\n\t" + "adds r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adcs r5, r5, r10\n\t" + "adcs r6, r6, r11\n\t" + "adc r12, r12, #0\n\t" + "adds r3, r3, r10\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r14\n\t" + "adcs r6, r6, #0\n\t" + "adc r12, r12, #0\n\t" + "str r3, [sp, #44]\n\t" + "str r4, [sp, #48]\n\t" + "str r5, [sp, #52]\n\t" + "str r6, [sp, #56]\n\t" + "# a[15] += t[7]\n\t" + "ldr r3, [sp, #60]\n\t" + "adds r3, r3, r12\n\t" + "adc r12, %[a], #0\n\t" + "adds r3, r3, r14\n\t" + "adc r12, r12, #0\n\t" + "str r3, [sp, #60]\n\t" + "ldr r3, [sp, #64]\n\t" + "ldr r4, [sp, #32]\n\t" + "ldr r5, [sp, #36]\n\t" + "ldr r6, [sp, #40]\n\t" + "ldr r9, [sp, #0]\n\t" + "ldr r10, [sp, #4]\n\t" + "ldr r11, [sp, #8]\n\t" + "ldr r14, [sp, #12]\n\t" + "subs r3, r3, r9\n\t" + "sbcs r4, r4, r10\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r14\n\t" + "str r4, [sp, #32]\n\t" + "str r5, [sp, #36]\n\t" + "str r6, [sp, #40]\n\t" + "ldr r3, [sp, #44]\n\t" + "ldr r4, [sp, #48]\n\t" + "ldr r5, [sp, #52]\n\t" + "ldr r6, [sp, #56]\n\t" + "ldr r8, [sp, #60]\n\t" + "ldr r9, [sp, #16]\n\t" + "ldr r10, [sp, #20]\n\t" + "ldr r11, [sp, #24]\n\t" + "ldr r14, [sp, #28]\n\t" + "sbcs r3, r3, r9\n\t" + "sbcs r4, r4, r10\n\t" + "sbcs r5, r5, r11\n\t" + "sbcs r6, r6, r14\n\t" + "sbc r8, r8, #0\n\t" + "str r3, [sp, #44]\n\t" + "str r4, [sp, #48]\n\t" + "str r5, [sp, #52]\n\t" + "str r6, [sp, #56]\n\t" + "str r8, [sp, #60]\n\t" + "# mask m and sub from result if overflow\n\t" + "sub r12, %[a], r12\n\t" + "and %[a], r12, #1\n\t" + "ldr r3, [sp, #32]\n\t" + "ldr r4, [sp, #36]\n\t" + "ldr r5, [sp, #40]\n\t" + "ldr r6, [sp, #44]\n\t" + "ldr r8, [sp, #48]\n\t" + "ldr r9, [sp, #52]\n\t" + "ldr r10, [sp, #56]\n\t" + "ldr r11, [sp, #60]\n\t" + "subs r3, r3, r12\n\t" + "sbcs r4, r4, r12\n\t" + "sbcs r5, r5, r12\n\t" + "sbcs r6, r6, #0\n\t" + "sbcs r8, r8, #0\n\t" + "sbcs r9, r9, #0\n\t" + "sbcs r10, r10, %[a]\n\t" + "sbc r11, r11, r12\n\t" + "str r3, [%[r], #0]\n\t" + "str r4, [%[r], #4]\n\t" + "str r5, [%[r], #8]\n\t" + "str r6, [%[r], #12]\n\t" + "str r8, [%[r], #16]\n\t" + "str r9, [%[r], #20]\n\t" + "str r10, [%[r], #24]\n\t" + "str r11, [%[r], #28]\n\t" + "add sp, sp, #68\n\t" + : [a] "+r" (a) + : [r] "r" (r) + : "memory", "r9", "r10", "r11", "r14", "r3", "r4", "r5", "r6", "r8", "r12" + ); } #if !defined(WOLFSSL_SP_SMALL) || defined(HAVE_COMP_KEY) @@ -15334,6 +15534,257 @@ SP_NOINLINE static int32_t sp_256_cmp_8(const sp_digit* a, const sp_digit* b) */ #define sp_256_norm_8(a) +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +SP_NOINLINE static sp_digit sp_256_cond_sub_8(sp_digit* r, const sp_digit* a, + const sp_digit* b, sp_digit m) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov r5, #32\n\t" + "mov r9, r5\n\t" + "mov r8, #0\n\t" + "\n1:\n\t" + "ldr r6, [%[b], r8]\n\t" + "and r6, r6, %[m]\n\t" + "mov r5, #0\n\t" + "subs r5, r5, %[c]\n\t" + "ldr r5, [%[a], r8]\n\t" + "sbcs r5, r5, r6\n\t" + "sbcs %[c], %[c], %[c]\n\t" + "str r5, [%[r], r8]\n\t" + "add r8, r8, #4\n\t" + "cmp r8, r9\n\t" + "blt 1b\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "r5", "r6", "r8", "r9" + ); + + return c; +} + +/* Reduce the number back to 256 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m, + sp_digit mp) +{ + (void)mp; + (void)m; + + __asm__ __volatile__ ( + "mov r2, #0\n\t" + "mov r1, #0\n\t" + /* i = 0 */ + "mov r9, r2\n\t" + "\n1:\n\t" + "mov r4, #0\n\t" + /* mu = a[i] * 1 (mp) = a[i] */ + "ldr r3, [%[a]]\n\t" + /* a[i] += -1 * mu = -1 * a[i] => a[i] = 0 no carry */ + /* a[i+1] += -1 * mu */ + "ldr r6, [%[a], #4]\n\t" + "mov r5, #0\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r2\n\t" + "str r4, [%[a], #4]\n\t" + /* a[i+2] += -1 * mu */ + "ldr r6, [%[a], #8]\n\t" + "mov r4, #0\n\t" + "adds r5, r5, r6\n\t" + "adc r4, r4, r2\n\t" + "str r5, [%[a], #8]\n\t" + /* a[i+3] += 0 * mu */ + "ldr r6, [%[a], #12]\n\t" + "mov r5, #0\n\t" + "adds r4, r4, r3\n\t" + "adc r5, r5, r2\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r2\n\t" + "str r4, [%[a], #12]\n\t" + /* a[i+4] += 0 * mu */ + "ldr r6, [%[a], #16]\n\t" + "mov r4, #0\n\t" + "adds r5, r5, r6\n\t" + "adc r4, r4, r2\n\t" + "str r5, [%[a], #16]\n\t" + /* a[i+5] += 0 * mu */ + "ldr r6, [%[a], #20]\n\t" + "mov r5, #0\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r2\n\t" + "str r4, [%[a], #20]\n\t" + /* a[i+6] += 1 * mu */ + "ldr r6, [%[a], #24]\n\t" + "mov r4, #0\n\t" + "adds r5, r5, r3\n\t" + "adc r4, r4, r2\n\t" + "adds r5, r5, r6\n\t" + "adc r4, r4, r2\n\t" + "str r5, [%[a], #24]\n\t" + /* a[i+7] += -1 * mu */ + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[a], #32]\n\t" + "adds r5, r1, r3\n\t" + "mov r1, #0\n\t" + "adc r1, r1, r2\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, r2\n\t" + "sbc r1, r1, r2\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r1, r1, r2\n\t" + "str r4, [%[a], #28]\n\t" + "str r5, [%[a], #32]\n\t" + /* i += 1 */ + "add r9, r9, #1\n\t" + "add %[a], %[a], #4\n\t" + "mov r6, #8\n\t" + "cmp r9, r6\n\t" + "blt 1b\n\t" + "sub %[a], %[a], #32\n\t" + "mov r3, r1\n\t" + "sub r1, r1, #1\n\t" + "mvn r1, r1\n\t" + "ldr r4, [%[a],#32]\n\t" + "ldr r5, [%[a],#36]\n\t" + "ldr r6, [%[a],#40]\n\t" + "ldr r8, [%[a],#44]\n\t" + "ldr r9, [%[a],#48]\n\t" + "ldr r10, [%[a],#52]\n\t" + "ldr r11, [%[a],#56]\n\t" + "ldr r14, [%[a],#60]\n\t" + "subs r4, r4, r1\n\t" + "sbcs r5, r5, r1\n\t" + "sbcs r6, r6, r1\n\t" + "sbcs r8, r8, r2\n\t" + "sbcs r9, r9, r2\n\t" + "sbcs r10, r10, r2\n\t" + "sbcs r11, r11, r3\n\t" + "sbc r14, r14, r1\n\t" + "str r4, [%[a],#0]\n\t" + "str r5, [%[a],#4]\n\t" + "str r6, [%[a],#8]\n\t" + "str r8, [%[a],#12]\n\t" + "str r9, [%[a],#16]\n\t" + "str r10, [%[a],#20]\n\t" + "str r11, [%[a],#24]\n\t" + "str r14, [%[a],#28]\n\t" + : [a] "+r" (a) + : + : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r14" + ); + + + (void)m; + (void)mp; +} + +/* Reduce the number back to 256 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +SP_NOINLINE static void sp_256_mont_reduce_order_8(sp_digit* a, const sp_digit* m, + sp_digit mp) +{ + sp_digit ca = 0; + + __asm__ __volatile__ ( + "mov r9, %[mp]\n\t" + "mov r12, %[m]\n\t" + "mov r10, %[a]\n\t" + "mov r4, #0\n\t" + "add r11, r10, #32\n\t" + "\n1:\n\t" + /* mu = a[i] * mp */ + "mov %[mp], r9\n\t" + "ldr %[a], [r10]\n\t" + "mul %[mp], %[mp], %[a]\n\t" + "mov %[m], r12\n\t" + "add r14, r10, #24\n\t" + "\n2:\n\t" + /* a[i+j] += m[j] * mu */ + "ldr %[a], [r10]\n\t" + "mov r5, #0\n\t" + /* Multiply m[j] and mu - Start */ + "ldr r8, [%[m]], #4\n\t" + "umull r6, r8, %[mp], r8\n\t" + "adds %[a], %[a], r6\n\t" + "adc r5, r5, r8\n\t" + /* Multiply m[j] and mu - Done */ + "adds r4, r4, %[a]\n\t" + "adc r5, r5, #0\n\t" + "str r4, [r10], #4\n\t" + /* a[i+j+1] += m[j+1] * mu */ + "ldr %[a], [r10]\n\t" + "mov r4, #0\n\t" + /* Multiply m[j] and mu - Start */ + "ldr r8, [%[m]], #4\n\t" + "umull r6, r8, %[mp], r8\n\t" + "adds %[a], %[a], r6\n\t" + "adc r4, r4, r8\n\t" + /* Multiply m[j] and mu - Done */ + "adds r5, r5, %[a]\n\t" + "adc r4, r4, #0\n\t" + "str r5, [r10], #4\n\t" + "cmp r10, r14\n\t" + "blt 2b\n\t" + /* a[i+6] += m[6] * mu */ + "ldr %[a], [r10]\n\t" + "mov r5, #0\n\t" + /* Multiply m[j] and mu - Start */ + "ldr r8, [%[m]], #4\n\t" + "umull r6, r8, %[mp], r8\n\t" + "adds %[a], %[a], r6\n\t" + "adc r5, r5, r8\n\t" + /* Multiply m[j] and mu - Done */ + "adds r4, r4, %[a]\n\t" + "adc r5, r5, #0\n\t" + "str r4, [r10], #4\n\t" + /* a[i+7] += m[7] * mu */ + "mov r4, %[ca]\n\t" + "mov %[ca], #0\n\t" + /* Multiply m[7] and mu - Start */ + "ldr r8, [%[m]]\n\t" + "umull r6, r8, %[mp], r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc %[ca], %[ca], #0\n\t" + /* Multiply m[7] and mu - Done */ + "ldr r6, [r10]\n\t" + "ldr r8, [r10, #4]\n\t" + "adds r6, r6, r5\n\t" + "adcs r8, r8, r4\n\t" + "adc %[ca], %[ca], #0\n\t" + "str r6, [r10]\n\t" + "str r8, [r10, #4]\n\t" + /* Next word in a */ + "sub r10, r10, #24\n\t" + "cmp r10, r11\n\t" + "blt 1b\n\t" + "mov %[a], r10\n\t" + "mov %[m], r12\n\t" + : [ca] "+r" (ca), [a] "+r" (a) + : [m] "r" (m), [mp] "r" (mp) + : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" + ); + + sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - ca); +} + /* Map the Montgomery form projective coordinate point to an affine point. * * r Resulting affine coordinate point. @@ -15471,71 +15922,61 @@ SP_NOINLINE static void sp_256_mont_add_8(sp_digit* r, const sp_digit* a, const (void)m; __asm__ __volatile__ ( - "mov r3, #0\n\t" - "ldr r4, [%[a],#0]\n\t" - "ldr r5, [%[a],#4]\n\t" - "ldr r6, [%[b],#0]\n\t" - "ldr r8, [%[b],#4]\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "str r4, [%[r],#0]\n\t" - "str r5, [%[r],#4]\n\t" - "ldr r4, [%[a],#8]\n\t" - "ldr r5, [%[a],#12]\n\t" - "ldr r6, [%[b],#8]\n\t" - "ldr r8, [%[b],#12]\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "str r4, [%[r],#8]\n\t" - "str r5, [%[r],#12]\n\t" - "ldr r4, [%[a],#16]\n\t" - "ldr r5, [%[a],#20]\n\t" - "ldr r6, [%[b],#16]\n\t" - "ldr r8, [%[b],#20]\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "mov r9, r4\n\t" - "mov r10, r5\n\t" - "ldr r4, [%[a],#24]\n\t" - "ldr r5, [%[a],#28]\n\t" - "ldr r6, [%[b],#24]\n\t" - "ldr r8, [%[b],#28]\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "mov r11, r4\n\t" - "mov r12, r5\n\t" - "adc r3, r3, r3\n\t" - "mov r6, r3\n\t" - "sub r3, r3, #1\n\t" - "mvn r3, r3\n\t" - "mov r8, #0\n\t" - "ldr r4, [%[r],#0]\n\t" - "ldr r5, [%[r],#4]\n\t" - "subs r4, r4, r3\n\t" - "sbcs r5, r5, r3\n\t" - "str r4, [%[r],#0]\n\t" - "str r5, [%[r],#4]\n\t" - "ldr r4, [%[r],#8]\n\t" - "ldr r5, [%[r],#12]\n\t" - "sbcs r4, r4, r3\n\t" - "sbcs r5, r5, r8\n\t" - "str r4, [%[r],#8]\n\t" - "str r5, [%[r],#12]\n\t" - "mov r4, r9\n\t" - "mov r5, r10\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r8\n\t" - "str r4, [%[r],#16]\n\t" - "str r5, [%[r],#20]\n\t" - "mov r4, r11\n\t" - "mov r5, r12\n\t" - "sbcs r4, r4, r6\n\t" - "sbc r5, r5, r3\n\t" - "str r4, [%[r],#24]\n\t" - "str r5, [%[r],#28]\n\t" + "mov r12, #0\n\t" + "ldr r4, [%[a],#0]\n\t" + "ldr r5, [%[a],#4]\n\t" + "ldr r6, [%[a],#8]\n\t" + "ldr r8, [%[a],#12]\n\t" + "ldr r9, [%[b],#0]\n\t" + "ldr r10, [%[b],#4]\n\t" + "ldr r11, [%[b],#8]\n\t" + "ldr r14, [%[b],#12]\n\t" + "adds r4, r4, r9\n\t" + "adcs r5, r5, r10\n\t" + "adcs r6, r6, r11\n\t" + "adcs r8, r8, r14\n\t" + "str r4, [%[r],#0]\n\t" + "str r5, [%[r],#4]\n\t" + "str r6, [%[r],#8]\n\t" + "str r8, [%[r],#12]\n\t" + "ldr r4, [%[a],#16]\n\t" + "ldr r5, [%[a],#20]\n\t" + "ldr r6, [%[a],#24]\n\t" + "ldr r8, [%[a],#28]\n\t" + "ldr r9, [%[b],#16]\n\t" + "ldr r10, [%[b],#20]\n\t" + "ldr r11, [%[b],#24]\n\t" + "ldr r14, [%[b],#28]\n\t" + "adcs r4, r4, r9\n\t" + "adcs r5, r5, r10\n\t" + "adcs r6, r6, r11\n\t" + "adcs r8, r8, r14\n\t" + "adc r3, r12, #0\n\t" + "sub r3, r12, r3\n\t" + "and r12, r3, #1\n\t" + "ldr r9, [%[r],#0]\n\t" + "ldr r10, [%[r],#4]\n\t" + "ldr r11, [%[r],#8]\n\t" + "ldr r14, [%[r],#12]\n\t" + "subs r9, r9, r3\n\t" + "sbcs r10, r10, r3\n\t" + "sbcs r11, r11, r3\n\t" + "sbcs r14, r14, #0\n\t" + "sbcs r4, r4, #0\n\t" + "sbcs r5, r5, #0\n\t" + "sbcs r6, r6, r12\n\t" + "sbc r8, r8, r3\n\t" + "str r9, [%[r],#0]\n\t" + "str r10, [%[r],#4]\n\t" + "str r11, [%[r],#8]\n\t" + "str r14, [%[r],#12]\n\t" + "str r4, [%[r],#16]\n\t" + "str r5, [%[r],#20]\n\t" + "str r6, [%[r],#24]\n\t" + "str r8, [%[r],#28]\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r14", "r3", "r12" ); } @@ -15550,63 +15991,45 @@ SP_NOINLINE static void sp_256_mont_dbl_8(sp_digit* r, const sp_digit* a, const (void)m; __asm__ __volatile__ ( - "ldr r4, [%[a],#0]\n\t" - "ldr r5, [%[a],#4]\n\t" - "ldr r6, [%[a],#8]\n\t" - "ldr r8, [%[a],#12]\n\t" - "adds r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r8, r8, r8\n\t" - "str r4, [%[r],#0]\n\t" - "str r5, [%[r],#4]\n\t" - "str r6, [%[r],#8]\n\t" - "str r8, [%[r],#12]\n\t" - "ldr r4, [%[a],#16]\n\t" - "ldr r5, [%[a],#20]\n\t" - "ldr r6, [%[a],#24]\n\t" - "ldr r8, [%[a],#28]\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r8, r8, r8\n\t" - "mov r9, r4\n\t" - "mov r10, r5\n\t" - "mov r11, r6\n\t" - "mov r12, r8\n\t" - "mov r3, #0\n\t" - "mov r8, #0\n\t" - "adc r3, r3, r3\n\t" - "mov r2, r3\n\t" - "sub r3, r3, #1\n\t" - "mvn r3, r3\n\t" - "ldr r4, [%[r],#0]\n\t" - "ldr r5, [%[r],#4]\n\t" - "ldr r6, [%[r],#8]\n\t" - "subs r4, r4, r3\n\t" - "sbcs r5, r5, r3\n\t" - "sbcs r6, r6, r3\n\t" - "str r4, [%[r],#0]\n\t" - "str r5, [%[r],#4]\n\t" - "str r6, [%[r],#8]\n\t" - "ldr r4, [%[r],#12]\n\t" - "mov r5, r9\n\t" - "mov r6, r10\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r8\n\t" - "sbcs r6, r6, r8\n\t" - "str r4, [%[r],#12]\n\t" - "str r5, [%[r],#16]\n\t" - "str r6, [%[r],#20]\n\t" - "mov r4, r11\n\t" - "mov r5, r12\n\t" - "sbcs r4, r4, r2\n\t" - "sbc r5, r5, r3\n\t" - "str r4, [%[r],#24]\n\t" - "str r5, [%[r],#28]\n\t" + "mov r12, #0\n\t" + "ldr r4, [%[a],#0]\n\t" + "ldr r5, [%[a],#4]\n\t" + "ldr r6, [%[a],#8]\n\t" + "ldr r8, [%[a],#12]\n\t" + "ldr r9, [%[a],#16]\n\t" + "ldr r10, [%[a],#20]\n\t" + "ldr r11, [%[a],#24]\n\t" + "ldr r14, [%[a],#28]\n\t" + "adds r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adcs r11, r11, r11\n\t" + "adcs r14, r14, r14\n\t" + "adc r3, r12, #0\n\t" + "sub r3, r12, r3\n\t" + "and r12, r3, #1\n\t" + "subs r4, r4, r3\n\t" + "sbcs r5, r5, r3\n\t" + "sbcs r6, r6, r3\n\t" + "sbcs r8, r8, #0\n\t" + "sbcs r9, r9, #0\n\t" + "sbcs r10, r10, #0\n\t" + "sbcs r11, r11, r12\n\t" + "sbc r14, r14, r3\n\t" + "str r4, [%[r],#0]\n\t" + "str r5, [%[r],#4]\n\t" + "str r6, [%[r],#8]\n\t" + "str r8, [%[r],#12]\n\t" + "str r9, [%[r],#16]\n\t" + "str r10, [%[r],#20]\n\t" + "str r11, [%[r],#24]\n\t" + "str r14, [%[r],#28]\n\t" : : [r] "r" (r), [a] "r" (a) - : "memory", "r3", "r2", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r14", "r3", "r12" ); } @@ -15708,68 +16131,60 @@ SP_NOINLINE static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a, const (void)m; __asm__ __volatile__ ( - "ldr r4, [%[a],#0]\n\t" - "ldr r5, [%[a],#4]\n\t" - "ldr r6, [%[b],#0]\n\t" - "ldr r8, [%[b],#4]\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "str r4, [%[r],#0]\n\t" - "str r5, [%[r],#4]\n\t" - "ldr r4, [%[a],#8]\n\t" - "ldr r5, [%[a],#12]\n\t" - "ldr r6, [%[b],#8]\n\t" - "ldr r8, [%[b],#12]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "str r4, [%[r],#8]\n\t" - "str r5, [%[r],#12]\n\t" - "ldr r4, [%[a],#16]\n\t" - "ldr r5, [%[a],#20]\n\t" - "ldr r6, [%[b],#16]\n\t" - "ldr r8, [%[b],#20]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "mov r9, r4\n\t" - "mov r10, r5\n\t" - "ldr r4, [%[a],#24]\n\t" - "ldr r5, [%[a],#28]\n\t" - "ldr r6, [%[b],#24]\n\t" - "ldr r8, [%[b],#28]\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "mov r11, r4\n\t" - "mov r12, r5\n\t" - "sbc r3, r3, r3\n\t" - "lsr r8, r3, #31\n\t" - "mov r6, #0\n\t" - "ldr r4, [%[r],#0]\n\t" - "ldr r5, [%[r],#4]\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r3\n\t" - "str r4, [%[r],#0]\n\t" - "str r5, [%[r],#4]\n\t" - "ldr r4, [%[r],#8]\n\t" - "ldr r5, [%[r],#12]\n\t" - "adcs r4, r4, r3\n\t" - "adcs r5, r5, r6\n\t" - "str r4, [%[r],#8]\n\t" - "str r5, [%[r],#12]\n\t" - "mov r4, r9\n\t" - "mov r5, r10\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r6\n\t" - "str r4, [%[r],#16]\n\t" - "str r5, [%[r],#20]\n\t" - "mov r4, r11\n\t" - "mov r5, r12\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, r3\n\t" - "str r4, [%[r],#24]\n\t" - "str r5, [%[r],#28]\n\t" + "mov r12, #0\n\t" + "ldr r4, [%[a],#0]\n\t" + "ldr r5, [%[a],#4]\n\t" + "ldr r6, [%[a],#8]\n\t" + "ldr r8, [%[a],#12]\n\t" + "ldr r9, [%[b],#0]\n\t" + "ldr r10, [%[b],#4]\n\t" + "ldr r11, [%[b],#8]\n\t" + "ldr r14, [%[b],#12]\n\t" + "subs r4, r4, r9\n\t" + "sbcs r5, r5, r10\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r8, r8, r14\n\t" + "str r4, [%[r],#0]\n\t" + "str r5, [%[r],#4]\n\t" + "str r6, [%[r],#8]\n\t" + "str r8, [%[r],#12]\n\t" + "ldr r4, [%[a],#16]\n\t" + "ldr r5, [%[a],#20]\n\t" + "ldr r6, [%[a],#24]\n\t" + "ldr r8, [%[a],#28]\n\t" + "ldr r9, [%[b],#16]\n\t" + "ldr r10, [%[b],#20]\n\t" + "ldr r11, [%[b],#24]\n\t" + "ldr r14, [%[b],#28]\n\t" + "sbcs r4, r4, r9\n\t" + "sbcs r5, r5, r10\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r8, r8, r14\n\t" + "sbc r3, r12, #0\n\t" + "and r12, r3, #1\n\t" + "ldr r9, [%[r],#0]\n\t" + "ldr r10, [%[r],#4]\n\t" + "ldr r11, [%[r],#8]\n\t" + "ldr r14, [%[r],#12]\n\t" + "adds r9, r9, r3\n\t" + "adcs r10, r10, r3\n\t" + "adcs r11, r11, r3\n\t" + "adcs r14, r14, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, r12\n\t" + "adc r8, r8, r3\n\t" + "str r9, [%[r],#0]\n\t" + "str r10, [%[r],#4]\n\t" + "str r11, [%[r],#8]\n\t" + "str r14, [%[r],#12]\n\t" + "str r4, [%[r],#16]\n\t" + "str r5, [%[r],#20]\n\t" + "str r6, [%[r],#24]\n\t" + "str r8, [%[r],#28]\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r14", "r3", "r12" ); } @@ -18842,6 +19257,514 @@ int sp_ecc_secret_gen_256(mp_int* priv, ecc_point* pub, byte* out, #endif /* HAVE_ECC_DHE */ #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static void sp_256_mul_8(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit tmp[8]; + + __asm__ __volatile__ ( + /* A[0] * B[0] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[b], #0]\n\t" + "umull r3, r4, r6, r8\n\t" + "mov r5, #0\n\t" + "str r3, [%[tmp], #0]\n\t" + "mov r3, #0\n\t" + /* A[0] * B[1] */ + "ldr r8, [%[b], #4]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r8\n\t" + /* A[1] * B[0] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[b], #0]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "str r4, [%[tmp], #4]\n\t" + "mov r4, #0\n\t" + /* A[0] * B[2] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[b], #8]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * B[1] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[b], #4]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * B[0] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[b], #0]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "str r5, [%[tmp], #8]\n\t" + "mov r5, #0\n\t" + /* A[0] * B[3] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[b], #12]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * B[2] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[b], #8]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * B[1] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[b], #4]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * B[0] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[b], #0]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + "str r3, [%[tmp], #12]\n\t" + "mov r3, #0\n\t" + /* A[0] * B[4] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[1] * B[3] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[b], #12]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[2] * B[2] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[b], #8]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[3] * B[1] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[b], #4]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[4] * B[0] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[b], #0]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "str r4, [%[tmp], #16]\n\t" + "mov r4, #0\n\t" + /* A[0] * B[5] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * B[4] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * B[3] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[b], #12]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * B[2] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[b], #8]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * B[1] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[b], #4]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[5] * B[0] */ + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #0]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "str r5, [%[tmp], #20]\n\t" + "mov r5, #0\n\t" + /* A[0] * B[6] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[1] * B[5] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[2] * B[4] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * B[3] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[b], #12]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[4] * B[2] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[b], #8]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * B[1] */ + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #4]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[6] * B[0] */ + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #0]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + "str r3, [%[tmp], #24]\n\t" + "mov r3, #0\n\t" + /* A[0] * B[7] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[1] * B[6] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[2] * B[5] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[3] * B[4] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[4] * B[3] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[b], #12]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[5] * B[2] */ + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #8]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[6] * B[1] */ + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #4]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[7] * B[0] */ + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #0]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "str r4, [%[tmp], #28]\n\t" + "mov r4, #0\n\t" + /* A[1] * B[7] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[2] * B[6] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[3] * B[5] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[4] * B[4] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[5] * B[3] */ + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #12]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[6] * B[2] */ + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #8]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[7] * B[1] */ + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #4]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "str r5, [%[r], #32]\n\t" + "mov r5, #0\n\t" + /* A[2] * B[7] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[3] * B[6] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[4] * B[5] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[5] * B[4] */ + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[6] * B[3] */ + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #12]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[7] * B[2] */ + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #8]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + "str r3, [%[r], #36]\n\t" + "mov r3, #0\n\t" + /* A[3] * B[7] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[4] * B[6] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[5] * B[5] */ + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[6] * B[4] */ + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[7] * B[3] */ + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #12]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "str r4, [%[r], #40]\n\t" + "mov r4, #0\n\t" + /* A[4] * B[7] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[5] * B[6] */ + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[6] * B[5] */ + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[7] * B[4] */ + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "str r5, [%[r], #44]\n\t" + "mov r5, #0\n\t" + /* A[5] * B[7] */ + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[6] * B[6] */ + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[7] * B[5] */ + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + "str r3, [%[r], #48]\n\t" + "mov r3, #0\n\t" + /* A[6] * B[7] */ + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + /* A[7] * B[6] */ + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "str r4, [%[r], #52]\n\t" + "mov r4, #0\n\t" + /* A[7] * B[7] */ + "ldr r6, [%[a], #28]\n\t" + "ldr r8, [%[b], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r8\n\t" + "str r5, [%[r], #56]\n\t" + "str r3, [%[r], #60]\n\t" + /* Transfer tmp to r */ + "ldr r3, [%[tmp], #0]\n\t" + "ldr r4, [%[tmp], #4]\n\t" + "ldr r5, [%[tmp], #8]\n\t" + "ldr r6, [%[tmp], #12]\n\t" + "str r3, [%[r], #0]\n\t" + "str r4, [%[r], #4]\n\t" + "str r5, [%[r], #8]\n\t" + "str r6, [%[r], #12]\n\t" + "ldr r3, [%[tmp], #16]\n\t" + "ldr r4, [%[tmp], #20]\n\t" + "ldr r5, [%[tmp], #24]\n\t" + "ldr r6, [%[tmp], #28]\n\t" + "str r3, [%[r], #16]\n\t" + "str r4, [%[r], #20]\n\t" + "str r5, [%[r], #24]\n\t" + "str r6, [%[r], #28]\n\t" + : + : [r] "r" (r), [a] "r" (a), [b] "r" (b), [tmp] "r" (tmp) + : "memory", "r3", "r4", "r5", "r6", "r8" + ); +} + #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) #ifdef WOLFSSL_SP_SMALL @@ -19092,6 +20015,356 @@ static WC_INLINE int sp_256_mod_8(sp_digit* r, const sp_digit* a, const sp_digit #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +SP_NOINLINE static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) +{ + sp_digit tmp[8]; + __asm__ __volatile__ ( + /* A[0] * A[0] */ + "ldr r6, [%[a], #0]\n\t" + "umull r3, r4, r6, r6\n\t" + "mov r5, #0\n\t" + "str r3, [%[tmp], #0]\n\t" + "mov r3, #0\n\t" + /* A[0] * A[1] */ + "ldr r8, [%[a], #4]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "str r4, [%[tmp], #4]\n\t" + "mov r4, #0\n\t" + /* A[0] * A[2] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[1] * A[1] */ + "ldr r6, [%[a], #4]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "str r5, [%[tmp], #8]\n\t" + "mov r5, #0\n\t" + /* A[0] * A[3] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[1] * A[2] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #8]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r3, r3, r9\n\t" + "adcs r4, r4, r10\n\t" + "adc r5, r5, r11\n\t" + "str r3, [%[tmp], #12]\n\t" + "mov r3, #0\n\t" + /* A[0] * A[4] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[1] * A[3] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[2] * A[2] */ + "ldr r6, [%[a], #8]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r4, r4, r9\n\t" + "adcs r5, r5, r10\n\t" + "adc r3, r3, r11\n\t" + "str r4, [%[tmp], #16]\n\t" + "mov r4, #0\n\t" + /* A[0] * A[5] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[1] * A[4] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[2] * A[3] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #12]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r5, r5, r9\n\t" + "adcs r3, r3, r10\n\t" + "adc r4, r4, r11\n\t" + "str r5, [%[tmp], #20]\n\t" + "mov r5, #0\n\t" + /* A[0] * A[6] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[1] * A[5] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[2] * A[4] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[3] * A[3] */ + "ldr r6, [%[a], #12]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r3, r3, r9\n\t" + "adcs r4, r4, r10\n\t" + "adc r5, r5, r11\n\t" + "str r3, [%[tmp], #24]\n\t" + "mov r3, #0\n\t" + /* A[0] * A[7] */ + "ldr r6, [%[a], #0]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[1] * A[6] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[2] * A[5] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[3] * A[4] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[a], #16]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r4, r4, r9\n\t" + "adcs r5, r5, r10\n\t" + "adc r3, r3, r11\n\t" + "str r4, [%[tmp], #28]\n\t" + "mov r4, #0\n\t" + /* A[1] * A[7] */ + "ldr r6, [%[a], #4]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[2] * A[6] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[3] * A[5] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[4] * A[4] */ + "ldr r6, [%[a], #16]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r5, r5, r9\n\t" + "adcs r3, r3, r10\n\t" + "adc r4, r4, r11\n\t" + "str r5, [%[r], #32]\n\t" + "mov r5, #0\n\t" + /* A[2] * A[7] */ + "ldr r6, [%[a], #8]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[3] * A[6] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[4] * A[5] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r3, r3, r9\n\t" + "adcs r4, r4, r10\n\t" + "adc r5, r5, r11\n\t" + "str r3, [%[r], #36]\n\t" + "mov r3, #0\n\t" + /* A[3] * A[7] */ + "ldr r6, [%[a], #12]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r9, r10, r6, r8\n\t" + "mov r11, #0\n\t" + /* A[4] * A[6] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r9, r9, r6\n\t" + "adcs r10, r10, r8\n\t" + "adc r11, r11, #0\n\t" + /* A[5] * A[5] */ + "ldr r6, [%[a], #20]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "adds r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r11, r11\n\t" + "adds r4, r4, r9\n\t" + "adcs r5, r5, r10\n\t" + "adc r3, r3, r11\n\t" + "str r4, [%[r], #40]\n\t" + "mov r4, #0\n\t" + /* A[4] * A[7] */ + "ldr r6, [%[a], #16]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + /* A[5] * A[6] */ + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[a], #24]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "adds r5, r5, r6\n\t" + "adcs r3, r3, r8\n\t" + "adc r4, r4, #0\n\t" + "str r5, [%[r], #44]\n\t" + "mov r5, #0\n\t" + /* A[5] * A[7] */ + "ldr r6, [%[a], #20]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + /* A[6] * A[6] */ + "ldr r6, [%[a], #24]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r3, r3, r6\n\t" + "adcs r4, r4, r8\n\t" + "adc r5, r5, #0\n\t" + "str r3, [%[r], #48]\n\t" + "mov r3, #0\n\t" + /* A[6] * A[7] */ + "ldr r6, [%[a], #24]\n\t" + "ldr r8, [%[a], #28]\n\t" + "umull r6, r8, r6, r8\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r5, r8\n\t" + "adc r3, r3, #0\n\t" + "str r4, [%[r], #52]\n\t" + "mov r4, #0\n\t" + /* A[7] * A[7] */ + "ldr r6, [%[a], #28]\n\t" + "umull r6, r8, r6, r6\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r8\n\t" + "str r5, [%[r], #56]\n\t" + "str r3, [%[r], #60]\n\t" + /* Transfer tmp to r */ + "ldr r3, [%[tmp], #0]\n\t" + "ldr r4, [%[tmp], #4]\n\t" + "ldr r5, [%[tmp], #8]\n\t" + "ldr r6, [%[tmp], #12]\n\t" + "str r3, [%[r], #0]\n\t" + "str r4, [%[r], #4]\n\t" + "str r5, [%[r], #8]\n\t" + "str r6, [%[r], #12]\n\t" + "ldr r3, [%[tmp], #16]\n\t" + "ldr r4, [%[tmp], #20]\n\t" + "ldr r5, [%[tmp], #24]\n\t" + "ldr r6, [%[tmp], #28]\n\t" + "str r3, [%[r], #16]\n\t" + "str r4, [%[r], #20]\n\t" + "str r5, [%[r], #24]\n\t" + "str r6, [%[r], #28]\n\t" + : + : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp) + : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11" + ); +} + #ifdef WOLFSSL_SP_SMALL /* Order-2 for the P256 curve. */ static const uint32_t p256_order_minus_2[8] = {