diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c index 74eb0da87..b3e6e0284 100644 --- a/wolfcrypt/src/sp_armthumb.c +++ b/wolfcrypt/src/sp_armthumb.c @@ -215,6 +215,14 @@ SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, "add %[b], r10\n\t" "\n2:\n\t" "# Multiply Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [%[b]]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [%[b]]\n\t" "lsl r6, r6, #16\n\t" @@ -249,6 +257,7 @@ SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Multiply Done\n\t" "add %[a], #4\n\t" "sub %[b], #4\n\t" @@ -314,6 +323,18 @@ SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) "cmp r2, %[a]\n\t" "beq 4f\n\t" "# Multiply * 2: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [r2]\n\t" "lsl r6, r6, #16\n\t" @@ -359,10 +380,77 @@ SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif +#else +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsl r6, r6, #16\n\t" + "lsl r7, r7, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsl r7, r7, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#endif +#endif "# Multiply * 2: Done\n\t" "bal 5f\n\t" "\n4:\n\t" "# Square: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "umull r6, r7, r6, r6\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" @@ -384,6 +472,7 @@ SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Square: Done\n\t" "\n5:\n\t" "add %[a], #4\n\t" @@ -1902,6 +1991,14 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, "add %[b], r10\n\t" "\n2:\n\t" "# Multiply Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [%[b]]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [%[b]]\n\t" "lsl r6, r6, #16\n\t" @@ -1936,6 +2033,7 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Multiply Done\n\t" "add %[a], #4\n\t" "sub %[b], #4\n\t" @@ -2004,6 +2102,18 @@ SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) "cmp r2, %[a]\n\t" "beq 4f\n\t" "# Multiply * 2: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [r2]\n\t" "lsl r6, r6, #16\n\t" @@ -2049,10 +2159,77 @@ SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif +#else +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsl r6, r6, #16\n\t" + "lsl r7, r7, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsl r7, r7, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#endif +#endif "# Multiply * 2: Done\n\t" "bal 5f\n\t" "\n4:\n\t" "# Square: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "umull r6, r7, r6, r6\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" @@ -2074,6 +2251,7 @@ SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Square: Done\n\t" "\n5:\n\t" "add %[a], #4\n\t" @@ -2255,6 +2433,14 @@ SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, "add %[b], r10\n\t" "\n2:\n\t" "# Multiply Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [%[b]]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [%[b]]\n\t" "lsl r6, r6, #16\n\t" @@ -2289,6 +2475,7 @@ SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Multiply Done\n\t" "add %[a], #4\n\t" "sub %[b], #4\n\t" @@ -2355,6 +2542,18 @@ SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) "cmp r2, %[a]\n\t" "beq 4f\n\t" "# Multiply * 2: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [r2]\n\t" "lsl r6, r6, #16\n\t" @@ -2400,10 +2599,77 @@ SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif +#else +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsl r6, r6, #16\n\t" + "lsl r7, r7, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsl r7, r7, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#endif +#endif "# Multiply * 2: Done\n\t" "bal 5f\n\t" "\n4:\n\t" "# Square: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "umull r6, r7, r6, r6\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" @@ -2425,6 +2691,7 @@ SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Square: Done\n\t" "\n5:\n\t" "add %[a], #4\n\t" @@ -2580,6 +2847,12 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "mov r4, r5\n\t" "mov r5, #0\n\t" "# Multiply m[j] and mu - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r7, [%[m]]\n\t" + "umull r6, r7, %[mp], r7\n\t" + "add %[a], r6\n\t" + "adc r5, r7\n\t" +#else "ldr r7, [%[m]]\n\t" "lsl r6, %[mp], #16\n\t" "lsl r7, r7, #16\n\t" @@ -2608,6 +2881,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "lsl r6, r6, #16\n\t" "add %[a], r6\n\t" "adc r5, r7\n\t" +#endif "# Multiply m[j] and mu - Done\n\t" "add r4, %[a]\n\t" "adc r5, %[ca]\n\t" @@ -2625,6 +2899,13 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "mov r4, r12\n\t" "mov %[a], #0\n\t" "# Multiply m[31] and mu - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r7, [%[m]]\n\t" + "umull r6, r7, %[mp], r7\n\t" + "add r5, r6\n\t" + "adc r4, r7\n\t" + "adc %[a], %[ca]\n\t" +#else "ldr r7, [%[m]]\n\t" "lsl r6, %[mp], #16\n\t" "lsl r7, r7, #16\n\t" @@ -2657,6 +2938,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m, "add r5, r6\n\t" "adc r4, r7\n\t" "adc %[a], %[ca]\n\t" +#endif "# Multiply m[31] and mu - Done\n\t" "mov %[ca], %[a]\n\t" "mov %[a], r10\n\t" @@ -2737,6 +3019,13 @@ SP_NOINLINE static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, "mov %[r], #0\n\t" "mov r5, #0\n\t" "# A[] * B\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "umull r6, r7, r6, %[b]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "lsl r6, r6, #16\n\t" "lsl r7, %[b], #16\n\t" @@ -2767,6 +3056,7 @@ SP_NOINLINE static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# A[] * B - Done\n\t" "mov %[r], r8\n\t" "str r3, [%[r]]\n\t" @@ -2829,6 +3119,9 @@ SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, "add %[r], %[r]\n\t" "add %[r], #1\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -2850,6 +3143,7 @@ SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "sub %[d1], r4\n\t" @@ -2859,6 +3153,9 @@ SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, "mov r5, %[d1]\n\t" "add %[r], r5\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -2880,6 +3177,7 @@ SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "mov r6, r9\n\t" @@ -2888,6 +3186,9 @@ SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, "mov r5, r6\n\t" "add %[r], r5\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -2909,6 +3210,7 @@ SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "mov r6, r9\n\t" @@ -3388,6 +3690,12 @@ SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, sp_digit* m, "mov r4, r5\n\t" "mov r5, #0\n\t" "# Multiply m[j] and mu - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r7, [%[m]]\n\t" + "umull r6, r7, %[mp], r7\n\t" + "add %[a], r6\n\t" + "adc r5, r7\n\t" +#else "ldr r7, [%[m]]\n\t" "lsl r6, %[mp], #16\n\t" "lsl r7, r7, #16\n\t" @@ -3416,6 +3724,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, sp_digit* m, "lsl r6, r6, #16\n\t" "add %[a], r6\n\t" "adc r5, r7\n\t" +#endif "# Multiply m[j] and mu - Done\n\t" "add r4, %[a]\n\t" "adc r5, %[ca]\n\t" @@ -3433,6 +3742,13 @@ SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, sp_digit* m, "mov r4, r12\n\t" "mov %[a], #0\n\t" "# Multiply m[63] and mu - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r7, [%[m]]\n\t" + "umull r6, r7, %[mp], r7\n\t" + "add r5, r6\n\t" + "adc r4, r7\n\t" + "adc %[a], %[ca]\n\t" +#else "ldr r7, [%[m]]\n\t" "lsl r6, %[mp], #16\n\t" "lsl r7, r7, #16\n\t" @@ -3465,6 +3781,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, sp_digit* m, "add r5, r6\n\t" "adc r4, r7\n\t" "adc %[a], %[ca]\n\t" +#endif "# Multiply m[63] and mu - Done\n\t" "mov %[ca], %[a]\n\t" "mov %[a], r10\n\t" @@ -3547,6 +3864,13 @@ SP_NOINLINE static void sp_2048_mul_d_64(sp_digit* r, const sp_digit* a, "mov %[r], #0\n\t" "mov r5, #0\n\t" "# A[] * B\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "umull r6, r7, r6, %[b]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "lsl r6, r6, #16\n\t" "lsl r7, %[b], #16\n\t" @@ -3577,6 +3901,7 @@ SP_NOINLINE static void sp_2048_mul_d_64(sp_digit* r, const sp_digit* a, "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# A[] * B - Done\n\t" "mov %[r], r8\n\t" "str r3, [%[r]]\n\t" @@ -3639,6 +3964,9 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, "add %[r], %[r]\n\t" "add %[r], #1\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -3660,6 +3988,7 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "sub %[d1], r4\n\t" @@ -3669,6 +3998,9 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, "mov r5, %[d1]\n\t" "add %[r], r5\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -3690,6 +4022,7 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "mov r6, r9\n\t" @@ -3698,6 +4031,9 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, "mov r5, r6\n\t" "add %[r], r5\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -3719,6 +4055,7 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "mov r6, r9\n\t" @@ -4754,6 +5091,14 @@ SP_NOINLINE static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, "add %[b], r10\n\t" "\n2:\n\t" "# Multiply Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [%[b]]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [%[b]]\n\t" "lsl r6, r6, #16\n\t" @@ -4788,6 +5133,7 @@ SP_NOINLINE static void sp_3072_mul_8(sp_digit* r, const sp_digit* a, "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Multiply Done\n\t" "add %[a], #4\n\t" "sub %[b], #4\n\t" @@ -4853,6 +5199,18 @@ SP_NOINLINE static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) "cmp r2, %[a]\n\t" "beq 4f\n\t" "# Multiply * 2: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [r2]\n\t" "lsl r6, r6, #16\n\t" @@ -4898,10 +5256,77 @@ SP_NOINLINE static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif +#else +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsl r6, r6, #16\n\t" + "lsl r7, r7, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsl r7, r7, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#endif +#endif "# Multiply * 2: Done\n\t" "bal 5f\n\t" "\n4:\n\t" "# Square: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "umull r6, r7, r6, r6\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" @@ -4923,6 +5348,7 @@ SP_NOINLINE static void sp_3072_sqr_8(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Square: Done\n\t" "\n5:\n\t" "add %[a], #4\n\t" @@ -5296,7 +5722,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sub r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #0]\n\t" - "str r5, [%[r], #0]\n\t" + "str r5, [%[r], #4]\n\t" "ldr r4, [%[a], #8]\n\t" "ldr r5, [%[a], #12]\n\t" "ldr r6, [%[b], #8]\n\t" @@ -5304,7 +5730,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #8]\n\t" - "str r5, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" "ldr r4, [%[a], #16]\n\t" "ldr r5, [%[a], #20]\n\t" "ldr r6, [%[b], #16]\n\t" @@ -5312,7 +5738,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #16]\n\t" - "str r5, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" "ldr r4, [%[a], #24]\n\t" "ldr r5, [%[a], #28]\n\t" "ldr r6, [%[b], #24]\n\t" @@ -5320,7 +5746,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #24]\n\t" - "str r5, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" "ldr r4, [%[a], #32]\n\t" "ldr r5, [%[a], #36]\n\t" "ldr r6, [%[b], #32]\n\t" @@ -5328,7 +5754,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #32]\n\t" - "str r5, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" "ldr r4, [%[a], #40]\n\t" "ldr r5, [%[a], #44]\n\t" "ldr r6, [%[b], #40]\n\t" @@ -5336,7 +5762,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #40]\n\t" - "str r5, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" "ldr r4, [%[a], #48]\n\t" "ldr r5, [%[a], #52]\n\t" "ldr r6, [%[b], #48]\n\t" @@ -5344,7 +5770,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #48]\n\t" - "str r5, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" "ldr r4, [%[a], #56]\n\t" "ldr r5, [%[a], #60]\n\t" "ldr r6, [%[b], #56]\n\t" @@ -5352,7 +5778,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #56]\n\t" - "str r5, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" "ldr r4, [%[a], #64]\n\t" "ldr r5, [%[a], #68]\n\t" "ldr r6, [%[b], #64]\n\t" @@ -5360,7 +5786,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #64]\n\t" - "str r5, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" "ldr r4, [%[a], #72]\n\t" "ldr r5, [%[a], #76]\n\t" "ldr r6, [%[b], #72]\n\t" @@ -5368,7 +5794,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #72]\n\t" - "str r5, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" "ldr r4, [%[a], #80]\n\t" "ldr r5, [%[a], #84]\n\t" "ldr r6, [%[b], #80]\n\t" @@ -5376,7 +5802,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #80]\n\t" - "str r5, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" "ldr r4, [%[a], #88]\n\t" "ldr r5, [%[a], #92]\n\t" "ldr r6, [%[b], #88]\n\t" @@ -5384,7 +5810,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #88]\n\t" - "str r5, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" "ldr r4, [%[a], #96]\n\t" "ldr r5, [%[a], #100]\n\t" "ldr r6, [%[b], #96]\n\t" @@ -5392,7 +5818,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #96]\n\t" - "str r5, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" "ldr r4, [%[a], #104]\n\t" "ldr r5, [%[a], #108]\n\t" "ldr r6, [%[b], #104]\n\t" @@ -5400,7 +5826,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #104]\n\t" - "str r5, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" "ldr r4, [%[a], #112]\n\t" "ldr r5, [%[a], #116]\n\t" "ldr r6, [%[b], #112]\n\t" @@ -5408,7 +5834,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #112]\n\t" - "str r5, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" "ldr r4, [%[a], #120]\n\t" "ldr r5, [%[a], #124]\n\t" "ldr r6, [%[b], #120]\n\t" @@ -5416,7 +5842,7 @@ SP_NOINLINE static sp_digit sp_3072_sub_32(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #120]\n\t" - "str r5, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" "sbc %[c], %[c]\n\t" : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -6938,6 +7364,14 @@ SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, "add %[b], r10\n\t" "\n2:\n\t" "# Multiply Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [%[b]]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [%[b]]\n\t" "lsl r6, r6, #16\n\t" @@ -6972,6 +7406,7 @@ SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Multiply Done\n\t" "add %[a], #4\n\t" "sub %[b], #4\n\t" @@ -7042,6 +7477,18 @@ SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) "cmp r2, %[a]\n\t" "beq 4f\n\t" "# Multiply * 2: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [r2]\n\t" "lsl r6, r6, #16\n\t" @@ -7087,10 +7534,77 @@ SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif +#else +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsl r6, r6, #16\n\t" + "lsl r7, r7, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsl r7, r7, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#endif +#endif "# Multiply * 2: Done\n\t" "bal 5f\n\t" "\n4:\n\t" "# Square: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "umull r6, r7, r6, r6\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" @@ -7112,6 +7626,7 @@ SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Square: Done\n\t" "\n5:\n\t" "add %[a], #4\n\t" @@ -7256,6 +7771,14 @@ SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, "add %[b], r10\n\t" "\n2:\n\t" "# Multiply Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [%[b]]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [%[b]]\n\t" "lsl r6, r6, #16\n\t" @@ -7290,6 +7813,7 @@ SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Multiply Done\n\t" "add %[a], #4\n\t" "sub %[b], #4\n\t" @@ -7359,6 +7883,18 @@ SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) "cmp r2, %[a]\n\t" "beq 4f\n\t" "# Multiply * 2: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [r2]\n\t" "lsl r6, r6, #16\n\t" @@ -7404,10 +7940,77 @@ SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif +#else +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsl r6, r6, #16\n\t" + "lsl r7, r7, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsl r7, r7, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#endif +#endif "# Multiply * 2: Done\n\t" "bal 5f\n\t" "\n4:\n\t" "# Square: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "umull r6, r7, r6, r6\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" @@ -7429,6 +8032,7 @@ SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Square: Done\n\t" "\n5:\n\t" "add %[a], #4\n\t" @@ -7846,6 +8450,12 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "mov r4, r5\n\t" "mov r5, #0\n\t" "# Multiply m[j] and mu - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r7, [%[m]]\n\t" + "umull r6, r7, %[mp], r7\n\t" + "add %[a], r6\n\t" + "adc r5, r7\n\t" +#else "ldr r7, [%[m]]\n\t" "lsl r6, %[mp], #16\n\t" "lsl r7, r7, #16\n\t" @@ -7874,6 +8484,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "lsl r6, r6, #16\n\t" "add %[a], r6\n\t" "adc r5, r7\n\t" +#endif "# Multiply m[j] and mu - Done\n\t" "add r4, %[a]\n\t" "adc r5, %[ca]\n\t" @@ -7891,6 +8502,13 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "mov r4, r12\n\t" "mov %[a], #0\n\t" "# Multiply m[47] and mu - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r7, [%[m]]\n\t" + "umull r6, r7, %[mp], r7\n\t" + "add r5, r6\n\t" + "adc r4, r7\n\t" + "adc %[a], %[ca]\n\t" +#else "ldr r7, [%[m]]\n\t" "lsl r6, %[mp], #16\n\t" "lsl r7, r7, #16\n\t" @@ -7923,6 +8541,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m, "add r5, r6\n\t" "adc r4, r7\n\t" "adc %[a], %[ca]\n\t" +#endif "# Multiply m[47] and mu - Done\n\t" "mov %[ca], %[a]\n\t" "mov %[a], r10\n\t" @@ -8003,6 +8622,13 @@ SP_NOINLINE static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, "mov %[r], #0\n\t" "mov r5, #0\n\t" "# A[] * B\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "umull r6, r7, r6, %[b]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "lsl r6, r6, #16\n\t" "lsl r7, %[b], #16\n\t" @@ -8033,6 +8659,7 @@ SP_NOINLINE static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# A[] * B - Done\n\t" "mov %[r], r8\n\t" "str r3, [%[r]]\n\t" @@ -8095,6 +8722,9 @@ SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, "add %[r], %[r]\n\t" "add %[r], #1\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -8116,6 +8746,7 @@ SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "sub %[d1], r4\n\t" @@ -8125,6 +8756,9 @@ SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, "mov r5, %[d1]\n\t" "add %[r], r5\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -8146,6 +8780,7 @@ SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "mov r6, r9\n\t" @@ -8154,6 +8789,9 @@ SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, "mov r5, r6\n\t" "add %[r], r5\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -8175,6 +8813,7 @@ SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "mov r6, r9\n\t" @@ -8655,6 +9294,12 @@ SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, sp_digit* m, "mov r4, r5\n\t" "mov r5, #0\n\t" "# Multiply m[j] and mu - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r7, [%[m]]\n\t" + "umull r6, r7, %[mp], r7\n\t" + "add %[a], r6\n\t" + "adc r5, r7\n\t" +#else "ldr r7, [%[m]]\n\t" "lsl r6, %[mp], #16\n\t" "lsl r7, r7, #16\n\t" @@ -8683,6 +9328,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, sp_digit* m, "lsl r6, r6, #16\n\t" "add %[a], r6\n\t" "adc r5, r7\n\t" +#endif "# Multiply m[j] and mu - Done\n\t" "add r4, %[a]\n\t" "adc r5, %[ca]\n\t" @@ -8702,6 +9348,13 @@ SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, sp_digit* m, "mov r4, r12\n\t" "mov %[a], #0\n\t" "# Multiply m[95] and mu - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r7, [%[m]]\n\t" + "umull r6, r7, %[mp], r7\n\t" + "add r5, r6\n\t" + "adc r4, r7\n\t" + "adc %[a], %[ca]\n\t" +#else "ldr r7, [%[m]]\n\t" "lsl r6, %[mp], #16\n\t" "lsl r7, r7, #16\n\t" @@ -8734,6 +9387,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, sp_digit* m, "add r5, r6\n\t" "adc r4, r7\n\t" "adc %[a], %[ca]\n\t" +#endif "# Multiply m[95] and mu - Done\n\t" "mov %[ca], %[a]\n\t" "mov %[a], r10\n\t" @@ -8818,6 +9472,13 @@ SP_NOINLINE static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, "mov %[r], #0\n\t" "mov r5, #0\n\t" "# A[] * B\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "umull r6, r7, r6, %[b]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "lsl r6, r6, #16\n\t" "lsl r7, %[b], #16\n\t" @@ -8848,6 +9509,7 @@ SP_NOINLINE static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# A[] * B - Done\n\t" "mov %[r], r8\n\t" "str r3, [%[r]]\n\t" @@ -8910,6 +9572,9 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, "add %[r], %[r]\n\t" "add %[r], #1\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -8931,6 +9596,7 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "sub %[d1], r4\n\t" @@ -8940,6 +9606,9 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, "mov r5, %[d1]\n\t" "add %[r], r5\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -8961,6 +9630,7 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "mov r6, r9\n\t" @@ -8969,6 +9639,9 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, "mov r5, r6\n\t" "add %[r], r5\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -8990,6 +9663,7 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "mov r6, r9\n\t" @@ -10452,6 +11126,12 @@ SP_NOINLINE static void sp_256_mont_reduce_order_8(sp_digit* a, sp_digit* m, "mov r4, r5\n\t" "mov r5, #0\n\t" "# Multiply m[j] and mu - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r7, [%[m]]\n\t" + "umull r6, r7, %[mp], r7\n\t" + "add %[a], r6\n\t" + "adc r5, r7\n\t" +#else "ldr r7, [%[m]]\n\t" "lsl r6, %[mp], #16\n\t" "lsl r7, r7, #16\n\t" @@ -10480,6 +11160,7 @@ SP_NOINLINE static void sp_256_mont_reduce_order_8(sp_digit* a, sp_digit* m, "lsl r6, r6, #16\n\t" "add %[a], r6\n\t" "adc r5, r7\n\t" +#endif "# Multiply m[j] and mu - Done\n\t" "add r4, %[a]\n\t" "adc r5, %[ca]\n\t" @@ -10497,6 +11178,13 @@ SP_NOINLINE static void sp_256_mont_reduce_order_8(sp_digit* a, sp_digit* m, "mov r4, r12\n\t" "mov %[a], #0\n\t" "# Multiply m[7] and mu - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r7, [%[m]]\n\t" + "umull r6, r7, %[mp], r7\n\t" + "add r5, r6\n\t" + "adc r4, r7\n\t" + "adc %[a], %[ca]\n\t" +#else "ldr r7, [%[m]]\n\t" "lsl r6, %[mp], #16\n\t" "lsl r7, r7, #16\n\t" @@ -10529,6 +11217,7 @@ SP_NOINLINE static void sp_256_mont_reduce_order_8(sp_digit* a, sp_digit* m, "add r5, r6\n\t" "adc r4, r7\n\t" "adc %[a], %[ca]\n\t" +#endif "# Multiply m[7] and mu - Done\n\t" "mov %[ca], %[a]\n\t" "mov %[a], r10\n\t" @@ -10594,6 +11283,14 @@ SP_NOINLINE static void sp_256_mul_8(sp_digit* r, const sp_digit* a, "add %[b], r10\n\t" "\n2:\n\t" "# Multiply Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [%[b]]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [%[b]]\n\t" "lsl r6, r6, #16\n\t" @@ -10628,6 +11325,7 @@ SP_NOINLINE static void sp_256_mul_8(sp_digit* r, const sp_digit* a, "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Multiply Done\n\t" "add %[a], #4\n\t" "sub %[b], #4\n\t" @@ -10709,6 +11407,18 @@ SP_NOINLINE static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) "cmp r2, %[a]\n\t" "beq 4f\n\t" "# Multiply * 2: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "ldr r7, [r2]\n\t" "lsl r6, r6, #16\n\t" @@ -10754,10 +11464,77 @@ SP_NOINLINE static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif +#else +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "umull r6, r7, r6, r7\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsl r6, r6, #16\n\t" + "lsl r7, r7, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "add r3, r7\n\t" + "adc r4, %[r]\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r6, [%[a]]\n\t" + "ldr r7, [r2]\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r4, r7\n\t" + "adc r5, %[r]\n\t" + "ldr r7, [r2]\n\t" + "lsl r7, r7, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#endif +#endif "# Multiply * 2: Done\n\t" "bal 5f\n\t" "\n4:\n\t" "# Square: Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "umull r6, r7, r6, r6\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" @@ -10779,6 +11556,7 @@ SP_NOINLINE static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# Square: Done\n\t" "\n5:\n\t" "add %[a], #4\n\t" @@ -11637,7 +12415,7 @@ SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a, "sub r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #0]\n\t" - "str r5, [%[r], #0]\n\t" + "str r5, [%[r], #4]\n\t" "ldr r4, [%[a], #8]\n\t" "ldr r5, [%[a], #12]\n\t" "ldr r6, [%[b], #8]\n\t" @@ -11645,7 +12423,7 @@ SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #8]\n\t" - "str r5, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" "ldr r4, [%[a], #16]\n\t" "ldr r5, [%[a], #20]\n\t" "ldr r6, [%[b], #16]\n\t" @@ -11653,7 +12431,7 @@ SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #16]\n\t" - "str r5, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" "ldr r4, [%[a], #24]\n\t" "ldr r5, [%[a], #28]\n\t" "ldr r6, [%[b], #24]\n\t" @@ -11661,7 +12439,7 @@ SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a, "sbc r4, r6\n\t" "sbc r5, r7\n\t" "str r4, [%[r], #24]\n\t" - "str r5, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" "sbc %[c], %[c]\n\t" : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -14812,6 +15590,13 @@ SP_NOINLINE static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a, "mov %[r], #0\n\t" "mov r5, #0\n\t" "# A[] * B\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "ldr r6, [%[a]]\n\t" + "umull r6, r7, r6, %[b]\n\t" + "add r3, r6\n\t" + "adc r4, r7\n\t" + "adc r5, %[r]\n\t" +#else "ldr r6, [%[a]]\n\t" "lsl r6, r6, #16\n\t" "lsl r7, %[b], #16\n\t" @@ -14842,6 +15627,7 @@ SP_NOINLINE static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a, "add r3, r6\n\t" "adc r4, r7\n\t" "adc r5, %[r]\n\t" +#endif "# A[] * B - Done\n\t" "mov %[r], r8\n\t" "str r3, [%[r]]\n\t" @@ -14904,6 +15690,9 @@ SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, "add %[r], %[r]\n\t" "add %[r], #1\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -14925,6 +15714,7 @@ SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "sub %[d1], r4\n\t" @@ -14934,6 +15724,9 @@ SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, "mov r5, %[d1]\n\t" "add %[r], r5\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -14955,6 +15748,7 @@ SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "mov r6, r9\n\t" @@ -14963,6 +15757,9 @@ SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, "mov r5, r6\n\t" "add %[r], r5\n\t" "# r * div - Start\n\t" +#ifdef WOLFSSL_SP_ARM_THUMB_ASM_CORTEX_M + "umull r4, r5, %[r], %[div]\n\t" +#else "lsl %[d1], %[r], #16\n\t" "lsl r4, %[div], #16\n\t" "lsr %[d1], %[d1], #16\n\t" @@ -14984,6 +15781,7 @@ SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, "lsl %[d1], %[d1], #16\n\t" "add r4, %[d1]\n\t" "adc r5, r6\n\t" +#endif "# r * div - Done\n\t" "mov %[d1], r8\n\t" "mov r6, r9\n\t"