From 9ffc300641102b5741df63669f140220b74898f4 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 6 May 2022 16:02:18 +1000 Subject: [PATCH] SP: Div fixes for constant time Implement div words for different platforms. Use implementation not using a divide instruction unless SP_DIV_WORD_USE_DIV or we know there is a div instruction. Fix Thumb implementation of div word. --- wolfcrypt/src/sp_armthumb.c | 1092 ++++++++++++++++++++++++++++++++++- wolfcrypt/src/sp_c32.c | 661 ++++++++++++++++++++- wolfcrypt/src/sp_c64.c | 773 ++++++++++++++++++++++++- 3 files changed, 2490 insertions(+), 36 deletions(-) diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c index feaa4e37d..cc439130f 100644 --- a/wolfcrypt/src/sp_armthumb.c +++ b/wolfcrypt/src/sp_armthumb.c @@ -23653,6 +23653,122 @@ SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, "sub r4, %[d1], r4\n\t" #endif #endif +#ifdef WOLFSSL_KEIL + "sbcs r6, r6, r5\n\t" +#elif defined(__clang__) + "sbcs r6, r5\n\t" +#else + "sbc r6, r5\n\t" +#endif + "movs r5, r6\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif + "# r * div - Start\n\t" + "uxth %[d1], r3\n\t" + "uxth r4, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls r4, %[d1], r4\n\t" +#elif defined(__clang__) + "muls r4, %[d1]\n\t" +#else + "mul r4, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[div], #16\n\t" +#else + "lsr r6, %[div], #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, %[d1], #16\n\t" +#else + "lsr r5, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r7\n\t" +#elif defined(__clang__) + "adcs r5, r7\n\t" +#else + "adc r5, r7\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs %[d1], r3, #16\n\t" +#else + "lsr %[d1], r3, #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls r6, %[d1], r6\n\t" +#elif defined(__clang__) + "muls r6, %[d1]\n\t" +#else + "mul r6, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r5, r5, r6\n\t" +#else + "add r5, r5, r6\n\t" +#endif + "uxth r6, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[d1], #16\n\t" +#else + "lsr r6, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r6\n\t" +#elif defined(__clang__) + "adcs r5, r6\n\t" +#else + "adc r5, r6\n\t" +#endif + "# r * div - Done\n\t" + "mov %[d1], r8\n\t" + "mov r6, r9\n\t" +#ifdef WOLFSSL_KEIL + "subs r4, %[d1], r4\n\t" +#else +#ifdef __clang__ + "subs r4, %[d1], r4\n\t" +#else + "sub r4, %[d1], r4\n\t" +#endif +#endif #ifdef WOLFSSL_KEIL "sbcs r6, r6, r5\n\t" #elif defined(__clang__) @@ -23849,7 +23965,9 @@ static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_dig */ static WC_INLINE int sp_2048_mod_32(sp_digit* r, const sp_digit* a, const sp_digit* m) { - return sp_2048_div_32(a, m, NULL, r); + int ret; + ret = sp_2048_div_32(a, m, NULL, r); + return ret; } #ifdef WOLFSSL_SP_SMALL @@ -27014,6 +27132,122 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, "sub r4, %[d1], r4\n\t" #endif #endif +#ifdef WOLFSSL_KEIL + "sbcs r6, r6, r5\n\t" +#elif defined(__clang__) + "sbcs r6, r5\n\t" +#else + "sbc r6, r5\n\t" +#endif + "movs r5, r6\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif + "# r * div - Start\n\t" + "uxth %[d1], r3\n\t" + "uxth r4, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls r4, %[d1], r4\n\t" +#elif defined(__clang__) + "muls r4, %[d1]\n\t" +#else + "mul r4, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[div], #16\n\t" +#else + "lsr r6, %[div], #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, %[d1], #16\n\t" +#else + "lsr r5, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r7\n\t" +#elif defined(__clang__) + "adcs r5, r7\n\t" +#else + "adc r5, r7\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs %[d1], r3, #16\n\t" +#else + "lsr %[d1], r3, #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls r6, %[d1], r6\n\t" +#elif defined(__clang__) + "muls r6, %[d1]\n\t" +#else + "mul r6, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r5, r5, r6\n\t" +#else + "add r5, r5, r6\n\t" +#endif + "uxth r6, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[d1], #16\n\t" +#else + "lsr r6, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r6\n\t" +#elif defined(__clang__) + "adcs r5, r6\n\t" +#else + "adc r5, r6\n\t" +#endif + "# r * div - Done\n\t" + "mov %[d1], r8\n\t" + "mov r6, r9\n\t" +#ifdef WOLFSSL_KEIL + "subs r4, %[d1], r4\n\t" +#else +#ifdef __clang__ + "subs r4, %[d1], r4\n\t" +#else + "sub r4, %[d1], r4\n\t" +#endif +#endif #ifdef WOLFSSL_KEIL "sbcs r6, r6, r5\n\t" #elif defined(__clang__) @@ -27114,7 +27348,9 @@ static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, s */ static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) { - return sp_2048_div_64_cond(a, m, NULL, r); + int ret; + ret = sp_2048_div_64_cond(a, m, NULL, r); + return ret; } #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) @@ -27305,7 +27541,9 @@ static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d, sp_dig */ static WC_INLINE int sp_2048_mod_64(sp_digit* r, const sp_digit* a, const sp_digit* m) { - return sp_2048_div_64(a, m, NULL, r); + int ret; + ret = sp_2048_div_64(a, m, NULL, r); + return ret; } #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ @@ -74857,6 +75095,122 @@ SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, "sub r4, %[d1], r4\n\t" #endif #endif +#ifdef WOLFSSL_KEIL + "sbcs r6, r6, r5\n\t" +#elif defined(__clang__) + "sbcs r6, r5\n\t" +#else + "sbc r6, r5\n\t" +#endif + "movs r5, r6\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif + "# r * div - Start\n\t" + "uxth %[d1], r3\n\t" + "uxth r4, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls r4, %[d1], r4\n\t" +#elif defined(__clang__) + "muls r4, %[d1]\n\t" +#else + "mul r4, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[div], #16\n\t" +#else + "lsr r6, %[div], #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, %[d1], #16\n\t" +#else + "lsr r5, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r7\n\t" +#elif defined(__clang__) + "adcs r5, r7\n\t" +#else + "adc r5, r7\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs %[d1], r3, #16\n\t" +#else + "lsr %[d1], r3, #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls r6, %[d1], r6\n\t" +#elif defined(__clang__) + "muls r6, %[d1]\n\t" +#else + "mul r6, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r5, r5, r6\n\t" +#else + "add r5, r5, r6\n\t" +#endif + "uxth r6, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[d1], #16\n\t" +#else + "lsr r6, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r6\n\t" +#elif defined(__clang__) + "adcs r5, r6\n\t" +#else + "adc r5, r6\n\t" +#endif + "# r * div - Done\n\t" + "mov %[d1], r8\n\t" + "mov r6, r9\n\t" +#ifdef WOLFSSL_KEIL + "subs r4, %[d1], r4\n\t" +#else +#ifdef __clang__ + "subs r4, %[d1], r4\n\t" +#else + "sub r4, %[d1], r4\n\t" +#endif +#endif #ifdef WOLFSSL_KEIL "sbcs r6, r6, r5\n\t" #elif defined(__clang__) @@ -75053,7 +75407,9 @@ static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_dig */ static WC_INLINE int sp_3072_mod_48(sp_digit* r, const sp_digit* a, const sp_digit* m) { - return sp_3072_div_48(a, m, NULL, r); + int ret; + ret = sp_3072_div_48(a, m, NULL, r); + return ret; } #ifdef WOLFSSL_SP_SMALL @@ -79044,6 +79400,122 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, "sub r4, %[d1], r4\n\t" #endif #endif +#ifdef WOLFSSL_KEIL + "sbcs r6, r6, r5\n\t" +#elif defined(__clang__) + "sbcs r6, r5\n\t" +#else + "sbc r6, r5\n\t" +#endif + "movs r5, r6\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif + "# r * div - Start\n\t" + "uxth %[d1], r3\n\t" + "uxth r4, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls r4, %[d1], r4\n\t" +#elif defined(__clang__) + "muls r4, %[d1]\n\t" +#else + "mul r4, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[div], #16\n\t" +#else + "lsr r6, %[div], #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, %[d1], #16\n\t" +#else + "lsr r5, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r7\n\t" +#elif defined(__clang__) + "adcs r5, r7\n\t" +#else + "adc r5, r7\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs %[d1], r3, #16\n\t" +#else + "lsr %[d1], r3, #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls r6, %[d1], r6\n\t" +#elif defined(__clang__) + "muls r6, %[d1]\n\t" +#else + "mul r6, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r5, r5, r6\n\t" +#else + "add r5, r5, r6\n\t" +#endif + "uxth r6, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[d1], #16\n\t" +#else + "lsr r6, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r6\n\t" +#elif defined(__clang__) + "adcs r5, r6\n\t" +#else + "adc r5, r6\n\t" +#endif + "# r * div - Done\n\t" + "mov %[d1], r8\n\t" + "mov r6, r9\n\t" +#ifdef WOLFSSL_KEIL + "subs r4, %[d1], r4\n\t" +#else +#ifdef __clang__ + "subs r4, %[d1], r4\n\t" +#else + "sub r4, %[d1], r4\n\t" +#endif +#endif #ifdef WOLFSSL_KEIL "sbcs r6, r6, r5\n\t" #elif defined(__clang__) @@ -79144,7 +79616,9 @@ static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, s */ static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) { - return sp_3072_div_96_cond(a, m, NULL, r); + int ret; + ret = sp_3072_div_96_cond(a, m, NULL, r); + return ret; } #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) @@ -79340,7 +79814,9 @@ static WC_INLINE int sp_3072_div_96(const sp_digit* a, const sp_digit* d, sp_dig */ static WC_INLINE int sp_3072_mod_96(sp_digit* r, const sp_digit* a, const sp_digit* m) { - return sp_3072_div_96(a, m, NULL, r); + int ret; + ret = sp_3072_div_96(a, m, NULL, r); + return ret; } #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ @@ -91457,6 +91933,122 @@ SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, "sub r4, %[d1], r4\n\t" #endif #endif +#ifdef WOLFSSL_KEIL + "sbcs r6, r6, r5\n\t" +#elif defined(__clang__) + "sbcs r6, r5\n\t" +#else + "sbc r6, r5\n\t" +#endif + "movs r5, r6\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif + "# r * div - Start\n\t" + "uxth %[d1], r3\n\t" + "uxth r4, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls r4, %[d1], r4\n\t" +#elif defined(__clang__) + "muls r4, %[d1]\n\t" +#else + "mul r4, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[div], #16\n\t" +#else + "lsr r6, %[div], #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, %[d1], #16\n\t" +#else + "lsr r5, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r7\n\t" +#elif defined(__clang__) + "adcs r5, r7\n\t" +#else + "adc r5, r7\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs %[d1], r3, #16\n\t" +#else + "lsr %[d1], r3, #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls r6, %[d1], r6\n\t" +#elif defined(__clang__) + "muls r6, %[d1]\n\t" +#else + "mul r6, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r5, r5, r6\n\t" +#else + "add r5, r5, r6\n\t" +#endif + "uxth r6, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[d1], #16\n\t" +#else + "lsr r6, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r6\n\t" +#elif defined(__clang__) + "adcs r5, r6\n\t" +#else + "adc r5, r6\n\t" +#endif + "# r * div - Done\n\t" + "mov %[d1], r8\n\t" + "mov r6, r9\n\t" +#ifdef WOLFSSL_KEIL + "subs r4, %[d1], r4\n\t" +#else +#ifdef __clang__ + "subs r4, %[d1], r4\n\t" +#else + "sub r4, %[d1], r4\n\t" +#endif +#endif #ifdef WOLFSSL_KEIL "sbcs r6, r6, r5\n\t" #elif defined(__clang__) @@ -91557,7 +92149,9 @@ static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d, */ static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) { - return sp_4096_div_128_cond(a, m, NULL, r); + int ret; + ret = sp_4096_div_128_cond(a, m, NULL, r); + return ret; } #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) @@ -91754,7 +92348,9 @@ static WC_INLINE int sp_4096_div_128(const sp_digit* a, const sp_digit* d, sp_di */ static WC_INLINE int sp_4096_mod_128(sp_digit* r, const sp_digit* a, const sp_digit* m) { - return sp_4096_div_128(a, m, NULL, r); + int ret; + ret = sp_4096_div_128(a, m, NULL, r); + return ret; } #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ @@ -104414,6 +105010,122 @@ SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, "sub r4, %[d1], r4\n\t" #endif #endif +#ifdef WOLFSSL_KEIL + "sbcs r6, r6, r5\n\t" +#elif defined(__clang__) + "sbcs r6, r5\n\t" +#else + "sbc r6, r5\n\t" +#endif + "movs r5, r6\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif + "# r * div - Start\n\t" + "uxth %[d1], r3\n\t" + "uxth r4, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls r4, %[d1], r4\n\t" +#elif defined(__clang__) + "muls r4, %[d1]\n\t" +#else + "mul r4, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[div], #16\n\t" +#else + "lsr r6, %[div], #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, %[d1], #16\n\t" +#else + "lsr r5, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r7\n\t" +#elif defined(__clang__) + "adcs r5, r7\n\t" +#else + "adc r5, r7\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs %[d1], r3, #16\n\t" +#else + "lsr %[d1], r3, #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls r6, %[d1], r6\n\t" +#elif defined(__clang__) + "muls r6, %[d1]\n\t" +#else + "mul r6, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r5, r5, r6\n\t" +#else + "add r5, r5, r6\n\t" +#endif + "uxth r6, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[d1], #16\n\t" +#else + "lsr r6, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r6\n\t" +#elif defined(__clang__) + "adcs r5, r6\n\t" +#else + "adc r5, r6\n\t" +#endif + "# r * div - Done\n\t" + "mov %[d1], r8\n\t" + "mov r6, r9\n\t" +#ifdef WOLFSSL_KEIL + "subs r4, %[d1], r4\n\t" +#else +#ifdef __clang__ + "subs r4, %[d1], r4\n\t" +#else + "sub r4, %[d1], r4\n\t" +#endif +#endif #ifdef WOLFSSL_KEIL "sbcs r6, r6, r5\n\t" #elif defined(__clang__) @@ -104527,7 +105239,9 @@ static WC_INLINE int sp_256_div_8(const sp_digit* a, const sp_digit* d, sp_digit */ static WC_INLINE int sp_256_mod_8(sp_digit* r, const sp_digit* a, const sp_digit* m) { - return sp_256_div_8(a, m, NULL, r); + int ret; + ret = sp_256_div_8(a, m, NULL, r); + return ret; } #endif @@ -114925,6 +115639,122 @@ SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, "sub r4, %[d1], r4\n\t" #endif #endif +#ifdef WOLFSSL_KEIL + "sbcs r6, r6, r5\n\t" +#elif defined(__clang__) + "sbcs r6, r5\n\t" +#else + "sbc r6, r5\n\t" +#endif + "movs r5, r6\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif + "# r * div - Start\n\t" + "uxth %[d1], r3\n\t" + "uxth r4, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls r4, %[d1], r4\n\t" +#elif defined(__clang__) + "muls r4, %[d1]\n\t" +#else + "mul r4, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[div], #16\n\t" +#else + "lsr r6, %[div], #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, %[d1], #16\n\t" +#else + "lsr r5, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r7\n\t" +#elif defined(__clang__) + "adcs r5, r7\n\t" +#else + "adc r5, r7\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs %[d1], r3, #16\n\t" +#else + "lsr %[d1], r3, #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls r6, %[d1], r6\n\t" +#elif defined(__clang__) + "muls r6, %[d1]\n\t" +#else + "mul r6, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r5, r5, r6\n\t" +#else + "add r5, r5, r6\n\t" +#endif + "uxth r6, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[d1], #16\n\t" +#else + "lsr r6, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r6\n\t" +#elif defined(__clang__) + "adcs r5, r6\n\t" +#else + "adc r5, r6\n\t" +#endif + "# r * div - Done\n\t" + "mov %[d1], r8\n\t" + "mov r6, r9\n\t" +#ifdef WOLFSSL_KEIL + "subs r4, %[d1], r4\n\t" +#else +#ifdef __clang__ + "subs r4, %[d1], r4\n\t" +#else + "sub r4, %[d1], r4\n\t" +#endif +#endif #ifdef WOLFSSL_KEIL "sbcs r6, r6, r5\n\t" #elif defined(__clang__) @@ -115042,7 +115872,9 @@ static WC_INLINE int sp_384_div_12(const sp_digit* a, const sp_digit* d, sp_digi */ static WC_INLINE int sp_384_mod_12(sp_digit* r, const sp_digit* a, const sp_digit* m) { - return sp_384_div_12(a, m, NULL, r); + int ret; + ret = sp_384_div_12(a, m, NULL, r); + return ret; } #endif @@ -130819,6 +131651,122 @@ SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, "sub r4, %[d1], r4\n\t" #endif #endif +#ifdef WOLFSSL_KEIL + "sbcs r6, r6, r5\n\t" +#elif defined(__clang__) + "sbcs r6, r5\n\t" +#else + "sbc r6, r5\n\t" +#endif + "movs r5, r6\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif + "# r * div - Start\n\t" + "uxth %[d1], r3\n\t" + "uxth r4, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls r4, %[d1], r4\n\t" +#elif defined(__clang__) + "muls r4, %[d1]\n\t" +#else + "mul r4, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[div], #16\n\t" +#else + "lsr r6, %[div], #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, %[d1], #16\n\t" +#else + "lsr r5, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r7\n\t" +#elif defined(__clang__) + "adcs r5, r7\n\t" +#else + "adc r5, r7\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs %[d1], r3, #16\n\t" +#else + "lsr %[d1], r3, #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls r6, %[d1], r6\n\t" +#elif defined(__clang__) + "muls r6, %[d1]\n\t" +#else + "mul r6, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r5, r5, r6\n\t" +#else + "add r5, r5, r6\n\t" +#endif + "uxth r6, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[d1], #16\n\t" +#else + "lsr r6, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r6\n\t" +#elif defined(__clang__) + "adcs r5, r6\n\t" +#else + "adc r5, r6\n\t" +#endif + "# r * div - Done\n\t" + "mov %[d1], r8\n\t" + "mov r6, r9\n\t" +#ifdef WOLFSSL_KEIL + "subs r4, %[d1], r4\n\t" +#else +#ifdef __clang__ + "subs r4, %[d1], r4\n\t" +#else + "sub r4, %[d1], r4\n\t" +#endif +#endif #ifdef WOLFSSL_KEIL "sbcs r6, r6, r5\n\t" #elif defined(__clang__) @@ -130947,7 +131895,9 @@ static WC_INLINE int sp_521_div_17(const sp_digit* a, const sp_digit* d, sp_digi */ static WC_INLINE int sp_521_mod_17(sp_digit* r, const sp_digit* a, const sp_digit* m) { - return sp_521_div_17(a, m, NULL, r); + int ret; + ret = sp_521_div_17(a, m, NULL, r); + return ret; } #endif @@ -200553,6 +201503,122 @@ SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, "sub r4, %[d1], r4\n\t" #endif #endif +#ifdef WOLFSSL_KEIL + "sbcs r6, r6, r5\n\t" +#elif defined(__clang__) + "sbcs r6, r5\n\t" +#else + "sbc r6, r5\n\t" +#endif + "movs r5, r6\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r3, r3, r5\n\t" +#else + "add r3, r3, r5\n\t" +#endif + "# r * div - Start\n\t" + "uxth %[d1], r3\n\t" + "uxth r4, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls r4, %[d1], r4\n\t" +#elif defined(__clang__) + "muls r4, %[d1]\n\t" +#else + "mul r4, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[div], #16\n\t" +#else + "lsr r6, %[div], #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, %[d1], #16\n\t" +#else + "lsr r5, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r7\n\t" +#elif defined(__clang__) + "adcs r5, r7\n\t" +#else + "adc r5, r7\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs %[d1], r3, #16\n\t" +#else + "lsr %[d1], r3, #16\n\t" +#endif +#ifdef WOLFSSL_KEIL + "muls r6, %[d1], r6\n\t" +#elif defined(__clang__) + "muls r6, %[d1]\n\t" +#else + "mul r6, %[d1]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r5, r5, r6\n\t" +#else + "add r5, r5, r6\n\t" +#endif + "uxth r6, %[div]\n\t" +#ifdef WOLFSSL_KEIL + "muls %[d1], r6, %[d1]\n\t" +#elif defined(__clang__) + "muls %[d1], r6\n\t" +#else + "mul %[d1], r6\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r6, %[d1], #16\n\t" +#else + "lsr r6, %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls %[d1], %[d1], #16\n\t" +#else + "lsl %[d1], %[d1], #16\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "adds r4, r4, %[d1]\n\t" +#else + "add r4, r4, %[d1]\n\t" +#endif +#ifdef WOLFSSL_KEIL + "adcs r5, r5, r6\n\t" +#elif defined(__clang__) + "adcs r5, r6\n\t" +#else + "adc r5, r6\n\t" +#endif + "# r * div - Done\n\t" + "mov %[d1], r8\n\t" + "mov r6, r9\n\t" +#ifdef WOLFSSL_KEIL + "subs r4, %[d1], r4\n\t" +#else +#ifdef __clang__ + "subs r4, %[d1], r4\n\t" +#else + "sub r4, %[d1], r4\n\t" +#endif +#endif #ifdef WOLFSSL_KEIL "sbcs r6, r6, r5\n\t" #elif defined(__clang__) @@ -200779,7 +201845,9 @@ static WC_INLINE int sp_1024_div_32(const sp_digit* a, const sp_digit* d, sp_dig */ static WC_INLINE int sp_1024_mod_32(sp_digit* r, const sp_digit* a, const sp_digit* m) { - return sp_1024_div_32(a, m, NULL, r); + int ret; + ret = sp_1024_div_32(a, m, NULL, r); + return ret; } /* Multiply a number by Montgomery normalizer mod modulus (prime). diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c index 9e6ffe405..b674a53c9 100644 --- a/wolfcrypt/src/sp_c32.c +++ b/wolfcrypt/src/sp_c32.c @@ -1923,6 +1923,51 @@ static WC_INLINE sp_digit sp_2048_div_word_36(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int64 d = ((sp_int64)d1 << 29) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 29; + sp_digit t0 = (sp_digit)d & 0x1fffffff; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int64 m; + + r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + t1 -= dv & (0 - r); + for (i = 27; i >= 1; i--) { + t1 += t1 + ((sp_uint32)t0 >> 28); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 29); + m = d - ((sp_int64)r * div); + r += (m >> 58) - (sp_digit)(d >> 58); + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 29); + m = d - ((sp_int64)r * div); + r += (m >> 58) - (sp_digit)(d >> 58); + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + return r; #else sp_int64 d = ((sp_int64)d1 << 29) + d0; sp_digit r = 0; @@ -1944,6 +1989,15 @@ static WC_INLINE sp_digit sp_2048_div_word_36(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_2048_word_div_word_36(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint32)(div - d) >> 31); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -2010,7 +2064,7 @@ static int sp_2048_div_36(const sp_digit* a, const sp_digit* d, } t1[36 - 1] += t1[36 - 2] >> 29; t1[36 - 2] &= 0x1fffffff; - r1 = t1[36 - 1] / dv; + r1 = sp_2048_word_div_word_36(t1[36 - 1], dv); sp_2048_mul_d_36(t2, sd, r1); sp_2048_sub_36(t1, t1, t2); @@ -2936,6 +2990,51 @@ static WC_INLINE sp_digit sp_2048_div_word_72(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int64 d = ((sp_int64)d1 << 29) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 29; + sp_digit t0 = (sp_digit)d & 0x1fffffff; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int64 m; + + r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + t1 -= dv & (0 - r); + for (i = 27; i >= 1; i--) { + t1 += t1 + ((sp_uint32)t0 >> 28); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 29); + m = d - ((sp_int64)r * div); + r += (m >> 58) - (sp_digit)(d >> 58); + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 29); + m = d - ((sp_int64)r * div); + r += (m >> 58) - (sp_digit)(d >> 58); + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + return r; #else sp_int64 d = ((sp_int64)d1 << 29) + d0; sp_digit r = 0; @@ -2957,6 +3056,15 @@ static WC_INLINE sp_digit sp_2048_div_word_72(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_2048_word_div_word_72(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint32)(div - d) >> 31); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -3022,7 +3130,7 @@ static int sp_2048_div_72(const sp_digit* a, const sp_digit* d, } t1[71 - 1] += t1[71 - 2] >> 29; t1[71 - 2] &= 0x1fffffff; - r1 = t1[71 - 1] / dv; + r1 = sp_2048_word_div_word_72(t1[71 - 1], dv); sp_2048_mul_d_72(t2, sd, r1); sp_2048_sub_72(t1, t1, t2); @@ -5507,6 +5615,51 @@ static WC_INLINE sp_digit sp_3072_div_word_53(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int64 d = ((sp_int64)d1 << 29) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 29; + sp_digit t0 = (sp_digit)d & 0x1fffffff; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int64 m; + + r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + t1 -= dv & (0 - r); + for (i = 27; i >= 1; i--) { + t1 += t1 + ((sp_uint32)t0 >> 28); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 29); + m = d - ((sp_int64)r * div); + r += (m >> 58) - (sp_digit)(d >> 58); + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 29); + m = d - ((sp_int64)r * div); + r += (m >> 58) - (sp_digit)(d >> 58); + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + return r; #else sp_int64 d = ((sp_int64)d1 << 29) + d0; sp_digit r = 0; @@ -5528,6 +5681,15 @@ static WC_INLINE sp_digit sp_3072_div_word_53(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_3072_word_div_word_53(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint32)(div - d) >> 31); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -5594,7 +5756,7 @@ static int sp_3072_div_53(const sp_digit* a, const sp_digit* d, } t1[53 - 1] += t1[53 - 2] >> 29; t1[53 - 2] &= 0x1fffffff; - r1 = t1[53 - 1] / dv; + r1 = sp_3072_word_div_word_53(t1[53 - 1], dv); sp_3072_mul_d_53(t2, sd, r1); sp_3072_sub_53(t1, t1, t2); @@ -6297,6 +6459,51 @@ static WC_INLINE sp_digit sp_3072_div_word_106(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int64 d = ((sp_int64)d1 << 29) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 29; + sp_digit t0 = (sp_digit)d & 0x1fffffff; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int64 m; + + r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + t1 -= dv & (0 - r); + for (i = 27; i >= 1; i--) { + t1 += t1 + ((sp_uint32)t0 >> 28); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 29); + m = d - ((sp_int64)r * div); + r += (m >> 58) - (sp_digit)(d >> 58); + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 29); + m = d - ((sp_int64)r * div); + r += (m >> 58) - (sp_digit)(d >> 58); + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + return r; #else sp_int64 d = ((sp_int64)d1 << 29) + d0; sp_digit r = 0; @@ -6318,6 +6525,15 @@ static WC_INLINE sp_digit sp_3072_div_word_106(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_3072_word_div_word_106(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint32)(div - d) >> 31); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -6384,7 +6600,7 @@ static int sp_3072_div_106(const sp_digit* a, const sp_digit* d, } t1[106 - 1] += t1[106 - 2] >> 29; t1[106 - 2] &= 0x1fffffff; - r1 = t1[106 - 1] / dv; + r1 = sp_3072_word_div_word_106(t1[106 - 1], dv); sp_3072_mul_d_106(t2, sd, r1); sp_3072_sub_106(t1, t1, t2); @@ -9432,6 +9648,51 @@ static WC_INLINE sp_digit sp_3072_div_word_56(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int64 d = ((sp_int64)d1 << 28) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 28; + sp_digit t0 = (sp_digit)d & 0xfffffff; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int64 m; + + r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + t1 -= dv & (0 - r); + for (i = 26; i >= 1; i--) { + t1 += t1 + ((sp_uint32)t0 >> 27); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 28); + m = d - ((sp_int64)r * div); + r += (m >> 56) - (sp_digit)(d >> 56); + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 28); + m = d - ((sp_int64)r * div); + r += (m >> 56) - (sp_digit)(d >> 56); + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + return r; #else sp_int64 d = ((sp_int64)d1 << 28) + d0; sp_digit r = 0; @@ -9453,6 +9714,15 @@ static WC_INLINE sp_digit sp_3072_div_word_56(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_3072_word_div_word_56(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint32)(div - d) >> 31); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -9518,7 +9788,7 @@ static int sp_3072_div_56(const sp_digit* a, const sp_digit* d, } t1[55 - 1] += t1[55 - 2] >> 28; t1[55 - 2] &= 0xfffffff; - r1 = t1[55 - 1] / dv; + r1 = sp_3072_word_div_word_56(t1[55 - 1], dv); sp_3072_mul_d_56(t2, sd, r1); sp_3072_sub_56(t1, t1, t2); @@ -10301,6 +10571,51 @@ static WC_INLINE sp_digit sp_3072_div_word_112(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int64 d = ((sp_int64)d1 << 28) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 28; + sp_digit t0 = (sp_digit)d & 0xfffffff; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int64 m; + + r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + t1 -= dv & (0 - r); + for (i = 26; i >= 1; i--) { + t1 += t1 + ((sp_uint32)t0 >> 27); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 28); + m = d - ((sp_int64)r * div); + r += (m >> 56) - (sp_digit)(d >> 56); + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 28); + m = d - ((sp_int64)r * div); + r += (m >> 56) - (sp_digit)(d >> 56); + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + return r; #else sp_int64 d = ((sp_int64)d1 << 28) + d0; sp_digit r = 0; @@ -10322,6 +10637,15 @@ static WC_INLINE sp_digit sp_3072_div_word_112(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_3072_word_div_word_112(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint32)(div - d) >> 31); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -10387,7 +10711,7 @@ static int sp_3072_div_112(const sp_digit* a, const sp_digit* d, } t1[110 - 1] += t1[110 - 2] >> 28; t1[110 - 2] &= 0xfffffff; - r1 = t1[110 - 1] / dv; + r1 = sp_3072_word_div_word_112(t1[110 - 1], dv); sp_3072_mul_d_112(t2, sd, r1); sp_3072_sub_112(t1, t1, t2); @@ -12954,6 +13278,51 @@ static WC_INLINE sp_digit sp_4096_div_word_71(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int64 d = ((sp_int64)d1 << 29) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 29; + sp_digit t0 = (sp_digit)d & 0x1fffffff; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int64 m; + + r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + t1 -= dv & (0 - r); + for (i = 27; i >= 1; i--) { + t1 += t1 + ((sp_uint32)t0 >> 28); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 29); + m = d - ((sp_int64)r * div); + r += (m >> 58) - (sp_digit)(d >> 58); + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 29); + m = d - ((sp_int64)r * div); + r += (m >> 58) - (sp_digit)(d >> 58); + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + return r; #else sp_int64 d = ((sp_int64)d1 << 29) + d0; sp_digit r = 0; @@ -12975,6 +13344,15 @@ static WC_INLINE sp_digit sp_4096_div_word_71(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_4096_word_div_word_71(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint32)(div - d) >> 31); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -13041,7 +13419,7 @@ static int sp_4096_div_71(const sp_digit* a, const sp_digit* d, } t1[71 - 1] += t1[71 - 2] >> 29; t1[71 - 2] &= 0x1fffffff; - r1 = t1[71 - 1] / dv; + r1 = sp_4096_word_div_word_71(t1[71 - 1], dv); sp_4096_mul_d_71(t2, sd, r1); sp_4096_sub_71(t1, t1, t2); @@ -13745,6 +14123,51 @@ static WC_INLINE sp_digit sp_4096_div_word_142(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int64 d = ((sp_int64)d1 << 29) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 29; + sp_digit t0 = (sp_digit)d & 0x1fffffff; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int64 m; + + r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + t1 -= dv & (0 - r); + for (i = 27; i >= 1; i--) { + t1 += t1 + ((sp_uint32)t0 >> 28); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 29); + m = d - ((sp_int64)r * div); + r += (m >> 58) - (sp_digit)(d >> 58); + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 29); + m = d - ((sp_int64)r * div); + r += (m >> 58) - (sp_digit)(d >> 58); + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + return r; #else sp_int64 d = ((sp_int64)d1 << 29) + d0; sp_digit r = 0; @@ -13766,6 +14189,15 @@ static WC_INLINE sp_digit sp_4096_div_word_142(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_4096_word_div_word_142(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint32)(div - d) >> 31); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -13832,7 +14264,7 @@ static int sp_4096_div_142(const sp_digit* a, const sp_digit* d, } t1[142 - 1] += t1[142 - 2] >> 29; t1[142 - 2] &= 0x1fffffff; - r1 = t1[142 - 1] / dv; + r1 = sp_4096_word_div_word_142(t1[142 - 1], dv); sp_4096_mul_d_142(t2, sd, r1); sp_4096_sub_142(t1, t1, t2); @@ -16759,6 +17191,51 @@ static WC_INLINE sp_digit sp_4096_div_word_81(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int64 d = ((sp_int64)d1 << 26) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 26; + sp_digit t0 = (sp_digit)d & 0x3ffffff; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int64 m; + + r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + t1 -= dv & (0 - r); + for (i = 24; i >= 1; i--) { + t1 += t1 + ((sp_uint32)t0 >> 25); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 26); + m = d - ((sp_int64)r * div); + r += (m >> 52) - (sp_digit)(d >> 52); + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 26); + m = d - ((sp_int64)r * div); + r += (m >> 52) - (sp_digit)(d >> 52); + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + return r; #else sp_int64 d = ((sp_int64)d1 << 26) + d0; sp_digit r = 0; @@ -16780,6 +17257,15 @@ static WC_INLINE sp_digit sp_4096_div_word_81(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_4096_word_div_word_81(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint32)(div - d) >> 31); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -16845,7 +17331,7 @@ static int sp_4096_div_81(const sp_digit* a, const sp_digit* d, } t1[79 - 1] += t1[79 - 2] >> 26; t1[79 - 2] &= 0x3ffffff; - r1 = t1[79 - 1] / dv; + r1 = sp_4096_word_div_word_81(t1[79 - 1], dv); sp_4096_mul_d_81(t2, sd, r1); sp_4096_sub_81(t1, t1, t2); @@ -17614,6 +18100,51 @@ static WC_INLINE sp_digit sp_4096_div_word_162(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int64 d = ((sp_int64)d1 << 26) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 26; + sp_digit t0 = (sp_digit)d & 0x3ffffff; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int64 m; + + r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + t1 -= dv & (0 - r); + for (i = 24; i >= 1; i--) { + t1 += t1 + ((sp_uint32)t0 >> 25); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 26); + m = d - ((sp_int64)r * div); + r += (m >> 52) - (sp_digit)(d >> 52); + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 26); + m = d - ((sp_int64)r * div); + r += (m >> 52) - (sp_digit)(d >> 52); + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + return r; #else sp_int64 d = ((sp_int64)d1 << 26) + d0; sp_digit r = 0; @@ -17635,6 +18166,15 @@ static WC_INLINE sp_digit sp_4096_div_word_162(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_4096_word_div_word_162(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint32)(div - d) >> 31); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -17700,7 +18240,7 @@ static int sp_4096_div_162(const sp_digit* a, const sp_digit* d, } t1[158 - 1] += t1[158 - 2] >> 26; t1[158 - 2] &= 0x3ffffff; - r1 = t1[158 - 1] / dv; + r1 = sp_4096_word_div_word_162(t1[158 - 1], dv); sp_4096_mul_d_162(t2, sd, r1); sp_4096_sub_162(t1, t1, t2); @@ -40524,6 +41064,51 @@ static WC_INLINE sp_digit sp_521_div_word_21(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int64 d = ((sp_int64)d1 << 25) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 25; + sp_digit t0 = (sp_digit)d & 0x1ffffff; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int64 m; + + r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + t1 -= dv & (0 - r); + for (i = 23; i >= 1; i--) { + t1 += t1 + ((sp_uint32)t0 >> 24); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 25); + m = d - ((sp_int64)r * div); + r += (m >> 50) - (sp_digit)(d >> 50); + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 25); + m = d - ((sp_int64)r * div); + r += (m >> 50) - (sp_digit)(d >> 50); + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + return r; #else sp_int64 d = ((sp_int64)d1 << 25) + d0; sp_digit r = 0; @@ -43142,6 +43727,51 @@ static WC_INLINE sp_digit sp_1024_div_word_42(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int64 d = ((sp_int64)d1 << 25) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 25; + sp_digit t0 = (sp_digit)d & 0x1ffffff; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int64 m; + + r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + t1 -= dv & (0 - r); + for (i = 23; i >= 1; i--) { + t1 += t1 + ((sp_uint32)t0 >> 24); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 25); + m = d - ((sp_int64)r * div); + r += (m >> 50) - (sp_digit)(d >> 50); + + m = d - ((sp_int64)r * div); + r += (sp_digit)(m >> 25); + m = d - ((sp_int64)r * div); + r += (m >> 50) - (sp_digit)(d >> 50); + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + + m = d - ((sp_int64)r * div); + sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31); + r += sign * t2; + return r; #else sp_int64 d = ((sp_int64)d1 << 25) + d0; sp_digit r = 0; @@ -43163,6 +43793,15 @@ static WC_INLINE sp_digit sp_1024_div_word_42(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_1024_word_div_word_42(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint32)(div - d) >> 31); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -43228,7 +43867,7 @@ static int sp_1024_div_42(const sp_digit* a, const sp_digit* d, } t1[41 - 1] += t1[41 - 2] >> 25; t1[41 - 2] &= 0x1ffffff; - r1 = t1[41 - 1] / dv; + r1 = sp_1024_word_div_word_42(t1[41 - 1], dv); sp_1024_mul_d_42(t2, sd, r1); sp_1024_sub_42(t1, t1, t2); diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c index 141b9e6d8..bf99db658 100644 --- a/wolfcrypt/src/sp_c64.c +++ b/wolfcrypt/src/sp_c64.c @@ -843,6 +843,51 @@ static WC_INLINE sp_digit sp_2048_div_word_17(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 61) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 61; + sp_digit t0 = (sp_digit)d & 0x1fffffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 59; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 60); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 61); + m = d - ((sp_int128)r * div); + r += (m >> 122) - (sp_digit)(d >> 122); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 61); + m = d - ((sp_int128)r * div); + r += (m >> 122) - (sp_digit)(d >> 122); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 61) + d0; sp_digit r = 0; @@ -864,6 +909,15 @@ static WC_INLINE sp_digit sp_2048_div_word_17(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_2048_word_div_word_17(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint64)(div - d) >> 63); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -930,7 +984,7 @@ static int sp_2048_div_17(const sp_digit* a, const sp_digit* d, } t1[17 - 1] += t1[17 - 2] >> 61; t1[17 - 2] &= 0x1fffffffffffffffL; - r1 = t1[17 - 1] / dv; + r1 = sp_2048_word_div_word_17(t1[17 - 1], dv); sp_2048_mul_d_17(t2, sd, r1); sp_2048_sub_17(t1, t1, t2); @@ -1619,6 +1673,51 @@ static WC_INLINE sp_digit sp_2048_div_word_34(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 61) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 61; + sp_digit t0 = (sp_digit)d & 0x1fffffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 59; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 60); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 61); + m = d - ((sp_int128)r * div); + r += (m >> 122) - (sp_digit)(d >> 122); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 61); + m = d - ((sp_int128)r * div); + r += (m >> 122) - (sp_digit)(d >> 122); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 61) + d0; sp_digit r = 0; @@ -1640,6 +1739,15 @@ static WC_INLINE sp_digit sp_2048_div_word_34(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_2048_word_div_word_34(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint64)(div - d) >> 63); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -1706,7 +1814,7 @@ static int sp_2048_div_34(const sp_digit* a, const sp_digit* d, } t1[34 - 1] += t1[34 - 2] >> 61; t1[34 - 2] &= 0x1fffffffffffffffL; - r1 = t1[34 - 1] / dv; + r1 = sp_2048_word_div_word_34(t1[34 - 1], dv); sp_2048_mul_d_34(t2, sd, r1); sp_2048_sub_34(t1, t1, t2); @@ -4269,6 +4377,51 @@ static WC_INLINE sp_digit sp_2048_div_word_18(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 57) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 57; + sp_digit t0 = (sp_digit)d & 0x1ffffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 55; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 56); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 57); + m = d - ((sp_int128)r * div); + r += (m >> 114) - (sp_digit)(d >> 114); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 57); + m = d - ((sp_int128)r * div); + r += (m >> 114) - (sp_digit)(d >> 114); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 57) + d0; sp_digit r = 0; @@ -4290,6 +4443,15 @@ static WC_INLINE sp_digit sp_2048_div_word_18(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_2048_word_div_word_18(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint64)(div - d) >> 63); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -4356,7 +4518,7 @@ static int sp_2048_div_18(const sp_digit* a, const sp_digit* d, } t1[18 - 1] += t1[18 - 2] >> 57; t1[18 - 2] &= 0x1ffffffffffffffL; - r1 = t1[18 - 1] / dv; + r1 = sp_2048_word_div_word_18(t1[18 - 1], dv); sp_2048_mul_d_18(t2, sd, r1); sp_2048_sub_18(t1, t1, t2); @@ -5106,6 +5268,51 @@ static WC_INLINE sp_digit sp_2048_div_word_36(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 57) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 57; + sp_digit t0 = (sp_digit)d & 0x1ffffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 55; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 56); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 57); + m = d - ((sp_int128)r * div); + r += (m >> 114) - (sp_digit)(d >> 114); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 57); + m = d - ((sp_int128)r * div); + r += (m >> 114) - (sp_digit)(d >> 114); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 57) + d0; sp_digit r = 0; @@ -5127,6 +5334,15 @@ static WC_INLINE sp_digit sp_2048_div_word_36(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_2048_word_div_word_36(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint64)(div - d) >> 63); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -5193,7 +5409,7 @@ static int sp_2048_div_36(const sp_digit* a, const sp_digit* d, } t1[36 - 1] += t1[36 - 2] >> 57; t1[36 - 2] &= 0x1ffffffffffffffL; - r1 = t1[36 - 1] / dv; + r1 = sp_2048_word_div_word_36(t1[36 - 1], dv); sp_2048_mul_d_36(t2, sd, r1); sp_2048_sub_36(t1, t1, t2); @@ -7510,6 +7726,51 @@ static WC_INLINE sp_digit sp_3072_div_word_26(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 60) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 60; + sp_digit t0 = (sp_digit)d & 0xfffffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 58; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 59); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 60); + m = d - ((sp_int128)r * div); + r += (m >> 120) - (sp_digit)(d >> 120); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 60); + m = d - ((sp_int128)r * div); + r += (m >> 120) - (sp_digit)(d >> 120); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 60) + d0; sp_digit r = 0; @@ -7531,6 +7792,15 @@ static WC_INLINE sp_digit sp_3072_div_word_26(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_3072_word_div_word_26(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint64)(div - d) >> 63); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -7597,7 +7867,7 @@ static int sp_3072_div_26(const sp_digit* a, const sp_digit* d, } t1[26 - 1] += t1[26 - 2] >> 60; t1[26 - 2] &= 0xfffffffffffffffL; - r1 = t1[26 - 1] / dv; + r1 = sp_3072_word_div_word_26(t1[26 - 1], dv); sp_3072_mul_d_26(t2, sd, r1); sp_3072_sub_26(t1, t1, t2); @@ -8292,6 +8562,51 @@ static WC_INLINE sp_digit sp_3072_div_word_52(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 60) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 60; + sp_digit t0 = (sp_digit)d & 0xfffffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 58; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 59); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 60); + m = d - ((sp_int128)r * div); + r += (m >> 120) - (sp_digit)(d >> 120); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 60); + m = d - ((sp_int128)r * div); + r += (m >> 120) - (sp_digit)(d >> 120); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 60) + d0; sp_digit r = 0; @@ -8313,6 +8628,15 @@ static WC_INLINE sp_digit sp_3072_div_word_52(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_3072_word_div_word_52(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint64)(div - d) >> 63); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -8379,7 +8703,7 @@ static int sp_3072_div_52(const sp_digit* a, const sp_digit* d, } t1[52 - 1] += t1[52 - 2] >> 60; t1[52 - 2] &= 0xfffffffffffffffL; - r1 = t1[52 - 1] / dv; + r1 = sp_3072_word_div_word_52(t1[52 - 1], dv); sp_3072_mul_d_52(t2, sd, r1); sp_3072_sub_52(t1, t1, t2); @@ -11080,6 +11404,51 @@ static WC_INLINE sp_digit sp_3072_div_word_27(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 57) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 57; + sp_digit t0 = (sp_digit)d & 0x1ffffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 55; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 56); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 57); + m = d - ((sp_int128)r * div); + r += (m >> 114) - (sp_digit)(d >> 114); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 57); + m = d - ((sp_int128)r * div); + r += (m >> 114) - (sp_digit)(d >> 114); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 57) + d0; sp_digit r = 0; @@ -11101,6 +11470,15 @@ static WC_INLINE sp_digit sp_3072_div_word_27(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_3072_word_div_word_27(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint64)(div - d) >> 63); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -11167,7 +11545,7 @@ static int sp_3072_div_27(const sp_digit* a, const sp_digit* d, } t1[27 - 1] += t1[27 - 2] >> 57; t1[27 - 2] &= 0x1ffffffffffffffL; - r1 = t1[27 - 1] / dv; + r1 = sp_3072_word_div_word_27(t1[27 - 1], dv); sp_3072_mul_d_27(t2, sd, r1); sp_3072_sub_27(t1, t1, t2); @@ -11928,6 +12306,51 @@ static WC_INLINE sp_digit sp_3072_div_word_54(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 57) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 57; + sp_digit t0 = (sp_digit)d & 0x1ffffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 55; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 56); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 57); + m = d - ((sp_int128)r * div); + r += (m >> 114) - (sp_digit)(d >> 114); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 57); + m = d - ((sp_int128)r * div); + r += (m >> 114) - (sp_digit)(d >> 114); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 57) + d0; sp_digit r = 0; @@ -11949,6 +12372,15 @@ static WC_INLINE sp_digit sp_3072_div_word_54(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_3072_word_div_word_54(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint64)(div - d) >> 63); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -12015,7 +12447,7 @@ static int sp_3072_div_54(const sp_digit* a, const sp_digit* d, } t1[54 - 1] += t1[54 - 2] >> 57; t1[54 - 2] &= 0x1ffffffffffffffL; - r1 = t1[54 - 1] / dv; + r1 = sp_3072_word_div_word_54(t1[54 - 1], dv); sp_3072_mul_d_54(t2, sd, r1); sp_3072_sub_54(t1, t1, t2); @@ -14374,6 +14806,51 @@ static WC_INLINE sp_digit sp_4096_div_word_35(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 59) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 59; + sp_digit t0 = (sp_digit)d & 0x7ffffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 57; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 58); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 59); + m = d - ((sp_int128)r * div); + r += (m >> 118) - (sp_digit)(d >> 118); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 59); + m = d - ((sp_int128)r * div); + r += (m >> 118) - (sp_digit)(d >> 118); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 59) + d0; sp_digit r = 0; @@ -14395,6 +14872,15 @@ static WC_INLINE sp_digit sp_4096_div_word_35(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_4096_word_div_word_35(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint64)(div - d) >> 63); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -14461,7 +14947,7 @@ static int sp_4096_div_35(const sp_digit* a, const sp_digit* d, } t1[35 - 1] += t1[35 - 2] >> 59; t1[35 - 2] &= 0x7ffffffffffffffL; - r1 = t1[35 - 1] / dv; + r1 = sp_4096_word_div_word_35(t1[35 - 1], dv); sp_4096_mul_d_35(t2, sd, r1); sp_4096_sub_35(t1, t1, t2); @@ -15151,6 +15637,51 @@ static WC_INLINE sp_digit sp_4096_div_word_70(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 59) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 59; + sp_digit t0 = (sp_digit)d & 0x7ffffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 57; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 58); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 59); + m = d - ((sp_int128)r * div); + r += (m >> 118) - (sp_digit)(d >> 118); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 59); + m = d - ((sp_int128)r * div); + r += (m >> 118) - (sp_digit)(d >> 118); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 59) + d0; sp_digit r = 0; @@ -15172,6 +15703,15 @@ static WC_INLINE sp_digit sp_4096_div_word_70(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_4096_word_div_word_70(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint64)(div - d) >> 63); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -15238,7 +15778,7 @@ static int sp_4096_div_70(const sp_digit* a, const sp_digit* d, } t1[70 - 1] += t1[70 - 2] >> 59; t1[70 - 2] &= 0x7ffffffffffffffL; - r1 = t1[70 - 1] / dv; + r1 = sp_4096_word_div_word_70(t1[70 - 1], dv); sp_4096_mul_d_70(t2, sd, r1); sp_4096_sub_70(t1, t1, t2); @@ -17994,6 +18534,51 @@ static WC_INLINE sp_digit sp_4096_div_word_39(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 53) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 53; + sp_digit t0 = (sp_digit)d & 0x1fffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 51; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 52); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 53); + m = d - ((sp_int128)r * div); + r += (m >> 106) - (sp_digit)(d >> 106); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 53); + m = d - ((sp_int128)r * div); + r += (m >> 106) - (sp_digit)(d >> 106); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 53) + d0; sp_digit r = 0; @@ -18015,6 +18600,15 @@ static WC_INLINE sp_digit sp_4096_div_word_39(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_4096_word_div_word_39(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint64)(div - d) >> 63); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -18081,7 +18675,7 @@ static int sp_4096_div_39(const sp_digit* a, const sp_digit* d, } t1[39 - 1] += t1[39 - 2] >> 53; t1[39 - 2] &= 0x1fffffffffffffL; - r1 = t1[39 - 1] / dv; + r1 = sp_4096_word_div_word_39(t1[39 - 1], dv); sp_4096_mul_d_39(t2, sd, r1); sp_4096_sub_39(t1, t1, t2); @@ -18843,6 +19437,51 @@ static WC_INLINE sp_digit sp_4096_div_word_78(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 53) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 53; + sp_digit t0 = (sp_digit)d & 0x1fffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 51; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 52); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 53); + m = d - ((sp_int128)r * div); + r += (m >> 106) - (sp_digit)(d >> 106); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 53); + m = d - ((sp_int128)r * div); + r += (m >> 106) - (sp_digit)(d >> 106); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 53) + d0; sp_digit r = 0; @@ -18864,6 +19503,15 @@ static WC_INLINE sp_digit sp_4096_div_word_78(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_4096_word_div_word_78(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint64)(div - d) >> 63); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -18930,7 +19578,7 @@ static int sp_4096_div_78(const sp_digit* a, const sp_digit* d, } t1[78 - 1] += t1[78 - 2] >> 53; t1[78 - 2] &= 0x1fffffffffffffL; - r1 = t1[78 - 1] / dv; + r1 = sp_4096_word_div_word_78(t1[78 - 1], dv); sp_4096_mul_d_78(t2, sd, r1); sp_4096_sub_78(t1, t1, t2); @@ -40077,6 +40725,51 @@ static WC_INLINE sp_digit sp_521_div_word_9(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 58) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 58; + sp_digit t0 = (sp_digit)d & 0x3ffffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 56; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 57); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 58); + m = d - ((sp_int128)r * div); + r += (m >> 116) - (sp_digit)(d >> 116); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 58); + m = d - ((sp_int128)r * div); + r += (m >> 116) - (sp_digit)(d >> 116); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 58) + d0; sp_digit r = 0; @@ -42554,6 +43247,51 @@ static WC_INLINE sp_digit sp_1024_div_word_18(sp_digit d1, sp_digit d0, ); return (sp_digit)lo; +#elif !defined(__aarch64__) && !defined(SP_DIV_WORD_USE_DIV) + sp_int128 d = ((sp_int128)d1 << 57) + d0; + sp_digit dv = (div >> 1) + 1; + sp_digit t1 = d >> 57; + sp_digit t0 = (sp_digit)d & 0x1ffffffffffffffL; + sp_digit t2; + sp_digit sign; + sp_digit r; + int i; + sp_int128 m; + + r = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + t1 -= dv & (0 - r); + for (i = 55; i >= 1; i--) { + t1 += t1 + ((sp_uint64)t0 >> 56); + t0 <<= 1; + t2 = (sp_digit)(((sp_uint64)(dv - t1)) >> 63); + r += r + t2; + t1 -= dv & (0 - t2); + t1 += t2; + } + r += r + 1; + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 57); + m = d - ((sp_int128)r * div); + r += (m >> 114) - (sp_digit)(d >> 114); + + m = d - ((sp_int128)r * div); + r += (sp_digit)(m >> 57); + m = d - ((sp_int128)r * div); + r += (m >> 114) - (sp_digit)(d >> 114); + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + + m = d - ((sp_int128)r * div); + sign = (sp_digit)(0 - ((sp_uint64)m >> 63)) * 2 + 1; + m *= sign; + t2 = (sp_digit)(((sp_uint64)(div - m)) >> 63); + r += sign * t2; + return r; #else sp_int128 d = ((sp_int128)d1 << 57) + d0; sp_digit r = 0; @@ -42575,6 +43313,15 @@ static WC_INLINE sp_digit sp_1024_div_word_18(sp_digit d1, sp_digit d0, return r; #endif } +static WC_INLINE sp_digit sp_1024_word_div_word_18(sp_digit d, sp_digit div) +{ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \ + defined(SP_DIV_WORD_USE_DIV) + return d / div; +#else + return (sp_digit)((sp_uint64)(div - d) >> 63); +#endif +} /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -42641,7 +43388,7 @@ static int sp_1024_div_18(const sp_digit* a, const sp_digit* d, } t1[18 - 1] += t1[18 - 2] >> 57; t1[18 - 2] &= 0x1ffffffffffffffL; - r1 = t1[18 - 1] / dv; + r1 = sp_1024_word_div_word_18(t1[18 - 1], dv); sp_1024_mul_d_18(t2, sd, r1); sp_1024_sub_18(t1, t1, t2);