From dbb03cb5a3ccb46ee437e0b70262099b7811706a Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Wed, 18 Aug 2021 12:59:44 +1000 Subject: [PATCH] SP RSA verify only: fix to compile Configurations: ./configure --disable-asn --disable-filesystem --enable-cryptonly --disable-dh --disable-sha224 --disable-ecc CFLAGS=-DWOLFSSL_PUBLIC_MP --enable-rsavfy --enable-sp=small2048 --enable-sp-math ./configure --disable-asn --disable-filesystem --enable-cryptonly --disable-dh --disable-sha224 --disable-ecc CFLAGS=-DWOLFSSL_PUBLIC_MP --enable-rsavfy --enable-sp=2048 --enable-sp-math ./configure --disable-asn --disable-filesystem --enable-cryptonly --disable-dh --disable-sha224 --disable-ecc CFLAGS=-DWOLFSSL_PUBLIC_MP --enable-rsavfy --enable-sp=small2048 --enable-sp-math-all ./configure --disable-asn --disable-filesystem --enable-cryptonly --disable-dh --disable-sha224 --disable-ecc CFLAGS=-DWOLFSSL_PUBLIC_MP --enable-rsavfy --enable-sp=small2048 --enable-sp-math --enable-sp-asm ./configure --disable-asn --disable-filesystem --enable-cryptonly --disable-dh --disable-sha224 --disable-ecc CFLAGS=-DWOLFSSL_PUBLIC_MP --enable-rsavfy --enable-sp=2048 --enable-sp-math --enable-sp-asm --- configure.ac | 3 +- tests/api.c | 2 +- wolfcrypt/src/sp_arm32.c | 1722 ++++++++++++++-- wolfcrypt/src/sp_arm64.c | 3286 ++++++++++++++++--------------- wolfcrypt/src/sp_armthumb.c | 3108 +++++++++++++++++++++++++++-- wolfcrypt/src/sp_c32.c | 20 + wolfcrypt/src/sp_c64.c | 24 + wolfcrypt/src/sp_cortexm.c | 1744 ++++++++++++++-- wolfcrypt/src/sp_int.c | 45 +- wolfcrypt/src/sp_x86_64.c | 528 ++--- wolfcrypt/src/sp_x86_64_asm.S | 2442 +++++++++++------------ wolfcrypt/src/sp_x86_64_asm.asm | 2364 +++++++++++----------- wolfcrypt/test/test.c | 2 + wolfssl/wolfcrypt/sp_int.h | 3 +- 14 files changed, 10480 insertions(+), 4813 deletions(-) diff --git a/configure.ac b/configure.ac index 092cddad2..accb9ca4b 100644 --- a/configure.ac +++ b/configure.ac @@ -2911,7 +2911,8 @@ AC_ARG_ENABLE([asn], if test "$ENABLED_ASN" = "no" then - AM_CFLAGS="$AM_CFLAGS -DNO_ASN" + AM_CFLAGS="$AM_CFLAGS -DNO_ASN -DNO_ASN_CRYPT" + enable_pwdbased=no if test "$ENABLED_DH" = "no" && test "$ENABLED_ECC" = "no" then # DH and ECC need bigint diff --git a/tests/api.c b/tests/api.c index 522cd4e65..9870bfaa2 100644 --- a/tests/api.c +++ b/tests/api.c @@ -16865,7 +16865,7 @@ static int test_wc_RsaKeyToPublicDer (void) word32 derLen = 162; #else int bits = 2048; - word32 derLen = 290; + word32 derLen = 294; #endif XMEMSET(&rng, 0, sizeof(rng)); diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c index bb866cd6a..f61c69bb8 100644 --- a/wolfcrypt/src/sp_arm32.c +++ b/wolfcrypt/src/sp_arm32.c @@ -221,12 +221,14 @@ static void sp_2048_to_bin_64(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. */ #define sp_2048_norm_64(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. @@ -5012,7 +5014,7 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 2048 bits, just need to subtract. * @@ -5027,7 +5029,7 @@ static void sp_2048_mont_norm_64(sp_digit* r, const sp_digit* m) sp_2048_sub_in_place_64(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -6034,7 +6036,328 @@ static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a, sp_2048_mont_reduce_64(r, m, mp); } -#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "add r12, %[a], #256\n\t" + "\n1:\n\t" + "rsbs %[c], %[c], #0\n\t" + "ldr r4, [%[a]], #4\n\t" + "ldr r5, [%[a]], #4\n\t" + "ldr r6, [%[a]], #4\n\t" + "ldr r7, [%[a]], #4\n\t" + "ldr r8, [%[b]], #4\n\t" + "ldr r9, [%[b]], #4\n\t" + "ldr r10, [%[b]], #4\n\t" + "ldr r14, [%[b]], #4\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "sbcs r7, r7, r14\n\t" + "str r4, [%[r]], #4\n\t" + "str r5, [%[r]], #4\n\t" + "str r6, [%[r]], #4\n\t" + "str r7, [%[r]], #4\n\t" + "sbc %[c], r4, r4\n\t" + "cmp %[a], r12\n\t" + "bne 1b\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14", "r12" + ); + + return c; +} + +#else +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldr r3, [%[a], #0]\n\t" + "ldr r4, [%[a], #4]\n\t" + "ldr r5, [%[a], #8]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r7, [%[b], #0]\n\t" + "ldr r8, [%[b], #4]\n\t" + "ldr r9, [%[b], #8]\n\t" + "ldr r10, [%[b], #12]\n\t" + "subs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #0]\n\t" + "str r4, [%[r], #4]\n\t" + "str r5, [%[r], #8]\n\t" + "str r6, [%[r], #12]\n\t" + "ldr r3, [%[a], #16]\n\t" + "ldr r4, [%[a], #20]\n\t" + "ldr r5, [%[a], #24]\n\t" + "ldr r6, [%[a], #28]\n\t" + "ldr r7, [%[b], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "ldr r9, [%[b], #24]\n\t" + "ldr r10, [%[b], #28]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #16]\n\t" + "str r4, [%[r], #20]\n\t" + "str r5, [%[r], #24]\n\t" + "str r6, [%[r], #28]\n\t" + "ldr r3, [%[a], #32]\n\t" + "ldr r4, [%[a], #36]\n\t" + "ldr r5, [%[a], #40]\n\t" + "ldr r6, [%[a], #44]\n\t" + "ldr r7, [%[b], #32]\n\t" + "ldr r8, [%[b], #36]\n\t" + "ldr r9, [%[b], #40]\n\t" + "ldr r10, [%[b], #44]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #32]\n\t" + "str r4, [%[r], #36]\n\t" + "str r5, [%[r], #40]\n\t" + "str r6, [%[r], #44]\n\t" + "ldr r3, [%[a], #48]\n\t" + "ldr r4, [%[a], #52]\n\t" + "ldr r5, [%[a], #56]\n\t" + "ldr r6, [%[a], #60]\n\t" + "ldr r7, [%[b], #48]\n\t" + "ldr r8, [%[b], #52]\n\t" + "ldr r9, [%[b], #56]\n\t" + "ldr r10, [%[b], #60]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #48]\n\t" + "str r4, [%[r], #52]\n\t" + "str r5, [%[r], #56]\n\t" + "str r6, [%[r], #60]\n\t" + "ldr r3, [%[a], #64]\n\t" + "ldr r4, [%[a], #68]\n\t" + "ldr r5, [%[a], #72]\n\t" + "ldr r6, [%[a], #76]\n\t" + "ldr r7, [%[b], #64]\n\t" + "ldr r8, [%[b], #68]\n\t" + "ldr r9, [%[b], #72]\n\t" + "ldr r10, [%[b], #76]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #64]\n\t" + "str r4, [%[r], #68]\n\t" + "str r5, [%[r], #72]\n\t" + "str r6, [%[r], #76]\n\t" + "ldr r3, [%[a], #80]\n\t" + "ldr r4, [%[a], #84]\n\t" + "ldr r5, [%[a], #88]\n\t" + "ldr r6, [%[a], #92]\n\t" + "ldr r7, [%[b], #80]\n\t" + "ldr r8, [%[b], #84]\n\t" + "ldr r9, [%[b], #88]\n\t" + "ldr r10, [%[b], #92]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #80]\n\t" + "str r4, [%[r], #84]\n\t" + "str r5, [%[r], #88]\n\t" + "str r6, [%[r], #92]\n\t" + "ldr r3, [%[a], #96]\n\t" + "ldr r4, [%[a], #100]\n\t" + "ldr r5, [%[a], #104]\n\t" + "ldr r6, [%[a], #108]\n\t" + "ldr r7, [%[b], #96]\n\t" + "ldr r8, [%[b], #100]\n\t" + "ldr r9, [%[b], #104]\n\t" + "ldr r10, [%[b], #108]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #96]\n\t" + "str r4, [%[r], #100]\n\t" + "str r5, [%[r], #104]\n\t" + "str r6, [%[r], #108]\n\t" + "ldr r3, [%[a], #112]\n\t" + "ldr r4, [%[a], #116]\n\t" + "ldr r5, [%[a], #120]\n\t" + "ldr r6, [%[a], #124]\n\t" + "ldr r7, [%[b], #112]\n\t" + "ldr r8, [%[b], #116]\n\t" + "ldr r9, [%[b], #120]\n\t" + "ldr r10, [%[b], #124]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #112]\n\t" + "str r4, [%[r], #116]\n\t" + "str r5, [%[r], #120]\n\t" + "str r6, [%[r], #124]\n\t" + "ldr r3, [%[a], #128]\n\t" + "ldr r4, [%[a], #132]\n\t" + "ldr r5, [%[a], #136]\n\t" + "ldr r6, [%[a], #140]\n\t" + "ldr r7, [%[b], #128]\n\t" + "ldr r8, [%[b], #132]\n\t" + "ldr r9, [%[b], #136]\n\t" + "ldr r10, [%[b], #140]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #128]\n\t" + "str r4, [%[r], #132]\n\t" + "str r5, [%[r], #136]\n\t" + "str r6, [%[r], #140]\n\t" + "ldr r3, [%[a], #144]\n\t" + "ldr r4, [%[a], #148]\n\t" + "ldr r5, [%[a], #152]\n\t" + "ldr r6, [%[a], #156]\n\t" + "ldr r7, [%[b], #144]\n\t" + "ldr r8, [%[b], #148]\n\t" + "ldr r9, [%[b], #152]\n\t" + "ldr r10, [%[b], #156]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #144]\n\t" + "str r4, [%[r], #148]\n\t" + "str r5, [%[r], #152]\n\t" + "str r6, [%[r], #156]\n\t" + "ldr r3, [%[a], #160]\n\t" + "ldr r4, [%[a], #164]\n\t" + "ldr r5, [%[a], #168]\n\t" + "ldr r6, [%[a], #172]\n\t" + "ldr r7, [%[b], #160]\n\t" + "ldr r8, [%[b], #164]\n\t" + "ldr r9, [%[b], #168]\n\t" + "ldr r10, [%[b], #172]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #160]\n\t" + "str r4, [%[r], #164]\n\t" + "str r5, [%[r], #168]\n\t" + "str r6, [%[r], #172]\n\t" + "ldr r3, [%[a], #176]\n\t" + "ldr r4, [%[a], #180]\n\t" + "ldr r5, [%[a], #184]\n\t" + "ldr r6, [%[a], #188]\n\t" + "ldr r7, [%[b], #176]\n\t" + "ldr r8, [%[b], #180]\n\t" + "ldr r9, [%[b], #184]\n\t" + "ldr r10, [%[b], #188]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #176]\n\t" + "str r4, [%[r], #180]\n\t" + "str r5, [%[r], #184]\n\t" + "str r6, [%[r], #188]\n\t" + "ldr r3, [%[a], #192]\n\t" + "ldr r4, [%[a], #196]\n\t" + "ldr r5, [%[a], #200]\n\t" + "ldr r6, [%[a], #204]\n\t" + "ldr r7, [%[b], #192]\n\t" + "ldr r8, [%[b], #196]\n\t" + "ldr r9, [%[b], #200]\n\t" + "ldr r10, [%[b], #204]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #192]\n\t" + "str r4, [%[r], #196]\n\t" + "str r5, [%[r], #200]\n\t" + "str r6, [%[r], #204]\n\t" + "ldr r3, [%[a], #208]\n\t" + "ldr r4, [%[a], #212]\n\t" + "ldr r5, [%[a], #216]\n\t" + "ldr r6, [%[a], #220]\n\t" + "ldr r7, [%[b], #208]\n\t" + "ldr r8, [%[b], #212]\n\t" + "ldr r9, [%[b], #216]\n\t" + "ldr r10, [%[b], #220]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #208]\n\t" + "str r4, [%[r], #212]\n\t" + "str r5, [%[r], #216]\n\t" + "str r6, [%[r], #220]\n\t" + "ldr r3, [%[a], #224]\n\t" + "ldr r4, [%[a], #228]\n\t" + "ldr r5, [%[a], #232]\n\t" + "ldr r6, [%[a], #236]\n\t" + "ldr r7, [%[b], #224]\n\t" + "ldr r8, [%[b], #228]\n\t" + "ldr r9, [%[b], #232]\n\t" + "ldr r10, [%[b], #236]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #224]\n\t" + "str r4, [%[r], #228]\n\t" + "str r5, [%[r], #232]\n\t" + "str r6, [%[r], #236]\n\t" + "ldr r3, [%[a], #240]\n\t" + "ldr r4, [%[a], #244]\n\t" + "ldr r5, [%[a], #248]\n\t" + "ldr r6, [%[a], #252]\n\t" + "ldr r7, [%[b], #240]\n\t" + "ldr r8, [%[b], #244]\n\t" + "ldr r9, [%[b], #248]\n\t" + "ldr r10, [%[b], #252]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #240]\n\t" + "str r4, [%[r], #244]\n\t" + "str r5, [%[r], #248]\n\t" + "str r6, [%[r], #252]\n\t" + "sbc %[c], %[c], #0\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + + return c; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -6097,6 +6420,69 @@ static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div) return r; } +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[128], t2[65]; + sp_digit div, r1; + int i; + + (void)m; + + + div = d[63]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); + for (i=63; i>=0; i--) { + sp_digit hi = t1[64 + i] - (t1[64 + i] == div); + r1 = div_2048_word_64(hi, t1[64 + i - 1], div); + + sp_2048_mul_d_64(t2, d, r1); + t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2); + t1[64 + i] -= t2[64]; + if (t1[64 + i] != 0) { + t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d); + if (t1[64 + i] != 0) + t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d); + } + } + + for (i = 63; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_2048_sub_64(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 64); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) +{ + return sp_2048_div_64_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) /* AND m into each word of a and store in r. * * r A single precision integer. @@ -6935,59 +7321,6 @@ static WC_INLINE int sp_2048_mod_64(sp_digit* r, const sp_digit* a, const sp_dig } #endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */ -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[128], t2[65]; - sp_digit div, r1; - int i; - - (void)m; - - - div = d[63]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); - for (i=63; i>=0; i--) { - sp_digit hi = t1[64 + i] - (t1[64 + i] == div); - r1 = div_2048_word_64(hi, t1[64 + i - 1], div); - - sp_2048_mul_d_64(t2, d, r1); - t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2); - t1[64 + i] -= t2[64]; - if (t1[64 + i] != 0) { - t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d); - if (t1[64 + i] != 0) - t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d); - } - } - - r1 = sp_2048_cmp_64(t1, d) >= 0; - sp_2048_cond_sub_64(r, t1, d, (sp_digit)0 - r1); - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - return sp_2048_div_64_cond(a, m, NULL, r); -} - #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -7264,6 +7597,7 @@ static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* WOLFSSL_SP_SMALL */ #endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -8702,12 +9036,14 @@ static void sp_3072_to_bin_96(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. */ #define sp_3072_norm_96(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. @@ -15587,7 +15923,7 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 3072 bits, just need to subtract. * @@ -15602,7 +15938,7 @@ static void sp_3072_mont_norm_96(sp_digit* r, const sp_digit* m) sp_3072_sub_in_place_96(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -17057,7 +17393,456 @@ static void sp_3072_mont_sqr_96(sp_digit* r, const sp_digit* a, sp_3072_mont_reduce_96(r, m, mp); } -#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "add r12, %[a], #384\n\t" + "\n1:\n\t" + "rsbs %[c], %[c], #0\n\t" + "ldr r4, [%[a]], #4\n\t" + "ldr r5, [%[a]], #4\n\t" + "ldr r6, [%[a]], #4\n\t" + "ldr r7, [%[a]], #4\n\t" + "ldr r8, [%[b]], #4\n\t" + "ldr r9, [%[b]], #4\n\t" + "ldr r10, [%[b]], #4\n\t" + "ldr r14, [%[b]], #4\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "sbcs r7, r7, r14\n\t" + "str r4, [%[r]], #4\n\t" + "str r5, [%[r]], #4\n\t" + "str r6, [%[r]], #4\n\t" + "str r7, [%[r]], #4\n\t" + "sbc %[c], r4, r4\n\t" + "cmp %[a], r12\n\t" + "bne 1b\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14", "r12" + ); + + return c; +} + +#else +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldr r3, [%[a], #0]\n\t" + "ldr r4, [%[a], #4]\n\t" + "ldr r5, [%[a], #8]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r7, [%[b], #0]\n\t" + "ldr r8, [%[b], #4]\n\t" + "ldr r9, [%[b], #8]\n\t" + "ldr r10, [%[b], #12]\n\t" + "subs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #0]\n\t" + "str r4, [%[r], #4]\n\t" + "str r5, [%[r], #8]\n\t" + "str r6, [%[r], #12]\n\t" + "ldr r3, [%[a], #16]\n\t" + "ldr r4, [%[a], #20]\n\t" + "ldr r5, [%[a], #24]\n\t" + "ldr r6, [%[a], #28]\n\t" + "ldr r7, [%[b], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "ldr r9, [%[b], #24]\n\t" + "ldr r10, [%[b], #28]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #16]\n\t" + "str r4, [%[r], #20]\n\t" + "str r5, [%[r], #24]\n\t" + "str r6, [%[r], #28]\n\t" + "ldr r3, [%[a], #32]\n\t" + "ldr r4, [%[a], #36]\n\t" + "ldr r5, [%[a], #40]\n\t" + "ldr r6, [%[a], #44]\n\t" + "ldr r7, [%[b], #32]\n\t" + "ldr r8, [%[b], #36]\n\t" + "ldr r9, [%[b], #40]\n\t" + "ldr r10, [%[b], #44]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #32]\n\t" + "str r4, [%[r], #36]\n\t" + "str r5, [%[r], #40]\n\t" + "str r6, [%[r], #44]\n\t" + "ldr r3, [%[a], #48]\n\t" + "ldr r4, [%[a], #52]\n\t" + "ldr r5, [%[a], #56]\n\t" + "ldr r6, [%[a], #60]\n\t" + "ldr r7, [%[b], #48]\n\t" + "ldr r8, [%[b], #52]\n\t" + "ldr r9, [%[b], #56]\n\t" + "ldr r10, [%[b], #60]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #48]\n\t" + "str r4, [%[r], #52]\n\t" + "str r5, [%[r], #56]\n\t" + "str r6, [%[r], #60]\n\t" + "ldr r3, [%[a], #64]\n\t" + "ldr r4, [%[a], #68]\n\t" + "ldr r5, [%[a], #72]\n\t" + "ldr r6, [%[a], #76]\n\t" + "ldr r7, [%[b], #64]\n\t" + "ldr r8, [%[b], #68]\n\t" + "ldr r9, [%[b], #72]\n\t" + "ldr r10, [%[b], #76]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #64]\n\t" + "str r4, [%[r], #68]\n\t" + "str r5, [%[r], #72]\n\t" + "str r6, [%[r], #76]\n\t" + "ldr r3, [%[a], #80]\n\t" + "ldr r4, [%[a], #84]\n\t" + "ldr r5, [%[a], #88]\n\t" + "ldr r6, [%[a], #92]\n\t" + "ldr r7, [%[b], #80]\n\t" + "ldr r8, [%[b], #84]\n\t" + "ldr r9, [%[b], #88]\n\t" + "ldr r10, [%[b], #92]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #80]\n\t" + "str r4, [%[r], #84]\n\t" + "str r5, [%[r], #88]\n\t" + "str r6, [%[r], #92]\n\t" + "ldr r3, [%[a], #96]\n\t" + "ldr r4, [%[a], #100]\n\t" + "ldr r5, [%[a], #104]\n\t" + "ldr r6, [%[a], #108]\n\t" + "ldr r7, [%[b], #96]\n\t" + "ldr r8, [%[b], #100]\n\t" + "ldr r9, [%[b], #104]\n\t" + "ldr r10, [%[b], #108]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #96]\n\t" + "str r4, [%[r], #100]\n\t" + "str r5, [%[r], #104]\n\t" + "str r6, [%[r], #108]\n\t" + "ldr r3, [%[a], #112]\n\t" + "ldr r4, [%[a], #116]\n\t" + "ldr r5, [%[a], #120]\n\t" + "ldr r6, [%[a], #124]\n\t" + "ldr r7, [%[b], #112]\n\t" + "ldr r8, [%[b], #116]\n\t" + "ldr r9, [%[b], #120]\n\t" + "ldr r10, [%[b], #124]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #112]\n\t" + "str r4, [%[r], #116]\n\t" + "str r5, [%[r], #120]\n\t" + "str r6, [%[r], #124]\n\t" + "ldr r3, [%[a], #128]\n\t" + "ldr r4, [%[a], #132]\n\t" + "ldr r5, [%[a], #136]\n\t" + "ldr r6, [%[a], #140]\n\t" + "ldr r7, [%[b], #128]\n\t" + "ldr r8, [%[b], #132]\n\t" + "ldr r9, [%[b], #136]\n\t" + "ldr r10, [%[b], #140]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #128]\n\t" + "str r4, [%[r], #132]\n\t" + "str r5, [%[r], #136]\n\t" + "str r6, [%[r], #140]\n\t" + "ldr r3, [%[a], #144]\n\t" + "ldr r4, [%[a], #148]\n\t" + "ldr r5, [%[a], #152]\n\t" + "ldr r6, [%[a], #156]\n\t" + "ldr r7, [%[b], #144]\n\t" + "ldr r8, [%[b], #148]\n\t" + "ldr r9, [%[b], #152]\n\t" + "ldr r10, [%[b], #156]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #144]\n\t" + "str r4, [%[r], #148]\n\t" + "str r5, [%[r], #152]\n\t" + "str r6, [%[r], #156]\n\t" + "ldr r3, [%[a], #160]\n\t" + "ldr r4, [%[a], #164]\n\t" + "ldr r5, [%[a], #168]\n\t" + "ldr r6, [%[a], #172]\n\t" + "ldr r7, [%[b], #160]\n\t" + "ldr r8, [%[b], #164]\n\t" + "ldr r9, [%[b], #168]\n\t" + "ldr r10, [%[b], #172]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #160]\n\t" + "str r4, [%[r], #164]\n\t" + "str r5, [%[r], #168]\n\t" + "str r6, [%[r], #172]\n\t" + "ldr r3, [%[a], #176]\n\t" + "ldr r4, [%[a], #180]\n\t" + "ldr r5, [%[a], #184]\n\t" + "ldr r6, [%[a], #188]\n\t" + "ldr r7, [%[b], #176]\n\t" + "ldr r8, [%[b], #180]\n\t" + "ldr r9, [%[b], #184]\n\t" + "ldr r10, [%[b], #188]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #176]\n\t" + "str r4, [%[r], #180]\n\t" + "str r5, [%[r], #184]\n\t" + "str r6, [%[r], #188]\n\t" + "ldr r3, [%[a], #192]\n\t" + "ldr r4, [%[a], #196]\n\t" + "ldr r5, [%[a], #200]\n\t" + "ldr r6, [%[a], #204]\n\t" + "ldr r7, [%[b], #192]\n\t" + "ldr r8, [%[b], #196]\n\t" + "ldr r9, [%[b], #200]\n\t" + "ldr r10, [%[b], #204]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #192]\n\t" + "str r4, [%[r], #196]\n\t" + "str r5, [%[r], #200]\n\t" + "str r6, [%[r], #204]\n\t" + "ldr r3, [%[a], #208]\n\t" + "ldr r4, [%[a], #212]\n\t" + "ldr r5, [%[a], #216]\n\t" + "ldr r6, [%[a], #220]\n\t" + "ldr r7, [%[b], #208]\n\t" + "ldr r8, [%[b], #212]\n\t" + "ldr r9, [%[b], #216]\n\t" + "ldr r10, [%[b], #220]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #208]\n\t" + "str r4, [%[r], #212]\n\t" + "str r5, [%[r], #216]\n\t" + "str r6, [%[r], #220]\n\t" + "ldr r3, [%[a], #224]\n\t" + "ldr r4, [%[a], #228]\n\t" + "ldr r5, [%[a], #232]\n\t" + "ldr r6, [%[a], #236]\n\t" + "ldr r7, [%[b], #224]\n\t" + "ldr r8, [%[b], #228]\n\t" + "ldr r9, [%[b], #232]\n\t" + "ldr r10, [%[b], #236]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #224]\n\t" + "str r4, [%[r], #228]\n\t" + "str r5, [%[r], #232]\n\t" + "str r6, [%[r], #236]\n\t" + "ldr r3, [%[a], #240]\n\t" + "ldr r4, [%[a], #244]\n\t" + "ldr r5, [%[a], #248]\n\t" + "ldr r6, [%[a], #252]\n\t" + "ldr r7, [%[b], #240]\n\t" + "ldr r8, [%[b], #244]\n\t" + "ldr r9, [%[b], #248]\n\t" + "ldr r10, [%[b], #252]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #240]\n\t" + "str r4, [%[r], #244]\n\t" + "str r5, [%[r], #248]\n\t" + "str r6, [%[r], #252]\n\t" + "ldr r3, [%[a], #256]\n\t" + "ldr r4, [%[a], #260]\n\t" + "ldr r5, [%[a], #264]\n\t" + "ldr r6, [%[a], #268]\n\t" + "ldr r7, [%[b], #256]\n\t" + "ldr r8, [%[b], #260]\n\t" + "ldr r9, [%[b], #264]\n\t" + "ldr r10, [%[b], #268]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #256]\n\t" + "str r4, [%[r], #260]\n\t" + "str r5, [%[r], #264]\n\t" + "str r6, [%[r], #268]\n\t" + "ldr r3, [%[a], #272]\n\t" + "ldr r4, [%[a], #276]\n\t" + "ldr r5, [%[a], #280]\n\t" + "ldr r6, [%[a], #284]\n\t" + "ldr r7, [%[b], #272]\n\t" + "ldr r8, [%[b], #276]\n\t" + "ldr r9, [%[b], #280]\n\t" + "ldr r10, [%[b], #284]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #272]\n\t" + "str r4, [%[r], #276]\n\t" + "str r5, [%[r], #280]\n\t" + "str r6, [%[r], #284]\n\t" + "ldr r3, [%[a], #288]\n\t" + "ldr r4, [%[a], #292]\n\t" + "ldr r5, [%[a], #296]\n\t" + "ldr r6, [%[a], #300]\n\t" + "ldr r7, [%[b], #288]\n\t" + "ldr r8, [%[b], #292]\n\t" + "ldr r9, [%[b], #296]\n\t" + "ldr r10, [%[b], #300]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #288]\n\t" + "str r4, [%[r], #292]\n\t" + "str r5, [%[r], #296]\n\t" + "str r6, [%[r], #300]\n\t" + "ldr r3, [%[a], #304]\n\t" + "ldr r4, [%[a], #308]\n\t" + "ldr r5, [%[a], #312]\n\t" + "ldr r6, [%[a], #316]\n\t" + "ldr r7, [%[b], #304]\n\t" + "ldr r8, [%[b], #308]\n\t" + "ldr r9, [%[b], #312]\n\t" + "ldr r10, [%[b], #316]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #304]\n\t" + "str r4, [%[r], #308]\n\t" + "str r5, [%[r], #312]\n\t" + "str r6, [%[r], #316]\n\t" + "ldr r3, [%[a], #320]\n\t" + "ldr r4, [%[a], #324]\n\t" + "ldr r5, [%[a], #328]\n\t" + "ldr r6, [%[a], #332]\n\t" + "ldr r7, [%[b], #320]\n\t" + "ldr r8, [%[b], #324]\n\t" + "ldr r9, [%[b], #328]\n\t" + "ldr r10, [%[b], #332]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #320]\n\t" + "str r4, [%[r], #324]\n\t" + "str r5, [%[r], #328]\n\t" + "str r6, [%[r], #332]\n\t" + "ldr r3, [%[a], #336]\n\t" + "ldr r4, [%[a], #340]\n\t" + "ldr r5, [%[a], #344]\n\t" + "ldr r6, [%[a], #348]\n\t" + "ldr r7, [%[b], #336]\n\t" + "ldr r8, [%[b], #340]\n\t" + "ldr r9, [%[b], #344]\n\t" + "ldr r10, [%[b], #348]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #336]\n\t" + "str r4, [%[r], #340]\n\t" + "str r5, [%[r], #344]\n\t" + "str r6, [%[r], #348]\n\t" + "ldr r3, [%[a], #352]\n\t" + "ldr r4, [%[a], #356]\n\t" + "ldr r5, [%[a], #360]\n\t" + "ldr r6, [%[a], #364]\n\t" + "ldr r7, [%[b], #352]\n\t" + "ldr r8, [%[b], #356]\n\t" + "ldr r9, [%[b], #360]\n\t" + "ldr r10, [%[b], #364]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #352]\n\t" + "str r4, [%[r], #356]\n\t" + "str r5, [%[r], #360]\n\t" + "str r6, [%[r], #364]\n\t" + "ldr r3, [%[a], #368]\n\t" + "ldr r4, [%[a], #372]\n\t" + "ldr r5, [%[a], #376]\n\t" + "ldr r6, [%[a], #380]\n\t" + "ldr r7, [%[b], #368]\n\t" + "ldr r8, [%[b], #372]\n\t" + "ldr r9, [%[b], #376]\n\t" + "ldr r10, [%[b], #380]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #368]\n\t" + "str r4, [%[r], #372]\n\t" + "str r5, [%[r], #376]\n\t" + "str r6, [%[r], #380]\n\t" + "sbc %[c], %[c], #0\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + + return c; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -17120,6 +17905,69 @@ static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div) return r; } +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[192], t2[97]; + sp_digit div, r1; + int i; + + (void)m; + + + div = d[95]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); + for (i=95; i>=0; i--) { + sp_digit hi = t1[96 + i] - (t1[96 + i] == div); + r1 = div_3072_word_96(hi, t1[96 + i - 1], div); + + sp_3072_mul_d_96(t2, d, r1); + t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2); + t1[96 + i] -= t2[96]; + if (t1[96 + i] != 0) { + t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d); + if (t1[96 + i] != 0) + t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d); + } + } + + for (i = 95; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_3072_sub_96(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 96); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) +{ + return sp_3072_div_96_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) /* AND m into each word of a and store in r. * * r A single precision integer. @@ -18310,59 +19158,6 @@ static WC_INLINE int sp_3072_mod_96(sp_digit* r, const sp_digit* a, const sp_dig } #endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */ -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[192], t2[97]; - sp_digit div, r1; - int i; - - (void)m; - - - div = d[95]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); - for (i=95; i>=0; i--) { - sp_digit hi = t1[96 + i] - (t1[96 + i] == div); - r1 = div_3072_word_96(hi, t1[96 + i - 1], div); - - sp_3072_mul_d_96(t2, d, r1); - t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2); - t1[96 + i] -= t2[96]; - if (t1[96 + i] != 0) { - t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d); - if (t1[96 + i] != 0) - t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d); - } - } - - r1 = sp_3072_cmp_96(t1, d) >= 0; - sp_3072_cond_sub_96(r, t1, d, (sp_digit)0 - r1); - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - return sp_3072_div_96_cond(a, m, NULL, r); -} - #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -18639,6 +19434,7 @@ static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* WOLFSSL_SP_SMALL */ #endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -20349,12 +21145,14 @@ static void sp_4096_to_bin_128(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. */ #define sp_4096_norm_128(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. @@ -22676,7 +23474,7 @@ static void sp_4096_mul_d_128(sp_digit* r, const sp_digit* a, #endif } -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 4096 bits, just need to subtract. * @@ -22691,7 +23489,7 @@ static void sp_4096_mont_norm_128(sp_digit* r, const sp_digit* m) sp_4096_sub_in_place_128(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -24594,7 +25392,584 @@ static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a, sp_4096_mont_reduce_128(r, m, mp); } -#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "add r12, %[a], #512\n\t" + "\n1:\n\t" + "rsbs %[c], %[c], #0\n\t" + "ldr r4, [%[a]], #4\n\t" + "ldr r5, [%[a]], #4\n\t" + "ldr r6, [%[a]], #4\n\t" + "ldr r7, [%[a]], #4\n\t" + "ldr r8, [%[b]], #4\n\t" + "ldr r9, [%[b]], #4\n\t" + "ldr r10, [%[b]], #4\n\t" + "ldr r14, [%[b]], #4\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "sbcs r7, r7, r14\n\t" + "str r4, [%[r]], #4\n\t" + "str r5, [%[r]], #4\n\t" + "str r6, [%[r]], #4\n\t" + "str r7, [%[r]], #4\n\t" + "sbc %[c], r4, r4\n\t" + "cmp %[a], r12\n\t" + "bne 1b\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14", "r12" + ); + + return c; +} + +#else +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldr r3, [%[a], #0]\n\t" + "ldr r4, [%[a], #4]\n\t" + "ldr r5, [%[a], #8]\n\t" + "ldr r6, [%[a], #12]\n\t" + "ldr r7, [%[b], #0]\n\t" + "ldr r8, [%[b], #4]\n\t" + "ldr r9, [%[b], #8]\n\t" + "ldr r10, [%[b], #12]\n\t" + "subs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #0]\n\t" + "str r4, [%[r], #4]\n\t" + "str r5, [%[r], #8]\n\t" + "str r6, [%[r], #12]\n\t" + "ldr r3, [%[a], #16]\n\t" + "ldr r4, [%[a], #20]\n\t" + "ldr r5, [%[a], #24]\n\t" + "ldr r6, [%[a], #28]\n\t" + "ldr r7, [%[b], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "ldr r9, [%[b], #24]\n\t" + "ldr r10, [%[b], #28]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #16]\n\t" + "str r4, [%[r], #20]\n\t" + "str r5, [%[r], #24]\n\t" + "str r6, [%[r], #28]\n\t" + "ldr r3, [%[a], #32]\n\t" + "ldr r4, [%[a], #36]\n\t" + "ldr r5, [%[a], #40]\n\t" + "ldr r6, [%[a], #44]\n\t" + "ldr r7, [%[b], #32]\n\t" + "ldr r8, [%[b], #36]\n\t" + "ldr r9, [%[b], #40]\n\t" + "ldr r10, [%[b], #44]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #32]\n\t" + "str r4, [%[r], #36]\n\t" + "str r5, [%[r], #40]\n\t" + "str r6, [%[r], #44]\n\t" + "ldr r3, [%[a], #48]\n\t" + "ldr r4, [%[a], #52]\n\t" + "ldr r5, [%[a], #56]\n\t" + "ldr r6, [%[a], #60]\n\t" + "ldr r7, [%[b], #48]\n\t" + "ldr r8, [%[b], #52]\n\t" + "ldr r9, [%[b], #56]\n\t" + "ldr r10, [%[b], #60]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #48]\n\t" + "str r4, [%[r], #52]\n\t" + "str r5, [%[r], #56]\n\t" + "str r6, [%[r], #60]\n\t" + "ldr r3, [%[a], #64]\n\t" + "ldr r4, [%[a], #68]\n\t" + "ldr r5, [%[a], #72]\n\t" + "ldr r6, [%[a], #76]\n\t" + "ldr r7, [%[b], #64]\n\t" + "ldr r8, [%[b], #68]\n\t" + "ldr r9, [%[b], #72]\n\t" + "ldr r10, [%[b], #76]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #64]\n\t" + "str r4, [%[r], #68]\n\t" + "str r5, [%[r], #72]\n\t" + "str r6, [%[r], #76]\n\t" + "ldr r3, [%[a], #80]\n\t" + "ldr r4, [%[a], #84]\n\t" + "ldr r5, [%[a], #88]\n\t" + "ldr r6, [%[a], #92]\n\t" + "ldr r7, [%[b], #80]\n\t" + "ldr r8, [%[b], #84]\n\t" + "ldr r9, [%[b], #88]\n\t" + "ldr r10, [%[b], #92]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #80]\n\t" + "str r4, [%[r], #84]\n\t" + "str r5, [%[r], #88]\n\t" + "str r6, [%[r], #92]\n\t" + "ldr r3, [%[a], #96]\n\t" + "ldr r4, [%[a], #100]\n\t" + "ldr r5, [%[a], #104]\n\t" + "ldr r6, [%[a], #108]\n\t" + "ldr r7, [%[b], #96]\n\t" + "ldr r8, [%[b], #100]\n\t" + "ldr r9, [%[b], #104]\n\t" + "ldr r10, [%[b], #108]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #96]\n\t" + "str r4, [%[r], #100]\n\t" + "str r5, [%[r], #104]\n\t" + "str r6, [%[r], #108]\n\t" + "ldr r3, [%[a], #112]\n\t" + "ldr r4, [%[a], #116]\n\t" + "ldr r5, [%[a], #120]\n\t" + "ldr r6, [%[a], #124]\n\t" + "ldr r7, [%[b], #112]\n\t" + "ldr r8, [%[b], #116]\n\t" + "ldr r9, [%[b], #120]\n\t" + "ldr r10, [%[b], #124]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #112]\n\t" + "str r4, [%[r], #116]\n\t" + "str r5, [%[r], #120]\n\t" + "str r6, [%[r], #124]\n\t" + "ldr r3, [%[a], #128]\n\t" + "ldr r4, [%[a], #132]\n\t" + "ldr r5, [%[a], #136]\n\t" + "ldr r6, [%[a], #140]\n\t" + "ldr r7, [%[b], #128]\n\t" + "ldr r8, [%[b], #132]\n\t" + "ldr r9, [%[b], #136]\n\t" + "ldr r10, [%[b], #140]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #128]\n\t" + "str r4, [%[r], #132]\n\t" + "str r5, [%[r], #136]\n\t" + "str r6, [%[r], #140]\n\t" + "ldr r3, [%[a], #144]\n\t" + "ldr r4, [%[a], #148]\n\t" + "ldr r5, [%[a], #152]\n\t" + "ldr r6, [%[a], #156]\n\t" + "ldr r7, [%[b], #144]\n\t" + "ldr r8, [%[b], #148]\n\t" + "ldr r9, [%[b], #152]\n\t" + "ldr r10, [%[b], #156]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #144]\n\t" + "str r4, [%[r], #148]\n\t" + "str r5, [%[r], #152]\n\t" + "str r6, [%[r], #156]\n\t" + "ldr r3, [%[a], #160]\n\t" + "ldr r4, [%[a], #164]\n\t" + "ldr r5, [%[a], #168]\n\t" + "ldr r6, [%[a], #172]\n\t" + "ldr r7, [%[b], #160]\n\t" + "ldr r8, [%[b], #164]\n\t" + "ldr r9, [%[b], #168]\n\t" + "ldr r10, [%[b], #172]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #160]\n\t" + "str r4, [%[r], #164]\n\t" + "str r5, [%[r], #168]\n\t" + "str r6, [%[r], #172]\n\t" + "ldr r3, [%[a], #176]\n\t" + "ldr r4, [%[a], #180]\n\t" + "ldr r5, [%[a], #184]\n\t" + "ldr r6, [%[a], #188]\n\t" + "ldr r7, [%[b], #176]\n\t" + "ldr r8, [%[b], #180]\n\t" + "ldr r9, [%[b], #184]\n\t" + "ldr r10, [%[b], #188]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #176]\n\t" + "str r4, [%[r], #180]\n\t" + "str r5, [%[r], #184]\n\t" + "str r6, [%[r], #188]\n\t" + "ldr r3, [%[a], #192]\n\t" + "ldr r4, [%[a], #196]\n\t" + "ldr r5, [%[a], #200]\n\t" + "ldr r6, [%[a], #204]\n\t" + "ldr r7, [%[b], #192]\n\t" + "ldr r8, [%[b], #196]\n\t" + "ldr r9, [%[b], #200]\n\t" + "ldr r10, [%[b], #204]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #192]\n\t" + "str r4, [%[r], #196]\n\t" + "str r5, [%[r], #200]\n\t" + "str r6, [%[r], #204]\n\t" + "ldr r3, [%[a], #208]\n\t" + "ldr r4, [%[a], #212]\n\t" + "ldr r5, [%[a], #216]\n\t" + "ldr r6, [%[a], #220]\n\t" + "ldr r7, [%[b], #208]\n\t" + "ldr r8, [%[b], #212]\n\t" + "ldr r9, [%[b], #216]\n\t" + "ldr r10, [%[b], #220]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #208]\n\t" + "str r4, [%[r], #212]\n\t" + "str r5, [%[r], #216]\n\t" + "str r6, [%[r], #220]\n\t" + "ldr r3, [%[a], #224]\n\t" + "ldr r4, [%[a], #228]\n\t" + "ldr r5, [%[a], #232]\n\t" + "ldr r6, [%[a], #236]\n\t" + "ldr r7, [%[b], #224]\n\t" + "ldr r8, [%[b], #228]\n\t" + "ldr r9, [%[b], #232]\n\t" + "ldr r10, [%[b], #236]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #224]\n\t" + "str r4, [%[r], #228]\n\t" + "str r5, [%[r], #232]\n\t" + "str r6, [%[r], #236]\n\t" + "ldr r3, [%[a], #240]\n\t" + "ldr r4, [%[a], #244]\n\t" + "ldr r5, [%[a], #248]\n\t" + "ldr r6, [%[a], #252]\n\t" + "ldr r7, [%[b], #240]\n\t" + "ldr r8, [%[b], #244]\n\t" + "ldr r9, [%[b], #248]\n\t" + "ldr r10, [%[b], #252]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #240]\n\t" + "str r4, [%[r], #244]\n\t" + "str r5, [%[r], #248]\n\t" + "str r6, [%[r], #252]\n\t" + "ldr r3, [%[a], #256]\n\t" + "ldr r4, [%[a], #260]\n\t" + "ldr r5, [%[a], #264]\n\t" + "ldr r6, [%[a], #268]\n\t" + "ldr r7, [%[b], #256]\n\t" + "ldr r8, [%[b], #260]\n\t" + "ldr r9, [%[b], #264]\n\t" + "ldr r10, [%[b], #268]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #256]\n\t" + "str r4, [%[r], #260]\n\t" + "str r5, [%[r], #264]\n\t" + "str r6, [%[r], #268]\n\t" + "ldr r3, [%[a], #272]\n\t" + "ldr r4, [%[a], #276]\n\t" + "ldr r5, [%[a], #280]\n\t" + "ldr r6, [%[a], #284]\n\t" + "ldr r7, [%[b], #272]\n\t" + "ldr r8, [%[b], #276]\n\t" + "ldr r9, [%[b], #280]\n\t" + "ldr r10, [%[b], #284]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #272]\n\t" + "str r4, [%[r], #276]\n\t" + "str r5, [%[r], #280]\n\t" + "str r6, [%[r], #284]\n\t" + "ldr r3, [%[a], #288]\n\t" + "ldr r4, [%[a], #292]\n\t" + "ldr r5, [%[a], #296]\n\t" + "ldr r6, [%[a], #300]\n\t" + "ldr r7, [%[b], #288]\n\t" + "ldr r8, [%[b], #292]\n\t" + "ldr r9, [%[b], #296]\n\t" + "ldr r10, [%[b], #300]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #288]\n\t" + "str r4, [%[r], #292]\n\t" + "str r5, [%[r], #296]\n\t" + "str r6, [%[r], #300]\n\t" + "ldr r3, [%[a], #304]\n\t" + "ldr r4, [%[a], #308]\n\t" + "ldr r5, [%[a], #312]\n\t" + "ldr r6, [%[a], #316]\n\t" + "ldr r7, [%[b], #304]\n\t" + "ldr r8, [%[b], #308]\n\t" + "ldr r9, [%[b], #312]\n\t" + "ldr r10, [%[b], #316]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #304]\n\t" + "str r4, [%[r], #308]\n\t" + "str r5, [%[r], #312]\n\t" + "str r6, [%[r], #316]\n\t" + "ldr r3, [%[a], #320]\n\t" + "ldr r4, [%[a], #324]\n\t" + "ldr r5, [%[a], #328]\n\t" + "ldr r6, [%[a], #332]\n\t" + "ldr r7, [%[b], #320]\n\t" + "ldr r8, [%[b], #324]\n\t" + "ldr r9, [%[b], #328]\n\t" + "ldr r10, [%[b], #332]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #320]\n\t" + "str r4, [%[r], #324]\n\t" + "str r5, [%[r], #328]\n\t" + "str r6, [%[r], #332]\n\t" + "ldr r3, [%[a], #336]\n\t" + "ldr r4, [%[a], #340]\n\t" + "ldr r5, [%[a], #344]\n\t" + "ldr r6, [%[a], #348]\n\t" + "ldr r7, [%[b], #336]\n\t" + "ldr r8, [%[b], #340]\n\t" + "ldr r9, [%[b], #344]\n\t" + "ldr r10, [%[b], #348]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #336]\n\t" + "str r4, [%[r], #340]\n\t" + "str r5, [%[r], #344]\n\t" + "str r6, [%[r], #348]\n\t" + "ldr r3, [%[a], #352]\n\t" + "ldr r4, [%[a], #356]\n\t" + "ldr r5, [%[a], #360]\n\t" + "ldr r6, [%[a], #364]\n\t" + "ldr r7, [%[b], #352]\n\t" + "ldr r8, [%[b], #356]\n\t" + "ldr r9, [%[b], #360]\n\t" + "ldr r10, [%[b], #364]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #352]\n\t" + "str r4, [%[r], #356]\n\t" + "str r5, [%[r], #360]\n\t" + "str r6, [%[r], #364]\n\t" + "ldr r3, [%[a], #368]\n\t" + "ldr r4, [%[a], #372]\n\t" + "ldr r5, [%[a], #376]\n\t" + "ldr r6, [%[a], #380]\n\t" + "ldr r7, [%[b], #368]\n\t" + "ldr r8, [%[b], #372]\n\t" + "ldr r9, [%[b], #376]\n\t" + "ldr r10, [%[b], #380]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #368]\n\t" + "str r4, [%[r], #372]\n\t" + "str r5, [%[r], #376]\n\t" + "str r6, [%[r], #380]\n\t" + "ldr r3, [%[a], #384]\n\t" + "ldr r4, [%[a], #388]\n\t" + "ldr r5, [%[a], #392]\n\t" + "ldr r6, [%[a], #396]\n\t" + "ldr r7, [%[b], #384]\n\t" + "ldr r8, [%[b], #388]\n\t" + "ldr r9, [%[b], #392]\n\t" + "ldr r10, [%[b], #396]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #384]\n\t" + "str r4, [%[r], #388]\n\t" + "str r5, [%[r], #392]\n\t" + "str r6, [%[r], #396]\n\t" + "ldr r3, [%[a], #400]\n\t" + "ldr r4, [%[a], #404]\n\t" + "ldr r5, [%[a], #408]\n\t" + "ldr r6, [%[a], #412]\n\t" + "ldr r7, [%[b], #400]\n\t" + "ldr r8, [%[b], #404]\n\t" + "ldr r9, [%[b], #408]\n\t" + "ldr r10, [%[b], #412]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #400]\n\t" + "str r4, [%[r], #404]\n\t" + "str r5, [%[r], #408]\n\t" + "str r6, [%[r], #412]\n\t" + "ldr r3, [%[a], #416]\n\t" + "ldr r4, [%[a], #420]\n\t" + "ldr r5, [%[a], #424]\n\t" + "ldr r6, [%[a], #428]\n\t" + "ldr r7, [%[b], #416]\n\t" + "ldr r8, [%[b], #420]\n\t" + "ldr r9, [%[b], #424]\n\t" + "ldr r10, [%[b], #428]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #416]\n\t" + "str r4, [%[r], #420]\n\t" + "str r5, [%[r], #424]\n\t" + "str r6, [%[r], #428]\n\t" + "ldr r3, [%[a], #432]\n\t" + "ldr r4, [%[a], #436]\n\t" + "ldr r5, [%[a], #440]\n\t" + "ldr r6, [%[a], #444]\n\t" + "ldr r7, [%[b], #432]\n\t" + "ldr r8, [%[b], #436]\n\t" + "ldr r9, [%[b], #440]\n\t" + "ldr r10, [%[b], #444]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #432]\n\t" + "str r4, [%[r], #436]\n\t" + "str r5, [%[r], #440]\n\t" + "str r6, [%[r], #444]\n\t" + "ldr r3, [%[a], #448]\n\t" + "ldr r4, [%[a], #452]\n\t" + "ldr r5, [%[a], #456]\n\t" + "ldr r6, [%[a], #460]\n\t" + "ldr r7, [%[b], #448]\n\t" + "ldr r8, [%[b], #452]\n\t" + "ldr r9, [%[b], #456]\n\t" + "ldr r10, [%[b], #460]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #448]\n\t" + "str r4, [%[r], #452]\n\t" + "str r5, [%[r], #456]\n\t" + "str r6, [%[r], #460]\n\t" + "ldr r3, [%[a], #464]\n\t" + "ldr r4, [%[a], #468]\n\t" + "ldr r5, [%[a], #472]\n\t" + "ldr r6, [%[a], #476]\n\t" + "ldr r7, [%[b], #464]\n\t" + "ldr r8, [%[b], #468]\n\t" + "ldr r9, [%[b], #472]\n\t" + "ldr r10, [%[b], #476]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #464]\n\t" + "str r4, [%[r], #468]\n\t" + "str r5, [%[r], #472]\n\t" + "str r6, [%[r], #476]\n\t" + "ldr r3, [%[a], #480]\n\t" + "ldr r4, [%[a], #484]\n\t" + "ldr r5, [%[a], #488]\n\t" + "ldr r6, [%[a], #492]\n\t" + "ldr r7, [%[b], #480]\n\t" + "ldr r8, [%[b], #484]\n\t" + "ldr r9, [%[b], #488]\n\t" + "ldr r10, [%[b], #492]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #480]\n\t" + "str r4, [%[r], #484]\n\t" + "str r5, [%[r], #488]\n\t" + "str r6, [%[r], #492]\n\t" + "ldr r3, [%[a], #496]\n\t" + "ldr r4, [%[a], #500]\n\t" + "ldr r5, [%[a], #504]\n\t" + "ldr r6, [%[a], #508]\n\t" + "ldr r7, [%[b], #496]\n\t" + "ldr r8, [%[b], #500]\n\t" + "ldr r9, [%[b], #504]\n\t" + "ldr r10, [%[b], #508]\n\t" + "sbcs r3, r3, r7\n\t" + "sbcs r4, r4, r8\n\t" + "sbcs r5, r5, r9\n\t" + "sbcs r6, r6, r10\n\t" + "str r3, [%[r], #496]\n\t" + "str r4, [%[r], #500]\n\t" + "str r5, [%[r], #504]\n\t" + "str r6, [%[r], #508]\n\t" + "sbc %[c], %[c], #0\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + + return c; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -24657,6 +26032,69 @@ static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div) return r; } +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[256], t2[129]; + sp_digit div, r1; + int i; + + (void)m; + + + div = d[127]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 128); + for (i=127; i>=0; i--) { + sp_digit hi = t1[128 + i] - (t1[128 + i] == div); + r1 = div_4096_word_128(hi, t1[128 + i - 1], div); + + sp_4096_mul_d_128(t2, d, r1); + t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2); + t1[128 + i] -= t2[128]; + if (t1[128 + i] != 0) { + t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d); + if (t1[128 + i] != 0) + t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d); + } + } + + for (i = 127; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_4096_sub_128(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 128); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) +{ + return sp_4096_div_128_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) /* AND m into each word of a and store in r. * * r A single precision integer. @@ -26199,59 +27637,6 @@ static WC_INLINE int sp_4096_mod_128(sp_digit* r, const sp_digit* a, const sp_di } #endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */ -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[256], t2[129]; - sp_digit div, r1; - int i; - - (void)m; - - - div = d[127]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 128); - for (i=127; i>=0; i--) { - sp_digit hi = t1[128 + i] - (t1[128 + i] == div); - r1 = div_4096_word_128(hi, t1[128 + i - 1], div); - - sp_4096_mul_d_128(t2, d, r1); - t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2); - t1[128 + i] -= t2[128]; - if (t1[128 + i] != 0) { - t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d); - if (t1[128 + i] != 0) - t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d); - } - } - - r1 = sp_4096_cmp_128(t1, d) >= 0; - sp_4096_cond_sub_128(r, t1, d, (sp_digit)0 - r1); - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - return sp_4096_div_128_cond(a, m, NULL, r); -} - #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -26528,6 +27913,7 @@ static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e #endif /* WOLFSSL_SP_SMALL */ #endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c index 57b9a25f7..ae93bfdd9 100644 --- a/wolfcrypt/src/sp_arm64.c +++ b/wolfcrypt/src/sp_arm64.c @@ -215,12 +215,14 @@ static void sp_2048_to_bin_32(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 64. * * a Array of sp_digit to normalize. */ #define sp_2048_norm_32(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 64. * * a Array of sp_digit to normalize. @@ -2402,108 +2404,6 @@ static void sp_2048_mont_norm_16(sp_digit* r, const sp_digit* m) sp_2048_sub_in_place_16(r, m); } -/* Conditionally subtract b from a using the mask m. - * m is -1 to subtract and 0 when not copying. - * - * r A single precision number representing condition subtract result. - * a A single precision number to subtract from. - * b A single precision number to subtract. - * m Mask value to apply. - */ -static sp_digit sp_2048_cond_sub_16(sp_digit* r, const sp_digit* a, const sp_digit* b, - sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - sp_digit c = 0; - - __asm__ __volatile__ ( - "mov x8, #0\n\t" - "1:\n\t" - "subs %[c], xzr, %[c]\n\t" - "ldr x4, [%[a], x8]\n\t" - "ldr x5, [%[b], x8]\n\t" - "and x5, x5, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "csetm %[c], cc\n\t" - "str x4, [%[r], x8]\n\t" - "add x8, x8, #8\n\t" - "cmp x8, 128\n\t" - "b.lt 1b\n\t" - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return c; -#else - __asm__ __volatile__ ( - - "ldp x5, x7, [%[b], 0]\n\t" - "ldp x11, x12, [%[b], 16]\n\t" - "ldp x4, x6, [%[a], 0]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 16]\n\t" - "and x7, x7, %[m]\n\t" - "subs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 0]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 16]\n\t" - "ldp x5, x7, [%[b], 32]\n\t" - "ldp x11, x12, [%[b], 48]\n\t" - "ldp x4, x6, [%[a], 32]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 48]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 32]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 48]\n\t" - "ldp x5, x7, [%[b], 64]\n\t" - "ldp x11, x12, [%[b], 80]\n\t" - "ldp x4, x6, [%[a], 64]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 80]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 64]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 80]\n\t" - "ldp x5, x7, [%[b], 96]\n\t" - "ldp x11, x12, [%[b], 112]\n\t" - "ldp x4, x6, [%[a], 96]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 112]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 96]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 112]\n\t" - "csetm %[r], cc\n\t" - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return (sp_digit)r; -#endif /* WOLFSSL_SP_SMALL */ -} - /* Reduce the number back to 2048 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -2767,6 +2667,108 @@ static void sp_2048_mont_sqr_16(sp_digit* r, const sp_digit* a, sp_2048_mont_reduce_16(r, m, mp); } +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_2048_cond_sub_16(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov x8, #0\n\t" + "1:\n\t" + "subs %[c], xzr, %[c]\n\t" + "ldr x4, [%[a], x8]\n\t" + "ldr x5, [%[b], x8]\n\t" + "and x5, x5, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "csetm %[c], cc\n\t" + "str x4, [%[r], x8]\n\t" + "add x8, x8, #8\n\t" + "cmp x8, 128\n\t" + "b.lt 1b\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + ); + + return c; +#else + __asm__ __volatile__ ( + + "ldp x5, x7, [%[b], 0]\n\t" + "ldp x11, x12, [%[b], 16]\n\t" + "ldp x4, x6, [%[a], 0]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" + "and x7, x7, %[m]\n\t" + "subs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 0]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 16]\n\t" + "ldp x5, x7, [%[b], 32]\n\t" + "ldp x11, x12, [%[b], 48]\n\t" + "ldp x4, x6, [%[a], 32]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 48]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 32]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 48]\n\t" + "ldp x5, x7, [%[b], 64]\n\t" + "ldp x11, x12, [%[b], 80]\n\t" + "ldp x4, x6, [%[a], 64]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 80]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 64]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 80]\n\t" + "ldp x5, x7, [%[b], 96]\n\t" + "ldp x11, x12, [%[b], 112]\n\t" + "ldp x4, x6, [%[a], 96]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 112]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 96]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 112]\n\t" + "csetm %[r], cc\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + ); + + return (sp_digit)r; +#endif /* WOLFSSL_SP_SMALL */ +} + /* Mul a by digit b into r. (r = a * b) * * r A single precision integer. @@ -3514,7 +3516,7 @@ static int sp_2048_mod_exp_16(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 2048 bits, just need to subtract. * @@ -3529,165 +3531,7 @@ static void sp_2048_mont_norm_32(sp_digit* r, const sp_digit* m) sp_2048_sub_in_place_32(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ -/* Conditionally subtract b from a using the mask m. - * m is -1 to subtract and 0 when not copying. - * - * r A single precision number representing condition subtract result. - * a A single precision number to subtract from. - * b A single precision number to subtract. - * m Mask value to apply. - */ -static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* b, - sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - sp_digit c = 0; - - __asm__ __volatile__ ( - "mov x8, #0\n\t" - "1:\n\t" - "subs %[c], xzr, %[c]\n\t" - "ldr x4, [%[a], x8]\n\t" - "ldr x5, [%[b], x8]\n\t" - "and x5, x5, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "csetm %[c], cc\n\t" - "str x4, [%[r], x8]\n\t" - "add x8, x8, #8\n\t" - "cmp x8, 256\n\t" - "b.lt 1b\n\t" - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return c; -#else - __asm__ __volatile__ ( - - "ldp x5, x7, [%[b], 0]\n\t" - "ldp x11, x12, [%[b], 16]\n\t" - "ldp x4, x6, [%[a], 0]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 16]\n\t" - "and x7, x7, %[m]\n\t" - "subs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 0]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 16]\n\t" - "ldp x5, x7, [%[b], 32]\n\t" - "ldp x11, x12, [%[b], 48]\n\t" - "ldp x4, x6, [%[a], 32]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 48]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 32]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 48]\n\t" - "ldp x5, x7, [%[b], 64]\n\t" - "ldp x11, x12, [%[b], 80]\n\t" - "ldp x4, x6, [%[a], 64]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 80]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 64]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 80]\n\t" - "ldp x5, x7, [%[b], 96]\n\t" - "ldp x11, x12, [%[b], 112]\n\t" - "ldp x4, x6, [%[a], 96]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 112]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 96]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 112]\n\t" - "ldp x5, x7, [%[b], 128]\n\t" - "ldp x11, x12, [%[b], 144]\n\t" - "ldp x4, x6, [%[a], 128]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 144]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 128]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 144]\n\t" - "ldp x5, x7, [%[b], 160]\n\t" - "ldp x11, x12, [%[b], 176]\n\t" - "ldp x4, x6, [%[a], 160]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 176]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 160]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 176]\n\t" - "ldp x5, x7, [%[b], 192]\n\t" - "ldp x11, x12, [%[b], 208]\n\t" - "ldp x4, x6, [%[a], 192]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 208]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 192]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 208]\n\t" - "ldp x5, x7, [%[b], 224]\n\t" - "ldp x11, x12, [%[b], 240]\n\t" - "ldp x4, x6, [%[a], 224]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 240]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 224]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 240]\n\t" - "csetm %[r], cc\n\t" - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return (sp_digit)r; -#endif /* WOLFSSL_SP_SMALL */ -} - +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ /* Reduce the number back to 2048 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -4167,6 +4011,144 @@ static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, sp_2048_mont_reduce_32(r, m, mp); } +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "add x11, %[a], 256\n\t" + "\n1:\n\t" + "subs %[c], xzr, %[c]\n\t" + "ldp x3, x4, [%[a]], #16\n\t" + "ldp x5, x6, [%[a]], #16\n\t" + "ldp x7, x8, [%[b]], #16\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x9, x10, [%[b]], #16\n\t" + "sbcs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r]], #16\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r]], #16\n\t" + "csetm %[c], cc\n\t" + "cmp %[a], x11\n\t" + "b.ne 1b\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" + ); + + return c; +} + +#else +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldp x3, x4, [%[a], 0]\n\t" + "ldp x7, x8, [%[b], 0]\n\t" + "subs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 16]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 16]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 0]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 16]\n\t" + "ldp x3, x4, [%[a], 32]\n\t" + "ldp x7, x8, [%[b], 32]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 48]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 48]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 32]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 48]\n\t" + "ldp x3, x4, [%[a], 64]\n\t" + "ldp x7, x8, [%[b], 64]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 80]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 80]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 64]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 80]\n\t" + "ldp x3, x4, [%[a], 96]\n\t" + "ldp x7, x8, [%[b], 96]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 112]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 112]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 96]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 112]\n\t" + "ldp x3, x4, [%[a], 128]\n\t" + "ldp x7, x8, [%[b], 128]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 144]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 144]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 128]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 144]\n\t" + "ldp x3, x4, [%[a], 160]\n\t" + "ldp x7, x8, [%[b], 160]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 176]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 176]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 160]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 176]\n\t" + "ldp x3, x4, [%[a], 192]\n\t" + "ldp x7, x8, [%[b], 192]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 208]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 208]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 192]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 208]\n\t" + "ldp x3, x4, [%[a], 224]\n\t" + "ldp x7, x8, [%[b], 224]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 240]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 240]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 224]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 240]\n\t" + "csetm %[r], cc\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" + ); + + return (sp_digit)r; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -4226,6 +4208,225 @@ static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div) return r; } +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_2048_div_32_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[64], t2[33]; + sp_digit div, r1; + int i; + + (void)m; + + div = d[31]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 32); + for (i=31; i>=0; i--) { + sp_digit hi = t1[32 + i] - (t1[32 + i] == div); + r1 = div_2048_word_32(hi, t1[32 + i - 1], div); + + sp_2048_mul_d_32(t2, d, r1); + t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2); + t1[32 + i] -= t2[32]; + if (t1[32 + i] != 0) { + t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], d); + if (t1[32 + i] != 0) + t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], d); + } + } + + for (i = 31; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_2048_sub_32(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 32); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_2048_mod_32_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) +{ + return sp_2048_div_32_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov x8, #0\n\t" + "1:\n\t" + "subs %[c], xzr, %[c]\n\t" + "ldr x4, [%[a], x8]\n\t" + "ldr x5, [%[b], x8]\n\t" + "and x5, x5, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "csetm %[c], cc\n\t" + "str x4, [%[r], x8]\n\t" + "add x8, x8, #8\n\t" + "cmp x8, 256\n\t" + "b.lt 1b\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + ); + + return c; +#else + __asm__ __volatile__ ( + + "ldp x5, x7, [%[b], 0]\n\t" + "ldp x11, x12, [%[b], 16]\n\t" + "ldp x4, x6, [%[a], 0]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" + "and x7, x7, %[m]\n\t" + "subs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 0]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 16]\n\t" + "ldp x5, x7, [%[b], 32]\n\t" + "ldp x11, x12, [%[b], 48]\n\t" + "ldp x4, x6, [%[a], 32]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 48]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 32]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 48]\n\t" + "ldp x5, x7, [%[b], 64]\n\t" + "ldp x11, x12, [%[b], 80]\n\t" + "ldp x4, x6, [%[a], 64]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 80]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 64]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 80]\n\t" + "ldp x5, x7, [%[b], 96]\n\t" + "ldp x11, x12, [%[b], 112]\n\t" + "ldp x4, x6, [%[a], 96]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 112]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 96]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 112]\n\t" + "ldp x5, x7, [%[b], 128]\n\t" + "ldp x11, x12, [%[b], 144]\n\t" + "ldp x4, x6, [%[a], 128]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 144]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 128]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 144]\n\t" + "ldp x5, x7, [%[b], 160]\n\t" + "ldp x11, x12, [%[b], 176]\n\t" + "ldp x4, x6, [%[a], 160]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 176]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 160]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 176]\n\t" + "ldp x5, x7, [%[b], 192]\n\t" + "ldp x11, x12, [%[b], 208]\n\t" + "ldp x4, x6, [%[a], 192]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 208]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 192]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 208]\n\t" + "ldp x5, x7, [%[b], 224]\n\t" + "ldp x11, x12, [%[b], 240]\n\t" + "ldp x4, x6, [%[a], 224]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 240]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 224]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 240]\n\t" + "csetm %[r], cc\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + ); + + return (sp_digit)r; +#endif /* WOLFSSL_SP_SMALL */ +} + /* AND m into each word of a and store in r. * * r A single precision integer. @@ -4577,204 +4778,6 @@ static WC_INLINE int sp_2048_mod_32(sp_digit* r, const sp_digit* a, const sp_dig return sp_2048_div_32(a, m, NULL, r); } -#ifdef WOLFSSL_SP_SMALL -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "add x11, %[a], 256\n\t" - "\n1:\n\t" - "subs %[c], xzr, %[c]\n\t" - "ldp x3, x4, [%[a]], #16\n\t" - "ldp x5, x6, [%[a]], #16\n\t" - "ldp x7, x8, [%[b]], #16\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x9, x10, [%[b]], #16\n\t" - "sbcs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r]], #16\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r]], #16\n\t" - "csetm %[c], cc\n\t" - "cmp %[a], x11\n\t" - "b.ne 1b\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" - ); - - return c; -} - -#else -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "ldp x3, x4, [%[a], 0]\n\t" - "ldp x7, x8, [%[b], 0]\n\t" - "subs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 16]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 0]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 16]\n\t" - "ldp x3, x4, [%[a], 32]\n\t" - "ldp x7, x8, [%[b], 32]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 48]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 32]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 48]\n\t" - "ldp x3, x4, [%[a], 64]\n\t" - "ldp x7, x8, [%[b], 64]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 80]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 80]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 64]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 80]\n\t" - "ldp x3, x4, [%[a], 96]\n\t" - "ldp x7, x8, [%[b], 96]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 112]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 112]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 96]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 112]\n\t" - "ldp x3, x4, [%[a], 128]\n\t" - "ldp x7, x8, [%[b], 128]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 144]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 144]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 128]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 144]\n\t" - "ldp x3, x4, [%[a], 160]\n\t" - "ldp x7, x8, [%[b], 160]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 176]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 176]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 160]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 176]\n\t" - "ldp x3, x4, [%[a], 192]\n\t" - "ldp x7, x8, [%[b], 192]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 208]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 208]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 192]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 208]\n\t" - "ldp x3, x4, [%[a], 224]\n\t" - "ldp x7, x8, [%[b], 224]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 240]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 240]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 224]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 240]\n\t" - "csetm %[r], cc\n\t" - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" - ); - - return (sp_digit)r; -} - -#endif /* WOLFSSL_SP_SMALL */ -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_2048_div_32_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[64], t2[33]; - sp_digit div, r1; - int i; - - (void)m; - - div = d[31]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 32); - for (i=31; i>=0; i--) { - sp_digit hi = t1[32 + i] - (t1[32 + i] == div); - r1 = div_2048_word_32(hi, t1[32 + i - 1], div); - - sp_2048_mul_d_32(t2, d, r1); - t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2); - t1[32 + i] -= t2[32]; - if (t1[32 + i] != 0) { - t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], d); - if (t1[32 + i] != 0) - t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], d); - } - } - - for (i = 31; i > 0; i--) { - if (t1[i] != d[i]) - break; - } - if (t1[i] >= d[i]) { - sp_2048_sub_32(r, t1, d); - } - else { - XMEMCPY(r, t1, sizeof(*t1) * 32); - } - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_2048_mod_32_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - return sp_2048_div_32_cond(a, m, NULL, r); -} - #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -5051,6 +5054,7 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* WOLFSSL_SP_SMALL */ #endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -6189,12 +6193,14 @@ static void sp_3072_to_bin_48(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 64. * * a Array of sp_digit to normalize. */ #define sp_3072_norm_48(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 64. * * a Array of sp_digit to normalize. @@ -9709,136 +9715,6 @@ static void sp_3072_mont_norm_24(sp_digit* r, const sp_digit* m) sp_3072_sub_in_place_24(r, m); } -/* Conditionally subtract b from a using the mask m. - * m is -1 to subtract and 0 when not copying. - * - * r A single precision number representing condition subtract result. - * a A single precision number to subtract from. - * b A single precision number to subtract. - * m Mask value to apply. - */ -static sp_digit sp_3072_cond_sub_24(sp_digit* r, const sp_digit* a, const sp_digit* b, - sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - sp_digit c = 0; - - __asm__ __volatile__ ( - "mov x8, #0\n\t" - "1:\n\t" - "subs %[c], xzr, %[c]\n\t" - "ldr x4, [%[a], x8]\n\t" - "ldr x5, [%[b], x8]\n\t" - "and x5, x5, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "csetm %[c], cc\n\t" - "str x4, [%[r], x8]\n\t" - "add x8, x8, #8\n\t" - "cmp x8, 192\n\t" - "b.lt 1b\n\t" - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return c; -#else - __asm__ __volatile__ ( - - "ldp x5, x7, [%[b], 0]\n\t" - "ldp x11, x12, [%[b], 16]\n\t" - "ldp x4, x6, [%[a], 0]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 16]\n\t" - "and x7, x7, %[m]\n\t" - "subs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 0]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 16]\n\t" - "ldp x5, x7, [%[b], 32]\n\t" - "ldp x11, x12, [%[b], 48]\n\t" - "ldp x4, x6, [%[a], 32]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 48]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 32]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 48]\n\t" - "ldp x5, x7, [%[b], 64]\n\t" - "ldp x11, x12, [%[b], 80]\n\t" - "ldp x4, x6, [%[a], 64]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 80]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 64]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 80]\n\t" - "ldp x5, x7, [%[b], 96]\n\t" - "ldp x11, x12, [%[b], 112]\n\t" - "ldp x4, x6, [%[a], 96]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 112]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 96]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 112]\n\t" - "ldp x5, x7, [%[b], 128]\n\t" - "ldp x11, x12, [%[b], 144]\n\t" - "ldp x4, x6, [%[a], 128]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 144]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 128]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 144]\n\t" - "ldp x5, x7, [%[b], 160]\n\t" - "ldp x11, x12, [%[b], 176]\n\t" - "ldp x4, x6, [%[a], 160]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 176]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 160]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 176]\n\t" - "csetm %[r], cc\n\t" - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return (sp_digit)r; -#endif /* WOLFSSL_SP_SMALL */ -} - /* Reduce the number back to 3072 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -10210,6 +10086,136 @@ static void sp_3072_mont_sqr_24(sp_digit* r, const sp_digit* a, sp_3072_mont_reduce_24(r, m, mp); } +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_3072_cond_sub_24(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov x8, #0\n\t" + "1:\n\t" + "subs %[c], xzr, %[c]\n\t" + "ldr x4, [%[a], x8]\n\t" + "ldr x5, [%[b], x8]\n\t" + "and x5, x5, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "csetm %[c], cc\n\t" + "str x4, [%[r], x8]\n\t" + "add x8, x8, #8\n\t" + "cmp x8, 192\n\t" + "b.lt 1b\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + ); + + return c; +#else + __asm__ __volatile__ ( + + "ldp x5, x7, [%[b], 0]\n\t" + "ldp x11, x12, [%[b], 16]\n\t" + "ldp x4, x6, [%[a], 0]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" + "and x7, x7, %[m]\n\t" + "subs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 0]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 16]\n\t" + "ldp x5, x7, [%[b], 32]\n\t" + "ldp x11, x12, [%[b], 48]\n\t" + "ldp x4, x6, [%[a], 32]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 48]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 32]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 48]\n\t" + "ldp x5, x7, [%[b], 64]\n\t" + "ldp x11, x12, [%[b], 80]\n\t" + "ldp x4, x6, [%[a], 64]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 80]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 64]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 80]\n\t" + "ldp x5, x7, [%[b], 96]\n\t" + "ldp x11, x12, [%[b], 112]\n\t" + "ldp x4, x6, [%[a], 96]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 112]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 96]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 112]\n\t" + "ldp x5, x7, [%[b], 128]\n\t" + "ldp x11, x12, [%[b], 144]\n\t" + "ldp x4, x6, [%[a], 128]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 144]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 128]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 144]\n\t" + "ldp x5, x7, [%[b], 160]\n\t" + "ldp x11, x12, [%[b], 176]\n\t" + "ldp x4, x6, [%[a], 160]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 176]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 160]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 176]\n\t" + "csetm %[r], cc\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + ); + + return (sp_digit)r; +#endif /* WOLFSSL_SP_SMALL */ +} + /* Mul a by digit b into r. (r = a * b) * * r A single precision integer. @@ -11081,7 +11087,7 @@ static int sp_3072_mod_exp_24(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 3072 bits, just need to subtract. * @@ -11096,221 +11102,7 @@ static void sp_3072_mont_norm_48(sp_digit* r, const sp_digit* m) sp_3072_sub_in_place_48(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ -/* Conditionally subtract b from a using the mask m. - * m is -1 to subtract and 0 when not copying. - * - * r A single precision number representing condition subtract result. - * a A single precision number to subtract from. - * b A single precision number to subtract. - * m Mask value to apply. - */ -static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, const sp_digit* b, - sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - sp_digit c = 0; - - __asm__ __volatile__ ( - "mov x8, #0\n\t" - "1:\n\t" - "subs %[c], xzr, %[c]\n\t" - "ldr x4, [%[a], x8]\n\t" - "ldr x5, [%[b], x8]\n\t" - "and x5, x5, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "csetm %[c], cc\n\t" - "str x4, [%[r], x8]\n\t" - "add x8, x8, #8\n\t" - "cmp x8, 384\n\t" - "b.lt 1b\n\t" - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return c; -#else - __asm__ __volatile__ ( - - "ldp x5, x7, [%[b], 0]\n\t" - "ldp x11, x12, [%[b], 16]\n\t" - "ldp x4, x6, [%[a], 0]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 16]\n\t" - "and x7, x7, %[m]\n\t" - "subs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 0]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 16]\n\t" - "ldp x5, x7, [%[b], 32]\n\t" - "ldp x11, x12, [%[b], 48]\n\t" - "ldp x4, x6, [%[a], 32]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 48]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 32]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 48]\n\t" - "ldp x5, x7, [%[b], 64]\n\t" - "ldp x11, x12, [%[b], 80]\n\t" - "ldp x4, x6, [%[a], 64]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 80]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 64]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 80]\n\t" - "ldp x5, x7, [%[b], 96]\n\t" - "ldp x11, x12, [%[b], 112]\n\t" - "ldp x4, x6, [%[a], 96]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 112]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 96]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 112]\n\t" - "ldp x5, x7, [%[b], 128]\n\t" - "ldp x11, x12, [%[b], 144]\n\t" - "ldp x4, x6, [%[a], 128]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 144]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 128]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 144]\n\t" - "ldp x5, x7, [%[b], 160]\n\t" - "ldp x11, x12, [%[b], 176]\n\t" - "ldp x4, x6, [%[a], 160]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 176]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 160]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 176]\n\t" - "ldp x5, x7, [%[b], 192]\n\t" - "ldp x11, x12, [%[b], 208]\n\t" - "ldp x4, x6, [%[a], 192]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 208]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 192]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 208]\n\t" - "ldp x5, x7, [%[b], 224]\n\t" - "ldp x11, x12, [%[b], 240]\n\t" - "ldp x4, x6, [%[a], 224]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 240]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 224]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 240]\n\t" - "ldp x5, x7, [%[b], 256]\n\t" - "ldp x11, x12, [%[b], 272]\n\t" - "ldp x4, x6, [%[a], 256]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 272]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 256]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 272]\n\t" - "ldp x5, x7, [%[b], 288]\n\t" - "ldp x11, x12, [%[b], 304]\n\t" - "ldp x4, x6, [%[a], 288]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 304]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 288]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 304]\n\t" - "ldp x5, x7, [%[b], 320]\n\t" - "ldp x11, x12, [%[b], 336]\n\t" - "ldp x4, x6, [%[a], 320]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 336]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 320]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 336]\n\t" - "ldp x5, x7, [%[b], 352]\n\t" - "ldp x11, x12, [%[b], 368]\n\t" - "ldp x4, x6, [%[a], 352]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 368]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 352]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 368]\n\t" - "csetm %[r], cc\n\t" - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return (sp_digit)r; -#endif /* WOLFSSL_SP_SMALL */ -} - +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ /* Reduce the number back to 3072 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -12006,6 +11798,184 @@ static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, sp_3072_mont_reduce_48(r, m, mp); } +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "add x11, %[a], 384\n\t" + "\n1:\n\t" + "subs %[c], xzr, %[c]\n\t" + "ldp x3, x4, [%[a]], #16\n\t" + "ldp x5, x6, [%[a]], #16\n\t" + "ldp x7, x8, [%[b]], #16\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x9, x10, [%[b]], #16\n\t" + "sbcs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r]], #16\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r]], #16\n\t" + "csetm %[c], cc\n\t" + "cmp %[a], x11\n\t" + "b.ne 1b\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" + ); + + return c; +} + +#else +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldp x3, x4, [%[a], 0]\n\t" + "ldp x7, x8, [%[b], 0]\n\t" + "subs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 16]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 16]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 0]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 16]\n\t" + "ldp x3, x4, [%[a], 32]\n\t" + "ldp x7, x8, [%[b], 32]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 48]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 48]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 32]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 48]\n\t" + "ldp x3, x4, [%[a], 64]\n\t" + "ldp x7, x8, [%[b], 64]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 80]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 80]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 64]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 80]\n\t" + "ldp x3, x4, [%[a], 96]\n\t" + "ldp x7, x8, [%[b], 96]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 112]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 112]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 96]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 112]\n\t" + "ldp x3, x4, [%[a], 128]\n\t" + "ldp x7, x8, [%[b], 128]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 144]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 144]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 128]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 144]\n\t" + "ldp x3, x4, [%[a], 160]\n\t" + "ldp x7, x8, [%[b], 160]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 176]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 176]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 160]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 176]\n\t" + "ldp x3, x4, [%[a], 192]\n\t" + "ldp x7, x8, [%[b], 192]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 208]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 208]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 192]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 208]\n\t" + "ldp x3, x4, [%[a], 224]\n\t" + "ldp x7, x8, [%[b], 224]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 240]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 240]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 224]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 240]\n\t" + "ldp x3, x4, [%[a], 256]\n\t" + "ldp x7, x8, [%[b], 256]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 272]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 272]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 256]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 272]\n\t" + "ldp x3, x4, [%[a], 288]\n\t" + "ldp x7, x8, [%[b], 288]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 304]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 304]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 288]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 304]\n\t" + "ldp x3, x4, [%[a], 320]\n\t" + "ldp x7, x8, [%[b], 320]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 336]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 336]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 320]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 336]\n\t" + "ldp x3, x4, [%[a], 352]\n\t" + "ldp x7, x8, [%[b], 352]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 368]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 368]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 352]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 368]\n\t" + "csetm %[r], cc\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" + ); + + return (sp_digit)r; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -12065,6 +12035,281 @@ static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div) return r; } +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_3072_div_48_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[96], t2[49]; + sp_digit div, r1; + int i; + + (void)m; + + div = d[47]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 48); + for (i=47; i>=0; i--) { + sp_digit hi = t1[48 + i] - (t1[48 + i] == div); + r1 = div_3072_word_48(hi, t1[48 + i - 1], div); + + sp_3072_mul_d_48(t2, d, r1); + t1[48 + i] += sp_3072_sub_in_place_48(&t1[i], t2); + t1[48 + i] -= t2[48]; + if (t1[48 + i] != 0) { + t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], d); + if (t1[48 + i] != 0) + t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], d); + } + } + + for (i = 47; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_3072_sub_48(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 48); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_3072_mod_48_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) +{ + return sp_3072_div_48_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov x8, #0\n\t" + "1:\n\t" + "subs %[c], xzr, %[c]\n\t" + "ldr x4, [%[a], x8]\n\t" + "ldr x5, [%[b], x8]\n\t" + "and x5, x5, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "csetm %[c], cc\n\t" + "str x4, [%[r], x8]\n\t" + "add x8, x8, #8\n\t" + "cmp x8, 384\n\t" + "b.lt 1b\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + ); + + return c; +#else + __asm__ __volatile__ ( + + "ldp x5, x7, [%[b], 0]\n\t" + "ldp x11, x12, [%[b], 16]\n\t" + "ldp x4, x6, [%[a], 0]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" + "and x7, x7, %[m]\n\t" + "subs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 0]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 16]\n\t" + "ldp x5, x7, [%[b], 32]\n\t" + "ldp x11, x12, [%[b], 48]\n\t" + "ldp x4, x6, [%[a], 32]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 48]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 32]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 48]\n\t" + "ldp x5, x7, [%[b], 64]\n\t" + "ldp x11, x12, [%[b], 80]\n\t" + "ldp x4, x6, [%[a], 64]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 80]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 64]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 80]\n\t" + "ldp x5, x7, [%[b], 96]\n\t" + "ldp x11, x12, [%[b], 112]\n\t" + "ldp x4, x6, [%[a], 96]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 112]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 96]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 112]\n\t" + "ldp x5, x7, [%[b], 128]\n\t" + "ldp x11, x12, [%[b], 144]\n\t" + "ldp x4, x6, [%[a], 128]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 144]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 128]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 144]\n\t" + "ldp x5, x7, [%[b], 160]\n\t" + "ldp x11, x12, [%[b], 176]\n\t" + "ldp x4, x6, [%[a], 160]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 176]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 160]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 176]\n\t" + "ldp x5, x7, [%[b], 192]\n\t" + "ldp x11, x12, [%[b], 208]\n\t" + "ldp x4, x6, [%[a], 192]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 208]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 192]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 208]\n\t" + "ldp x5, x7, [%[b], 224]\n\t" + "ldp x11, x12, [%[b], 240]\n\t" + "ldp x4, x6, [%[a], 224]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 240]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 224]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 240]\n\t" + "ldp x5, x7, [%[b], 256]\n\t" + "ldp x11, x12, [%[b], 272]\n\t" + "ldp x4, x6, [%[a], 256]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 272]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 256]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 272]\n\t" + "ldp x5, x7, [%[b], 288]\n\t" + "ldp x11, x12, [%[b], 304]\n\t" + "ldp x4, x6, [%[a], 288]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 304]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 288]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 304]\n\t" + "ldp x5, x7, [%[b], 320]\n\t" + "ldp x11, x12, [%[b], 336]\n\t" + "ldp x4, x6, [%[a], 320]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 336]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 320]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 336]\n\t" + "ldp x5, x7, [%[b], 352]\n\t" + "ldp x11, x12, [%[b], 368]\n\t" + "ldp x4, x6, [%[a], 352]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 368]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 352]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 368]\n\t" + "csetm %[r], cc\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + ); + + return (sp_digit)r; +#endif /* WOLFSSL_SP_SMALL */ +} + /* AND m into each word of a and store in r. * * r A single precision integer. @@ -12528,244 +12773,6 @@ static WC_INLINE int sp_3072_mod_48(sp_digit* r, const sp_digit* a, const sp_dig return sp_3072_div_48(a, m, NULL, r); } -#ifdef WOLFSSL_SP_SMALL -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "add x11, %[a], 384\n\t" - "\n1:\n\t" - "subs %[c], xzr, %[c]\n\t" - "ldp x3, x4, [%[a]], #16\n\t" - "ldp x5, x6, [%[a]], #16\n\t" - "ldp x7, x8, [%[b]], #16\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x9, x10, [%[b]], #16\n\t" - "sbcs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r]], #16\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r]], #16\n\t" - "csetm %[c], cc\n\t" - "cmp %[a], x11\n\t" - "b.ne 1b\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" - ); - - return c; -} - -#else -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "ldp x3, x4, [%[a], 0]\n\t" - "ldp x7, x8, [%[b], 0]\n\t" - "subs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 16]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 0]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 16]\n\t" - "ldp x3, x4, [%[a], 32]\n\t" - "ldp x7, x8, [%[b], 32]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 48]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 32]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 48]\n\t" - "ldp x3, x4, [%[a], 64]\n\t" - "ldp x7, x8, [%[b], 64]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 80]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 80]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 64]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 80]\n\t" - "ldp x3, x4, [%[a], 96]\n\t" - "ldp x7, x8, [%[b], 96]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 112]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 112]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 96]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 112]\n\t" - "ldp x3, x4, [%[a], 128]\n\t" - "ldp x7, x8, [%[b], 128]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 144]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 144]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 128]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 144]\n\t" - "ldp x3, x4, [%[a], 160]\n\t" - "ldp x7, x8, [%[b], 160]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 176]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 176]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 160]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 176]\n\t" - "ldp x3, x4, [%[a], 192]\n\t" - "ldp x7, x8, [%[b], 192]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 208]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 208]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 192]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 208]\n\t" - "ldp x3, x4, [%[a], 224]\n\t" - "ldp x7, x8, [%[b], 224]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 240]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 240]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 224]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 240]\n\t" - "ldp x3, x4, [%[a], 256]\n\t" - "ldp x7, x8, [%[b], 256]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 272]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 272]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 256]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 272]\n\t" - "ldp x3, x4, [%[a], 288]\n\t" - "ldp x7, x8, [%[b], 288]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 304]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 304]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 288]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 304]\n\t" - "ldp x3, x4, [%[a], 320]\n\t" - "ldp x7, x8, [%[b], 320]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 336]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 336]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 320]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 336]\n\t" - "ldp x3, x4, [%[a], 352]\n\t" - "ldp x7, x8, [%[b], 352]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 368]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 368]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 352]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 368]\n\t" - "csetm %[r], cc\n\t" - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" - ); - - return (sp_digit)r; -} - -#endif /* WOLFSSL_SP_SMALL */ -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_3072_div_48_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[96], t2[49]; - sp_digit div, r1; - int i; - - (void)m; - - div = d[47]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 48); - for (i=47; i>=0; i--) { - sp_digit hi = t1[48 + i] - (t1[48 + i] == div); - r1 = div_3072_word_48(hi, t1[48 + i - 1], div); - - sp_3072_mul_d_48(t2, d, r1); - t1[48 + i] += sp_3072_sub_in_place_48(&t1[i], t2); - t1[48 + i] -= t2[48]; - if (t1[48 + i] != 0) { - t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], d); - if (t1[48 + i] != 0) - t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], d); - } - } - - for (i = 47; i > 0; i--) { - if (t1[i] != d[i]) - break; - } - if (t1[i] >= d[i]) { - sp_3072_sub_48(r, t1, d); - } - else { - XMEMCPY(r, t1, sizeof(*t1) * 48); - } - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_3072_mod_48_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - return sp_3072_div_48_cond(a, m, NULL, r); -} - #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -13042,6 +13049,7 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* WOLFSSL_SP_SMALL */ #endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -14304,12 +14312,14 @@ static void sp_4096_to_bin_64(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 64. * * a Array of sp_digit to normalize. */ #define sp_4096_norm_64(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 64. * * a Array of sp_digit to normalize. @@ -15833,7 +15843,7 @@ static void sp_4096_mul_d_64(sp_digit* r, const sp_digit* a, #endif } -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 4096 bits, just need to subtract. * @@ -15848,277 +15858,7 @@ static void sp_4096_mont_norm_64(sp_digit* r, const sp_digit* m) sp_4096_sub_in_place_64(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ -/* Conditionally subtract b from a using the mask m. - * m is -1 to subtract and 0 when not copying. - * - * r A single precision number representing condition subtract result. - * a A single precision number to subtract from. - * b A single precision number to subtract. - * m Mask value to apply. - */ -static sp_digit sp_4096_cond_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b, - sp_digit m) -{ -#ifdef WOLFSSL_SP_SMALL - sp_digit c = 0; - - __asm__ __volatile__ ( - "mov x8, #0\n\t" - "1:\n\t" - "subs %[c], xzr, %[c]\n\t" - "ldr x4, [%[a], x8]\n\t" - "ldr x5, [%[b], x8]\n\t" - "and x5, x5, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "csetm %[c], cc\n\t" - "str x4, [%[r], x8]\n\t" - "add x8, x8, #8\n\t" - "cmp x8, 512\n\t" - "b.lt 1b\n\t" - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return c; -#else - __asm__ __volatile__ ( - - "ldp x5, x7, [%[b], 0]\n\t" - "ldp x11, x12, [%[b], 16]\n\t" - "ldp x4, x6, [%[a], 0]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 16]\n\t" - "and x7, x7, %[m]\n\t" - "subs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 0]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 16]\n\t" - "ldp x5, x7, [%[b], 32]\n\t" - "ldp x11, x12, [%[b], 48]\n\t" - "ldp x4, x6, [%[a], 32]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 48]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 32]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 48]\n\t" - "ldp x5, x7, [%[b], 64]\n\t" - "ldp x11, x12, [%[b], 80]\n\t" - "ldp x4, x6, [%[a], 64]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 80]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 64]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 80]\n\t" - "ldp x5, x7, [%[b], 96]\n\t" - "ldp x11, x12, [%[b], 112]\n\t" - "ldp x4, x6, [%[a], 96]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 112]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 96]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 112]\n\t" - "ldp x5, x7, [%[b], 128]\n\t" - "ldp x11, x12, [%[b], 144]\n\t" - "ldp x4, x6, [%[a], 128]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 144]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 128]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 144]\n\t" - "ldp x5, x7, [%[b], 160]\n\t" - "ldp x11, x12, [%[b], 176]\n\t" - "ldp x4, x6, [%[a], 160]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 176]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 160]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 176]\n\t" - "ldp x5, x7, [%[b], 192]\n\t" - "ldp x11, x12, [%[b], 208]\n\t" - "ldp x4, x6, [%[a], 192]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 208]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 192]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 208]\n\t" - "ldp x5, x7, [%[b], 224]\n\t" - "ldp x11, x12, [%[b], 240]\n\t" - "ldp x4, x6, [%[a], 224]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 240]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 224]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 240]\n\t" - "ldp x5, x7, [%[b], 256]\n\t" - "ldp x11, x12, [%[b], 272]\n\t" - "ldp x4, x6, [%[a], 256]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 272]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 256]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 272]\n\t" - "ldp x5, x7, [%[b], 288]\n\t" - "ldp x11, x12, [%[b], 304]\n\t" - "ldp x4, x6, [%[a], 288]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 304]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 288]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 304]\n\t" - "ldp x5, x7, [%[b], 320]\n\t" - "ldp x11, x12, [%[b], 336]\n\t" - "ldp x4, x6, [%[a], 320]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 336]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 320]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 336]\n\t" - "ldp x5, x7, [%[b], 352]\n\t" - "ldp x11, x12, [%[b], 368]\n\t" - "ldp x4, x6, [%[a], 352]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 368]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 352]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 368]\n\t" - "ldp x5, x7, [%[b], 384]\n\t" - "ldp x11, x12, [%[b], 400]\n\t" - "ldp x4, x6, [%[a], 384]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 400]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 384]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 400]\n\t" - "ldp x5, x7, [%[b], 416]\n\t" - "ldp x11, x12, [%[b], 432]\n\t" - "ldp x4, x6, [%[a], 416]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 432]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 416]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 432]\n\t" - "ldp x5, x7, [%[b], 448]\n\t" - "ldp x11, x12, [%[b], 464]\n\t" - "ldp x4, x6, [%[a], 448]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 464]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 448]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 464]\n\t" - "ldp x5, x7, [%[b], 480]\n\t" - "ldp x11, x12, [%[b], 496]\n\t" - "ldp x4, x6, [%[a], 480]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 496]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 480]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 496]\n\t" - "csetm %[r], cc\n\t" - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return (sp_digit)r; -#endif /* WOLFSSL_SP_SMALL */ -} - +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ /* Reduce the number back to 4096 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -17030,6 +16770,224 @@ static void sp_4096_mont_sqr_64(sp_digit* r, const sp_digit* a, sp_4096_mont_reduce_64(r, m, mp); } +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_4096_sub_64(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "add x11, %[a], 512\n\t" + "\n1:\n\t" + "subs %[c], xzr, %[c]\n\t" + "ldp x3, x4, [%[a]], #16\n\t" + "ldp x5, x6, [%[a]], #16\n\t" + "ldp x7, x8, [%[b]], #16\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x9, x10, [%[b]], #16\n\t" + "sbcs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r]], #16\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r]], #16\n\t" + "csetm %[c], cc\n\t" + "cmp %[a], x11\n\t" + "b.ne 1b\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" + ); + + return c; +} + +#else +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_4096_sub_64(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "ldp x3, x4, [%[a], 0]\n\t" + "ldp x7, x8, [%[b], 0]\n\t" + "subs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 16]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 16]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 0]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 16]\n\t" + "ldp x3, x4, [%[a], 32]\n\t" + "ldp x7, x8, [%[b], 32]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 48]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 48]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 32]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 48]\n\t" + "ldp x3, x4, [%[a], 64]\n\t" + "ldp x7, x8, [%[b], 64]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 80]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 80]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 64]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 80]\n\t" + "ldp x3, x4, [%[a], 96]\n\t" + "ldp x7, x8, [%[b], 96]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 112]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 112]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 96]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 112]\n\t" + "ldp x3, x4, [%[a], 128]\n\t" + "ldp x7, x8, [%[b], 128]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 144]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 144]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 128]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 144]\n\t" + "ldp x3, x4, [%[a], 160]\n\t" + "ldp x7, x8, [%[b], 160]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 176]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 176]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 160]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 176]\n\t" + "ldp x3, x4, [%[a], 192]\n\t" + "ldp x7, x8, [%[b], 192]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 208]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 208]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 192]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 208]\n\t" + "ldp x3, x4, [%[a], 224]\n\t" + "ldp x7, x8, [%[b], 224]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 240]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 240]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 224]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 240]\n\t" + "ldp x3, x4, [%[a], 256]\n\t" + "ldp x7, x8, [%[b], 256]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 272]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 272]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 256]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 272]\n\t" + "ldp x3, x4, [%[a], 288]\n\t" + "ldp x7, x8, [%[b], 288]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 304]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 304]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 288]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 304]\n\t" + "ldp x3, x4, [%[a], 320]\n\t" + "ldp x7, x8, [%[b], 320]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 336]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 336]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 320]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 336]\n\t" + "ldp x3, x4, [%[a], 352]\n\t" + "ldp x7, x8, [%[b], 352]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 368]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 368]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 352]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 368]\n\t" + "ldp x3, x4, [%[a], 384]\n\t" + "ldp x7, x8, [%[b], 384]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 400]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 400]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 384]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 400]\n\t" + "ldp x3, x4, [%[a], 416]\n\t" + "ldp x7, x8, [%[b], 416]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 432]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 432]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 416]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 432]\n\t" + "ldp x3, x4, [%[a], 448]\n\t" + "ldp x7, x8, [%[b], 448]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 464]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 464]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 448]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 464]\n\t" + "ldp x3, x4, [%[a], 480]\n\t" + "ldp x7, x8, [%[b], 480]\n\t" + "sbcs x3, x3, x7\n\t" + "ldp x5, x6, [%[a], 496]\n\t" + "sbcs x4, x4, x8\n\t" + "ldp x9, x10, [%[b], 496]\n\t" + "sbcs x5, x5, x9\n\t" + "stp x3, x4, [%[r], 480]\n\t" + "sbcs x6, x6, x10\n\t" + "stp x5, x6, [%[r], 496]\n\t" + "csetm %[r], cc\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" + ); + + return (sp_digit)r; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -17089,6 +17047,337 @@ static sp_digit div_4096_word_64(sp_digit d1, sp_digit d0, sp_digit div) return r; } +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_4096_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[128], t2[65]; + sp_digit div, r1; + int i; + + (void)m; + + div = d[63]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); + for (i=63; i>=0; i--) { + sp_digit hi = t1[64 + i] - (t1[64 + i] == div); + r1 = div_4096_word_64(hi, t1[64 + i - 1], div); + + sp_4096_mul_d_64(t2, d, r1); + t1[64 + i] += sp_4096_sub_in_place_64(&t1[i], t2); + t1[64 + i] -= t2[64]; + if (t1[64 + i] != 0) { + t1[64 + i] += sp_4096_add_64(&t1[i], &t1[i], d); + if (t1[64 + i] != 0) + t1[64 + i] += sp_4096_add_64(&t1[i], &t1[i], d); + } + } + + for (i = 63; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_4096_sub_64(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 64); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_4096_mod_64_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) +{ + return sp_4096_div_64_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_4096_cond_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ +#ifdef WOLFSSL_SP_SMALL + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov x8, #0\n\t" + "1:\n\t" + "subs %[c], xzr, %[c]\n\t" + "ldr x4, [%[a], x8]\n\t" + "ldr x5, [%[b], x8]\n\t" + "and x5, x5, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "csetm %[c], cc\n\t" + "str x4, [%[r], x8]\n\t" + "add x8, x8, #8\n\t" + "cmp x8, 512\n\t" + "b.lt 1b\n\t" + : [c] "+r" (c) + : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + ); + + return c; +#else + __asm__ __volatile__ ( + + "ldp x5, x7, [%[b], 0]\n\t" + "ldp x11, x12, [%[b], 16]\n\t" + "ldp x4, x6, [%[a], 0]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" + "and x7, x7, %[m]\n\t" + "subs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 0]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 16]\n\t" + "ldp x5, x7, [%[b], 32]\n\t" + "ldp x11, x12, [%[b], 48]\n\t" + "ldp x4, x6, [%[a], 32]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 48]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 32]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 48]\n\t" + "ldp x5, x7, [%[b], 64]\n\t" + "ldp x11, x12, [%[b], 80]\n\t" + "ldp x4, x6, [%[a], 64]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 80]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 64]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 80]\n\t" + "ldp x5, x7, [%[b], 96]\n\t" + "ldp x11, x12, [%[b], 112]\n\t" + "ldp x4, x6, [%[a], 96]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 112]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 96]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 112]\n\t" + "ldp x5, x7, [%[b], 128]\n\t" + "ldp x11, x12, [%[b], 144]\n\t" + "ldp x4, x6, [%[a], 128]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 144]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 128]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 144]\n\t" + "ldp x5, x7, [%[b], 160]\n\t" + "ldp x11, x12, [%[b], 176]\n\t" + "ldp x4, x6, [%[a], 160]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 176]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 160]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 176]\n\t" + "ldp x5, x7, [%[b], 192]\n\t" + "ldp x11, x12, [%[b], 208]\n\t" + "ldp x4, x6, [%[a], 192]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 208]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 192]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 208]\n\t" + "ldp x5, x7, [%[b], 224]\n\t" + "ldp x11, x12, [%[b], 240]\n\t" + "ldp x4, x6, [%[a], 224]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 240]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 224]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 240]\n\t" + "ldp x5, x7, [%[b], 256]\n\t" + "ldp x11, x12, [%[b], 272]\n\t" + "ldp x4, x6, [%[a], 256]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 272]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 256]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 272]\n\t" + "ldp x5, x7, [%[b], 288]\n\t" + "ldp x11, x12, [%[b], 304]\n\t" + "ldp x4, x6, [%[a], 288]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 304]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 288]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 304]\n\t" + "ldp x5, x7, [%[b], 320]\n\t" + "ldp x11, x12, [%[b], 336]\n\t" + "ldp x4, x6, [%[a], 320]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 336]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 320]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 336]\n\t" + "ldp x5, x7, [%[b], 352]\n\t" + "ldp x11, x12, [%[b], 368]\n\t" + "ldp x4, x6, [%[a], 352]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 368]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 352]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 368]\n\t" + "ldp x5, x7, [%[b], 384]\n\t" + "ldp x11, x12, [%[b], 400]\n\t" + "ldp x4, x6, [%[a], 384]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 400]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 384]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 400]\n\t" + "ldp x5, x7, [%[b], 416]\n\t" + "ldp x11, x12, [%[b], 432]\n\t" + "ldp x4, x6, [%[a], 416]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 432]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 416]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 432]\n\t" + "ldp x5, x7, [%[b], 448]\n\t" + "ldp x11, x12, [%[b], 464]\n\t" + "ldp x4, x6, [%[a], 448]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 464]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 448]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 464]\n\t" + "ldp x5, x7, [%[b], 480]\n\t" + "ldp x11, x12, [%[b], 496]\n\t" + "ldp x4, x6, [%[a], 480]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 496]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 480]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 496]\n\t" + "csetm %[r], cc\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + ); + + return (sp_digit)r; +#endif /* WOLFSSL_SP_SMALL */ +} + /* AND m into each word of a and store in r. * * r A single precision integer. @@ -17664,284 +17953,6 @@ static WC_INLINE int sp_4096_mod_64(sp_digit* r, const sp_digit* a, const sp_dig return sp_4096_div_64(a, m, NULL, r); } -#ifdef WOLFSSL_SP_SMALL -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static sp_digit sp_4096_sub_64(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "add x11, %[a], 512\n\t" - "\n1:\n\t" - "subs %[c], xzr, %[c]\n\t" - "ldp x3, x4, [%[a]], #16\n\t" - "ldp x5, x6, [%[a]], #16\n\t" - "ldp x7, x8, [%[b]], #16\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x9, x10, [%[b]], #16\n\t" - "sbcs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r]], #16\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r]], #16\n\t" - "csetm %[c], cc\n\t" - "cmp %[a], x11\n\t" - "b.ne 1b\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11" - ); - - return c; -} - -#else -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -static sp_digit sp_4096_sub_64(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - __asm__ __volatile__ ( - "ldp x3, x4, [%[a], 0]\n\t" - "ldp x7, x8, [%[b], 0]\n\t" - "subs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 16]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 16]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 0]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 16]\n\t" - "ldp x3, x4, [%[a], 32]\n\t" - "ldp x7, x8, [%[b], 32]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 48]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 48]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 32]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 48]\n\t" - "ldp x3, x4, [%[a], 64]\n\t" - "ldp x7, x8, [%[b], 64]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 80]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 80]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 64]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 80]\n\t" - "ldp x3, x4, [%[a], 96]\n\t" - "ldp x7, x8, [%[b], 96]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 112]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 112]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 96]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 112]\n\t" - "ldp x3, x4, [%[a], 128]\n\t" - "ldp x7, x8, [%[b], 128]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 144]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 144]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 128]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 144]\n\t" - "ldp x3, x4, [%[a], 160]\n\t" - "ldp x7, x8, [%[b], 160]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 176]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 176]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 160]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 176]\n\t" - "ldp x3, x4, [%[a], 192]\n\t" - "ldp x7, x8, [%[b], 192]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 208]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 208]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 192]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 208]\n\t" - "ldp x3, x4, [%[a], 224]\n\t" - "ldp x7, x8, [%[b], 224]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 240]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 240]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 224]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 240]\n\t" - "ldp x3, x4, [%[a], 256]\n\t" - "ldp x7, x8, [%[b], 256]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 272]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 272]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 256]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 272]\n\t" - "ldp x3, x4, [%[a], 288]\n\t" - "ldp x7, x8, [%[b], 288]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 304]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 304]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 288]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 304]\n\t" - "ldp x3, x4, [%[a], 320]\n\t" - "ldp x7, x8, [%[b], 320]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 336]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 336]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 320]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 336]\n\t" - "ldp x3, x4, [%[a], 352]\n\t" - "ldp x7, x8, [%[b], 352]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 368]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 368]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 352]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 368]\n\t" - "ldp x3, x4, [%[a], 384]\n\t" - "ldp x7, x8, [%[b], 384]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 400]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 400]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 384]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 400]\n\t" - "ldp x3, x4, [%[a], 416]\n\t" - "ldp x7, x8, [%[b], 416]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 432]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 432]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 416]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 432]\n\t" - "ldp x3, x4, [%[a], 448]\n\t" - "ldp x7, x8, [%[b], 448]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 464]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 464]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 448]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 464]\n\t" - "ldp x3, x4, [%[a], 480]\n\t" - "ldp x7, x8, [%[b], 480]\n\t" - "sbcs x3, x3, x7\n\t" - "ldp x5, x6, [%[a], 496]\n\t" - "sbcs x4, x4, x8\n\t" - "ldp x9, x10, [%[b], 496]\n\t" - "sbcs x5, x5, x9\n\t" - "stp x3, x4, [%[r], 480]\n\t" - "sbcs x6, x6, x10\n\t" - "stp x5, x6, [%[r], 496]\n\t" - "csetm %[r], cc\n\t" - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" - ); - - return (sp_digit)r; -} - -#endif /* WOLFSSL_SP_SMALL */ -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_4096_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[128], t2[65]; - sp_digit div, r1; - int i; - - (void)m; - - div = d[63]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); - for (i=63; i>=0; i--) { - sp_digit hi = t1[64 + i] - (t1[64 + i] == div); - r1 = div_4096_word_64(hi, t1[64 + i - 1], div); - - sp_4096_mul_d_64(t2, d, r1); - t1[64 + i] += sp_4096_sub_in_place_64(&t1[i], t2); - t1[64 + i] -= t2[64]; - if (t1[64 + i] != 0) { - t1[64 + i] += sp_4096_add_64(&t1[i], &t1[i], d); - if (t1[64 + i] != 0) - t1[64 + i] += sp_4096_add_64(&t1[i], &t1[i], d); - } - } - - for (i = 63; i > 0; i--) { - if (t1[i] != d[i]) - break; - } - if (t1[i] >= d[i]) { - sp_4096_sub_64(r, t1, d); - } - else { - XMEMCPY(r, t1, sizeof(*t1) * 64); - } - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_4096_mod_64_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - return sp_4096_div_64_cond(a, m, NULL, r); -} - #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -18218,6 +18229,7 @@ static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* WOLFSSL_SP_SMALL */ #endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -40380,49 +40392,6 @@ static void sp_384_cond_copy_6(sp_digit* r, const sp_digit* a, sp_digit m) ); } -/* Conditionally subtract b from a using the mask m. - * m is -1 to subtract and 0 when not copying. - * - * r A single precision number representing condition subtract result. - * a A single precision number to subtract from. - * b A single precision number to subtract. - * m Mask value to apply. - */ -static sp_digit sp_384_cond_sub_6(sp_digit* r, const sp_digit* a, const sp_digit* b, - sp_digit m) -{ - __asm__ __volatile__ ( - - "ldp x5, x7, [%[b], 0]\n\t" - "ldp x11, x12, [%[b], 16]\n\t" - "ldp x4, x6, [%[a], 0]\n\t" - "and x5, x5, %[m]\n\t" - "ldp x9, x10, [%[a], 16]\n\t" - "and x7, x7, %[m]\n\t" - "subs x4, x4, x5\n\t" - "and x11, x11, %[m]\n\t" - "sbcs x6, x6, x7\n\t" - "and x12, x12, %[m]\n\t" - "sbcs x9, x9, x11\n\t" - "stp x4, x6, [%[r], 0]\n\t" - "sbcs x10, x10, x12\n\t" - "stp x9, x10, [%[r], 16]\n\t" - "ldp x5, x7, [%[b], 32]\n\t" - "ldp x4, x6, [%[a], 32]\n\t" - "and x5, x5, %[m]\n\t" - "and x7, x7, %[m]\n\t" - "sbcs x4, x4, x5\n\t" - "sbcs x6, x6, x7\n\t" - "stp x4, x6, [%[r], 32]\n\t" - "csetm %[r], cc\n\t" - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" - ); - - return (sp_digit)r; -} - #define sp_384_mont_reduce_order_6 sp_384_mont_reduce_6 /* Reduce the number back to 384 bits using Montgomery reduction. @@ -40767,6 +40736,49 @@ static sp_int64 sp_384_cmp_6(const sp_digit* a, const sp_digit* b) */ #define sp_384_norm_6(a) +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_384_cond_sub_6(sp_digit* r, const sp_digit* a, const sp_digit* b, + sp_digit m) +{ + __asm__ __volatile__ ( + + "ldp x5, x7, [%[b], 0]\n\t" + "ldp x11, x12, [%[b], 16]\n\t" + "ldp x4, x6, [%[a], 0]\n\t" + "and x5, x5, %[m]\n\t" + "ldp x9, x10, [%[a], 16]\n\t" + "and x7, x7, %[m]\n\t" + "subs x4, x4, x5\n\t" + "and x11, x11, %[m]\n\t" + "sbcs x6, x6, x7\n\t" + "and x12, x12, %[m]\n\t" + "sbcs x9, x9, x11\n\t" + "stp x4, x6, [%[r], 0]\n\t" + "sbcs x10, x10, x12\n\t" + "stp x9, x10, [%[r], 16]\n\t" + "ldp x5, x7, [%[b], 32]\n\t" + "ldp x4, x6, [%[a], 32]\n\t" + "and x5, x5, %[m]\n\t" + "and x7, x7, %[m]\n\t" + "sbcs x4, x4, x5\n\t" + "sbcs x6, x6, x7\n\t" + "stp x4, x6, [%[r], 32]\n\t" + "csetm %[r], cc\n\t" + : [r] "+r" (r) + : [a] "r" (a), [b] "r" (b), [m] "r" (m) + : "memory", "x4", "x6", "x5", "x7", "x8", "x9", "x10", "x11", "x12" + ); + + return (sp_digit)r; +} + /* Map the Montgomery form projective coordinate point to an affine point. * * r Resulting affine coordinate point. diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c index cde9ded9f..9be28694e 100644 --- a/wolfcrypt/src/sp_armthumb.c +++ b/wolfcrypt/src/sp_armthumb.c @@ -221,12 +221,14 @@ static void sp_2048_to_bin_64(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. */ #define sp_2048_norm_64(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. @@ -6820,7 +6822,7 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 2048 bits, just need to subtract. * @@ -6835,7 +6837,7 @@ static void sp_2048_mont_norm_64(sp_digit* r, const sp_digit* m) sp_2048_sub_in_place_64(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -7312,6 +7314,640 @@ static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a, sp_2048_mont_reduce_64(r, m, mp); } +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "movs r6, %[a]\n\t" + "movs r3, #0\n\t" + "movs r5, #0xff\n\t" +#ifdef __clang__ + "adds r5, r5, #1\n\t" +#else + "add r5, r5, #1\n\t" +#endif +#ifdef __clang__ + "adds r6, r6, r5\n\t" +#else + "add r6, r6, r5\n\t" +#endif + "\n" + "L_sp_2048_sub_64_word_%=: \n\t" + "movs r5, #0\n\t" +#ifdef __clang__ + "subs r5, r5, r3\n\t" +#else + "sub r5, r5, r3\n\t" +#endif + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[b]]\n\t" +#ifdef __clang__ + "sbcs r4, r5\n\t" +#else + "sbc r4, r5\n\t" +#endif + "str r4, [%[r]]\n\t" +#ifdef __clang__ + "sbcs r3, r3\n\t" +#else + "sbc r3, r3\n\t" +#endif +#ifdef __clang__ + "adds %[a], %[a], #4\n\t" +#else + "add %[a], %[a], #4\n\t" +#endif +#ifdef __clang__ + "adds %[b], %[b], #4\n\t" +#else + "add %[b], %[b], #4\n\t" +#endif +#ifdef __clang__ + "adds %[r], %[r], #4\n\t" +#else + "add %[r], %[r], #4\n\t" +#endif + "cmp %[a], r6\n\t" + "bne L_sp_2048_sub_64_word_%=\n\t" + "movs %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + return (uint32_t)(size_t)r; +} + +#else +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "movs r3, #0\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b]]\n\t" + "ldr r7, [%[b], #4]\n\t" +#ifdef __clang__ + "subs r4, r4, r6\n\t" +#else + "sub r4, r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r]]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r7, [%[b], #12]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r7, [%[b], #20]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r7, [%[b], #28]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r7, [%[b], #36]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r7, [%[b], #44]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r7, [%[b], #52]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r7, [%[b], #60]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r7, [%[b], #68]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r7, [%[b], #76]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r7, [%[b], #84]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r7, [%[b], #92]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r7, [%[b], #100]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r7, [%[b], #108]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r7, [%[b], #116]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r7, [%[b], #124]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" +#ifdef __clang__ + "sbcs r3, r3\n\t" +#else + "sbc r3, r3\n\t" +#endif +#ifdef __clang__ + "adds %[a], %[a], #0x80\n\t" +#else + "add %[a], %[a], #0x80\n\t" +#endif +#ifdef __clang__ + "adds %[b], %[b], #0x80\n\t" +#else + "add %[b], %[b], #0x80\n\t" +#endif +#ifdef __clang__ + "adds %[r], %[r], #0x80\n\t" +#else + "add %[r], %[r], #0x80\n\t" +#endif + "movs r6, #0\n\t" +#ifdef __clang__ + "subs r6, r6, r3\n\t" +#else + "sub r6, r6, r3\n\t" +#endif + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b]]\n\t" + "ldr r7, [%[b], #4]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r]]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r7, [%[b], #12]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r7, [%[b], #20]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r7, [%[b], #28]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r7, [%[b], #36]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r7, [%[b], #44]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r7, [%[b], #52]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r7, [%[b], #60]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r7, [%[b], #68]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r7, [%[b], #76]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r7, [%[b], #84]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r7, [%[b], #92]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r7, [%[b], #100]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r7, [%[b], #108]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r7, [%[b], #116]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r7, [%[b], #124]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" +#ifdef __clang__ + "sbcs r3, r3\n\t" +#else + "sbc r3, r3\n\t" +#endif + "movs %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -7832,6 +8468,67 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, return (uint32_t)(size_t)d1; } +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[128], t2[65]; + sp_digit div, r1; + int i; + + (void)m; + + div = d[63]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); + for (i=63; i>=0; i--) { + sp_digit hi = t1[64 + i] - (t1[64 + i] == div); + r1 = div_2048_word_64(hi, t1[64 + i - 1], div); + + sp_2048_mul_d_64(t2, d, r1); + t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2); + t1[64 + i] -= t2[64]; + if (t1[64 + i] != 0) { + t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d); + if (t1[64 + i] != 0) + t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d); + } + } + + for (i = 63; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_2048_sub_64(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 64); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) +{ + return sp_2048_div_64_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* AND m into each word of a and store in r. * * r A single precision integer. @@ -8011,58 +8708,6 @@ static WC_INLINE int sp_2048_mod_64(sp_digit* r, const sp_digit* a, const sp_dig return sp_2048_div_64(a, m, NULL, r); } -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[128], t2[65]; - sp_digit div, r1; - int i; - - (void)m; - - div = d[63]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); - for (i=63; i>=0; i--) { - sp_digit hi = t1[64 + i] - (t1[64 + i] == div); - r1 = div_2048_word_64(hi, t1[64 + i - 1], div); - - sp_2048_mul_d_64(t2, d, r1); - t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2); - t1[64 + i] -= t2[64]; - if (t1[64 + i] != 0) { - t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d); - if (t1[64 + i] != 0) - t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d); - } - } - - r1 = sp_2048_cmp_64(t1, d) >= 0; - sp_2048_cond_sub_64(r, t1, d, (sp_digit)0 - r1); - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - return sp_2048_div_64_cond(a, m, NULL, r); -} - #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -8339,6 +8984,7 @@ static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* WOLFSSL_SP_SMALL */ #endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -10704,12 +11350,14 @@ static void sp_3072_to_bin_96(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. */ #define sp_3072_norm_96(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. @@ -18375,7 +19023,7 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 3072 bits, just need to subtract. * @@ -18390,7 +19038,7 @@ static void sp_3072_mont_norm_96(sp_digit* r, const sp_digit* m) sp_3072_sub_in_place_96(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -18872,6 +19520,922 @@ static void sp_3072_mont_sqr_96(sp_digit* r, const sp_digit* a, sp_3072_mont_reduce_96(r, m, mp); } +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "movs r6, %[a]\n\t" + "movs r3, #0\n\t" + "movs r5, #0xff\n\t" +#ifdef __clang__ + "adds r5, r5, #0x81\n\t" +#else + "add r5, r5, #0x81\n\t" +#endif +#ifdef __clang__ + "adds r6, r6, r5\n\t" +#else + "add r6, r6, r5\n\t" +#endif + "\n" + "L_sp_3072_sub_96_word_%=: \n\t" + "movs r5, #0\n\t" +#ifdef __clang__ + "subs r5, r5, r3\n\t" +#else + "sub r5, r5, r3\n\t" +#endif + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[b]]\n\t" +#ifdef __clang__ + "sbcs r4, r5\n\t" +#else + "sbc r4, r5\n\t" +#endif + "str r4, [%[r]]\n\t" +#ifdef __clang__ + "sbcs r3, r3\n\t" +#else + "sbc r3, r3\n\t" +#endif +#ifdef __clang__ + "adds %[a], %[a], #4\n\t" +#else + "add %[a], %[a], #4\n\t" +#endif +#ifdef __clang__ + "adds %[b], %[b], #4\n\t" +#else + "add %[b], %[b], #4\n\t" +#endif +#ifdef __clang__ + "adds %[r], %[r], #4\n\t" +#else + "add %[r], %[r], #4\n\t" +#endif + "cmp %[a], r6\n\t" + "bne L_sp_3072_sub_96_word_%=\n\t" + "movs %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + return (uint32_t)(size_t)r; +} + +#else +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "movs r3, #0\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b]]\n\t" + "ldr r7, [%[b], #4]\n\t" +#ifdef __clang__ + "subs r4, r4, r6\n\t" +#else + "sub r4, r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r]]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r7, [%[b], #12]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r7, [%[b], #20]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r7, [%[b], #28]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r7, [%[b], #36]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r7, [%[b], #44]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r7, [%[b], #52]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r7, [%[b], #60]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r7, [%[b], #68]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r7, [%[b], #76]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r7, [%[b], #84]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r7, [%[b], #92]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r7, [%[b], #100]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r7, [%[b], #108]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r7, [%[b], #116]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r7, [%[b], #124]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" +#ifdef __clang__ + "sbcs r3, r3\n\t" +#else + "sbc r3, r3\n\t" +#endif +#ifdef __clang__ + "adds %[a], %[a], #0x80\n\t" +#else + "add %[a], %[a], #0x80\n\t" +#endif +#ifdef __clang__ + "adds %[b], %[b], #0x80\n\t" +#else + "add %[b], %[b], #0x80\n\t" +#endif +#ifdef __clang__ + "adds %[r], %[r], #0x80\n\t" +#else + "add %[r], %[r], #0x80\n\t" +#endif + "movs r6, #0\n\t" +#ifdef __clang__ + "subs r6, r6, r3\n\t" +#else + "sub r6, r6, r3\n\t" +#endif + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b]]\n\t" + "ldr r7, [%[b], #4]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r]]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r7, [%[b], #12]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r7, [%[b], #20]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r7, [%[b], #28]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r7, [%[b], #36]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r7, [%[b], #44]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r7, [%[b], #52]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r7, [%[b], #60]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r7, [%[b], #68]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r7, [%[b], #76]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r7, [%[b], #84]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r7, [%[b], #92]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r7, [%[b], #100]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r7, [%[b], #108]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r7, [%[b], #116]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r7, [%[b], #124]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" +#ifdef __clang__ + "sbcs r3, r3\n\t" +#else + "sbc r3, r3\n\t" +#endif +#ifdef __clang__ + "adds %[a], %[a], #0x80\n\t" +#else + "add %[a], %[a], #0x80\n\t" +#endif +#ifdef __clang__ + "adds %[b], %[b], #0x80\n\t" +#else + "add %[b], %[b], #0x80\n\t" +#endif +#ifdef __clang__ + "adds %[r], %[r], #0x80\n\t" +#else + "add %[r], %[r], #0x80\n\t" +#endif + "movs r6, #0\n\t" +#ifdef __clang__ + "subs r6, r6, r3\n\t" +#else + "sub r6, r6, r3\n\t" +#endif + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b]]\n\t" + "ldr r7, [%[b], #4]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r]]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r7, [%[b], #12]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r7, [%[b], #20]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r7, [%[b], #28]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r7, [%[b], #36]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r7, [%[b], #44]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r7, [%[b], #52]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r7, [%[b], #60]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r7, [%[b], #68]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r7, [%[b], #76]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r7, [%[b], #84]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r7, [%[b], #92]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r7, [%[b], #100]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r7, [%[b], #108]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r7, [%[b], #116]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r7, [%[b], #124]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" +#ifdef __clang__ + "sbcs r3, r3\n\t" +#else + "sbc r3, r3\n\t" +#endif + "movs %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -19392,6 +20956,67 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, return (uint32_t)(size_t)d1; } +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[192], t2[97]; + sp_digit div, r1; + int i; + + (void)m; + + div = d[95]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); + for (i=95; i>=0; i--) { + sp_digit hi = t1[96 + i] - (t1[96 + i] == div); + r1 = div_3072_word_96(hi, t1[96 + i - 1], div); + + sp_3072_mul_d_96(t2, d, r1); + t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2); + t1[96 + i] -= t2[96]; + if (t1[96 + i] != 0) { + t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d); + if (t1[96 + i] != 0) + t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d); + } + } + + for (i = 95; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_3072_sub_96(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 96); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) +{ + return sp_3072_div_96_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* AND m into each word of a and store in r. * * r A single precision integer. @@ -19576,58 +21201,6 @@ static WC_INLINE int sp_3072_mod_96(sp_digit* r, const sp_digit* a, const sp_dig return sp_3072_div_96(a, m, NULL, r); } -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[192], t2[97]; - sp_digit div, r1; - int i; - - (void)m; - - div = d[95]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); - for (i=95; i>=0; i--) { - sp_digit hi = t1[96 + i] - (t1[96 + i] == div); - r1 = div_3072_word_96(hi, t1[96 + i - 1], div); - - sp_3072_mul_d_96(t2, d, r1); - t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2); - t1[96 + i] -= t2[96]; - if (t1[96 + i] != 0) { - t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d); - if (t1[96 + i] != 0) - t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d); - } - } - - r1 = sp_3072_cmp_96(t1, d) >= 0; - sp_3072_cond_sub_96(r, t1, d, (sp_digit)0 - r1); - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - return sp_3072_div_96_cond(a, m, NULL, r); -} - #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -19904,6 +21477,7 @@ static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* WOLFSSL_SP_SMALL */ #endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -23003,12 +24577,14 @@ static void sp_4096_to_bin_128(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. */ #define sp_4096_norm_128(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. @@ -26424,7 +28000,7 @@ SP_NOINLINE static void sp_4096_mul_d_128(sp_digit* r, const sp_digit* a, ); } -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 4096 bits, just need to subtract. * @@ -26439,7 +28015,7 @@ static void sp_4096_mont_norm_128(sp_digit* r, const sp_digit* m) sp_4096_sub_in_place_128(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -26921,6 +28497,1204 @@ static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a, sp_4096_mont_reduce_128(r, m, mp); } +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "movs r6, %[a]\n\t" + "movs r3, #0\n\t" + "movs r5, #2\n\t" +#ifdef __clang__ + "lsls r5, r5, #8\n\t" +#else + "lsl r5, r5, #8\n\t" +#endif +#ifdef __clang__ + "adds r6, r6, r5\n\t" +#else + "add r6, r6, r5\n\t" +#endif + "\n" + "L_sp_4096_sub_128_word_%=: \n\t" + "movs r5, #0\n\t" +#ifdef __clang__ + "subs r5, r5, r3\n\t" +#else + "sub r5, r5, r3\n\t" +#endif + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[b]]\n\t" +#ifdef __clang__ + "sbcs r4, r5\n\t" +#else + "sbc r4, r5\n\t" +#endif + "str r4, [%[r]]\n\t" +#ifdef __clang__ + "sbcs r3, r3\n\t" +#else + "sbc r3, r3\n\t" +#endif +#ifdef __clang__ + "adds %[a], %[a], #4\n\t" +#else + "add %[a], %[a], #4\n\t" +#endif +#ifdef __clang__ + "adds %[b], %[b], #4\n\t" +#else + "add %[b], %[b], #4\n\t" +#endif +#ifdef __clang__ + "adds %[r], %[r], #4\n\t" +#else + "add %[r], %[r], #4\n\t" +#endif + "cmp %[a], r6\n\t" + "bne L_sp_4096_sub_128_word_%=\n\t" + "movs %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6" + ); + return (uint32_t)(size_t)r; +} + +#else +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + __asm__ __volatile__ ( + "movs r3, #0\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b]]\n\t" + "ldr r7, [%[b], #4]\n\t" +#ifdef __clang__ + "subs r4, r4, r6\n\t" +#else + "sub r4, r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r]]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r7, [%[b], #12]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r7, [%[b], #20]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r7, [%[b], #28]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r7, [%[b], #36]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r7, [%[b], #44]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r7, [%[b], #52]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r7, [%[b], #60]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r7, [%[b], #68]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r7, [%[b], #76]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r7, [%[b], #84]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r7, [%[b], #92]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r7, [%[b], #100]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r7, [%[b], #108]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r7, [%[b], #116]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r7, [%[b], #124]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" +#ifdef __clang__ + "sbcs r3, r3\n\t" +#else + "sbc r3, r3\n\t" +#endif +#ifdef __clang__ + "adds %[a], %[a], #0x80\n\t" +#else + "add %[a], %[a], #0x80\n\t" +#endif +#ifdef __clang__ + "adds %[b], %[b], #0x80\n\t" +#else + "add %[b], %[b], #0x80\n\t" +#endif +#ifdef __clang__ + "adds %[r], %[r], #0x80\n\t" +#else + "add %[r], %[r], #0x80\n\t" +#endif + "movs r6, #0\n\t" +#ifdef __clang__ + "subs r6, r6, r3\n\t" +#else + "sub r6, r6, r3\n\t" +#endif + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b]]\n\t" + "ldr r7, [%[b], #4]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r]]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r7, [%[b], #12]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r7, [%[b], #20]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r7, [%[b], #28]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r7, [%[b], #36]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r7, [%[b], #44]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r7, [%[b], #52]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r7, [%[b], #60]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r7, [%[b], #68]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r7, [%[b], #76]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r7, [%[b], #84]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r7, [%[b], #92]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r7, [%[b], #100]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r7, [%[b], #108]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r7, [%[b], #116]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r7, [%[b], #124]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" +#ifdef __clang__ + "sbcs r3, r3\n\t" +#else + "sbc r3, r3\n\t" +#endif +#ifdef __clang__ + "adds %[a], %[a], #0x80\n\t" +#else + "add %[a], %[a], #0x80\n\t" +#endif +#ifdef __clang__ + "adds %[b], %[b], #0x80\n\t" +#else + "add %[b], %[b], #0x80\n\t" +#endif +#ifdef __clang__ + "adds %[r], %[r], #0x80\n\t" +#else + "add %[r], %[r], #0x80\n\t" +#endif + "movs r6, #0\n\t" +#ifdef __clang__ + "subs r6, r6, r3\n\t" +#else + "sub r6, r6, r3\n\t" +#endif + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b]]\n\t" + "ldr r7, [%[b], #4]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r]]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r7, [%[b], #12]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r7, [%[b], #20]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r7, [%[b], #28]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r7, [%[b], #36]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r7, [%[b], #44]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r7, [%[b], #52]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r7, [%[b], #60]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r7, [%[b], #68]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r7, [%[b], #76]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r7, [%[b], #84]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r7, [%[b], #92]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r7, [%[b], #100]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r7, [%[b], #108]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r7, [%[b], #116]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r7, [%[b], #124]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" +#ifdef __clang__ + "sbcs r3, r3\n\t" +#else + "sbc r3, r3\n\t" +#endif +#ifdef __clang__ + "adds %[a], %[a], #0x80\n\t" +#else + "add %[a], %[a], #0x80\n\t" +#endif +#ifdef __clang__ + "adds %[b], %[b], #0x80\n\t" +#else + "add %[b], %[b], #0x80\n\t" +#endif +#ifdef __clang__ + "adds %[r], %[r], #0x80\n\t" +#else + "add %[r], %[r], #0x80\n\t" +#endif + "movs r6, #0\n\t" +#ifdef __clang__ + "subs r6, r6, r3\n\t" +#else + "sub r6, r6, r3\n\t" +#endif + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b]]\n\t" + "ldr r7, [%[b], #4]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r]]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r7, [%[b], #12]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r7, [%[b], #20]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r7, [%[b], #28]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r7, [%[b], #36]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r7, [%[b], #44]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r7, [%[b], #52]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r7, [%[b], #60]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r7, [%[b], #68]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r7, [%[b], #76]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r7, [%[b], #84]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r7, [%[b], #92]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r7, [%[b], #100]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r7, [%[b], #108]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r7, [%[b], #116]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r7, [%[b], #124]\n\t" +#ifdef __clang__ + "sbcs r4, r6\n\t" +#else + "sbc r4, r6\n\t" +#endif +#ifdef __clang__ + "sbcs r5, r7\n\t" +#else + "sbc r5, r7\n\t" +#endif + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" +#ifdef __clang__ + "sbcs r3, r3\n\t" +#else + "sbc r3, r3\n\t" +#endif + "movs %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -27441,6 +30215,67 @@ SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, return (uint32_t)(size_t)d1; } +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[256], t2[129]; + sp_digit div, r1; + int i; + + (void)m; + + div = d[127]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 128); + for (i=127; i>=0; i--) { + sp_digit hi = t1[128 + i] - (t1[128 + i] == div); + r1 = div_4096_word_128(hi, t1[128 + i - 1], div); + + sp_4096_mul_d_128(t2, d, r1); + t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2); + t1[128 + i] -= t2[128]; + if (t1[128 + i] != 0) { + t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d); + if (t1[128 + i] != 0) + t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d); + } + } + + for (i = 127; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_4096_sub_128(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 128); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) +{ + return sp_4096_div_128_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* AND m into each word of a and store in r. * * r A single precision integer. @@ -27626,58 +30461,6 @@ static WC_INLINE int sp_4096_mod_128(sp_digit* r, const sp_digit* a, const sp_di return sp_4096_div_128(a, m, NULL, r); } -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[256], t2[129]; - sp_digit div, r1; - int i; - - (void)m; - - div = d[127]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 128); - for (i=127; i>=0; i--) { - sp_digit hi = t1[128 + i] - (t1[128 + i] == div); - r1 = div_4096_word_128(hi, t1[128 + i - 1], div); - - sp_4096_mul_d_128(t2, d, r1); - t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2); - t1[128 + i] -= t2[128]; - if (t1[128 + i] != 0) { - t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d); - if (t1[128 + i] != 0) - t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d); - } - } - - r1 = sp_4096_cmp_128(t1, d) >= 0; - sp_4096_cond_sub_128(r, t1, d, (sp_digit)0 - r1); - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - return sp_4096_div_128_cond(a, m, NULL, r); -} - #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -27954,6 +30737,7 @@ static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e #endif /* WOLFSSL_SP_SMALL */ #endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c index af9ccc0e1..5fb2f0ff0 100644 --- a/wolfcrypt/src/sp_c32.c +++ b/wolfcrypt/src/sp_c32.c @@ -249,6 +249,7 @@ static void sp_2048_to_bin_72(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 29 bits. * * a Array of sp_digit to normalize. @@ -279,6 +280,7 @@ static void sp_2048_norm_36(sp_digit* a) #endif /* WOLFSSL_SP_SMALL */ } +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 29 bits. * * a Array of sp_digit to normalize. @@ -3221,6 +3223,7 @@ static int sp_2048_mod_72(sp_digit* r, const sp_digit* a, const sp_digit* m) return sp_2048_div_72(a, m, NULL, r); } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) @@ -3535,6 +3538,7 @@ static int sp_2048_mod_exp_72(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) || */ /* WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -5030,6 +5034,7 @@ static void sp_3072_to_bin_106(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 29 bits. * * a Array of sp_digit to normalize. @@ -5043,6 +5048,7 @@ static void sp_3072_norm_53(sp_digit* a) } } +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 29 bits. * * a Array of sp_digit to normalize. @@ -6728,6 +6734,7 @@ static int sp_3072_mod_106(sp_digit* r, const sp_digit* a, const sp_digit* m) return sp_3072_div_106(a, m, NULL, r); } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) * * r A single precision number that is the result of the operation. @@ -7038,6 +7045,7 @@ static int sp_3072_mod_exp_106(sp_digit* r, const sp_digit* a, const sp_digit* e #endif } +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -8380,6 +8388,7 @@ static void sp_3072_to_bin_112(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 28 bits. * * a Array of sp_digit to normalize. @@ -8406,6 +8415,7 @@ static void sp_3072_norm_56(sp_digit* a) a[55] += a[54] >> 28; a[54] &= 0xfffffff; } +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 28 bits. * * a Array of sp_digit to normalize. @@ -10812,6 +10822,7 @@ static int sp_3072_mod_112(sp_digit* r, const sp_digit* a, const sp_digit* m) return sp_3072_div_112(a, m, NULL, r); } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) @@ -11126,6 +11137,7 @@ static int sp_3072_mod_exp_112(sp_digit* r, const sp_digit* a, const sp_digit* e #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) || */ /* WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -12693,6 +12705,7 @@ static void sp_4096_to_bin_142(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) #if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D) /* Normalize the values in each word to 29 bits. * @@ -12708,6 +12721,7 @@ static void sp_4096_norm_71(sp_digit* a) } #endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 29 bits. * * a Array of sp_digit to normalize. @@ -14401,6 +14415,7 @@ static int sp_4096_mod_142(sp_digit* r, const sp_digit* a, const sp_digit* m) return sp_4096_div_142(a, m, NULL, r); } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) * * r A single precision number that is the result of the operation. @@ -14711,6 +14726,7 @@ static int sp_4096_mod_exp_142(sp_digit* r, const sp_digit* a, const sp_digit* e #endif } +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -15911,6 +15927,7 @@ static void sp_4096_to_bin_162(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) #if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D) /* Normalize the values in each word to 26 bits. * @@ -15932,6 +15949,7 @@ static void sp_4096_norm_81(sp_digit* a) } #endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 26 bits. * * a Array of sp_digit to normalize. @@ -18293,6 +18311,7 @@ static int sp_4096_mod_162(sp_digit* r, const sp_digit* a, const sp_digit* m) return sp_4096_div_162(a, m, NULL, r); } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) @@ -18607,6 +18626,7 @@ static int sp_4096_mod_exp_162(sp_digit* r, const sp_digit* a, const sp_digit* e #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) || */ /* WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c index 63f481b17..bdb09537b 100644 --- a/wolfcrypt/src/sp_c64.c +++ b/wolfcrypt/src/sp_c64.c @@ -250,6 +250,7 @@ static void sp_2048_to_bin_34(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 61 bits. * * a Array of sp_digit to normalize. @@ -263,6 +264,7 @@ static void sp_2048_norm_17(sp_digit* a) } } +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 61 bits. * * a Array of sp_digit to normalize. @@ -2177,6 +2179,7 @@ static int sp_2048_mod_34(sp_digit* r, const sp_digit* a, const sp_digit* m) return sp_2048_div_34(a, m, NULL, r); } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) * * r A single precision number that is the result of the operation. @@ -2487,6 +2490,7 @@ static int sp_2048_mod_exp_34(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif } +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -3830,6 +3834,7 @@ static void sp_2048_to_bin_36(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 57 bits. * * a Array of sp_digit to normalize. @@ -3850,6 +3855,7 @@ static void sp_2048_norm_18(sp_digit* a) a[17] += a[16] >> 57; a[16] &= 0x1ffffffffffffffL; } +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 57 bits. * * a Array of sp_digit to normalize. @@ -5730,6 +5736,7 @@ static int sp_2048_mod_36(sp_digit* r, const sp_digit* a, const sp_digit* m) return sp_2048_div_36(a, m, NULL, r); } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) @@ -6044,6 +6051,7 @@ static int sp_2048_mod_exp_36(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) || */ /* WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -7460,6 +7468,7 @@ static void sp_3072_to_bin_52(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 60 bits. * * a Array of sp_digit to normalize. @@ -7473,6 +7482,7 @@ static void sp_3072_norm_26(sp_digit* a) } } +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 60 bits. * * a Array of sp_digit to normalize. @@ -9142,6 +9152,7 @@ static int sp_3072_mod_52(sp_digit* r, const sp_digit* a, const sp_digit* m) return sp_3072_div_52(a, m, NULL, r); } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) * * r A single precision number that is the result of the operation. @@ -9452,6 +9463,7 @@ static int sp_3072_mod_exp_52(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif } +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -10795,6 +10807,7 @@ static void sp_3072_to_bin_54(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 57 bits. * * a Array of sp_digit to normalize. @@ -10816,6 +10829,7 @@ static void sp_3072_norm_27(sp_digit* a) a[26] += a[25] >> 57; a[25] &= 0x1ffffffffffffffL; } +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 57 bits. * * a Array of sp_digit to normalize. @@ -12844,6 +12858,7 @@ static int sp_3072_mod_54(sp_digit* r, const sp_digit* a, const sp_digit* m) return sp_3072_div_54(a, m, NULL, r); } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) @@ -13158,6 +13173,7 @@ static int sp_3072_mod_exp_54(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) || */ /* WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -14610,6 +14626,7 @@ static void sp_4096_to_bin_70(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) #if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D) /* Normalize the values in each word to 59 bits. * @@ -14625,6 +14642,7 @@ static void sp_4096_norm_35(sp_digit* a) } #endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 59 bits. * * a Array of sp_digit to normalize. @@ -16213,6 +16231,7 @@ static int sp_4096_mod_70(sp_digit* r, const sp_digit* a, const sp_digit* m) return sp_4096_div_70(a, m, NULL, r); } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) * * r A single precision number that is the result of the operation. @@ -16523,6 +16542,7 @@ static int sp_4096_mod_exp_70(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif } +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -17724,6 +17744,7 @@ static void sp_4096_to_bin_78(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) #if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D) /* Normalize the values in each word to 53 bits. * @@ -17751,6 +17772,7 @@ static void sp_4096_norm_39(sp_digit* a) } #endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 53 bits. * * a Array of sp_digit to normalize. @@ -19907,6 +19929,7 @@ static int sp_4096_mod_78(sp_digit* r, const sp_digit* a, const sp_digit* m) return sp_4096_div_78(a, m, NULL, r); } +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) @@ -20221,6 +20244,7 @@ static int sp_4096_mod_exp_78(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) || */ /* WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c index f147dcf13..f689ae21b 100644 --- a/wolfcrypt/src/sp_cortexm.c +++ b/wolfcrypt/src/sp_cortexm.c @@ -230,12 +230,14 @@ static void sp_2048_to_bin_64(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. */ #define sp_2048_norm_64(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. @@ -3474,7 +3476,7 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 2048 bits, just need to subtract. * @@ -3489,7 +3491,7 @@ static void sp_2048_mont_norm_64(sp_digit* r, const sp_digit* m) sp_2048_sub_in_place_64(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -3664,6 +3666,333 @@ static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a, sp_2048_mont_reduce_64(r, m, mp); } +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov r6, %[a]\n\t" + "mov r5, #1\n\t" + "lsl r5, r5, #8\n\t" + "add r6, r6, r5\n\t" + "\n1:\n\t" + "mov r5, #0\n\t" + "subs r5, r5, %[c]\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[b]]\n\t" + "sbcs r4, r4, r5\n\t" + "str r4, [%[r]]\n\t" + "sbc %[c], %[c], %[c]\n\t" + "add %[a], %[a], #4\n\t" + "add %[b], %[b], #4\n\t" + "add %[r], %[r], #4\n\t" + "cmp %[a], r6\n\t" +#ifdef __GNUC__ + "bne 1b\n\t" +#else + "bne.n 1b\n\t" +#endif /* __GNUC__ */ + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6" + ); + + return c; +} + +#else +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldr r4, [%[a], #0]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b], #0]\n\t" + "ldr r8, [%[b], #4]\n\t" + "subs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #0]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r8, [%[b], #12]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r8, [%[b], #28]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r8, [%[b], #36]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r8, [%[b], #44]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r8, [%[b], #52]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r8, [%[b], #60]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r8, [%[b], #68]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r8, [%[b], #76]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r8, [%[b], #84]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r8, [%[b], #92]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r8, [%[b], #100]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r8, [%[b], #108]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r8, [%[b], #116]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r8, [%[b], #124]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" + "sbc %[c], %[c], %[c]\n\t" + "add %[a], %[a], #0x80\n\t" + "add %[b], %[b], #0x80\n\t" + "add %[r], %[r], #0x80\n\t" + "mov r6, #0\n\t" + "sub r6, r6, %[c]\n\t" + "ldr r4, [%[a], #0]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b], #0]\n\t" + "ldr r8, [%[b], #4]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #0]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r8, [%[b], #12]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r8, [%[b], #28]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r8, [%[b], #36]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r8, [%[b], #44]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r8, [%[b], #52]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r8, [%[b], #60]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r8, [%[b], #68]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r8, [%[b], #76]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r8, [%[b], #84]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r8, [%[b], #92]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r8, [%[b], #100]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r8, [%[b], #108]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r8, [%[b], #116]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r8, [%[b], #124]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" + "sbc %[c], %[c], %[c]\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6", "r8" + ); + + return c; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -3716,6 +4045,67 @@ SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, return r; } +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[128], t2[65]; + sp_digit div, r1; + int i; + + (void)m; + + div = d[63]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); + for (i=63; i>=0; i--) { + sp_digit hi = t1[64 + i] - (t1[64 + i] == div); + r1 = div_2048_word_64(hi, t1[64 + i - 1], div); + + sp_2048_mul_d_64(t2, d, r1); + t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2); + t1[64 + i] -= t2[64]; + if (t1[64 + i] != 0) { + t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d); + if (t1[64 + i] != 0) + t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d); + } + } + + for (i = 63; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_2048_sub_64(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 64); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) +{ + return sp_2048_div_64_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* AND m into each word of a and store in r. * * r A single precision integer. @@ -3844,58 +4234,6 @@ static WC_INLINE int sp_2048_mod_64(sp_digit* r, const sp_digit* a, const sp_dig return sp_2048_div_64(a, m, NULL, r); } -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[128], t2[65]; - sp_digit div, r1; - int i; - - (void)m; - - div = d[63]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); - for (i=63; i>=0; i--) { - sp_digit hi = t1[64 + i] - (t1[64 + i] == div); - r1 = div_2048_word_64(hi, t1[64 + i - 1], div); - - sp_2048_mul_d_64(t2, d, r1); - t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2); - t1[64 + i] -= t2[64]; - if (t1[64 + i] != 0) { - t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d); - if (t1[64 + i] != 0) - t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d); - } - } - - r1 = sp_2048_cmp_64(t1, d) >= 0; - sp_2048_cond_sub_64(r, t1, d, (sp_digit)0 - r1); - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - return sp_2048_div_64_cond(a, m, NULL, r); -} - #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -4172,6 +4510,7 @@ static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* WOLFSSL_SP_SMALL */ #endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -5453,12 +5792,14 @@ static void sp_3072_to_bin_96(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. */ #define sp_3072_norm_96(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. @@ -8354,7 +8695,7 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 3072 bits, just need to subtract. * @@ -8369,7 +8710,7 @@ static void sp_3072_mont_norm_96(sp_digit* r, const sp_digit* m) sp_3072_sub_in_place_96(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -8545,6 +8886,468 @@ static void sp_3072_mont_sqr_96(sp_digit* r, const sp_digit* a, sp_3072_mont_reduce_96(r, m, mp); } +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov r6, %[a]\n\t" + "mov r5, #1\n\t" + "lsl r5, r5, #8\n\t" + "add r5, r5, #128\n\t" + "add r6, r6, r5\n\t" + "\n1:\n\t" + "mov r5, #0\n\t" + "subs r5, r5, %[c]\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[b]]\n\t" + "sbcs r4, r4, r5\n\t" + "str r4, [%[r]]\n\t" + "sbc %[c], %[c], %[c]\n\t" + "add %[a], %[a], #4\n\t" + "add %[b], %[b], #4\n\t" + "add %[r], %[r], #4\n\t" + "cmp %[a], r6\n\t" +#ifdef __GNUC__ + "bne 1b\n\t" +#else + "bne.n 1b\n\t" +#endif /* __GNUC__ */ + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6" + ); + + return c; +} + +#else +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldr r4, [%[a], #0]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b], #0]\n\t" + "ldr r8, [%[b], #4]\n\t" + "subs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #0]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r8, [%[b], #12]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r8, [%[b], #28]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r8, [%[b], #36]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r8, [%[b], #44]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r8, [%[b], #52]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r8, [%[b], #60]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r8, [%[b], #68]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r8, [%[b], #76]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r8, [%[b], #84]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r8, [%[b], #92]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r8, [%[b], #100]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r8, [%[b], #108]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r8, [%[b], #116]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r8, [%[b], #124]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" + "sbc %[c], %[c], %[c]\n\t" + "add %[a], %[a], #0x80\n\t" + "add %[b], %[b], #0x80\n\t" + "add %[r], %[r], #0x80\n\t" + "mov r6, #0\n\t" + "sub r6, r6, %[c]\n\t" + "ldr r4, [%[a], #0]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b], #0]\n\t" + "ldr r8, [%[b], #4]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #0]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r8, [%[b], #12]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r8, [%[b], #28]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r8, [%[b], #36]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r8, [%[b], #44]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r8, [%[b], #52]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r8, [%[b], #60]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r8, [%[b], #68]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r8, [%[b], #76]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r8, [%[b], #84]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r8, [%[b], #92]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r8, [%[b], #100]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r8, [%[b], #108]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r8, [%[b], #116]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r8, [%[b], #124]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" + "sbc %[c], %[c], %[c]\n\t" + "add %[a], %[a], #0x80\n\t" + "add %[b], %[b], #0x80\n\t" + "add %[r], %[r], #0x80\n\t" + "mov r6, #0\n\t" + "sub r6, r6, %[c]\n\t" + "ldr r4, [%[a], #0]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b], #0]\n\t" + "ldr r8, [%[b], #4]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #0]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r8, [%[b], #12]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r8, [%[b], #28]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r8, [%[b], #36]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r8, [%[b], #44]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r8, [%[b], #52]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r8, [%[b], #60]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r8, [%[b], #68]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r8, [%[b], #76]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r8, [%[b], #84]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r8, [%[b], #92]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r8, [%[b], #100]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r8, [%[b], #108]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r8, [%[b], #116]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r8, [%[b], #124]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" + "sbc %[c], %[c], %[c]\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6", "r8" + ); + + return c; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -8597,6 +9400,67 @@ SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, return r; } +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[192], t2[97]; + sp_digit div, r1; + int i; + + (void)m; + + div = d[95]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); + for (i=95; i>=0; i--) { + sp_digit hi = t1[96 + i] - (t1[96 + i] == div); + r1 = div_3072_word_96(hi, t1[96 + i - 1], div); + + sp_3072_mul_d_96(t2, d, r1); + t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2); + t1[96 + i] -= t2[96]; + if (t1[96 + i] != 0) { + t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d); + if (t1[96 + i] != 0) + t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d); + } + } + + for (i = 95; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_3072_sub_96(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 96); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) +{ + return sp_3072_div_96_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* AND m into each word of a and store in r. * * r A single precision integer. @@ -8727,58 +9591,6 @@ static WC_INLINE int sp_3072_mod_96(sp_digit* r, const sp_digit* a, const sp_dig return sp_3072_div_96(a, m, NULL, r); } -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_3072_div_96_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[192], t2[97]; - sp_digit div, r1; - int i; - - (void)m; - - div = d[95]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 96); - for (i=95; i>=0; i--) { - sp_digit hi = t1[96 + i] - (t1[96 + i] == div); - r1 = div_3072_word_96(hi, t1[96 + i - 1], div); - - sp_3072_mul_d_96(t2, d, r1); - t1[96 + i] += sp_3072_sub_in_place_96(&t1[i], t2); - t1[96 + i] -= t2[96]; - if (t1[96 + i] != 0) { - t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d); - if (t1[96 + i] != 0) - t1[96 + i] += sp_3072_add_96(&t1[i], &t1[i], d); - } - } - - r1 = sp_3072_cmp_96(t1, d) >= 0; - sp_3072_cond_sub_96(r, t1, d, (sp_digit)0 - r1); - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - return sp_3072_div_96_cond(a, m, NULL, r); -} - #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -9055,6 +9867,7 @@ static int sp_3072_mod_exp_96(sp_digit* r, const sp_digit* a, const sp_digit* e, #endif /* WOLFSSL_SP_SMALL */ #endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -10532,12 +11345,14 @@ static void sp_4096_to_bin_128(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. */ #define sp_4096_norm_128(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 32. * * a Array of sp_digit to normalize. @@ -11663,7 +12478,7 @@ SP_NOINLINE static void sp_4096_mul_d_128(sp_digit* r, const sp_digit* a, ); } -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 4096 bits, just need to subtract. * @@ -11678,7 +12493,7 @@ static void sp_4096_mont_norm_128(sp_digit* r, const sp_digit* m) sp_4096_sub_in_place_128(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -11853,6 +12668,601 @@ static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a, sp_4096_mont_reduce_128(r, m, mp); } +#ifdef WOLFSSL_SP_SMALL +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "mov r6, %[a]\n\t" + "mov r5, #2\n\t" + "lsl r5, r5, #8\n\t" + "add r6, r6, r5\n\t" + "\n1:\n\t" + "mov r5, #0\n\t" + "subs r5, r5, %[c]\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[b]]\n\t" + "sbcs r4, r4, r5\n\t" + "str r4, [%[r]]\n\t" + "sbc %[c], %[c], %[c]\n\t" + "add %[a], %[a], #4\n\t" + "add %[b], %[b], #4\n\t" + "add %[r], %[r], #4\n\t" + "cmp %[a], r6\n\t" +#ifdef __GNUC__ + "bne 1b\n\t" +#else + "bne.n 1b\n\t" +#endif /* __GNUC__ */ + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6" + ); + + return c; +} + +#else +/* Sub b from a into r. (r = a - b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, + const sp_digit* b) +{ + sp_digit c = 0; + + __asm__ __volatile__ ( + "ldr r4, [%[a], #0]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b], #0]\n\t" + "ldr r8, [%[b], #4]\n\t" + "subs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #0]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r8, [%[b], #12]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r8, [%[b], #28]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r8, [%[b], #36]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r8, [%[b], #44]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r8, [%[b], #52]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r8, [%[b], #60]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r8, [%[b], #68]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r8, [%[b], #76]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r8, [%[b], #84]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r8, [%[b], #92]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r8, [%[b], #100]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r8, [%[b], #108]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r8, [%[b], #116]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r8, [%[b], #124]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" + "sbc %[c], %[c], %[c]\n\t" + "add %[a], %[a], #0x80\n\t" + "add %[b], %[b], #0x80\n\t" + "add %[r], %[r], #0x80\n\t" + "mov r6, #0\n\t" + "sub r6, r6, %[c]\n\t" + "ldr r4, [%[a], #0]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b], #0]\n\t" + "ldr r8, [%[b], #4]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #0]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r8, [%[b], #12]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r8, [%[b], #28]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r8, [%[b], #36]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r8, [%[b], #44]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r8, [%[b], #52]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r8, [%[b], #60]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r8, [%[b], #68]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r8, [%[b], #76]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r8, [%[b], #84]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r8, [%[b], #92]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r8, [%[b], #100]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r8, [%[b], #108]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r8, [%[b], #116]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r8, [%[b], #124]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" + "sbc %[c], %[c], %[c]\n\t" + "add %[a], %[a], #0x80\n\t" + "add %[b], %[b], #0x80\n\t" + "add %[r], %[r], #0x80\n\t" + "mov r6, #0\n\t" + "sub r6, r6, %[c]\n\t" + "ldr r4, [%[a], #0]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b], #0]\n\t" + "ldr r8, [%[b], #4]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #0]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r8, [%[b], #12]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r8, [%[b], #28]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r8, [%[b], #36]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r8, [%[b], #44]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r8, [%[b], #52]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r8, [%[b], #60]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r8, [%[b], #68]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r8, [%[b], #76]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r8, [%[b], #84]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r8, [%[b], #92]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r8, [%[b], #100]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r8, [%[b], #108]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r8, [%[b], #116]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r8, [%[b], #124]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" + "sbc %[c], %[c], %[c]\n\t" + "add %[a], %[a], #0x80\n\t" + "add %[b], %[b], #0x80\n\t" + "add %[r], %[r], #0x80\n\t" + "mov r6, #0\n\t" + "sub r6, r6, %[c]\n\t" + "ldr r4, [%[a], #0]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[b], #0]\n\t" + "ldr r8, [%[b], #4]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #0]\n\t" + "str r5, [%[r], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "ldr r6, [%[b], #8]\n\t" + "ldr r8, [%[b], #12]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #8]\n\t" + "str r5, [%[r], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "ldr r6, [%[b], #16]\n\t" + "ldr r8, [%[b], #20]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #16]\n\t" + "str r5, [%[r], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "ldr r6, [%[b], #24]\n\t" + "ldr r8, [%[b], #28]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #24]\n\t" + "str r5, [%[r], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "ldr r6, [%[b], #32]\n\t" + "ldr r8, [%[b], #36]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #32]\n\t" + "str r5, [%[r], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "ldr r6, [%[b], #40]\n\t" + "ldr r8, [%[b], #44]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #40]\n\t" + "str r5, [%[r], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "ldr r6, [%[b], #48]\n\t" + "ldr r8, [%[b], #52]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #48]\n\t" + "str r5, [%[r], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "ldr r6, [%[b], #56]\n\t" + "ldr r8, [%[b], #60]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #56]\n\t" + "str r5, [%[r], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "ldr r5, [%[a], #68]\n\t" + "ldr r6, [%[b], #64]\n\t" + "ldr r8, [%[b], #68]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #64]\n\t" + "str r5, [%[r], #68]\n\t" + "ldr r4, [%[a], #72]\n\t" + "ldr r5, [%[a], #76]\n\t" + "ldr r6, [%[b], #72]\n\t" + "ldr r8, [%[b], #76]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #72]\n\t" + "str r5, [%[r], #76]\n\t" + "ldr r4, [%[a], #80]\n\t" + "ldr r5, [%[a], #84]\n\t" + "ldr r6, [%[b], #80]\n\t" + "ldr r8, [%[b], #84]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #80]\n\t" + "str r5, [%[r], #84]\n\t" + "ldr r4, [%[a], #88]\n\t" + "ldr r5, [%[a], #92]\n\t" + "ldr r6, [%[b], #88]\n\t" + "ldr r8, [%[b], #92]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #88]\n\t" + "str r5, [%[r], #92]\n\t" + "ldr r4, [%[a], #96]\n\t" + "ldr r5, [%[a], #100]\n\t" + "ldr r6, [%[b], #96]\n\t" + "ldr r8, [%[b], #100]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #96]\n\t" + "str r5, [%[r], #100]\n\t" + "ldr r4, [%[a], #104]\n\t" + "ldr r5, [%[a], #108]\n\t" + "ldr r6, [%[b], #104]\n\t" + "ldr r8, [%[b], #108]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #104]\n\t" + "str r5, [%[r], #108]\n\t" + "ldr r4, [%[a], #112]\n\t" + "ldr r5, [%[a], #116]\n\t" + "ldr r6, [%[b], #112]\n\t" + "ldr r8, [%[b], #116]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #112]\n\t" + "str r5, [%[r], #116]\n\t" + "ldr r4, [%[a], #120]\n\t" + "ldr r5, [%[a], #124]\n\t" + "ldr r6, [%[b], #120]\n\t" + "ldr r8, [%[b], #124]\n\t" + "sbcs r4, r4, r6\n\t" + "sbcs r5, r5, r8\n\t" + "str r4, [%[r], #120]\n\t" + "str r5, [%[r], #124]\n\t" + "sbc %[c], %[c], %[c]\n\t" + : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r4", "r5", "r6", "r8" + ); + + return c; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -11905,6 +13315,67 @@ SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, return r; } +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[256], t2[129]; + sp_digit div, r1; + int i; + + (void)m; + + div = d[127]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 128); + for (i=127; i>=0; i--) { + sp_digit hi = t1[128 + i] - (t1[128 + i] == div); + r1 = div_4096_word_128(hi, t1[128 + i - 1], div); + + sp_4096_mul_d_128(t2, d, r1); + t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2); + t1[128 + i] -= t2[128]; + if (t1[128 + i] != 0) { + t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d); + if (t1[128 + i] != 0) + t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d); + } + } + + for (i = 127; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_4096_sub_128(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 128); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) +{ + return sp_4096_div_128_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* AND m into each word of a and store in r. * * r A single precision integer. @@ -12035,58 +13506,6 @@ static WC_INLINE int sp_4096_mod_128(sp_digit* r, const sp_digit* a, const sp_di return sp_4096_div_128(a, m, NULL, r); } -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_4096_div_128_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[256], t2[129]; - sp_digit div, r1; - int i; - - (void)m; - - div = d[127]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 128); - for (i=127; i>=0; i--) { - sp_digit hi = t1[128 + i] - (t1[128 + i] == div); - r1 = div_4096_word_128(hi, t1[128 + i - 1], div); - - sp_4096_mul_d_128(t2, d, r1); - t1[128 + i] += sp_4096_sub_in_place_128(&t1[i], t2); - t1[128 + i] -= t2[128]; - if (t1[128 + i] != 0) { - t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d); - if (t1[128 + i] != 0) - t1[128 + i] += sp_4096_add_128(&t1[i], &t1[i], d); - } - } - - r1 = sp_4096_cmp_128(t1, d) >= 0; - sp_4096_cond_sub_128(r, t1, d, (sp_digit)0 - r1); - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - return sp_4096_div_128_cond(a, m, NULL, r); -} - #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -12363,6 +13782,7 @@ static int sp_4096_mod_exp_128(sp_digit* r, const sp_digit* a, const sp_digit* e #endif /* WOLFSSL_SP_SMALL */ #endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * diff --git a/wolfcrypt/src/sp_int.c b/wolfcrypt/src/sp_int.c index 6070faaa9..fbb6e11c9 100644 --- a/wolfcrypt/src/sp_int.c +++ b/wolfcrypt/src/sp_int.c @@ -2317,7 +2317,7 @@ int sp_grow(sp_int* a, int l) } #endif /* !WOLFSSL_RSA_VERIFY_ONLY || !NO_DH || HAVE_ECC */ -#if !defined(WOLFSSL_RSA_VERIFY_ONLY) +#if !defined(WOLFSSL_RSA_VERIFY_ONLY) || defined(HAVE_ECC) /* Set the multi-precision number to zero. * * @param [out] a SP integer to set to zero. @@ -2629,7 +2629,7 @@ static int _sp_cmp(sp_int* a, sp_int* b) } #endif -#ifndef WOLFSSL_RSA_VERIFY_ONLY +#if !defined(WOLFSSL_RSA_VERIFY_ONLY) || defined(HAVE_ECC) /* Compare two multi-precision numbers. * * Pointers are compared such that NULL is less than not NULL. @@ -3318,8 +3318,23 @@ int sp_mul_d(sp_int* a, sp_int_digit d, sp_int* r) #endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) || * (WOLFSSL_KEY_GEN && !NO_RSA) */ -#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \ - (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) +/* Predefine complicated rules of when to compile in sp_div_d and sp_mod_d. */ +#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \ + defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \ + defined(WC_MP_TO_RADIX) +#define WOLFSSL_SP_DIV_D +#endif +#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \ + defined(WOLFSSL_HAVE_SP_DH) || \ + (defined(HAVE_ECC) && (defined(FP_ECC) || defined(HAVE_COMP_KEY))) || \ + (!defined(NO_RSA) && defined(WOLFSSL_KEY_GEN)) +#define WOLFSSL_SP_MOD_D +#endif + +#if (defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \ + (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \ + !defined(WOLFSSL_RSA_PUBLIC_ONLY))) || \ + defined(WOLFSSL_SP_DIV_D) || defined(WOLFSSL_SP_MOD_D) #ifndef SP_ASM_DIV_WORD /* Divide a two digit number by a digit number and return. (hi | lo) / d * @@ -3382,18 +3397,6 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo, #endif /* WOLFSSL_SP_MATH_ALL || !NO_DH || HAVE_ECC || * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */ -/* Predefine complicated rules of when to compile in sp_div_d and sp_mod_d. */ -#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \ - defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) -#define WOLFSSL_SP_DIV_D -#endif -#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \ - defined(WOLFSSL_HAVE_SP_DH) || \ - (defined(HAVE_ECC) && (defined(FP_ECC) || defined(HAVE_COMP_KEY))) || \ - (!defined(NO_RSA) && defined(WOLFSSL_KEY_GEN)) -#define WOLFSSL_SP_MOD_D -#endif - #if (defined(WOLFSSL_SP_DIV_D) || defined(WOLFSSL_SP_MOD_D)) && \ !defined(WOLFSSL_SP_SMALL) /* Divide by 3: r = a / 3 and rem = a % 3 @@ -3809,7 +3812,7 @@ int sp_div_2(sp_int* a, sp_int* r) * Add/Subtract Functions ************************/ -#if !defined(WOLFSSL_RSA_VERIFY_ONLY) +#if !defined(WOLFSSL_RSA_VERIFY_ONLY) || defined(WOLFSSL_SP_INVMOD) /* Add offset b to a into r: r = a + (b << (o * SP_WORD_SIZEOF)) * * @param [in] a SP integer to add to. @@ -3915,7 +3918,7 @@ static int _sp_sub_off(sp_int* a, sp_int* b, sp_int* r, int o) #endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_SP_INT_NEGATIVE || !NO_DH || * HAVE_ECC || (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */ -#if !defined(WOLFSSL_RSA_VERIFY_ONLY) +#if !defined(WOLFSSL_RSA_VERIFY_ONLY) || defined(WOLFSSL_SP_INVMOD) /* Add b to a into r: r = a + b * * @param [in] a SP integer to add to. @@ -12329,7 +12332,8 @@ int sp_read_unsigned_bin(sp_int* a, const byte* in, word32 inSz) } #if (!defined(NO_DH) || defined(HAVE_ECC) || defined(WC_RSA_BLINDING) || \ - defined(WOLFSSL_RSA_PUBLIC_ONLY)) && !defined(WOLFSSL_RSA_VERIFY_ONLY) + defined(WOLFSSL_RSA_PUBLIC_ONLY)) && (!defined(WOLFSSL_RSA_VERIFY_ONLY) || \ + defined(HAVE_ECC_KEY_EXPORT)) /* Convert the multi-precision number to an array of bytes in big-endian format. * * The array must be large enough for encoded number - use mp_unsigned_bin_size @@ -12671,7 +12675,8 @@ int sp_tohex(sp_int* a, char* str) #endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) || WC_MP_TO_RADIX */ #if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \ - defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) + defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \ + defined(WC_MP_TO_RADIX) /* Put the big-endian, decimal string encoding of a into str. * * Assumes str is large enough for result. diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c index 82f1c9ef7..4140d4f8a 100644 --- a/wolfcrypt/src/sp_x86_64.c +++ b/wolfcrypt/src/sp_x86_64.c @@ -193,12 +193,14 @@ static void sp_2048_to_bin_32(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 64. * * a Array of sp_digit to normalize. */ #define sp_2048_norm_32(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 64. * * a Array of sp_digit to normalize. @@ -830,7 +832,7 @@ static int sp_2048_mod_exp_avx2_16(sp_digit* r, const sp_digit* a, const sp_digi #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 2048 bits, just need to subtract. * @@ -845,7 +847,7 @@ static void sp_2048_mont_norm_32(sp_digit* r, const sp_digit* m) sp_2048_sub_in_place_32(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ extern sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); extern void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp); /* Multiply two Montogmery form numbers mod the modulus (prime). @@ -878,8 +880,7 @@ static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, sp_2048_mont_reduce_32(r, m, mp); } -#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) -extern sp_digit sp_2048_cond_sub_avx2_32(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); +extern sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* b); extern void sp_2048_mul_d_avx2_32(sp_digit* r, const sp_digit* a, const sp_digit b); #ifdef _WIN64 #if _MSC_VER < 1920 @@ -922,6 +923,86 @@ static WC_INLINE sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, return r; } #endif /* _WIN64 */ +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_2048_div_32_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[64]; + sp_digit t2[33]; + sp_digit div; + sp_digit r1; + int i; +#ifdef HAVE_INTEL_AVX2 + word32 cpuid_flags = cpuid_get_flags(); +#endif + + (void)m; + + div = d[31]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 32); + for (i = 31; i > 0; i--) { + if (t1[i + 32] != d[i]) + break; + } + if (t1[i + 32] >= d[i]) { + sp_2048_sub_in_place_32(&t1[32], d); + } + for (i=31; i>=0; i--) { + sp_digit hi = t1[32 + i] - (t1[32 + i] == div); + r1 = div_2048_word_32(hi, t1[32 + i - 1], div); + +#ifdef HAVE_INTEL_AVX2 + if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) + sp_2048_mul_d_avx2_32(t2, d, r1); + else +#endif + sp_2048_mul_d_32(t2, d, r1); + t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2); + t1[32 + i] -= t2[32]; + if (t1[32 + i] != 0) { + t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], d); + if (t1[32 + i] != 0) + t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], d); + } + } + + for (i = 31; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_2048_sub_32(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 32); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_2048_mod_32_cond(sp_digit* r, const sp_digit* a, + const sp_digit* m) +{ + return sp_2048_div_32_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +extern sp_digit sp_2048_cond_sub_avx2_32(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); /* AND m into each word of a and store in r. * * r A single precision integer. @@ -1014,6 +1095,7 @@ static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_dig return MP_OKAY; } +#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) /* Reduce a modulo m into r. (r = a mod m) * * r A single precision number that is the reduced result. @@ -1028,86 +1110,6 @@ static WC_INLINE int sp_2048_mod_32(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */ -extern sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* b); -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_2048_div_32_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[64]; - sp_digit t2[33]; - sp_digit div; - sp_digit r1; - int i; -#ifdef HAVE_INTEL_AVX2 - word32 cpuid_flags = cpuid_get_flags(); -#endif - - (void)m; - - div = d[31]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 32); - for (i = 31; i > 0; i--) { - if (t1[i + 32] != d[i]) - break; - } - if (t1[i + 32] >= d[i]) { - sp_2048_sub_in_place_32(&t1[32], d); - } - for (i=31; i>=0; i--) { - sp_digit hi = t1[32 + i] - (t1[32 + i] == div); - r1 = div_2048_word_32(hi, t1[32 + i - 1], div); - -#ifdef HAVE_INTEL_AVX2 - if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) - sp_2048_mul_d_avx2_32(t2, d, r1); - else -#endif - sp_2048_mul_d_32(t2, d, r1); - t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2); - t1[32 + i] -= t2[32]; - if (t1[32 + i] != 0) { - t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], d); - if (t1[32 + i] != 0) - t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], d); - } - } - - for (i = 31; i > 0; i--) { - if (t1[i] != d[i]) - break; - } - if (t1[i] >= d[i]) { - sp_2048_sub_32(r, t1, d); - } - else { - XMEMCPY(r, t1, sizeof(*t1) * 32); - } - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_2048_mod_32_cond(sp_digit* r, const sp_digit* a, - const sp_digit* m) -{ - return sp_2048_div_32_cond(a, m, NULL, r); -} - -#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) * * r A single precision number that is the result of the operation. @@ -1263,8 +1265,8 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, return err; } -#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ extern void sp_2048_mont_reduce_avx2_32(sp_digit* a, const sp_digit* m, sp_digit mp); #ifdef HAVE_INTEL_AVX2 /* Multiply two Montogmery form numbers mod the modulus (prime). @@ -1458,8 +1460,8 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi return err; } #endif /* HAVE_INTEL_AVX2 */ -#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -1619,6 +1621,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, return err; } +#ifndef WOLFSSL_RSA_PUBLIC_ONLY #if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM) /* RSA private key operation. * @@ -1868,6 +1871,7 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm, return err; } #endif /* SP_RSA_PRIVATE_EXP_D | RSA_LOW_MEM */ +#endif /* WOLFSSL_RSA_PUBLIC_ONLY */ #endif /* WOLFSSL_HAVE_SP_RSA */ #if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \ !defined(WOLFSSL_RSA_PUBLIC_ONLY)) @@ -2510,12 +2514,14 @@ static void sp_3072_to_bin_48(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 64. * * a Array of sp_digit to normalize. */ #define sp_3072_norm_48(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 64. * * a Array of sp_digit to normalize. @@ -3157,7 +3163,7 @@ static int sp_3072_mod_exp_avx2_24(sp_digit* r, const sp_digit* a, const sp_digi #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 3072 bits, just need to subtract. * @@ -3172,7 +3178,7 @@ static void sp_3072_mont_norm_48(sp_digit* r, const sp_digit* m) sp_3072_sub_in_place_48(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ extern sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); extern void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp); /* Multiply two Montogmery form numbers mod the modulus (prime). @@ -3205,8 +3211,7 @@ static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, sp_3072_mont_reduce_48(r, m, mp); } -#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) -extern sp_digit sp_3072_cond_sub_avx2_48(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); +extern sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, const sp_digit* b); extern void sp_3072_mul_d_avx2_48(sp_digit* r, const sp_digit* a, const sp_digit b); #ifdef _WIN64 #if _MSC_VER < 1920 @@ -3249,6 +3254,86 @@ static WC_INLINE sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, return r; } #endif /* _WIN64 */ +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_3072_div_48_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[96]; + sp_digit t2[49]; + sp_digit div; + sp_digit r1; + int i; +#ifdef HAVE_INTEL_AVX2 + word32 cpuid_flags = cpuid_get_flags(); +#endif + + (void)m; + + div = d[47]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 48); + for (i = 47; i > 0; i--) { + if (t1[i + 48] != d[i]) + break; + } + if (t1[i + 48] >= d[i]) { + sp_3072_sub_in_place_48(&t1[48], d); + } + for (i=47; i>=0; i--) { + sp_digit hi = t1[48 + i] - (t1[48 + i] == div); + r1 = div_3072_word_48(hi, t1[48 + i - 1], div); + +#ifdef HAVE_INTEL_AVX2 + if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) + sp_3072_mul_d_avx2_48(t2, d, r1); + else +#endif + sp_3072_mul_d_48(t2, d, r1); + t1[48 + i] += sp_3072_sub_in_place_48(&t1[i], t2); + t1[48 + i] -= t2[48]; + if (t1[48 + i] != 0) { + t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], d); + if (t1[48 + i] != 0) + t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], d); + } + } + + for (i = 47; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_3072_sub_48(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 48); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_3072_mod_48_cond(sp_digit* r, const sp_digit* a, + const sp_digit* m) +{ + return sp_3072_div_48_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +extern sp_digit sp_3072_cond_sub_avx2_48(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); /* AND m into each word of a and store in r. * * r A single precision integer. @@ -3341,6 +3426,7 @@ static WC_INLINE int sp_3072_div_48(const sp_digit* a, const sp_digit* d, sp_dig return MP_OKAY; } +#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) /* Reduce a modulo m into r. (r = a mod m) * * r A single precision number that is the reduced result. @@ -3355,86 +3441,6 @@ static WC_INLINE int sp_3072_mod_48(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */ -extern sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, const sp_digit* b); -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_3072_div_48_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[96]; - sp_digit t2[49]; - sp_digit div; - sp_digit r1; - int i; -#ifdef HAVE_INTEL_AVX2 - word32 cpuid_flags = cpuid_get_flags(); -#endif - - (void)m; - - div = d[47]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 48); - for (i = 47; i > 0; i--) { - if (t1[i + 48] != d[i]) - break; - } - if (t1[i + 48] >= d[i]) { - sp_3072_sub_in_place_48(&t1[48], d); - } - for (i=47; i>=0; i--) { - sp_digit hi = t1[48 + i] - (t1[48 + i] == div); - r1 = div_3072_word_48(hi, t1[48 + i - 1], div); - -#ifdef HAVE_INTEL_AVX2 - if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) - sp_3072_mul_d_avx2_48(t2, d, r1); - else -#endif - sp_3072_mul_d_48(t2, d, r1); - t1[48 + i] += sp_3072_sub_in_place_48(&t1[i], t2); - t1[48 + i] -= t2[48]; - if (t1[48 + i] != 0) { - t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], d); - if (t1[48 + i] != 0) - t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], d); - } - } - - for (i = 47; i > 0; i--) { - if (t1[i] != d[i]) - break; - } - if (t1[i] >= d[i]) { - sp_3072_sub_48(r, t1, d); - } - else { - XMEMCPY(r, t1, sizeof(*t1) * 48); - } - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_3072_mod_48_cond(sp_digit* r, const sp_digit* a, - const sp_digit* m) -{ - return sp_3072_div_48_cond(a, m, NULL, r); -} - -#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) * * r A single precision number that is the result of the operation. @@ -3590,8 +3596,8 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e, return err; } -#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ extern void sp_3072_mont_reduce_avx2_48(sp_digit* a, const sp_digit* m, sp_digit mp); #ifdef HAVE_INTEL_AVX2 /* Multiply two Montogmery form numbers mod the modulus (prime). @@ -3785,8 +3791,8 @@ static int sp_3072_mod_exp_avx2_48(sp_digit* r, const sp_digit* a, const sp_digi return err; } #endif /* HAVE_INTEL_AVX2 */ -#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -3946,6 +3952,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, return err; } +#ifndef WOLFSSL_RSA_PUBLIC_ONLY #if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM) /* RSA private key operation. * @@ -4195,6 +4202,7 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm, return err; } #endif /* SP_RSA_PRIVATE_EXP_D | RSA_LOW_MEM */ +#endif /* WOLFSSL_RSA_PUBLIC_ONLY */ #endif /* WOLFSSL_HAVE_SP_RSA */ #if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \ !defined(WOLFSSL_RSA_PUBLIC_ONLY)) @@ -4837,12 +4845,14 @@ static void sp_4096_to_bin_64(sp_digit* r, byte* a) } } +#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH) /* Normalize the values in each word to 64. * * a Array of sp_digit to normalize. */ #define sp_4096_norm_64(a) +#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */ /* Normalize the values in each word to 64. * * a Array of sp_digit to normalize. @@ -4882,7 +4892,7 @@ static void sp_4096_mont_setup(const sp_digit* a, sp_digit* rho) } extern void sp_4096_mul_d_64(sp_digit* r, const sp_digit* a, sp_digit b); -#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 4096 bits, just need to subtract. * @@ -4897,7 +4907,7 @@ static void sp_4096_mont_norm_64(sp_digit* r, const sp_digit* m) sp_4096_sub_in_place_64(r, m); } -#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ extern sp_digit sp_4096_cond_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); extern void sp_4096_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp); /* Multiply two Montogmery form numbers mod the modulus (prime). @@ -4930,8 +4940,7 @@ static void sp_4096_mont_sqr_64(sp_digit* r, const sp_digit* a, sp_4096_mont_reduce_64(r, m, mp); } -#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) -extern sp_digit sp_4096_cond_sub_avx2_64(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); +extern sp_digit sp_4096_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b); extern void sp_4096_mul_d_avx2_64(sp_digit* r, const sp_digit* a, const sp_digit b); #ifdef _WIN64 #if _MSC_VER < 1920 @@ -4974,6 +4983,86 @@ static WC_INLINE sp_digit div_4096_word_64(sp_digit d1, sp_digit d0, return r; } #endif /* _WIN64 */ +/* Divide d in a and put remainder into r (m*d + r = a) + * m is not calculated as it is not needed at this time. + * + * a Number to be divided. + * d Number to divide with. + * m Multiplier result. + * r Remainder from the division. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_4096_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, + sp_digit* r) +{ + sp_digit t1[128]; + sp_digit t2[65]; + sp_digit div; + sp_digit r1; + int i; +#ifdef HAVE_INTEL_AVX2 + word32 cpuid_flags = cpuid_get_flags(); +#endif + + (void)m; + + div = d[63]; + XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); + for (i = 63; i > 0; i--) { + if (t1[i + 64] != d[i]) + break; + } + if (t1[i + 64] >= d[i]) { + sp_4096_sub_in_place_64(&t1[64], d); + } + for (i=63; i>=0; i--) { + sp_digit hi = t1[64 + i] - (t1[64 + i] == div); + r1 = div_4096_word_64(hi, t1[64 + i - 1], div); + +#ifdef HAVE_INTEL_AVX2 + if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) + sp_4096_mul_d_avx2_64(t2, d, r1); + else +#endif + sp_4096_mul_d_64(t2, d, r1); + t1[64 + i] += sp_4096_sub_in_place_64(&t1[i], t2); + t1[64 + i] -= t2[64]; + if (t1[64 + i] != 0) { + t1[64 + i] += sp_4096_add_64(&t1[i], &t1[i], d); + if (t1[64 + i] != 0) + t1[64 + i] += sp_4096_add_64(&t1[i], &t1[i], d); + } + } + + for (i = 63; i > 0; i--) { + if (t1[i] != d[i]) + break; + } + if (t1[i] >= d[i]) { + sp_4096_sub_64(r, t1, d); + } + else { + XMEMCPY(r, t1, sizeof(*t1) * 64); + } + + return MP_OKAY; +} + +/* Reduce a modulo m into r. (r = a mod m) + * + * r A single precision number that is the reduced result. + * a A single precision number that is to be reduced. + * m A single precision number that is the modulus to reduce with. + * returns MP_OKAY indicating success. + */ +static WC_INLINE int sp_4096_mod_64_cond(sp_digit* r, const sp_digit* a, + const sp_digit* m) +{ + return sp_4096_div_64_cond(a, m, NULL, r); +} + +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +extern sp_digit sp_4096_cond_sub_avx2_64(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); /* AND m into each word of a and store in r. * * r A single precision integer. @@ -5066,6 +5155,7 @@ static WC_INLINE int sp_4096_div_64(const sp_digit* a, const sp_digit* d, sp_dig return MP_OKAY; } +#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) /* Reduce a modulo m into r. (r = a mod m) * * r A single precision number that is the reduced result. @@ -5080,86 +5170,6 @@ static WC_INLINE int sp_4096_mod_64(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */ -extern sp_digit sp_4096_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b); -/* Divide d in a and put remainder into r (m*d + r = a) - * m is not calculated as it is not needed at this time. - * - * a Number to be divided. - * d Number to divide with. - * m Multiplier result. - * r Remainder from the division. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_4096_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m, - sp_digit* r) -{ - sp_digit t1[128]; - sp_digit t2[65]; - sp_digit div; - sp_digit r1; - int i; -#ifdef HAVE_INTEL_AVX2 - word32 cpuid_flags = cpuid_get_flags(); -#endif - - (void)m; - - div = d[63]; - XMEMCPY(t1, a, sizeof(*t1) * 2 * 64); - for (i = 63; i > 0; i--) { - if (t1[i + 64] != d[i]) - break; - } - if (t1[i + 64] >= d[i]) { - sp_4096_sub_in_place_64(&t1[64], d); - } - for (i=63; i>=0; i--) { - sp_digit hi = t1[64 + i] - (t1[64 + i] == div); - r1 = div_4096_word_64(hi, t1[64 + i - 1], div); - -#ifdef HAVE_INTEL_AVX2 - if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) - sp_4096_mul_d_avx2_64(t2, d, r1); - else -#endif - sp_4096_mul_d_64(t2, d, r1); - t1[64 + i] += sp_4096_sub_in_place_64(&t1[i], t2); - t1[64 + i] -= t2[64]; - if (t1[64 + i] != 0) { - t1[64 + i] += sp_4096_add_64(&t1[i], &t1[i], d); - if (t1[64 + i] != 0) - t1[64 + i] += sp_4096_add_64(&t1[i], &t1[i], d); - } - } - - for (i = 63; i > 0; i--) { - if (t1[i] != d[i]) - break; - } - if (t1[i] >= d[i]) { - sp_4096_sub_64(r, t1, d); - } - else { - XMEMCPY(r, t1, sizeof(*t1) * 64); - } - - return MP_OKAY; -} - -/* Reduce a modulo m into r. (r = a mod m) - * - * r A single precision number that is the reduced result. - * a A single precision number that is to be reduced. - * m A single precision number that is the modulus to reduce with. - * returns MP_OKAY indicating success. - */ -static WC_INLINE int sp_4096_mod_64_cond(sp_digit* r, const sp_digit* a, - const sp_digit* m) -{ - return sp_4096_div_64_cond(a, m, NULL, r); -} - -#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* Modular exponentiate a to the e mod m. (r = a^e mod m) * * r A single precision number that is the result of the operation. @@ -5315,8 +5325,8 @@ static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e, return err; } -#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ extern void sp_4096_mont_reduce_avx2_64(sp_digit* a, const sp_digit* m, sp_digit mp); #ifdef HAVE_INTEL_AVX2 /* Multiply two Montogmery form numbers mod the modulus (prime). @@ -5510,8 +5520,8 @@ static int sp_4096_mod_exp_avx2_64(sp_digit* r, const sp_digit* a, const sp_digi return err; } #endif /* HAVE_INTEL_AVX2 */ -#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. * @@ -5671,6 +5681,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, return err; } +#ifndef WOLFSSL_RSA_PUBLIC_ONLY #if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM) /* RSA private key operation. * @@ -5920,6 +5931,7 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm, return err; } #endif /* SP_RSA_PRIVATE_EXP_D | RSA_LOW_MEM */ +#endif /* WOLFSSL_RSA_PUBLIC_ONLY */ #endif /* WOLFSSL_HAVE_SP_RSA */ #if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \ !defined(WOLFSSL_RSA_PUBLIC_ONLY)) diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S index 3fc9b3365..6b3dfa5c0 100644 --- a/wolfcrypt/src/sp_x86_64_asm.S +++ b/wolfcrypt/src/sp_x86_64_asm.S @@ -11067,194 +11067,126 @@ L_2048_mont_loop_32: #ifndef __APPLE__ .size sp_2048_mont_reduce_32,.-sp_2048_mont_reduce_32 #endif /* __APPLE__ */ -#ifdef HAVE_INTEL_AVX2 -/* Conditionally subtract b from a using the mask m. - * m is -1 to subtract and 0 when not copying. +/* Sub b from a into r. (r = a - b) * - * r A single precision number representing condition subtract result. - * a A single precision number to subtract from. - * b A single precision number to subtract. - * m Mask value to apply. + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. */ #ifndef __APPLE__ .text -.globl sp_2048_cond_sub_avx2_32 -.type sp_2048_cond_sub_avx2_32,@function +.globl sp_2048_sub_32 +.type sp_2048_sub_32,@function .align 16 -sp_2048_cond_sub_avx2_32: +sp_2048_sub_32: #else .section __TEXT,__text -.globl _sp_2048_cond_sub_avx2_32 +.globl _sp_2048_sub_32 .p2align 4 -_sp_2048_cond_sub_avx2_32: +_sp_2048_sub_32: #endif /* __APPLE__ */ - movq $0x00, %rax - movq (%rdx), %r10 - movq (%rsi), %r8 - pextq %rcx, %r10, %r10 - subq %r10, %r8 - movq 8(%rdx), %r10 - movq 8(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, (%rdi) - sbbq %r10, %r9 - movq 16(%rdx), %r8 - movq 16(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 8(%rdi) - sbbq %r8, %r10 - movq 24(%rdx), %r9 + movq (%rsi), %rcx + xorq %rax, %rax + subq (%rdx), %rcx + movq 8(%rsi), %r8 + movq %rcx, (%rdi) + sbbq 8(%rdx), %r8 + movq 16(%rsi), %rcx + movq %r8, 8(%rdi) + sbbq 16(%rdx), %rcx movq 24(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 16(%rdi) - sbbq %r9, %r8 - movq 32(%rdx), %r10 - movq 32(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 16(%rdi) + sbbq 24(%rdx), %r8 + movq 32(%rsi), %rcx movq %r8, 24(%rdi) - sbbq %r10, %r9 - movq 40(%rdx), %r8 - movq 40(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 32(%rdi) - sbbq %r8, %r10 - movq 48(%rdx), %r9 - movq 48(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 40(%rdi) - sbbq %r9, %r8 - movq 56(%rdx), %r10 - movq 56(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 48(%rdi) - sbbq %r10, %r9 - movq 64(%rdx), %r8 - movq 64(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 56(%rdi) - sbbq %r8, %r10 - movq 72(%rdx), %r9 + sbbq 32(%rdx), %rcx + movq 40(%rsi), %r8 + movq %rcx, 32(%rdi) + sbbq 40(%rdx), %r8 + movq 48(%rsi), %rcx + movq %r8, 40(%rdi) + sbbq 48(%rdx), %rcx + movq 56(%rsi), %r8 + movq %rcx, 48(%rdi) + sbbq 56(%rdx), %r8 + movq 64(%rsi), %rcx + movq %r8, 56(%rdi) + sbbq 64(%rdx), %rcx movq 72(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 64(%rdi) - sbbq %r9, %r8 - movq 80(%rdx), %r10 - movq 80(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 64(%rdi) + sbbq 72(%rdx), %r8 + movq 80(%rsi), %rcx movq %r8, 72(%rdi) - sbbq %r10, %r9 - movq 88(%rdx), %r8 - movq 88(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 80(%rdi) - sbbq %r8, %r10 - movq 96(%rdx), %r9 - movq 96(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 88(%rdi) - sbbq %r9, %r8 - movq 104(%rdx), %r10 - movq 104(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 96(%rdi) - sbbq %r10, %r9 - movq 112(%rdx), %r8 - movq 112(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 104(%rdi) - sbbq %r8, %r10 - movq 120(%rdx), %r9 + sbbq 80(%rdx), %rcx + movq 88(%rsi), %r8 + movq %rcx, 80(%rdi) + sbbq 88(%rdx), %r8 + movq 96(%rsi), %rcx + movq %r8, 88(%rdi) + sbbq 96(%rdx), %rcx + movq 104(%rsi), %r8 + movq %rcx, 96(%rdi) + sbbq 104(%rdx), %r8 + movq 112(%rsi), %rcx + movq %r8, 104(%rdi) + sbbq 112(%rdx), %rcx movq 120(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 112(%rdi) - sbbq %r9, %r8 - movq 128(%rdx), %r10 - movq 128(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 112(%rdi) + sbbq 120(%rdx), %r8 + movq 128(%rsi), %rcx movq %r8, 120(%rdi) - sbbq %r10, %r9 - movq 136(%rdx), %r8 - movq 136(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 128(%rdi) - sbbq %r8, %r10 - movq 144(%rdx), %r9 - movq 144(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 136(%rdi) - sbbq %r9, %r8 - movq 152(%rdx), %r10 - movq 152(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 144(%rdi) - sbbq %r10, %r9 - movq 160(%rdx), %r8 - movq 160(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 152(%rdi) - sbbq %r8, %r10 - movq 168(%rdx), %r9 + sbbq 128(%rdx), %rcx + movq 136(%rsi), %r8 + movq %rcx, 128(%rdi) + sbbq 136(%rdx), %r8 + movq 144(%rsi), %rcx + movq %r8, 136(%rdi) + sbbq 144(%rdx), %rcx + movq 152(%rsi), %r8 + movq %rcx, 144(%rdi) + sbbq 152(%rdx), %r8 + movq 160(%rsi), %rcx + movq %r8, 152(%rdi) + sbbq 160(%rdx), %rcx movq 168(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 160(%rdi) - sbbq %r9, %r8 - movq 176(%rdx), %r10 - movq 176(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 160(%rdi) + sbbq 168(%rdx), %r8 + movq 176(%rsi), %rcx movq %r8, 168(%rdi) - sbbq %r10, %r9 - movq 184(%rdx), %r8 - movq 184(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 176(%rdi) - sbbq %r8, %r10 - movq 192(%rdx), %r9 - movq 192(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 184(%rdi) - sbbq %r9, %r8 - movq 200(%rdx), %r10 - movq 200(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 192(%rdi) - sbbq %r10, %r9 - movq 208(%rdx), %r8 - movq 208(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 200(%rdi) - sbbq %r8, %r10 - movq 216(%rdx), %r9 + sbbq 176(%rdx), %rcx + movq 184(%rsi), %r8 + movq %rcx, 176(%rdi) + sbbq 184(%rdx), %r8 + movq 192(%rsi), %rcx + movq %r8, 184(%rdi) + sbbq 192(%rdx), %rcx + movq 200(%rsi), %r8 + movq %rcx, 192(%rdi) + sbbq 200(%rdx), %r8 + movq 208(%rsi), %rcx + movq %r8, 200(%rdi) + sbbq 208(%rdx), %rcx movq 216(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 208(%rdi) - sbbq %r9, %r8 - movq 224(%rdx), %r10 - movq 224(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 208(%rdi) + sbbq 216(%rdx), %r8 + movq 224(%rsi), %rcx movq %r8, 216(%rdi) - sbbq %r10, %r9 - movq 232(%rdx), %r8 - movq 232(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 224(%rdi) - sbbq %r8, %r10 - movq 240(%rdx), %r9 - movq 240(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 232(%rdi) - sbbq %r9, %r8 - movq 248(%rdx), %r10 - movq 248(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 240(%rdi) - sbbq %r10, %r9 - movq %r9, 248(%rdi) + sbbq 224(%rdx), %rcx + movq 232(%rsi), %r8 + movq %rcx, 224(%rdi) + sbbq 232(%rdx), %r8 + movq 240(%rsi), %rcx + movq %r8, 232(%rdi) + sbbq 240(%rdx), %rcx + movq 248(%rsi), %r8 + movq %rcx, 240(%rdi) + sbbq 248(%rdx), %r8 + movq %r8, 248(%rdi) sbbq $0x00, %rax repz retq #ifndef __APPLE__ -.size sp_2048_cond_sub_avx2_32,.-sp_2048_cond_sub_avx2_32 +.size sp_2048_sub_32,.-sp_2048_sub_32 #endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ #ifdef HAVE_INTEL_AVX2 /* Mul a by digit b into r. (r = a * b) * @@ -11502,6 +11434,194 @@ _div_2048_word_asm_32: .size div_2048_word_asm_32,.-div_2048_word_asm_32 #endif /* __APPLE__ */ #endif /* _WIN64 */ +#ifdef HAVE_INTEL_AVX2 +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +#ifndef __APPLE__ +.text +.globl sp_2048_cond_sub_avx2_32 +.type sp_2048_cond_sub_avx2_32,@function +.align 16 +sp_2048_cond_sub_avx2_32: +#else +.section __TEXT,__text +.globl _sp_2048_cond_sub_avx2_32 +.p2align 4 +_sp_2048_cond_sub_avx2_32: +#endif /* __APPLE__ */ + movq $0x00, %rax + movq (%rdx), %r10 + movq (%rsi), %r8 + pextq %rcx, %r10, %r10 + subq %r10, %r8 + movq 8(%rdx), %r10 + movq 8(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, (%rdi) + sbbq %r10, %r9 + movq 16(%rdx), %r8 + movq 16(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 8(%rdi) + sbbq %r8, %r10 + movq 24(%rdx), %r9 + movq 24(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 16(%rdi) + sbbq %r9, %r8 + movq 32(%rdx), %r10 + movq 32(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 24(%rdi) + sbbq %r10, %r9 + movq 40(%rdx), %r8 + movq 40(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 32(%rdi) + sbbq %r8, %r10 + movq 48(%rdx), %r9 + movq 48(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 40(%rdi) + sbbq %r9, %r8 + movq 56(%rdx), %r10 + movq 56(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 48(%rdi) + sbbq %r10, %r9 + movq 64(%rdx), %r8 + movq 64(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 56(%rdi) + sbbq %r8, %r10 + movq 72(%rdx), %r9 + movq 72(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 64(%rdi) + sbbq %r9, %r8 + movq 80(%rdx), %r10 + movq 80(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 72(%rdi) + sbbq %r10, %r9 + movq 88(%rdx), %r8 + movq 88(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 80(%rdi) + sbbq %r8, %r10 + movq 96(%rdx), %r9 + movq 96(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 88(%rdi) + sbbq %r9, %r8 + movq 104(%rdx), %r10 + movq 104(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 96(%rdi) + sbbq %r10, %r9 + movq 112(%rdx), %r8 + movq 112(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 104(%rdi) + sbbq %r8, %r10 + movq 120(%rdx), %r9 + movq 120(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 112(%rdi) + sbbq %r9, %r8 + movq 128(%rdx), %r10 + movq 128(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 120(%rdi) + sbbq %r10, %r9 + movq 136(%rdx), %r8 + movq 136(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 128(%rdi) + sbbq %r8, %r10 + movq 144(%rdx), %r9 + movq 144(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 136(%rdi) + sbbq %r9, %r8 + movq 152(%rdx), %r10 + movq 152(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 144(%rdi) + sbbq %r10, %r9 + movq 160(%rdx), %r8 + movq 160(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 152(%rdi) + sbbq %r8, %r10 + movq 168(%rdx), %r9 + movq 168(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 160(%rdi) + sbbq %r9, %r8 + movq 176(%rdx), %r10 + movq 176(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 168(%rdi) + sbbq %r10, %r9 + movq 184(%rdx), %r8 + movq 184(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 176(%rdi) + sbbq %r8, %r10 + movq 192(%rdx), %r9 + movq 192(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 184(%rdi) + sbbq %r9, %r8 + movq 200(%rdx), %r10 + movq 200(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 192(%rdi) + sbbq %r10, %r9 + movq 208(%rdx), %r8 + movq 208(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 200(%rdi) + sbbq %r8, %r10 + movq 216(%rdx), %r9 + movq 216(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 208(%rdi) + sbbq %r9, %r8 + movq 224(%rdx), %r10 + movq 224(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 216(%rdi) + sbbq %r10, %r9 + movq 232(%rdx), %r8 + movq 232(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 224(%rdi) + sbbq %r8, %r10 + movq 240(%rdx), %r9 + movq 240(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 232(%rdi) + sbbq %r9, %r8 + movq 248(%rdx), %r10 + movq 248(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 240(%rdi) + sbbq %r10, %r9 + movq %r9, 248(%rdi) + sbbq $0x00, %rax + repz retq +#ifndef __APPLE__ +.size sp_2048_cond_sub_avx2_32,.-sp_2048_cond_sub_avx2_32 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ /* Compare a with b in constant time. * * a A single precision integer. @@ -11786,126 +11906,6 @@ _sp_2048_cmp_32: #ifndef __APPLE__ .size sp_2048_cmp_32,.-sp_2048_cmp_32 #endif /* __APPLE__ */ -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_2048_sub_32 -.type sp_2048_sub_32,@function -.align 16 -sp_2048_sub_32: -#else -.section __TEXT,__text -.globl _sp_2048_sub_32 -.p2align 4 -_sp_2048_sub_32: -#endif /* __APPLE__ */ - movq (%rsi), %rcx - xorq %rax, %rax - subq (%rdx), %rcx - movq 8(%rsi), %r8 - movq %rcx, (%rdi) - sbbq 8(%rdx), %r8 - movq 16(%rsi), %rcx - movq %r8, 8(%rdi) - sbbq 16(%rdx), %rcx - movq 24(%rsi), %r8 - movq %rcx, 16(%rdi) - sbbq 24(%rdx), %r8 - movq 32(%rsi), %rcx - movq %r8, 24(%rdi) - sbbq 32(%rdx), %rcx - movq 40(%rsi), %r8 - movq %rcx, 32(%rdi) - sbbq 40(%rdx), %r8 - movq 48(%rsi), %rcx - movq %r8, 40(%rdi) - sbbq 48(%rdx), %rcx - movq 56(%rsi), %r8 - movq %rcx, 48(%rdi) - sbbq 56(%rdx), %r8 - movq 64(%rsi), %rcx - movq %r8, 56(%rdi) - sbbq 64(%rdx), %rcx - movq 72(%rsi), %r8 - movq %rcx, 64(%rdi) - sbbq 72(%rdx), %r8 - movq 80(%rsi), %rcx - movq %r8, 72(%rdi) - sbbq 80(%rdx), %rcx - movq 88(%rsi), %r8 - movq %rcx, 80(%rdi) - sbbq 88(%rdx), %r8 - movq 96(%rsi), %rcx - movq %r8, 88(%rdi) - sbbq 96(%rdx), %rcx - movq 104(%rsi), %r8 - movq %rcx, 96(%rdi) - sbbq 104(%rdx), %r8 - movq 112(%rsi), %rcx - movq %r8, 104(%rdi) - sbbq 112(%rdx), %rcx - movq 120(%rsi), %r8 - movq %rcx, 112(%rdi) - sbbq 120(%rdx), %r8 - movq 128(%rsi), %rcx - movq %r8, 120(%rdi) - sbbq 128(%rdx), %rcx - movq 136(%rsi), %r8 - movq %rcx, 128(%rdi) - sbbq 136(%rdx), %r8 - movq 144(%rsi), %rcx - movq %r8, 136(%rdi) - sbbq 144(%rdx), %rcx - movq 152(%rsi), %r8 - movq %rcx, 144(%rdi) - sbbq 152(%rdx), %r8 - movq 160(%rsi), %rcx - movq %r8, 152(%rdi) - sbbq 160(%rdx), %rcx - movq 168(%rsi), %r8 - movq %rcx, 160(%rdi) - sbbq 168(%rdx), %r8 - movq 176(%rsi), %rcx - movq %r8, 168(%rdi) - sbbq 176(%rdx), %rcx - movq 184(%rsi), %r8 - movq %rcx, 176(%rdi) - sbbq 184(%rdx), %r8 - movq 192(%rsi), %rcx - movq %r8, 184(%rdi) - sbbq 192(%rdx), %rcx - movq 200(%rsi), %r8 - movq %rcx, 192(%rdi) - sbbq 200(%rdx), %r8 - movq 208(%rsi), %rcx - movq %r8, 200(%rdi) - sbbq 208(%rdx), %rcx - movq 216(%rsi), %r8 - movq %rcx, 208(%rdi) - sbbq 216(%rdx), %r8 - movq 224(%rsi), %rcx - movq %r8, 216(%rdi) - sbbq 224(%rdx), %rcx - movq 232(%rsi), %r8 - movq %rcx, 224(%rdi) - sbbq 232(%rdx), %r8 - movq 240(%rsi), %rcx - movq %r8, 232(%rdi) - sbbq 240(%rdx), %rcx - movq 248(%rsi), %r8 - movq %rcx, 240(%rdi) - sbbq 248(%rdx), %r8 - movq %r8, 248(%rdi) - sbbq $0x00, %rax - repz retq -#ifndef __APPLE__ -.size sp_2048_sub_32,.-sp_2048_sub_32 -#endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Reduce the number back to 2048 bits using Montgomery reduction. * @@ -25764,274 +25764,174 @@ L_3072_mont_loop_48: #ifndef __APPLE__ .size sp_3072_mont_reduce_48,.-sp_3072_mont_reduce_48 #endif /* __APPLE__ */ -#ifdef HAVE_INTEL_AVX2 -/* Conditionally subtract b from a using the mask m. - * m is -1 to subtract and 0 when not copying. +/* Sub b from a into r. (r = a - b) * - * r A single precision number representing condition subtract result. - * a A single precision number to subtract from. - * b A single precision number to subtract. - * m Mask value to apply. + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. */ #ifndef __APPLE__ .text -.globl sp_3072_cond_sub_avx2_48 -.type sp_3072_cond_sub_avx2_48,@function +.globl sp_3072_sub_48 +.type sp_3072_sub_48,@function .align 16 -sp_3072_cond_sub_avx2_48: +sp_3072_sub_48: #else .section __TEXT,__text -.globl _sp_3072_cond_sub_avx2_48 +.globl _sp_3072_sub_48 .p2align 4 -_sp_3072_cond_sub_avx2_48: +_sp_3072_sub_48: #endif /* __APPLE__ */ - movq $0x00, %rax - movq (%rdx), %r10 - movq (%rsi), %r8 - pextq %rcx, %r10, %r10 - subq %r10, %r8 - movq 8(%rdx), %r10 - movq 8(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, (%rdi) - sbbq %r10, %r9 - movq 16(%rdx), %r8 - movq 16(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 8(%rdi) - sbbq %r8, %r10 - movq 24(%rdx), %r9 + movq (%rsi), %rcx + xorq %rax, %rax + subq (%rdx), %rcx + movq 8(%rsi), %r8 + movq %rcx, (%rdi) + sbbq 8(%rdx), %r8 + movq 16(%rsi), %rcx + movq %r8, 8(%rdi) + sbbq 16(%rdx), %rcx movq 24(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 16(%rdi) - sbbq %r9, %r8 - movq 32(%rdx), %r10 - movq 32(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 16(%rdi) + sbbq 24(%rdx), %r8 + movq 32(%rsi), %rcx movq %r8, 24(%rdi) - sbbq %r10, %r9 - movq 40(%rdx), %r8 - movq 40(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 32(%rdi) - sbbq %r8, %r10 - movq 48(%rdx), %r9 - movq 48(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 40(%rdi) - sbbq %r9, %r8 - movq 56(%rdx), %r10 - movq 56(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 48(%rdi) - sbbq %r10, %r9 - movq 64(%rdx), %r8 - movq 64(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 56(%rdi) - sbbq %r8, %r10 - movq 72(%rdx), %r9 + sbbq 32(%rdx), %rcx + movq 40(%rsi), %r8 + movq %rcx, 32(%rdi) + sbbq 40(%rdx), %r8 + movq 48(%rsi), %rcx + movq %r8, 40(%rdi) + sbbq 48(%rdx), %rcx + movq 56(%rsi), %r8 + movq %rcx, 48(%rdi) + sbbq 56(%rdx), %r8 + movq 64(%rsi), %rcx + movq %r8, 56(%rdi) + sbbq 64(%rdx), %rcx movq 72(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 64(%rdi) - sbbq %r9, %r8 - movq 80(%rdx), %r10 - movq 80(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 64(%rdi) + sbbq 72(%rdx), %r8 + movq 80(%rsi), %rcx movq %r8, 72(%rdi) - sbbq %r10, %r9 - movq 88(%rdx), %r8 - movq 88(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 80(%rdi) - sbbq %r8, %r10 - movq 96(%rdx), %r9 - movq 96(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 88(%rdi) - sbbq %r9, %r8 - movq 104(%rdx), %r10 - movq 104(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 96(%rdi) - sbbq %r10, %r9 - movq 112(%rdx), %r8 - movq 112(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 104(%rdi) - sbbq %r8, %r10 - movq 120(%rdx), %r9 + sbbq 80(%rdx), %rcx + movq 88(%rsi), %r8 + movq %rcx, 80(%rdi) + sbbq 88(%rdx), %r8 + movq 96(%rsi), %rcx + movq %r8, 88(%rdi) + sbbq 96(%rdx), %rcx + movq 104(%rsi), %r8 + movq %rcx, 96(%rdi) + sbbq 104(%rdx), %r8 + movq 112(%rsi), %rcx + movq %r8, 104(%rdi) + sbbq 112(%rdx), %rcx movq 120(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 112(%rdi) - sbbq %r9, %r8 - movq 128(%rdx), %r10 - movq 128(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 112(%rdi) + sbbq 120(%rdx), %r8 + movq 128(%rsi), %rcx movq %r8, 120(%rdi) - sbbq %r10, %r9 - movq 136(%rdx), %r8 - movq 136(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 128(%rdi) - sbbq %r8, %r10 - movq 144(%rdx), %r9 - movq 144(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 136(%rdi) - sbbq %r9, %r8 - movq 152(%rdx), %r10 - movq 152(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 144(%rdi) - sbbq %r10, %r9 - movq 160(%rdx), %r8 - movq 160(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 152(%rdi) - sbbq %r8, %r10 - movq 168(%rdx), %r9 + sbbq 128(%rdx), %rcx + movq 136(%rsi), %r8 + movq %rcx, 128(%rdi) + sbbq 136(%rdx), %r8 + movq 144(%rsi), %rcx + movq %r8, 136(%rdi) + sbbq 144(%rdx), %rcx + movq 152(%rsi), %r8 + movq %rcx, 144(%rdi) + sbbq 152(%rdx), %r8 + movq 160(%rsi), %rcx + movq %r8, 152(%rdi) + sbbq 160(%rdx), %rcx movq 168(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 160(%rdi) - sbbq %r9, %r8 - movq 176(%rdx), %r10 - movq 176(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 160(%rdi) + sbbq 168(%rdx), %r8 + movq 176(%rsi), %rcx movq %r8, 168(%rdi) - sbbq %r10, %r9 - movq 184(%rdx), %r8 - movq 184(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 176(%rdi) - sbbq %r8, %r10 - movq 192(%rdx), %r9 - movq 192(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 184(%rdi) - sbbq %r9, %r8 - movq 200(%rdx), %r10 - movq 200(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 192(%rdi) - sbbq %r10, %r9 - movq 208(%rdx), %r8 - movq 208(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 200(%rdi) - sbbq %r8, %r10 - movq 216(%rdx), %r9 + sbbq 176(%rdx), %rcx + movq 184(%rsi), %r8 + movq %rcx, 176(%rdi) + sbbq 184(%rdx), %r8 + movq 192(%rsi), %rcx + movq %r8, 184(%rdi) + sbbq 192(%rdx), %rcx + movq 200(%rsi), %r8 + movq %rcx, 192(%rdi) + sbbq 200(%rdx), %r8 + movq 208(%rsi), %rcx + movq %r8, 200(%rdi) + sbbq 208(%rdx), %rcx movq 216(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 208(%rdi) - sbbq %r9, %r8 - movq 224(%rdx), %r10 - movq 224(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 208(%rdi) + sbbq 216(%rdx), %r8 + movq 224(%rsi), %rcx movq %r8, 216(%rdi) - sbbq %r10, %r9 - movq 232(%rdx), %r8 - movq 232(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 224(%rdi) - sbbq %r8, %r10 - movq 240(%rdx), %r9 - movq 240(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 232(%rdi) - sbbq %r9, %r8 - movq 248(%rdx), %r10 - movq 248(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 240(%rdi) - sbbq %r10, %r9 - movq 256(%rdx), %r8 - movq 256(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 248(%rdi) - sbbq %r8, %r10 - movq 264(%rdx), %r9 + sbbq 224(%rdx), %rcx + movq 232(%rsi), %r8 + movq %rcx, 224(%rdi) + sbbq 232(%rdx), %r8 + movq 240(%rsi), %rcx + movq %r8, 232(%rdi) + sbbq 240(%rdx), %rcx + movq 248(%rsi), %r8 + movq %rcx, 240(%rdi) + sbbq 248(%rdx), %r8 + movq 256(%rsi), %rcx + movq %r8, 248(%rdi) + sbbq 256(%rdx), %rcx movq 264(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 256(%rdi) - sbbq %r9, %r8 - movq 272(%rdx), %r10 - movq 272(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 256(%rdi) + sbbq 264(%rdx), %r8 + movq 272(%rsi), %rcx movq %r8, 264(%rdi) - sbbq %r10, %r9 - movq 280(%rdx), %r8 - movq 280(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 272(%rdi) - sbbq %r8, %r10 - movq 288(%rdx), %r9 - movq 288(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 280(%rdi) - sbbq %r9, %r8 - movq 296(%rdx), %r10 - movq 296(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 288(%rdi) - sbbq %r10, %r9 - movq 304(%rdx), %r8 - movq 304(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 296(%rdi) - sbbq %r8, %r10 - movq 312(%rdx), %r9 + sbbq 272(%rdx), %rcx + movq 280(%rsi), %r8 + movq %rcx, 272(%rdi) + sbbq 280(%rdx), %r8 + movq 288(%rsi), %rcx + movq %r8, 280(%rdi) + sbbq 288(%rdx), %rcx + movq 296(%rsi), %r8 + movq %rcx, 288(%rdi) + sbbq 296(%rdx), %r8 + movq 304(%rsi), %rcx + movq %r8, 296(%rdi) + sbbq 304(%rdx), %rcx movq 312(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 304(%rdi) - sbbq %r9, %r8 - movq 320(%rdx), %r10 - movq 320(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 304(%rdi) + sbbq 312(%rdx), %r8 + movq 320(%rsi), %rcx movq %r8, 312(%rdi) - sbbq %r10, %r9 - movq 328(%rdx), %r8 - movq 328(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 320(%rdi) - sbbq %r8, %r10 - movq 336(%rdx), %r9 - movq 336(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 328(%rdi) - sbbq %r9, %r8 - movq 344(%rdx), %r10 - movq 344(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 336(%rdi) - sbbq %r10, %r9 - movq 352(%rdx), %r8 - movq 352(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 344(%rdi) - sbbq %r8, %r10 - movq 360(%rdx), %r9 + sbbq 320(%rdx), %rcx + movq 328(%rsi), %r8 + movq %rcx, 320(%rdi) + sbbq 328(%rdx), %r8 + movq 336(%rsi), %rcx + movq %r8, 328(%rdi) + sbbq 336(%rdx), %rcx + movq 344(%rsi), %r8 + movq %rcx, 336(%rdi) + sbbq 344(%rdx), %r8 + movq 352(%rsi), %rcx + movq %r8, 344(%rdi) + sbbq 352(%rdx), %rcx movq 360(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 352(%rdi) - sbbq %r9, %r8 - movq 368(%rdx), %r10 - movq 368(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 352(%rdi) + sbbq 360(%rdx), %r8 + movq 368(%rsi), %rcx movq %r8, 360(%rdi) - sbbq %r10, %r9 - movq 376(%rdx), %r8 - movq 376(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 368(%rdi) - sbbq %r8, %r10 - movq %r10, 376(%rdi) + sbbq 368(%rdx), %rcx + movq 376(%rsi), %r8 + movq %rcx, 368(%rdi) + sbbq 376(%rdx), %r8 + movq %r8, 376(%rdi) sbbq $0x00, %rax repz retq #ifndef __APPLE__ -.size sp_3072_cond_sub_avx2_48,.-sp_3072_cond_sub_avx2_48 +.size sp_3072_sub_48,.-sp_3072_sub_48 #endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ #ifdef HAVE_INTEL_AVX2 /* Mul a by digit b into r. (r = a * b) * @@ -26375,6 +26275,274 @@ _div_3072_word_asm_48: .size div_3072_word_asm_48,.-div_3072_word_asm_48 #endif /* __APPLE__ */ #endif /* _WIN64 */ +#ifdef HAVE_INTEL_AVX2 +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +#ifndef __APPLE__ +.text +.globl sp_3072_cond_sub_avx2_48 +.type sp_3072_cond_sub_avx2_48,@function +.align 16 +sp_3072_cond_sub_avx2_48: +#else +.section __TEXT,__text +.globl _sp_3072_cond_sub_avx2_48 +.p2align 4 +_sp_3072_cond_sub_avx2_48: +#endif /* __APPLE__ */ + movq $0x00, %rax + movq (%rdx), %r10 + movq (%rsi), %r8 + pextq %rcx, %r10, %r10 + subq %r10, %r8 + movq 8(%rdx), %r10 + movq 8(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, (%rdi) + sbbq %r10, %r9 + movq 16(%rdx), %r8 + movq 16(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 8(%rdi) + sbbq %r8, %r10 + movq 24(%rdx), %r9 + movq 24(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 16(%rdi) + sbbq %r9, %r8 + movq 32(%rdx), %r10 + movq 32(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 24(%rdi) + sbbq %r10, %r9 + movq 40(%rdx), %r8 + movq 40(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 32(%rdi) + sbbq %r8, %r10 + movq 48(%rdx), %r9 + movq 48(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 40(%rdi) + sbbq %r9, %r8 + movq 56(%rdx), %r10 + movq 56(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 48(%rdi) + sbbq %r10, %r9 + movq 64(%rdx), %r8 + movq 64(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 56(%rdi) + sbbq %r8, %r10 + movq 72(%rdx), %r9 + movq 72(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 64(%rdi) + sbbq %r9, %r8 + movq 80(%rdx), %r10 + movq 80(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 72(%rdi) + sbbq %r10, %r9 + movq 88(%rdx), %r8 + movq 88(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 80(%rdi) + sbbq %r8, %r10 + movq 96(%rdx), %r9 + movq 96(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 88(%rdi) + sbbq %r9, %r8 + movq 104(%rdx), %r10 + movq 104(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 96(%rdi) + sbbq %r10, %r9 + movq 112(%rdx), %r8 + movq 112(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 104(%rdi) + sbbq %r8, %r10 + movq 120(%rdx), %r9 + movq 120(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 112(%rdi) + sbbq %r9, %r8 + movq 128(%rdx), %r10 + movq 128(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 120(%rdi) + sbbq %r10, %r9 + movq 136(%rdx), %r8 + movq 136(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 128(%rdi) + sbbq %r8, %r10 + movq 144(%rdx), %r9 + movq 144(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 136(%rdi) + sbbq %r9, %r8 + movq 152(%rdx), %r10 + movq 152(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 144(%rdi) + sbbq %r10, %r9 + movq 160(%rdx), %r8 + movq 160(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 152(%rdi) + sbbq %r8, %r10 + movq 168(%rdx), %r9 + movq 168(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 160(%rdi) + sbbq %r9, %r8 + movq 176(%rdx), %r10 + movq 176(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 168(%rdi) + sbbq %r10, %r9 + movq 184(%rdx), %r8 + movq 184(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 176(%rdi) + sbbq %r8, %r10 + movq 192(%rdx), %r9 + movq 192(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 184(%rdi) + sbbq %r9, %r8 + movq 200(%rdx), %r10 + movq 200(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 192(%rdi) + sbbq %r10, %r9 + movq 208(%rdx), %r8 + movq 208(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 200(%rdi) + sbbq %r8, %r10 + movq 216(%rdx), %r9 + movq 216(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 208(%rdi) + sbbq %r9, %r8 + movq 224(%rdx), %r10 + movq 224(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 216(%rdi) + sbbq %r10, %r9 + movq 232(%rdx), %r8 + movq 232(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 224(%rdi) + sbbq %r8, %r10 + movq 240(%rdx), %r9 + movq 240(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 232(%rdi) + sbbq %r9, %r8 + movq 248(%rdx), %r10 + movq 248(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 240(%rdi) + sbbq %r10, %r9 + movq 256(%rdx), %r8 + movq 256(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 248(%rdi) + sbbq %r8, %r10 + movq 264(%rdx), %r9 + movq 264(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 256(%rdi) + sbbq %r9, %r8 + movq 272(%rdx), %r10 + movq 272(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 264(%rdi) + sbbq %r10, %r9 + movq 280(%rdx), %r8 + movq 280(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 272(%rdi) + sbbq %r8, %r10 + movq 288(%rdx), %r9 + movq 288(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 280(%rdi) + sbbq %r9, %r8 + movq 296(%rdx), %r10 + movq 296(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 288(%rdi) + sbbq %r10, %r9 + movq 304(%rdx), %r8 + movq 304(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 296(%rdi) + sbbq %r8, %r10 + movq 312(%rdx), %r9 + movq 312(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 304(%rdi) + sbbq %r9, %r8 + movq 320(%rdx), %r10 + movq 320(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 312(%rdi) + sbbq %r10, %r9 + movq 328(%rdx), %r8 + movq 328(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 320(%rdi) + sbbq %r8, %r10 + movq 336(%rdx), %r9 + movq 336(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 328(%rdi) + sbbq %r9, %r8 + movq 344(%rdx), %r10 + movq 344(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 336(%rdi) + sbbq %r10, %r9 + movq 352(%rdx), %r8 + movq 352(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 344(%rdi) + sbbq %r8, %r10 + movq 360(%rdx), %r9 + movq 360(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 352(%rdi) + sbbq %r9, %r8 + movq 368(%rdx), %r10 + movq 368(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 360(%rdi) + sbbq %r10, %r9 + movq 376(%rdx), %r8 + movq 376(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 368(%rdi) + sbbq %r8, %r10 + movq %r10, 376(%rdi) + sbbq $0x00, %rax + repz retq +#ifndef __APPLE__ +.size sp_3072_cond_sub_avx2_48,.-sp_3072_cond_sub_avx2_48 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ /* Compare a with b in constant time. * * a A single precision integer. @@ -26787,174 +26955,6 @@ _sp_3072_cmp_48: #ifndef __APPLE__ .size sp_3072_cmp_48,.-sp_3072_cmp_48 #endif /* __APPLE__ */ -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_3072_sub_48 -.type sp_3072_sub_48,@function -.align 16 -sp_3072_sub_48: -#else -.section __TEXT,__text -.globl _sp_3072_sub_48 -.p2align 4 -_sp_3072_sub_48: -#endif /* __APPLE__ */ - movq (%rsi), %rcx - xorq %rax, %rax - subq (%rdx), %rcx - movq 8(%rsi), %r8 - movq %rcx, (%rdi) - sbbq 8(%rdx), %r8 - movq 16(%rsi), %rcx - movq %r8, 8(%rdi) - sbbq 16(%rdx), %rcx - movq 24(%rsi), %r8 - movq %rcx, 16(%rdi) - sbbq 24(%rdx), %r8 - movq 32(%rsi), %rcx - movq %r8, 24(%rdi) - sbbq 32(%rdx), %rcx - movq 40(%rsi), %r8 - movq %rcx, 32(%rdi) - sbbq 40(%rdx), %r8 - movq 48(%rsi), %rcx - movq %r8, 40(%rdi) - sbbq 48(%rdx), %rcx - movq 56(%rsi), %r8 - movq %rcx, 48(%rdi) - sbbq 56(%rdx), %r8 - movq 64(%rsi), %rcx - movq %r8, 56(%rdi) - sbbq 64(%rdx), %rcx - movq 72(%rsi), %r8 - movq %rcx, 64(%rdi) - sbbq 72(%rdx), %r8 - movq 80(%rsi), %rcx - movq %r8, 72(%rdi) - sbbq 80(%rdx), %rcx - movq 88(%rsi), %r8 - movq %rcx, 80(%rdi) - sbbq 88(%rdx), %r8 - movq 96(%rsi), %rcx - movq %r8, 88(%rdi) - sbbq 96(%rdx), %rcx - movq 104(%rsi), %r8 - movq %rcx, 96(%rdi) - sbbq 104(%rdx), %r8 - movq 112(%rsi), %rcx - movq %r8, 104(%rdi) - sbbq 112(%rdx), %rcx - movq 120(%rsi), %r8 - movq %rcx, 112(%rdi) - sbbq 120(%rdx), %r8 - movq 128(%rsi), %rcx - movq %r8, 120(%rdi) - sbbq 128(%rdx), %rcx - movq 136(%rsi), %r8 - movq %rcx, 128(%rdi) - sbbq 136(%rdx), %r8 - movq 144(%rsi), %rcx - movq %r8, 136(%rdi) - sbbq 144(%rdx), %rcx - movq 152(%rsi), %r8 - movq %rcx, 144(%rdi) - sbbq 152(%rdx), %r8 - movq 160(%rsi), %rcx - movq %r8, 152(%rdi) - sbbq 160(%rdx), %rcx - movq 168(%rsi), %r8 - movq %rcx, 160(%rdi) - sbbq 168(%rdx), %r8 - movq 176(%rsi), %rcx - movq %r8, 168(%rdi) - sbbq 176(%rdx), %rcx - movq 184(%rsi), %r8 - movq %rcx, 176(%rdi) - sbbq 184(%rdx), %r8 - movq 192(%rsi), %rcx - movq %r8, 184(%rdi) - sbbq 192(%rdx), %rcx - movq 200(%rsi), %r8 - movq %rcx, 192(%rdi) - sbbq 200(%rdx), %r8 - movq 208(%rsi), %rcx - movq %r8, 200(%rdi) - sbbq 208(%rdx), %rcx - movq 216(%rsi), %r8 - movq %rcx, 208(%rdi) - sbbq 216(%rdx), %r8 - movq 224(%rsi), %rcx - movq %r8, 216(%rdi) - sbbq 224(%rdx), %rcx - movq 232(%rsi), %r8 - movq %rcx, 224(%rdi) - sbbq 232(%rdx), %r8 - movq 240(%rsi), %rcx - movq %r8, 232(%rdi) - sbbq 240(%rdx), %rcx - movq 248(%rsi), %r8 - movq %rcx, 240(%rdi) - sbbq 248(%rdx), %r8 - movq 256(%rsi), %rcx - movq %r8, 248(%rdi) - sbbq 256(%rdx), %rcx - movq 264(%rsi), %r8 - movq %rcx, 256(%rdi) - sbbq 264(%rdx), %r8 - movq 272(%rsi), %rcx - movq %r8, 264(%rdi) - sbbq 272(%rdx), %rcx - movq 280(%rsi), %r8 - movq %rcx, 272(%rdi) - sbbq 280(%rdx), %r8 - movq 288(%rsi), %rcx - movq %r8, 280(%rdi) - sbbq 288(%rdx), %rcx - movq 296(%rsi), %r8 - movq %rcx, 288(%rdi) - sbbq 296(%rdx), %r8 - movq 304(%rsi), %rcx - movq %r8, 296(%rdi) - sbbq 304(%rdx), %rcx - movq 312(%rsi), %r8 - movq %rcx, 304(%rdi) - sbbq 312(%rdx), %r8 - movq 320(%rsi), %rcx - movq %r8, 312(%rdi) - sbbq 320(%rdx), %rcx - movq 328(%rsi), %r8 - movq %rcx, 320(%rdi) - sbbq 328(%rdx), %r8 - movq 336(%rsi), %rcx - movq %r8, 328(%rdi) - sbbq 336(%rdx), %rcx - movq 344(%rsi), %r8 - movq %rcx, 336(%rdi) - sbbq 344(%rdx), %r8 - movq 352(%rsi), %rcx - movq %r8, 344(%rdi) - sbbq 352(%rdx), %rcx - movq 360(%rsi), %r8 - movq %rcx, 352(%rdi) - sbbq 360(%rdx), %r8 - movq 368(%rsi), %rcx - movq %r8, 360(%rdi) - sbbq 368(%rdx), %rcx - movq 376(%rsi), %r8 - movq %rcx, 368(%rdi) - sbbq 376(%rdx), %r8 - movq %r8, 376(%rdi) - sbbq $0x00, %rax - repz retq -#ifndef __APPLE__ -.size sp_3072_sub_48,.-sp_3072_sub_48 -#endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Reduce the number back to 3072 bits using Montgomery reduction. * @@ -35677,354 +35677,222 @@ L_4096_mont_loop_64: #ifndef __APPLE__ .size sp_4096_mont_reduce_64,.-sp_4096_mont_reduce_64 #endif /* __APPLE__ */ -#ifdef HAVE_INTEL_AVX2 -/* Conditionally subtract b from a using the mask m. - * m is -1 to subtract and 0 when not copying. +/* Sub b from a into r. (r = a - b) * - * r A single precision number representing condition subtract result. - * a A single precision number to subtract from. - * b A single precision number to subtract. - * m Mask value to apply. + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. */ #ifndef __APPLE__ .text -.globl sp_4096_cond_sub_avx2_64 -.type sp_4096_cond_sub_avx2_64,@function +.globl sp_4096_sub_64 +.type sp_4096_sub_64,@function .align 16 -sp_4096_cond_sub_avx2_64: +sp_4096_sub_64: #else .section __TEXT,__text -.globl _sp_4096_cond_sub_avx2_64 +.globl _sp_4096_sub_64 .p2align 4 -_sp_4096_cond_sub_avx2_64: +_sp_4096_sub_64: #endif /* __APPLE__ */ - movq $0x00, %rax - movq (%rdx), %r10 - movq (%rsi), %r8 - pextq %rcx, %r10, %r10 - subq %r10, %r8 - movq 8(%rdx), %r10 - movq 8(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, (%rdi) - sbbq %r10, %r9 - movq 16(%rdx), %r8 - movq 16(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 8(%rdi) - sbbq %r8, %r10 - movq 24(%rdx), %r9 + movq (%rsi), %rcx + xorq %rax, %rax + subq (%rdx), %rcx + movq 8(%rsi), %r8 + movq %rcx, (%rdi) + sbbq 8(%rdx), %r8 + movq 16(%rsi), %rcx + movq %r8, 8(%rdi) + sbbq 16(%rdx), %rcx movq 24(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 16(%rdi) - sbbq %r9, %r8 - movq 32(%rdx), %r10 - movq 32(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 16(%rdi) + sbbq 24(%rdx), %r8 + movq 32(%rsi), %rcx movq %r8, 24(%rdi) - sbbq %r10, %r9 - movq 40(%rdx), %r8 - movq 40(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 32(%rdi) - sbbq %r8, %r10 - movq 48(%rdx), %r9 - movq 48(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 40(%rdi) - sbbq %r9, %r8 - movq 56(%rdx), %r10 - movq 56(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 48(%rdi) - sbbq %r10, %r9 - movq 64(%rdx), %r8 - movq 64(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 56(%rdi) - sbbq %r8, %r10 - movq 72(%rdx), %r9 + sbbq 32(%rdx), %rcx + movq 40(%rsi), %r8 + movq %rcx, 32(%rdi) + sbbq 40(%rdx), %r8 + movq 48(%rsi), %rcx + movq %r8, 40(%rdi) + sbbq 48(%rdx), %rcx + movq 56(%rsi), %r8 + movq %rcx, 48(%rdi) + sbbq 56(%rdx), %r8 + movq 64(%rsi), %rcx + movq %r8, 56(%rdi) + sbbq 64(%rdx), %rcx movq 72(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 64(%rdi) - sbbq %r9, %r8 - movq 80(%rdx), %r10 - movq 80(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 64(%rdi) + sbbq 72(%rdx), %r8 + movq 80(%rsi), %rcx movq %r8, 72(%rdi) - sbbq %r10, %r9 - movq 88(%rdx), %r8 - movq 88(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 80(%rdi) - sbbq %r8, %r10 - movq 96(%rdx), %r9 - movq 96(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 88(%rdi) - sbbq %r9, %r8 - movq 104(%rdx), %r10 - movq 104(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 96(%rdi) - sbbq %r10, %r9 - movq 112(%rdx), %r8 - movq 112(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 104(%rdi) - sbbq %r8, %r10 - movq 120(%rdx), %r9 + sbbq 80(%rdx), %rcx + movq 88(%rsi), %r8 + movq %rcx, 80(%rdi) + sbbq 88(%rdx), %r8 + movq 96(%rsi), %rcx + movq %r8, 88(%rdi) + sbbq 96(%rdx), %rcx + movq 104(%rsi), %r8 + movq %rcx, 96(%rdi) + sbbq 104(%rdx), %r8 + movq 112(%rsi), %rcx + movq %r8, 104(%rdi) + sbbq 112(%rdx), %rcx movq 120(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 112(%rdi) - sbbq %r9, %r8 - movq 128(%rdx), %r10 - movq 128(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 112(%rdi) + sbbq 120(%rdx), %r8 + movq 128(%rsi), %rcx movq %r8, 120(%rdi) - sbbq %r10, %r9 - movq 136(%rdx), %r8 - movq 136(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 128(%rdi) - sbbq %r8, %r10 - movq 144(%rdx), %r9 - movq 144(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 136(%rdi) - sbbq %r9, %r8 - movq 152(%rdx), %r10 - movq 152(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 144(%rdi) - sbbq %r10, %r9 - movq 160(%rdx), %r8 - movq 160(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 152(%rdi) - sbbq %r8, %r10 - movq 168(%rdx), %r9 + sbbq 128(%rdx), %rcx + movq 136(%rsi), %r8 + movq %rcx, 128(%rdi) + sbbq 136(%rdx), %r8 + movq 144(%rsi), %rcx + movq %r8, 136(%rdi) + sbbq 144(%rdx), %rcx + movq 152(%rsi), %r8 + movq %rcx, 144(%rdi) + sbbq 152(%rdx), %r8 + movq 160(%rsi), %rcx + movq %r8, 152(%rdi) + sbbq 160(%rdx), %rcx movq 168(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 160(%rdi) - sbbq %r9, %r8 - movq 176(%rdx), %r10 - movq 176(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 160(%rdi) + sbbq 168(%rdx), %r8 + movq 176(%rsi), %rcx movq %r8, 168(%rdi) - sbbq %r10, %r9 - movq 184(%rdx), %r8 - movq 184(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 176(%rdi) - sbbq %r8, %r10 - movq 192(%rdx), %r9 - movq 192(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 184(%rdi) - sbbq %r9, %r8 - movq 200(%rdx), %r10 - movq 200(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 192(%rdi) - sbbq %r10, %r9 - movq 208(%rdx), %r8 - movq 208(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 200(%rdi) - sbbq %r8, %r10 - movq 216(%rdx), %r9 + sbbq 176(%rdx), %rcx + movq 184(%rsi), %r8 + movq %rcx, 176(%rdi) + sbbq 184(%rdx), %r8 + movq 192(%rsi), %rcx + movq %r8, 184(%rdi) + sbbq 192(%rdx), %rcx + movq 200(%rsi), %r8 + movq %rcx, 192(%rdi) + sbbq 200(%rdx), %r8 + movq 208(%rsi), %rcx + movq %r8, 200(%rdi) + sbbq 208(%rdx), %rcx movq 216(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 208(%rdi) - sbbq %r9, %r8 - movq 224(%rdx), %r10 - movq 224(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 208(%rdi) + sbbq 216(%rdx), %r8 + movq 224(%rsi), %rcx movq %r8, 216(%rdi) - sbbq %r10, %r9 - movq 232(%rdx), %r8 - movq 232(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 224(%rdi) - sbbq %r8, %r10 - movq 240(%rdx), %r9 - movq 240(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 232(%rdi) - sbbq %r9, %r8 - movq 248(%rdx), %r10 - movq 248(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 240(%rdi) - sbbq %r10, %r9 - movq 256(%rdx), %r8 - movq 256(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 248(%rdi) - sbbq %r8, %r10 - movq 264(%rdx), %r9 + sbbq 224(%rdx), %rcx + movq 232(%rsi), %r8 + movq %rcx, 224(%rdi) + sbbq 232(%rdx), %r8 + movq 240(%rsi), %rcx + movq %r8, 232(%rdi) + sbbq 240(%rdx), %rcx + movq 248(%rsi), %r8 + movq %rcx, 240(%rdi) + sbbq 248(%rdx), %r8 + movq 256(%rsi), %rcx + movq %r8, 248(%rdi) + sbbq 256(%rdx), %rcx movq 264(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 256(%rdi) - sbbq %r9, %r8 - movq 272(%rdx), %r10 - movq 272(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 256(%rdi) + sbbq 264(%rdx), %r8 + movq 272(%rsi), %rcx movq %r8, 264(%rdi) - sbbq %r10, %r9 - movq 280(%rdx), %r8 - movq 280(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 272(%rdi) - sbbq %r8, %r10 - movq 288(%rdx), %r9 - movq 288(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 280(%rdi) - sbbq %r9, %r8 - movq 296(%rdx), %r10 - movq 296(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 288(%rdi) - sbbq %r10, %r9 - movq 304(%rdx), %r8 - movq 304(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 296(%rdi) - sbbq %r8, %r10 - movq 312(%rdx), %r9 + sbbq 272(%rdx), %rcx + movq 280(%rsi), %r8 + movq %rcx, 272(%rdi) + sbbq 280(%rdx), %r8 + movq 288(%rsi), %rcx + movq %r8, 280(%rdi) + sbbq 288(%rdx), %rcx + movq 296(%rsi), %r8 + movq %rcx, 288(%rdi) + sbbq 296(%rdx), %r8 + movq 304(%rsi), %rcx + movq %r8, 296(%rdi) + sbbq 304(%rdx), %rcx movq 312(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 304(%rdi) - sbbq %r9, %r8 - movq 320(%rdx), %r10 - movq 320(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 304(%rdi) + sbbq 312(%rdx), %r8 + movq 320(%rsi), %rcx movq %r8, 312(%rdi) - sbbq %r10, %r9 - movq 328(%rdx), %r8 - movq 328(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 320(%rdi) - sbbq %r8, %r10 - movq 336(%rdx), %r9 - movq 336(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 328(%rdi) - sbbq %r9, %r8 - movq 344(%rdx), %r10 - movq 344(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 336(%rdi) - sbbq %r10, %r9 - movq 352(%rdx), %r8 - movq 352(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 344(%rdi) - sbbq %r8, %r10 - movq 360(%rdx), %r9 + sbbq 320(%rdx), %rcx + movq 328(%rsi), %r8 + movq %rcx, 320(%rdi) + sbbq 328(%rdx), %r8 + movq 336(%rsi), %rcx + movq %r8, 328(%rdi) + sbbq 336(%rdx), %rcx + movq 344(%rsi), %r8 + movq %rcx, 336(%rdi) + sbbq 344(%rdx), %r8 + movq 352(%rsi), %rcx + movq %r8, 344(%rdi) + sbbq 352(%rdx), %rcx movq 360(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 352(%rdi) - sbbq %r9, %r8 - movq 368(%rdx), %r10 - movq 368(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 352(%rdi) + sbbq 360(%rdx), %r8 + movq 368(%rsi), %rcx movq %r8, 360(%rdi) - sbbq %r10, %r9 - movq 376(%rdx), %r8 - movq 376(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 368(%rdi) - sbbq %r8, %r10 - movq 384(%rdx), %r9 - movq 384(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 376(%rdi) - sbbq %r9, %r8 - movq 392(%rdx), %r10 - movq 392(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 384(%rdi) - sbbq %r10, %r9 - movq 400(%rdx), %r8 - movq 400(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 392(%rdi) - sbbq %r8, %r10 - movq 408(%rdx), %r9 + sbbq 368(%rdx), %rcx + movq 376(%rsi), %r8 + movq %rcx, 368(%rdi) + sbbq 376(%rdx), %r8 + movq 384(%rsi), %rcx + movq %r8, 376(%rdi) + sbbq 384(%rdx), %rcx + movq 392(%rsi), %r8 + movq %rcx, 384(%rdi) + sbbq 392(%rdx), %r8 + movq 400(%rsi), %rcx + movq %r8, 392(%rdi) + sbbq 400(%rdx), %rcx movq 408(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 400(%rdi) - sbbq %r9, %r8 - movq 416(%rdx), %r10 - movq 416(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 400(%rdi) + sbbq 408(%rdx), %r8 + movq 416(%rsi), %rcx movq %r8, 408(%rdi) - sbbq %r10, %r9 - movq 424(%rdx), %r8 - movq 424(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 416(%rdi) - sbbq %r8, %r10 - movq 432(%rdx), %r9 - movq 432(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 424(%rdi) - sbbq %r9, %r8 - movq 440(%rdx), %r10 - movq 440(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 432(%rdi) - sbbq %r10, %r9 - movq 448(%rdx), %r8 - movq 448(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 440(%rdi) - sbbq %r8, %r10 - movq 456(%rdx), %r9 + sbbq 416(%rdx), %rcx + movq 424(%rsi), %r8 + movq %rcx, 416(%rdi) + sbbq 424(%rdx), %r8 + movq 432(%rsi), %rcx + movq %r8, 424(%rdi) + sbbq 432(%rdx), %rcx + movq 440(%rsi), %r8 + movq %rcx, 432(%rdi) + sbbq 440(%rdx), %r8 + movq 448(%rsi), %rcx + movq %r8, 440(%rdi) + sbbq 448(%rdx), %rcx movq 456(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 448(%rdi) - sbbq %r9, %r8 - movq 464(%rdx), %r10 - movq 464(%rsi), %r9 - pextq %rcx, %r10, %r10 + movq %rcx, 448(%rdi) + sbbq 456(%rdx), %r8 + movq 464(%rsi), %rcx movq %r8, 456(%rdi) - sbbq %r10, %r9 - movq 472(%rdx), %r8 - movq 472(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 464(%rdi) - sbbq %r8, %r10 - movq 480(%rdx), %r9 - movq 480(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 472(%rdi) - sbbq %r9, %r8 - movq 488(%rdx), %r10 - movq 488(%rsi), %r9 - pextq %rcx, %r10, %r10 - movq %r8, 480(%rdi) - sbbq %r10, %r9 - movq 496(%rdx), %r8 - movq 496(%rsi), %r10 - pextq %rcx, %r8, %r8 - movq %r9, 488(%rdi) - sbbq %r8, %r10 - movq 504(%rdx), %r9 + sbbq 464(%rdx), %rcx + movq 472(%rsi), %r8 + movq %rcx, 464(%rdi) + sbbq 472(%rdx), %r8 + movq 480(%rsi), %rcx + movq %r8, 472(%rdi) + sbbq 480(%rdx), %rcx + movq 488(%rsi), %r8 + movq %rcx, 480(%rdi) + sbbq 488(%rdx), %r8 + movq 496(%rsi), %rcx + movq %r8, 488(%rdi) + sbbq 496(%rdx), %rcx movq 504(%rsi), %r8 - pextq %rcx, %r9, %r9 - movq %r10, 496(%rdi) - sbbq %r9, %r8 + movq %rcx, 496(%rdi) + sbbq 504(%rdx), %r8 movq %r8, 504(%rdi) sbbq $0x00, %rax repz retq #ifndef __APPLE__ -.size sp_4096_cond_sub_avx2_64,.-sp_4096_cond_sub_avx2_64 +.size sp_4096_sub_64,.-sp_4096_sub_64 #endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ #ifdef HAVE_INTEL_AVX2 /* Mul a by digit b into r. (r = a * b) * @@ -36464,6 +36332,354 @@ _div_4096_word_asm_64: .size div_4096_word_asm_64,.-div_4096_word_asm_64 #endif /* __APPLE__ */ #endif /* _WIN64 */ +#ifdef HAVE_INTEL_AVX2 +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +#ifndef __APPLE__ +.text +.globl sp_4096_cond_sub_avx2_64 +.type sp_4096_cond_sub_avx2_64,@function +.align 16 +sp_4096_cond_sub_avx2_64: +#else +.section __TEXT,__text +.globl _sp_4096_cond_sub_avx2_64 +.p2align 4 +_sp_4096_cond_sub_avx2_64: +#endif /* __APPLE__ */ + movq $0x00, %rax + movq (%rdx), %r10 + movq (%rsi), %r8 + pextq %rcx, %r10, %r10 + subq %r10, %r8 + movq 8(%rdx), %r10 + movq 8(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, (%rdi) + sbbq %r10, %r9 + movq 16(%rdx), %r8 + movq 16(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 8(%rdi) + sbbq %r8, %r10 + movq 24(%rdx), %r9 + movq 24(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 16(%rdi) + sbbq %r9, %r8 + movq 32(%rdx), %r10 + movq 32(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 24(%rdi) + sbbq %r10, %r9 + movq 40(%rdx), %r8 + movq 40(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 32(%rdi) + sbbq %r8, %r10 + movq 48(%rdx), %r9 + movq 48(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 40(%rdi) + sbbq %r9, %r8 + movq 56(%rdx), %r10 + movq 56(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 48(%rdi) + sbbq %r10, %r9 + movq 64(%rdx), %r8 + movq 64(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 56(%rdi) + sbbq %r8, %r10 + movq 72(%rdx), %r9 + movq 72(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 64(%rdi) + sbbq %r9, %r8 + movq 80(%rdx), %r10 + movq 80(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 72(%rdi) + sbbq %r10, %r9 + movq 88(%rdx), %r8 + movq 88(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 80(%rdi) + sbbq %r8, %r10 + movq 96(%rdx), %r9 + movq 96(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 88(%rdi) + sbbq %r9, %r8 + movq 104(%rdx), %r10 + movq 104(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 96(%rdi) + sbbq %r10, %r9 + movq 112(%rdx), %r8 + movq 112(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 104(%rdi) + sbbq %r8, %r10 + movq 120(%rdx), %r9 + movq 120(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 112(%rdi) + sbbq %r9, %r8 + movq 128(%rdx), %r10 + movq 128(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 120(%rdi) + sbbq %r10, %r9 + movq 136(%rdx), %r8 + movq 136(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 128(%rdi) + sbbq %r8, %r10 + movq 144(%rdx), %r9 + movq 144(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 136(%rdi) + sbbq %r9, %r8 + movq 152(%rdx), %r10 + movq 152(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 144(%rdi) + sbbq %r10, %r9 + movq 160(%rdx), %r8 + movq 160(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 152(%rdi) + sbbq %r8, %r10 + movq 168(%rdx), %r9 + movq 168(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 160(%rdi) + sbbq %r9, %r8 + movq 176(%rdx), %r10 + movq 176(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 168(%rdi) + sbbq %r10, %r9 + movq 184(%rdx), %r8 + movq 184(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 176(%rdi) + sbbq %r8, %r10 + movq 192(%rdx), %r9 + movq 192(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 184(%rdi) + sbbq %r9, %r8 + movq 200(%rdx), %r10 + movq 200(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 192(%rdi) + sbbq %r10, %r9 + movq 208(%rdx), %r8 + movq 208(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 200(%rdi) + sbbq %r8, %r10 + movq 216(%rdx), %r9 + movq 216(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 208(%rdi) + sbbq %r9, %r8 + movq 224(%rdx), %r10 + movq 224(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 216(%rdi) + sbbq %r10, %r9 + movq 232(%rdx), %r8 + movq 232(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 224(%rdi) + sbbq %r8, %r10 + movq 240(%rdx), %r9 + movq 240(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 232(%rdi) + sbbq %r9, %r8 + movq 248(%rdx), %r10 + movq 248(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 240(%rdi) + sbbq %r10, %r9 + movq 256(%rdx), %r8 + movq 256(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 248(%rdi) + sbbq %r8, %r10 + movq 264(%rdx), %r9 + movq 264(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 256(%rdi) + sbbq %r9, %r8 + movq 272(%rdx), %r10 + movq 272(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 264(%rdi) + sbbq %r10, %r9 + movq 280(%rdx), %r8 + movq 280(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 272(%rdi) + sbbq %r8, %r10 + movq 288(%rdx), %r9 + movq 288(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 280(%rdi) + sbbq %r9, %r8 + movq 296(%rdx), %r10 + movq 296(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 288(%rdi) + sbbq %r10, %r9 + movq 304(%rdx), %r8 + movq 304(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 296(%rdi) + sbbq %r8, %r10 + movq 312(%rdx), %r9 + movq 312(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 304(%rdi) + sbbq %r9, %r8 + movq 320(%rdx), %r10 + movq 320(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 312(%rdi) + sbbq %r10, %r9 + movq 328(%rdx), %r8 + movq 328(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 320(%rdi) + sbbq %r8, %r10 + movq 336(%rdx), %r9 + movq 336(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 328(%rdi) + sbbq %r9, %r8 + movq 344(%rdx), %r10 + movq 344(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 336(%rdi) + sbbq %r10, %r9 + movq 352(%rdx), %r8 + movq 352(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 344(%rdi) + sbbq %r8, %r10 + movq 360(%rdx), %r9 + movq 360(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 352(%rdi) + sbbq %r9, %r8 + movq 368(%rdx), %r10 + movq 368(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 360(%rdi) + sbbq %r10, %r9 + movq 376(%rdx), %r8 + movq 376(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 368(%rdi) + sbbq %r8, %r10 + movq 384(%rdx), %r9 + movq 384(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 376(%rdi) + sbbq %r9, %r8 + movq 392(%rdx), %r10 + movq 392(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 384(%rdi) + sbbq %r10, %r9 + movq 400(%rdx), %r8 + movq 400(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 392(%rdi) + sbbq %r8, %r10 + movq 408(%rdx), %r9 + movq 408(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 400(%rdi) + sbbq %r9, %r8 + movq 416(%rdx), %r10 + movq 416(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 408(%rdi) + sbbq %r10, %r9 + movq 424(%rdx), %r8 + movq 424(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 416(%rdi) + sbbq %r8, %r10 + movq 432(%rdx), %r9 + movq 432(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 424(%rdi) + sbbq %r9, %r8 + movq 440(%rdx), %r10 + movq 440(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 432(%rdi) + sbbq %r10, %r9 + movq 448(%rdx), %r8 + movq 448(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 440(%rdi) + sbbq %r8, %r10 + movq 456(%rdx), %r9 + movq 456(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 448(%rdi) + sbbq %r9, %r8 + movq 464(%rdx), %r10 + movq 464(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 456(%rdi) + sbbq %r10, %r9 + movq 472(%rdx), %r8 + movq 472(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 464(%rdi) + sbbq %r8, %r10 + movq 480(%rdx), %r9 + movq 480(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 472(%rdi) + sbbq %r9, %r8 + movq 488(%rdx), %r10 + movq 488(%rsi), %r9 + pextq %rcx, %r10, %r10 + movq %r8, 480(%rdi) + sbbq %r10, %r9 + movq 496(%rdx), %r8 + movq 496(%rsi), %r10 + pextq %rcx, %r8, %r8 + movq %r9, 488(%rdi) + sbbq %r8, %r10 + movq 504(%rdx), %r9 + movq 504(%rsi), %r8 + pextq %rcx, %r9, %r9 + movq %r10, 496(%rdi) + sbbq %r9, %r8 + movq %r8, 504(%rdi) + sbbq $0x00, %rax + repz retq +#ifndef __APPLE__ +.size sp_4096_cond_sub_avx2_64,.-sp_4096_cond_sub_avx2_64 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ /* Compare a with b in constant time. * * a A single precision integer. @@ -37004,222 +37220,6 @@ _sp_4096_cmp_64: #ifndef __APPLE__ .size sp_4096_cmp_64,.-sp_4096_cmp_64 #endif /* __APPLE__ */ -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_4096_sub_64 -.type sp_4096_sub_64,@function -.align 16 -sp_4096_sub_64: -#else -.section __TEXT,__text -.globl _sp_4096_sub_64 -.p2align 4 -_sp_4096_sub_64: -#endif /* __APPLE__ */ - movq (%rsi), %rcx - xorq %rax, %rax - subq (%rdx), %rcx - movq 8(%rsi), %r8 - movq %rcx, (%rdi) - sbbq 8(%rdx), %r8 - movq 16(%rsi), %rcx - movq %r8, 8(%rdi) - sbbq 16(%rdx), %rcx - movq 24(%rsi), %r8 - movq %rcx, 16(%rdi) - sbbq 24(%rdx), %r8 - movq 32(%rsi), %rcx - movq %r8, 24(%rdi) - sbbq 32(%rdx), %rcx - movq 40(%rsi), %r8 - movq %rcx, 32(%rdi) - sbbq 40(%rdx), %r8 - movq 48(%rsi), %rcx - movq %r8, 40(%rdi) - sbbq 48(%rdx), %rcx - movq 56(%rsi), %r8 - movq %rcx, 48(%rdi) - sbbq 56(%rdx), %r8 - movq 64(%rsi), %rcx - movq %r8, 56(%rdi) - sbbq 64(%rdx), %rcx - movq 72(%rsi), %r8 - movq %rcx, 64(%rdi) - sbbq 72(%rdx), %r8 - movq 80(%rsi), %rcx - movq %r8, 72(%rdi) - sbbq 80(%rdx), %rcx - movq 88(%rsi), %r8 - movq %rcx, 80(%rdi) - sbbq 88(%rdx), %r8 - movq 96(%rsi), %rcx - movq %r8, 88(%rdi) - sbbq 96(%rdx), %rcx - movq 104(%rsi), %r8 - movq %rcx, 96(%rdi) - sbbq 104(%rdx), %r8 - movq 112(%rsi), %rcx - movq %r8, 104(%rdi) - sbbq 112(%rdx), %rcx - movq 120(%rsi), %r8 - movq %rcx, 112(%rdi) - sbbq 120(%rdx), %r8 - movq 128(%rsi), %rcx - movq %r8, 120(%rdi) - sbbq 128(%rdx), %rcx - movq 136(%rsi), %r8 - movq %rcx, 128(%rdi) - sbbq 136(%rdx), %r8 - movq 144(%rsi), %rcx - movq %r8, 136(%rdi) - sbbq 144(%rdx), %rcx - movq 152(%rsi), %r8 - movq %rcx, 144(%rdi) - sbbq 152(%rdx), %r8 - movq 160(%rsi), %rcx - movq %r8, 152(%rdi) - sbbq 160(%rdx), %rcx - movq 168(%rsi), %r8 - movq %rcx, 160(%rdi) - sbbq 168(%rdx), %r8 - movq 176(%rsi), %rcx - movq %r8, 168(%rdi) - sbbq 176(%rdx), %rcx - movq 184(%rsi), %r8 - movq %rcx, 176(%rdi) - sbbq 184(%rdx), %r8 - movq 192(%rsi), %rcx - movq %r8, 184(%rdi) - sbbq 192(%rdx), %rcx - movq 200(%rsi), %r8 - movq %rcx, 192(%rdi) - sbbq 200(%rdx), %r8 - movq 208(%rsi), %rcx - movq %r8, 200(%rdi) - sbbq 208(%rdx), %rcx - movq 216(%rsi), %r8 - movq %rcx, 208(%rdi) - sbbq 216(%rdx), %r8 - movq 224(%rsi), %rcx - movq %r8, 216(%rdi) - sbbq 224(%rdx), %rcx - movq 232(%rsi), %r8 - movq %rcx, 224(%rdi) - sbbq 232(%rdx), %r8 - movq 240(%rsi), %rcx - movq %r8, 232(%rdi) - sbbq 240(%rdx), %rcx - movq 248(%rsi), %r8 - movq %rcx, 240(%rdi) - sbbq 248(%rdx), %r8 - movq 256(%rsi), %rcx - movq %r8, 248(%rdi) - sbbq 256(%rdx), %rcx - movq 264(%rsi), %r8 - movq %rcx, 256(%rdi) - sbbq 264(%rdx), %r8 - movq 272(%rsi), %rcx - movq %r8, 264(%rdi) - sbbq 272(%rdx), %rcx - movq 280(%rsi), %r8 - movq %rcx, 272(%rdi) - sbbq 280(%rdx), %r8 - movq 288(%rsi), %rcx - movq %r8, 280(%rdi) - sbbq 288(%rdx), %rcx - movq 296(%rsi), %r8 - movq %rcx, 288(%rdi) - sbbq 296(%rdx), %r8 - movq 304(%rsi), %rcx - movq %r8, 296(%rdi) - sbbq 304(%rdx), %rcx - movq 312(%rsi), %r8 - movq %rcx, 304(%rdi) - sbbq 312(%rdx), %r8 - movq 320(%rsi), %rcx - movq %r8, 312(%rdi) - sbbq 320(%rdx), %rcx - movq 328(%rsi), %r8 - movq %rcx, 320(%rdi) - sbbq 328(%rdx), %r8 - movq 336(%rsi), %rcx - movq %r8, 328(%rdi) - sbbq 336(%rdx), %rcx - movq 344(%rsi), %r8 - movq %rcx, 336(%rdi) - sbbq 344(%rdx), %r8 - movq 352(%rsi), %rcx - movq %r8, 344(%rdi) - sbbq 352(%rdx), %rcx - movq 360(%rsi), %r8 - movq %rcx, 352(%rdi) - sbbq 360(%rdx), %r8 - movq 368(%rsi), %rcx - movq %r8, 360(%rdi) - sbbq 368(%rdx), %rcx - movq 376(%rsi), %r8 - movq %rcx, 368(%rdi) - sbbq 376(%rdx), %r8 - movq 384(%rsi), %rcx - movq %r8, 376(%rdi) - sbbq 384(%rdx), %rcx - movq 392(%rsi), %r8 - movq %rcx, 384(%rdi) - sbbq 392(%rdx), %r8 - movq 400(%rsi), %rcx - movq %r8, 392(%rdi) - sbbq 400(%rdx), %rcx - movq 408(%rsi), %r8 - movq %rcx, 400(%rdi) - sbbq 408(%rdx), %r8 - movq 416(%rsi), %rcx - movq %r8, 408(%rdi) - sbbq 416(%rdx), %rcx - movq 424(%rsi), %r8 - movq %rcx, 416(%rdi) - sbbq 424(%rdx), %r8 - movq 432(%rsi), %rcx - movq %r8, 424(%rdi) - sbbq 432(%rdx), %rcx - movq 440(%rsi), %r8 - movq %rcx, 432(%rdi) - sbbq 440(%rdx), %r8 - movq 448(%rsi), %rcx - movq %r8, 440(%rdi) - sbbq 448(%rdx), %rcx - movq 456(%rsi), %r8 - movq %rcx, 448(%rdi) - sbbq 456(%rdx), %r8 - movq 464(%rsi), %rcx - movq %r8, 456(%rdi) - sbbq 464(%rdx), %rcx - movq 472(%rsi), %r8 - movq %rcx, 464(%rdi) - sbbq 472(%rdx), %r8 - movq 480(%rsi), %rcx - movq %r8, 472(%rdi) - sbbq 480(%rdx), %rcx - movq 488(%rsi), %r8 - movq %rcx, 480(%rdi) - sbbq 488(%rdx), %r8 - movq 496(%rsi), %rcx - movq %r8, 488(%rdi) - sbbq 496(%rdx), %rcx - movq 504(%rsi), %r8 - movq %rcx, 496(%rdi) - sbbq 504(%rdx), %r8 - movq %r8, 504(%rdi) - sbbq $0x00, %rax - repz retq -#ifndef __APPLE__ -.size sp_4096_sub_64,.-sp_4096_sub_64 -#endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Reduce the number back to 4096 bits using Montgomery reduction. * diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm index f56accebd..b4574864e 100644 --- a/wolfcrypt/src/sp_x86_64_asm.asm +++ b/wolfcrypt/src/sp_x86_64_asm.asm @@ -10780,185 +10780,115 @@ ENDIF ret sp_2048_mont_reduce_32 ENDP _text ENDS -IFDEF HAVE_INTEL_AVX2 -; /* Conditionally subtract b from a using the mask m. -; * m is -1 to subtract and 0 when not copying. +; /* Sub b from a into r. (r = a - b) ; * -; * r A single precision number representing condition subtract result. -; * a A single precision number to subtract from. -; * b A single precision number to subtract. -; * m Mask value to apply. +; * r A single precision integer. +; * a A single precision integer. +; * b A single precision integer. ; */ _text SEGMENT READONLY PARA -sp_2048_cond_sub_avx2_32 PROC - push r12 - mov rax, 0 - mov r12, QWORD PTR [r8] - mov r10, QWORD PTR [rdx] - pext r12, r12, r9 - sub r10, r12 - mov r12, QWORD PTR [r8+8] - mov r11, QWORD PTR [rdx+8] - pext r12, r12, r9 - mov QWORD PTR [rcx], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+16] - mov r12, QWORD PTR [rdx+16] - pext r10, r10, r9 - mov QWORD PTR [rcx+8], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+24] +sp_2048_sub_32 PROC + mov r9, QWORD PTR [rdx] + xor rax, rax + sub r9, QWORD PTR [r8] + mov r10, QWORD PTR [rdx+8] + mov QWORD PTR [rcx], r9 + sbb r10, QWORD PTR [r8+8] + mov r9, QWORD PTR [rdx+16] + mov QWORD PTR [rcx+8], r10 + sbb r9, QWORD PTR [r8+16] mov r10, QWORD PTR [rdx+24] - pext r11, r11, r9 - mov QWORD PTR [rcx+16], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+32] - mov r11, QWORD PTR [rdx+32] - pext r12, r12, r9 + mov QWORD PTR [rcx+16], r9 + sbb r10, QWORD PTR [r8+24] + mov r9, QWORD PTR [rdx+32] mov QWORD PTR [rcx+24], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+40] - mov r12, QWORD PTR [rdx+40] - pext r10, r10, r9 - mov QWORD PTR [rcx+32], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+48] - mov r10, QWORD PTR [rdx+48] - pext r11, r11, r9 - mov QWORD PTR [rcx+40], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+56] - mov r11, QWORD PTR [rdx+56] - pext r12, r12, r9 - mov QWORD PTR [rcx+48], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+64] - mov r12, QWORD PTR [rdx+64] - pext r10, r10, r9 - mov QWORD PTR [rcx+56], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+72] + sbb r9, QWORD PTR [r8+32] + mov r10, QWORD PTR [rdx+40] + mov QWORD PTR [rcx+32], r9 + sbb r10, QWORD PTR [r8+40] + mov r9, QWORD PTR [rdx+48] + mov QWORD PTR [rcx+40], r10 + sbb r9, QWORD PTR [r8+48] + mov r10, QWORD PTR [rdx+56] + mov QWORD PTR [rcx+48], r9 + sbb r10, QWORD PTR [r8+56] + mov r9, QWORD PTR [rdx+64] + mov QWORD PTR [rcx+56], r10 + sbb r9, QWORD PTR [r8+64] mov r10, QWORD PTR [rdx+72] - pext r11, r11, r9 - mov QWORD PTR [rcx+64], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+80] - mov r11, QWORD PTR [rdx+80] - pext r12, r12, r9 + mov QWORD PTR [rcx+64], r9 + sbb r10, QWORD PTR [r8+72] + mov r9, QWORD PTR [rdx+80] mov QWORD PTR [rcx+72], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+88] - mov r12, QWORD PTR [rdx+88] - pext r10, r10, r9 - mov QWORD PTR [rcx+80], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+96] - mov r10, QWORD PTR [rdx+96] - pext r11, r11, r9 - mov QWORD PTR [rcx+88], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+104] - mov r11, QWORD PTR [rdx+104] - pext r12, r12, r9 - mov QWORD PTR [rcx+96], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+112] - mov r12, QWORD PTR [rdx+112] - pext r10, r10, r9 - mov QWORD PTR [rcx+104], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+120] + sbb r9, QWORD PTR [r8+80] + mov r10, QWORD PTR [rdx+88] + mov QWORD PTR [rcx+80], r9 + sbb r10, QWORD PTR [r8+88] + mov r9, QWORD PTR [rdx+96] + mov QWORD PTR [rcx+88], r10 + sbb r9, QWORD PTR [r8+96] + mov r10, QWORD PTR [rdx+104] + mov QWORD PTR [rcx+96], r9 + sbb r10, QWORD PTR [r8+104] + mov r9, QWORD PTR [rdx+112] + mov QWORD PTR [rcx+104], r10 + sbb r9, QWORD PTR [r8+112] mov r10, QWORD PTR [rdx+120] - pext r11, r11, r9 - mov QWORD PTR [rcx+112], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+128] - mov r11, QWORD PTR [rdx+128] - pext r12, r12, r9 + mov QWORD PTR [rcx+112], r9 + sbb r10, QWORD PTR [r8+120] + mov r9, QWORD PTR [rdx+128] mov QWORD PTR [rcx+120], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+136] - mov r12, QWORD PTR [rdx+136] - pext r10, r10, r9 - mov QWORD PTR [rcx+128], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+144] - mov r10, QWORD PTR [rdx+144] - pext r11, r11, r9 - mov QWORD PTR [rcx+136], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+152] - mov r11, QWORD PTR [rdx+152] - pext r12, r12, r9 - mov QWORD PTR [rcx+144], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+160] - mov r12, QWORD PTR [rdx+160] - pext r10, r10, r9 - mov QWORD PTR [rcx+152], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+168] + sbb r9, QWORD PTR [r8+128] + mov r10, QWORD PTR [rdx+136] + mov QWORD PTR [rcx+128], r9 + sbb r10, QWORD PTR [r8+136] + mov r9, QWORD PTR [rdx+144] + mov QWORD PTR [rcx+136], r10 + sbb r9, QWORD PTR [r8+144] + mov r10, QWORD PTR [rdx+152] + mov QWORD PTR [rcx+144], r9 + sbb r10, QWORD PTR [r8+152] + mov r9, QWORD PTR [rdx+160] + mov QWORD PTR [rcx+152], r10 + sbb r9, QWORD PTR [r8+160] mov r10, QWORD PTR [rdx+168] - pext r11, r11, r9 - mov QWORD PTR [rcx+160], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+176] - mov r11, QWORD PTR [rdx+176] - pext r12, r12, r9 + mov QWORD PTR [rcx+160], r9 + sbb r10, QWORD PTR [r8+168] + mov r9, QWORD PTR [rdx+176] mov QWORD PTR [rcx+168], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+184] - mov r12, QWORD PTR [rdx+184] - pext r10, r10, r9 - mov QWORD PTR [rcx+176], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+192] - mov r10, QWORD PTR [rdx+192] - pext r11, r11, r9 - mov QWORD PTR [rcx+184], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+200] - mov r11, QWORD PTR [rdx+200] - pext r12, r12, r9 - mov QWORD PTR [rcx+192], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+208] - mov r12, QWORD PTR [rdx+208] - pext r10, r10, r9 - mov QWORD PTR [rcx+200], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+216] + sbb r9, QWORD PTR [r8+176] + mov r10, QWORD PTR [rdx+184] + mov QWORD PTR [rcx+176], r9 + sbb r10, QWORD PTR [r8+184] + mov r9, QWORD PTR [rdx+192] + mov QWORD PTR [rcx+184], r10 + sbb r9, QWORD PTR [r8+192] + mov r10, QWORD PTR [rdx+200] + mov QWORD PTR [rcx+192], r9 + sbb r10, QWORD PTR [r8+200] + mov r9, QWORD PTR [rdx+208] + mov QWORD PTR [rcx+200], r10 + sbb r9, QWORD PTR [r8+208] mov r10, QWORD PTR [rdx+216] - pext r11, r11, r9 - mov QWORD PTR [rcx+208], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+224] - mov r11, QWORD PTR [rdx+224] - pext r12, r12, r9 + mov QWORD PTR [rcx+208], r9 + sbb r10, QWORD PTR [r8+216] + mov r9, QWORD PTR [rdx+224] mov QWORD PTR [rcx+216], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+232] - mov r12, QWORD PTR [rdx+232] - pext r10, r10, r9 - mov QWORD PTR [rcx+224], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+240] - mov r10, QWORD PTR [rdx+240] - pext r11, r11, r9 - mov QWORD PTR [rcx+232], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+248] - mov r11, QWORD PTR [rdx+248] - pext r12, r12, r9 - mov QWORD PTR [rcx+240], r10 - sbb r11, r12 - mov QWORD PTR [rcx+248], r11 + sbb r9, QWORD PTR [r8+224] + mov r10, QWORD PTR [rdx+232] + mov QWORD PTR [rcx+224], r9 + sbb r10, QWORD PTR [r8+232] + mov r9, QWORD PTR [rdx+240] + mov QWORD PTR [rcx+232], r10 + sbb r9, QWORD PTR [r8+240] + mov r10, QWORD PTR [rdx+248] + mov QWORD PTR [rcx+240], r9 + sbb r10, QWORD PTR [r8+248] + mov QWORD PTR [rcx+248], r10 sbb rax, 0 - pop r12 ret -sp_2048_cond_sub_avx2_32 ENDP +sp_2048_sub_32 ENDP _text ENDS -ENDIF IFDEF HAVE_INTEL_AVX2 ; /* Mul a by digit b into r. (r = a * b) ; * @@ -11188,6 +11118,185 @@ div_2048_word_asm_32 PROC div_2048_word_asm_32 ENDP _text ENDS ENDIF +IFDEF HAVE_INTEL_AVX2 +; /* Conditionally subtract b from a using the mask m. +; * m is -1 to subtract and 0 when not copying. +; * +; * r A single precision number representing condition subtract result. +; * a A single precision number to subtract from. +; * b A single precision number to subtract. +; * m Mask value to apply. +; */ +_text SEGMENT READONLY PARA +sp_2048_cond_sub_avx2_32 PROC + push r12 + mov rax, 0 + mov r12, QWORD PTR [r8] + mov r10, QWORD PTR [rdx] + pext r12, r12, r9 + sub r10, r12 + mov r12, QWORD PTR [r8+8] + mov r11, QWORD PTR [rdx+8] + pext r12, r12, r9 + mov QWORD PTR [rcx], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+16] + mov r12, QWORD PTR [rdx+16] + pext r10, r10, r9 + mov QWORD PTR [rcx+8], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+24] + mov r10, QWORD PTR [rdx+24] + pext r11, r11, r9 + mov QWORD PTR [rcx+16], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+32] + mov r11, QWORD PTR [rdx+32] + pext r12, r12, r9 + mov QWORD PTR [rcx+24], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+40] + mov r12, QWORD PTR [rdx+40] + pext r10, r10, r9 + mov QWORD PTR [rcx+32], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+48] + mov r10, QWORD PTR [rdx+48] + pext r11, r11, r9 + mov QWORD PTR [rcx+40], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+56] + mov r11, QWORD PTR [rdx+56] + pext r12, r12, r9 + mov QWORD PTR [rcx+48], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+64] + mov r12, QWORD PTR [rdx+64] + pext r10, r10, r9 + mov QWORD PTR [rcx+56], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+72] + mov r10, QWORD PTR [rdx+72] + pext r11, r11, r9 + mov QWORD PTR [rcx+64], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+80] + mov r11, QWORD PTR [rdx+80] + pext r12, r12, r9 + mov QWORD PTR [rcx+72], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+88] + mov r12, QWORD PTR [rdx+88] + pext r10, r10, r9 + mov QWORD PTR [rcx+80], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+96] + mov r10, QWORD PTR [rdx+96] + pext r11, r11, r9 + mov QWORD PTR [rcx+88], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+104] + mov r11, QWORD PTR [rdx+104] + pext r12, r12, r9 + mov QWORD PTR [rcx+96], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+112] + mov r12, QWORD PTR [rdx+112] + pext r10, r10, r9 + mov QWORD PTR [rcx+104], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+120] + mov r10, QWORD PTR [rdx+120] + pext r11, r11, r9 + mov QWORD PTR [rcx+112], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+128] + mov r11, QWORD PTR [rdx+128] + pext r12, r12, r9 + mov QWORD PTR [rcx+120], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+136] + mov r12, QWORD PTR [rdx+136] + pext r10, r10, r9 + mov QWORD PTR [rcx+128], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+144] + mov r10, QWORD PTR [rdx+144] + pext r11, r11, r9 + mov QWORD PTR [rcx+136], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+152] + mov r11, QWORD PTR [rdx+152] + pext r12, r12, r9 + mov QWORD PTR [rcx+144], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+160] + mov r12, QWORD PTR [rdx+160] + pext r10, r10, r9 + mov QWORD PTR [rcx+152], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+168] + mov r10, QWORD PTR [rdx+168] + pext r11, r11, r9 + mov QWORD PTR [rcx+160], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+176] + mov r11, QWORD PTR [rdx+176] + pext r12, r12, r9 + mov QWORD PTR [rcx+168], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+184] + mov r12, QWORD PTR [rdx+184] + pext r10, r10, r9 + mov QWORD PTR [rcx+176], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+192] + mov r10, QWORD PTR [rdx+192] + pext r11, r11, r9 + mov QWORD PTR [rcx+184], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+200] + mov r11, QWORD PTR [rdx+200] + pext r12, r12, r9 + mov QWORD PTR [rcx+192], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+208] + mov r12, QWORD PTR [rdx+208] + pext r10, r10, r9 + mov QWORD PTR [rcx+200], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+216] + mov r10, QWORD PTR [rdx+216] + pext r11, r11, r9 + mov QWORD PTR [rcx+208], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+224] + mov r11, QWORD PTR [rdx+224] + pext r12, r12, r9 + mov QWORD PTR [rcx+216], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+232] + mov r12, QWORD PTR [rdx+232] + pext r10, r10, r9 + mov QWORD PTR [rcx+224], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+240] + mov r10, QWORD PTR [rdx+240] + pext r11, r11, r9 + mov QWORD PTR [rcx+232], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+248] + mov r11, QWORD PTR [rdx+248] + pext r12, r12, r9 + mov QWORD PTR [rcx+240], r10 + sbb r11, r12 + mov QWORD PTR [rcx+248], r11 + sbb rax, 0 + pop r12 + ret +sp_2048_cond_sub_avx2_32 ENDP +_text ENDS +ENDIF ; /* Compare a with b in constant time. ; * ; * a A single precision integer. @@ -11463,115 +11572,6 @@ sp_2048_cmp_32 PROC ret sp_2048_cmp_32 ENDP _text ENDS -; /* Sub b from a into r. (r = a - b) -; * -; * r A single precision integer. -; * a A single precision integer. -; * b A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_2048_sub_32 PROC - mov r9, QWORD PTR [rdx] - xor rax, rax - sub r9, QWORD PTR [r8] - mov r10, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r9 - sbb r10, QWORD PTR [r8+8] - mov r9, QWORD PTR [rdx+16] - mov QWORD PTR [rcx+8], r10 - sbb r9, QWORD PTR [r8+16] - mov r10, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r9 - sbb r10, QWORD PTR [r8+24] - mov r9, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r10 - sbb r9, QWORD PTR [r8+32] - mov r10, QWORD PTR [rdx+40] - mov QWORD PTR [rcx+32], r9 - sbb r10, QWORD PTR [r8+40] - mov r9, QWORD PTR [rdx+48] - mov QWORD PTR [rcx+40], r10 - sbb r9, QWORD PTR [r8+48] - mov r10, QWORD PTR [rdx+56] - mov QWORD PTR [rcx+48], r9 - sbb r10, QWORD PTR [r8+56] - mov r9, QWORD PTR [rdx+64] - mov QWORD PTR [rcx+56], r10 - sbb r9, QWORD PTR [r8+64] - mov r10, QWORD PTR [rdx+72] - mov QWORD PTR [rcx+64], r9 - sbb r10, QWORD PTR [r8+72] - mov r9, QWORD PTR [rdx+80] - mov QWORD PTR [rcx+72], r10 - sbb r9, QWORD PTR [r8+80] - mov r10, QWORD PTR [rdx+88] - mov QWORD PTR [rcx+80], r9 - sbb r10, QWORD PTR [r8+88] - mov r9, QWORD PTR [rdx+96] - mov QWORD PTR [rcx+88], r10 - sbb r9, QWORD PTR [r8+96] - mov r10, QWORD PTR [rdx+104] - mov QWORD PTR [rcx+96], r9 - sbb r10, QWORD PTR [r8+104] - mov r9, QWORD PTR [rdx+112] - mov QWORD PTR [rcx+104], r10 - sbb r9, QWORD PTR [r8+112] - mov r10, QWORD PTR [rdx+120] - mov QWORD PTR [rcx+112], r9 - sbb r10, QWORD PTR [r8+120] - mov r9, QWORD PTR [rdx+128] - mov QWORD PTR [rcx+120], r10 - sbb r9, QWORD PTR [r8+128] - mov r10, QWORD PTR [rdx+136] - mov QWORD PTR [rcx+128], r9 - sbb r10, QWORD PTR [r8+136] - mov r9, QWORD PTR [rdx+144] - mov QWORD PTR [rcx+136], r10 - sbb r9, QWORD PTR [r8+144] - mov r10, QWORD PTR [rdx+152] - mov QWORD PTR [rcx+144], r9 - sbb r10, QWORD PTR [r8+152] - mov r9, QWORD PTR [rdx+160] - mov QWORD PTR [rcx+152], r10 - sbb r9, QWORD PTR [r8+160] - mov r10, QWORD PTR [rdx+168] - mov QWORD PTR [rcx+160], r9 - sbb r10, QWORD PTR [r8+168] - mov r9, QWORD PTR [rdx+176] - mov QWORD PTR [rcx+168], r10 - sbb r9, QWORD PTR [r8+176] - mov r10, QWORD PTR [rdx+184] - mov QWORD PTR [rcx+176], r9 - sbb r10, QWORD PTR [r8+184] - mov r9, QWORD PTR [rdx+192] - mov QWORD PTR [rcx+184], r10 - sbb r9, QWORD PTR [r8+192] - mov r10, QWORD PTR [rdx+200] - mov QWORD PTR [rcx+192], r9 - sbb r10, QWORD PTR [r8+200] - mov r9, QWORD PTR [rdx+208] - mov QWORD PTR [rcx+200], r10 - sbb r9, QWORD PTR [r8+208] - mov r10, QWORD PTR [rdx+216] - mov QWORD PTR [rcx+208], r9 - sbb r10, QWORD PTR [r8+216] - mov r9, QWORD PTR [rdx+224] - mov QWORD PTR [rcx+216], r10 - sbb r9, QWORD PTR [r8+224] - mov r10, QWORD PTR [rdx+232] - mov QWORD PTR [rcx+224], r9 - sbb r10, QWORD PTR [r8+232] - mov r9, QWORD PTR [rdx+240] - mov QWORD PTR [rcx+232], r10 - sbb r9, QWORD PTR [r8+240] - mov r10, QWORD PTR [rdx+248] - mov QWORD PTR [rcx+240], r9 - sbb r10, QWORD PTR [r8+248] - mov QWORD PTR [rcx+248], r10 - sbb rax, 0 - ret -sp_2048_sub_32 ENDP -_text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Reduce the number back to 2048 bits using Montgomery reduction. ; * @@ -24990,265 +24990,163 @@ ENDIF ret sp_3072_mont_reduce_48 ENDP _text ENDS -IFDEF HAVE_INTEL_AVX2 -; /* Conditionally subtract b from a using the mask m. -; * m is -1 to subtract and 0 when not copying. +; /* Sub b from a into r. (r = a - b) ; * -; * r A single precision number representing condition subtract result. -; * a A single precision number to subtract from. -; * b A single precision number to subtract. -; * m Mask value to apply. +; * r A single precision integer. +; * a A single precision integer. +; * b A single precision integer. ; */ _text SEGMENT READONLY PARA -sp_3072_cond_sub_avx2_48 PROC - push r12 - mov rax, 0 - mov r12, QWORD PTR [r8] - mov r10, QWORD PTR [rdx] - pext r12, r12, r9 - sub r10, r12 - mov r12, QWORD PTR [r8+8] - mov r11, QWORD PTR [rdx+8] - pext r12, r12, r9 - mov QWORD PTR [rcx], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+16] - mov r12, QWORD PTR [rdx+16] - pext r10, r10, r9 - mov QWORD PTR [rcx+8], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+24] +sp_3072_sub_48 PROC + mov r9, QWORD PTR [rdx] + xor rax, rax + sub r9, QWORD PTR [r8] + mov r10, QWORD PTR [rdx+8] + mov QWORD PTR [rcx], r9 + sbb r10, QWORD PTR [r8+8] + mov r9, QWORD PTR [rdx+16] + mov QWORD PTR [rcx+8], r10 + sbb r9, QWORD PTR [r8+16] mov r10, QWORD PTR [rdx+24] - pext r11, r11, r9 - mov QWORD PTR [rcx+16], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+32] - mov r11, QWORD PTR [rdx+32] - pext r12, r12, r9 + mov QWORD PTR [rcx+16], r9 + sbb r10, QWORD PTR [r8+24] + mov r9, QWORD PTR [rdx+32] mov QWORD PTR [rcx+24], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+40] - mov r12, QWORD PTR [rdx+40] - pext r10, r10, r9 - mov QWORD PTR [rcx+32], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+48] - mov r10, QWORD PTR [rdx+48] - pext r11, r11, r9 - mov QWORD PTR [rcx+40], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+56] - mov r11, QWORD PTR [rdx+56] - pext r12, r12, r9 - mov QWORD PTR [rcx+48], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+64] - mov r12, QWORD PTR [rdx+64] - pext r10, r10, r9 - mov QWORD PTR [rcx+56], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+72] + sbb r9, QWORD PTR [r8+32] + mov r10, QWORD PTR [rdx+40] + mov QWORD PTR [rcx+32], r9 + sbb r10, QWORD PTR [r8+40] + mov r9, QWORD PTR [rdx+48] + mov QWORD PTR [rcx+40], r10 + sbb r9, QWORD PTR [r8+48] + mov r10, QWORD PTR [rdx+56] + mov QWORD PTR [rcx+48], r9 + sbb r10, QWORD PTR [r8+56] + mov r9, QWORD PTR [rdx+64] + mov QWORD PTR [rcx+56], r10 + sbb r9, QWORD PTR [r8+64] mov r10, QWORD PTR [rdx+72] - pext r11, r11, r9 - mov QWORD PTR [rcx+64], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+80] - mov r11, QWORD PTR [rdx+80] - pext r12, r12, r9 + mov QWORD PTR [rcx+64], r9 + sbb r10, QWORD PTR [r8+72] + mov r9, QWORD PTR [rdx+80] mov QWORD PTR [rcx+72], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+88] - mov r12, QWORD PTR [rdx+88] - pext r10, r10, r9 - mov QWORD PTR [rcx+80], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+96] - mov r10, QWORD PTR [rdx+96] - pext r11, r11, r9 - mov QWORD PTR [rcx+88], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+104] - mov r11, QWORD PTR [rdx+104] - pext r12, r12, r9 - mov QWORD PTR [rcx+96], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+112] - mov r12, QWORD PTR [rdx+112] - pext r10, r10, r9 - mov QWORD PTR [rcx+104], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+120] + sbb r9, QWORD PTR [r8+80] + mov r10, QWORD PTR [rdx+88] + mov QWORD PTR [rcx+80], r9 + sbb r10, QWORD PTR [r8+88] + mov r9, QWORD PTR [rdx+96] + mov QWORD PTR [rcx+88], r10 + sbb r9, QWORD PTR [r8+96] + mov r10, QWORD PTR [rdx+104] + mov QWORD PTR [rcx+96], r9 + sbb r10, QWORD PTR [r8+104] + mov r9, QWORD PTR [rdx+112] + mov QWORD PTR [rcx+104], r10 + sbb r9, QWORD PTR [r8+112] mov r10, QWORD PTR [rdx+120] - pext r11, r11, r9 - mov QWORD PTR [rcx+112], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+128] - mov r11, QWORD PTR [rdx+128] - pext r12, r12, r9 + mov QWORD PTR [rcx+112], r9 + sbb r10, QWORD PTR [r8+120] + mov r9, QWORD PTR [rdx+128] mov QWORD PTR [rcx+120], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+136] - mov r12, QWORD PTR [rdx+136] - pext r10, r10, r9 - mov QWORD PTR [rcx+128], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+144] - mov r10, QWORD PTR [rdx+144] - pext r11, r11, r9 - mov QWORD PTR [rcx+136], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+152] - mov r11, QWORD PTR [rdx+152] - pext r12, r12, r9 - mov QWORD PTR [rcx+144], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+160] - mov r12, QWORD PTR [rdx+160] - pext r10, r10, r9 - mov QWORD PTR [rcx+152], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+168] + sbb r9, QWORD PTR [r8+128] + mov r10, QWORD PTR [rdx+136] + mov QWORD PTR [rcx+128], r9 + sbb r10, QWORD PTR [r8+136] + mov r9, QWORD PTR [rdx+144] + mov QWORD PTR [rcx+136], r10 + sbb r9, QWORD PTR [r8+144] + mov r10, QWORD PTR [rdx+152] + mov QWORD PTR [rcx+144], r9 + sbb r10, QWORD PTR [r8+152] + mov r9, QWORD PTR [rdx+160] + mov QWORD PTR [rcx+152], r10 + sbb r9, QWORD PTR [r8+160] mov r10, QWORD PTR [rdx+168] - pext r11, r11, r9 - mov QWORD PTR [rcx+160], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+176] - mov r11, QWORD PTR [rdx+176] - pext r12, r12, r9 + mov QWORD PTR [rcx+160], r9 + sbb r10, QWORD PTR [r8+168] + mov r9, QWORD PTR [rdx+176] mov QWORD PTR [rcx+168], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+184] - mov r12, QWORD PTR [rdx+184] - pext r10, r10, r9 - mov QWORD PTR [rcx+176], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+192] - mov r10, QWORD PTR [rdx+192] - pext r11, r11, r9 - mov QWORD PTR [rcx+184], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+200] - mov r11, QWORD PTR [rdx+200] - pext r12, r12, r9 - mov QWORD PTR [rcx+192], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+208] - mov r12, QWORD PTR [rdx+208] - pext r10, r10, r9 - mov QWORD PTR [rcx+200], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+216] + sbb r9, QWORD PTR [r8+176] + mov r10, QWORD PTR [rdx+184] + mov QWORD PTR [rcx+176], r9 + sbb r10, QWORD PTR [r8+184] + mov r9, QWORD PTR [rdx+192] + mov QWORD PTR [rcx+184], r10 + sbb r9, QWORD PTR [r8+192] + mov r10, QWORD PTR [rdx+200] + mov QWORD PTR [rcx+192], r9 + sbb r10, QWORD PTR [r8+200] + mov r9, QWORD PTR [rdx+208] + mov QWORD PTR [rcx+200], r10 + sbb r9, QWORD PTR [r8+208] mov r10, QWORD PTR [rdx+216] - pext r11, r11, r9 - mov QWORD PTR [rcx+208], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+224] - mov r11, QWORD PTR [rdx+224] - pext r12, r12, r9 + mov QWORD PTR [rcx+208], r9 + sbb r10, QWORD PTR [r8+216] + mov r9, QWORD PTR [rdx+224] mov QWORD PTR [rcx+216], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+232] - mov r12, QWORD PTR [rdx+232] - pext r10, r10, r9 - mov QWORD PTR [rcx+224], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+240] - mov r10, QWORD PTR [rdx+240] - pext r11, r11, r9 - mov QWORD PTR [rcx+232], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+248] - mov r11, QWORD PTR [rdx+248] - pext r12, r12, r9 - mov QWORD PTR [rcx+240], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+256] - mov r12, QWORD PTR [rdx+256] - pext r10, r10, r9 - mov QWORD PTR [rcx+248], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+264] + sbb r9, QWORD PTR [r8+224] + mov r10, QWORD PTR [rdx+232] + mov QWORD PTR [rcx+224], r9 + sbb r10, QWORD PTR [r8+232] + mov r9, QWORD PTR [rdx+240] + mov QWORD PTR [rcx+232], r10 + sbb r9, QWORD PTR [r8+240] + mov r10, QWORD PTR [rdx+248] + mov QWORD PTR [rcx+240], r9 + sbb r10, QWORD PTR [r8+248] + mov r9, QWORD PTR [rdx+256] + mov QWORD PTR [rcx+248], r10 + sbb r9, QWORD PTR [r8+256] mov r10, QWORD PTR [rdx+264] - pext r11, r11, r9 - mov QWORD PTR [rcx+256], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+272] - mov r11, QWORD PTR [rdx+272] - pext r12, r12, r9 + mov QWORD PTR [rcx+256], r9 + sbb r10, QWORD PTR [r8+264] + mov r9, QWORD PTR [rdx+272] mov QWORD PTR [rcx+264], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+280] - mov r12, QWORD PTR [rdx+280] - pext r10, r10, r9 - mov QWORD PTR [rcx+272], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+288] - mov r10, QWORD PTR [rdx+288] - pext r11, r11, r9 - mov QWORD PTR [rcx+280], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+296] - mov r11, QWORD PTR [rdx+296] - pext r12, r12, r9 - mov QWORD PTR [rcx+288], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+304] - mov r12, QWORD PTR [rdx+304] - pext r10, r10, r9 - mov QWORD PTR [rcx+296], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+312] + sbb r9, QWORD PTR [r8+272] + mov r10, QWORD PTR [rdx+280] + mov QWORD PTR [rcx+272], r9 + sbb r10, QWORD PTR [r8+280] + mov r9, QWORD PTR [rdx+288] + mov QWORD PTR [rcx+280], r10 + sbb r9, QWORD PTR [r8+288] + mov r10, QWORD PTR [rdx+296] + mov QWORD PTR [rcx+288], r9 + sbb r10, QWORD PTR [r8+296] + mov r9, QWORD PTR [rdx+304] + mov QWORD PTR [rcx+296], r10 + sbb r9, QWORD PTR [r8+304] mov r10, QWORD PTR [rdx+312] - pext r11, r11, r9 - mov QWORD PTR [rcx+304], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+320] - mov r11, QWORD PTR [rdx+320] - pext r12, r12, r9 + mov QWORD PTR [rcx+304], r9 + sbb r10, QWORD PTR [r8+312] + mov r9, QWORD PTR [rdx+320] mov QWORD PTR [rcx+312], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+328] - mov r12, QWORD PTR [rdx+328] - pext r10, r10, r9 - mov QWORD PTR [rcx+320], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+336] - mov r10, QWORD PTR [rdx+336] - pext r11, r11, r9 - mov QWORD PTR [rcx+328], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+344] - mov r11, QWORD PTR [rdx+344] - pext r12, r12, r9 - mov QWORD PTR [rcx+336], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+352] - mov r12, QWORD PTR [rdx+352] - pext r10, r10, r9 - mov QWORD PTR [rcx+344], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+360] + sbb r9, QWORD PTR [r8+320] + mov r10, QWORD PTR [rdx+328] + mov QWORD PTR [rcx+320], r9 + sbb r10, QWORD PTR [r8+328] + mov r9, QWORD PTR [rdx+336] + mov QWORD PTR [rcx+328], r10 + sbb r9, QWORD PTR [r8+336] + mov r10, QWORD PTR [rdx+344] + mov QWORD PTR [rcx+336], r9 + sbb r10, QWORD PTR [r8+344] + mov r9, QWORD PTR [rdx+352] + mov QWORD PTR [rcx+344], r10 + sbb r9, QWORD PTR [r8+352] mov r10, QWORD PTR [rdx+360] - pext r11, r11, r9 - mov QWORD PTR [rcx+352], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+368] - mov r11, QWORD PTR [rdx+368] - pext r12, r12, r9 + mov QWORD PTR [rcx+352], r9 + sbb r10, QWORD PTR [r8+360] + mov r9, QWORD PTR [rdx+368] mov QWORD PTR [rcx+360], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+376] - mov r12, QWORD PTR [rdx+376] - pext r10, r10, r9 - mov QWORD PTR [rcx+368], r11 - sbb r12, r10 - mov QWORD PTR [rcx+376], r12 + sbb r9, QWORD PTR [r8+368] + mov r10, QWORD PTR [rdx+376] + mov QWORD PTR [rcx+368], r9 + sbb r10, QWORD PTR [r8+376] + mov QWORD PTR [rcx+376], r10 sbb rax, 0 - pop r12 ret -sp_3072_cond_sub_avx2_48 ENDP +sp_3072_sub_48 ENDP _text ENDS -ENDIF IFDEF HAVE_INTEL_AVX2 ; /* Mul a by digit b into r. (r = a * b) ; * @@ -25574,6 +25472,265 @@ div_3072_word_asm_48 PROC div_3072_word_asm_48 ENDP _text ENDS ENDIF +IFDEF HAVE_INTEL_AVX2 +; /* Conditionally subtract b from a using the mask m. +; * m is -1 to subtract and 0 when not copying. +; * +; * r A single precision number representing condition subtract result. +; * a A single precision number to subtract from. +; * b A single precision number to subtract. +; * m Mask value to apply. +; */ +_text SEGMENT READONLY PARA +sp_3072_cond_sub_avx2_48 PROC + push r12 + mov rax, 0 + mov r12, QWORD PTR [r8] + mov r10, QWORD PTR [rdx] + pext r12, r12, r9 + sub r10, r12 + mov r12, QWORD PTR [r8+8] + mov r11, QWORD PTR [rdx+8] + pext r12, r12, r9 + mov QWORD PTR [rcx], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+16] + mov r12, QWORD PTR [rdx+16] + pext r10, r10, r9 + mov QWORD PTR [rcx+8], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+24] + mov r10, QWORD PTR [rdx+24] + pext r11, r11, r9 + mov QWORD PTR [rcx+16], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+32] + mov r11, QWORD PTR [rdx+32] + pext r12, r12, r9 + mov QWORD PTR [rcx+24], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+40] + mov r12, QWORD PTR [rdx+40] + pext r10, r10, r9 + mov QWORD PTR [rcx+32], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+48] + mov r10, QWORD PTR [rdx+48] + pext r11, r11, r9 + mov QWORD PTR [rcx+40], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+56] + mov r11, QWORD PTR [rdx+56] + pext r12, r12, r9 + mov QWORD PTR [rcx+48], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+64] + mov r12, QWORD PTR [rdx+64] + pext r10, r10, r9 + mov QWORD PTR [rcx+56], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+72] + mov r10, QWORD PTR [rdx+72] + pext r11, r11, r9 + mov QWORD PTR [rcx+64], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+80] + mov r11, QWORD PTR [rdx+80] + pext r12, r12, r9 + mov QWORD PTR [rcx+72], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+88] + mov r12, QWORD PTR [rdx+88] + pext r10, r10, r9 + mov QWORD PTR [rcx+80], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+96] + mov r10, QWORD PTR [rdx+96] + pext r11, r11, r9 + mov QWORD PTR [rcx+88], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+104] + mov r11, QWORD PTR [rdx+104] + pext r12, r12, r9 + mov QWORD PTR [rcx+96], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+112] + mov r12, QWORD PTR [rdx+112] + pext r10, r10, r9 + mov QWORD PTR [rcx+104], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+120] + mov r10, QWORD PTR [rdx+120] + pext r11, r11, r9 + mov QWORD PTR [rcx+112], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+128] + mov r11, QWORD PTR [rdx+128] + pext r12, r12, r9 + mov QWORD PTR [rcx+120], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+136] + mov r12, QWORD PTR [rdx+136] + pext r10, r10, r9 + mov QWORD PTR [rcx+128], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+144] + mov r10, QWORD PTR [rdx+144] + pext r11, r11, r9 + mov QWORD PTR [rcx+136], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+152] + mov r11, QWORD PTR [rdx+152] + pext r12, r12, r9 + mov QWORD PTR [rcx+144], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+160] + mov r12, QWORD PTR [rdx+160] + pext r10, r10, r9 + mov QWORD PTR [rcx+152], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+168] + mov r10, QWORD PTR [rdx+168] + pext r11, r11, r9 + mov QWORD PTR [rcx+160], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+176] + mov r11, QWORD PTR [rdx+176] + pext r12, r12, r9 + mov QWORD PTR [rcx+168], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+184] + mov r12, QWORD PTR [rdx+184] + pext r10, r10, r9 + mov QWORD PTR [rcx+176], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+192] + mov r10, QWORD PTR [rdx+192] + pext r11, r11, r9 + mov QWORD PTR [rcx+184], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+200] + mov r11, QWORD PTR [rdx+200] + pext r12, r12, r9 + mov QWORD PTR [rcx+192], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+208] + mov r12, QWORD PTR [rdx+208] + pext r10, r10, r9 + mov QWORD PTR [rcx+200], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+216] + mov r10, QWORD PTR [rdx+216] + pext r11, r11, r9 + mov QWORD PTR [rcx+208], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+224] + mov r11, QWORD PTR [rdx+224] + pext r12, r12, r9 + mov QWORD PTR [rcx+216], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+232] + mov r12, QWORD PTR [rdx+232] + pext r10, r10, r9 + mov QWORD PTR [rcx+224], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+240] + mov r10, QWORD PTR [rdx+240] + pext r11, r11, r9 + mov QWORD PTR [rcx+232], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+248] + mov r11, QWORD PTR [rdx+248] + pext r12, r12, r9 + mov QWORD PTR [rcx+240], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+256] + mov r12, QWORD PTR [rdx+256] + pext r10, r10, r9 + mov QWORD PTR [rcx+248], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+264] + mov r10, QWORD PTR [rdx+264] + pext r11, r11, r9 + mov QWORD PTR [rcx+256], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+272] + mov r11, QWORD PTR [rdx+272] + pext r12, r12, r9 + mov QWORD PTR [rcx+264], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+280] + mov r12, QWORD PTR [rdx+280] + pext r10, r10, r9 + mov QWORD PTR [rcx+272], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+288] + mov r10, QWORD PTR [rdx+288] + pext r11, r11, r9 + mov QWORD PTR [rcx+280], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+296] + mov r11, QWORD PTR [rdx+296] + pext r12, r12, r9 + mov QWORD PTR [rcx+288], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+304] + mov r12, QWORD PTR [rdx+304] + pext r10, r10, r9 + mov QWORD PTR [rcx+296], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+312] + mov r10, QWORD PTR [rdx+312] + pext r11, r11, r9 + mov QWORD PTR [rcx+304], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+320] + mov r11, QWORD PTR [rdx+320] + pext r12, r12, r9 + mov QWORD PTR [rcx+312], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+328] + mov r12, QWORD PTR [rdx+328] + pext r10, r10, r9 + mov QWORD PTR [rcx+320], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+336] + mov r10, QWORD PTR [rdx+336] + pext r11, r11, r9 + mov QWORD PTR [rcx+328], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+344] + mov r11, QWORD PTR [rdx+344] + pext r12, r12, r9 + mov QWORD PTR [rcx+336], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+352] + mov r12, QWORD PTR [rdx+352] + pext r10, r10, r9 + mov QWORD PTR [rcx+344], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+360] + mov r10, QWORD PTR [rdx+360] + pext r11, r11, r9 + mov QWORD PTR [rcx+352], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+368] + mov r11, QWORD PTR [rdx+368] + pext r12, r12, r9 + mov QWORD PTR [rcx+360], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+376] + mov r12, QWORD PTR [rdx+376] + pext r10, r10, r9 + mov QWORD PTR [rcx+368], r11 + sbb r12, r10 + mov QWORD PTR [rcx+376], r12 + sbb rax, 0 + pop r12 + ret +sp_3072_cond_sub_avx2_48 ENDP +_text ENDS +ENDIF ; /* Compare a with b in constant time. ; * ; * a A single precision integer. @@ -25977,163 +26134,6 @@ sp_3072_cmp_48 PROC ret sp_3072_cmp_48 ENDP _text ENDS -; /* Sub b from a into r. (r = a - b) -; * -; * r A single precision integer. -; * a A single precision integer. -; * b A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_3072_sub_48 PROC - mov r9, QWORD PTR [rdx] - xor rax, rax - sub r9, QWORD PTR [r8] - mov r10, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r9 - sbb r10, QWORD PTR [r8+8] - mov r9, QWORD PTR [rdx+16] - mov QWORD PTR [rcx+8], r10 - sbb r9, QWORD PTR [r8+16] - mov r10, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r9 - sbb r10, QWORD PTR [r8+24] - mov r9, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r10 - sbb r9, QWORD PTR [r8+32] - mov r10, QWORD PTR [rdx+40] - mov QWORD PTR [rcx+32], r9 - sbb r10, QWORD PTR [r8+40] - mov r9, QWORD PTR [rdx+48] - mov QWORD PTR [rcx+40], r10 - sbb r9, QWORD PTR [r8+48] - mov r10, QWORD PTR [rdx+56] - mov QWORD PTR [rcx+48], r9 - sbb r10, QWORD PTR [r8+56] - mov r9, QWORD PTR [rdx+64] - mov QWORD PTR [rcx+56], r10 - sbb r9, QWORD PTR [r8+64] - mov r10, QWORD PTR [rdx+72] - mov QWORD PTR [rcx+64], r9 - sbb r10, QWORD PTR [r8+72] - mov r9, QWORD PTR [rdx+80] - mov QWORD PTR [rcx+72], r10 - sbb r9, QWORD PTR [r8+80] - mov r10, QWORD PTR [rdx+88] - mov QWORD PTR [rcx+80], r9 - sbb r10, QWORD PTR [r8+88] - mov r9, QWORD PTR [rdx+96] - mov QWORD PTR [rcx+88], r10 - sbb r9, QWORD PTR [r8+96] - mov r10, QWORD PTR [rdx+104] - mov QWORD PTR [rcx+96], r9 - sbb r10, QWORD PTR [r8+104] - mov r9, QWORD PTR [rdx+112] - mov QWORD PTR [rcx+104], r10 - sbb r9, QWORD PTR [r8+112] - mov r10, QWORD PTR [rdx+120] - mov QWORD PTR [rcx+112], r9 - sbb r10, QWORD PTR [r8+120] - mov r9, QWORD PTR [rdx+128] - mov QWORD PTR [rcx+120], r10 - sbb r9, QWORD PTR [r8+128] - mov r10, QWORD PTR [rdx+136] - mov QWORD PTR [rcx+128], r9 - sbb r10, QWORD PTR [r8+136] - mov r9, QWORD PTR [rdx+144] - mov QWORD PTR [rcx+136], r10 - sbb r9, QWORD PTR [r8+144] - mov r10, QWORD PTR [rdx+152] - mov QWORD PTR [rcx+144], r9 - sbb r10, QWORD PTR [r8+152] - mov r9, QWORD PTR [rdx+160] - mov QWORD PTR [rcx+152], r10 - sbb r9, QWORD PTR [r8+160] - mov r10, QWORD PTR [rdx+168] - mov QWORD PTR [rcx+160], r9 - sbb r10, QWORD PTR [r8+168] - mov r9, QWORD PTR [rdx+176] - mov QWORD PTR [rcx+168], r10 - sbb r9, QWORD PTR [r8+176] - mov r10, QWORD PTR [rdx+184] - mov QWORD PTR [rcx+176], r9 - sbb r10, QWORD PTR [r8+184] - mov r9, QWORD PTR [rdx+192] - mov QWORD PTR [rcx+184], r10 - sbb r9, QWORD PTR [r8+192] - mov r10, QWORD PTR [rdx+200] - mov QWORD PTR [rcx+192], r9 - sbb r10, QWORD PTR [r8+200] - mov r9, QWORD PTR [rdx+208] - mov QWORD PTR [rcx+200], r10 - sbb r9, QWORD PTR [r8+208] - mov r10, QWORD PTR [rdx+216] - mov QWORD PTR [rcx+208], r9 - sbb r10, QWORD PTR [r8+216] - mov r9, QWORD PTR [rdx+224] - mov QWORD PTR [rcx+216], r10 - sbb r9, QWORD PTR [r8+224] - mov r10, QWORD PTR [rdx+232] - mov QWORD PTR [rcx+224], r9 - sbb r10, QWORD PTR [r8+232] - mov r9, QWORD PTR [rdx+240] - mov QWORD PTR [rcx+232], r10 - sbb r9, QWORD PTR [r8+240] - mov r10, QWORD PTR [rdx+248] - mov QWORD PTR [rcx+240], r9 - sbb r10, QWORD PTR [r8+248] - mov r9, QWORD PTR [rdx+256] - mov QWORD PTR [rcx+248], r10 - sbb r9, QWORD PTR [r8+256] - mov r10, QWORD PTR [rdx+264] - mov QWORD PTR [rcx+256], r9 - sbb r10, QWORD PTR [r8+264] - mov r9, QWORD PTR [rdx+272] - mov QWORD PTR [rcx+264], r10 - sbb r9, QWORD PTR [r8+272] - mov r10, QWORD PTR [rdx+280] - mov QWORD PTR [rcx+272], r9 - sbb r10, QWORD PTR [r8+280] - mov r9, QWORD PTR [rdx+288] - mov QWORD PTR [rcx+280], r10 - sbb r9, QWORD PTR [r8+288] - mov r10, QWORD PTR [rdx+296] - mov QWORD PTR [rcx+288], r9 - sbb r10, QWORD PTR [r8+296] - mov r9, QWORD PTR [rdx+304] - mov QWORD PTR [rcx+296], r10 - sbb r9, QWORD PTR [r8+304] - mov r10, QWORD PTR [rdx+312] - mov QWORD PTR [rcx+304], r9 - sbb r10, QWORD PTR [r8+312] - mov r9, QWORD PTR [rdx+320] - mov QWORD PTR [rcx+312], r10 - sbb r9, QWORD PTR [r8+320] - mov r10, QWORD PTR [rdx+328] - mov QWORD PTR [rcx+320], r9 - sbb r10, QWORD PTR [r8+328] - mov r9, QWORD PTR [rdx+336] - mov QWORD PTR [rcx+328], r10 - sbb r9, QWORD PTR [r8+336] - mov r10, QWORD PTR [rdx+344] - mov QWORD PTR [rcx+336], r9 - sbb r10, QWORD PTR [r8+344] - mov r9, QWORD PTR [rdx+352] - mov QWORD PTR [rcx+344], r10 - sbb r9, QWORD PTR [r8+352] - mov r10, QWORD PTR [rdx+360] - mov QWORD PTR [rcx+352], r9 - sbb r10, QWORD PTR [r8+360] - mov r9, QWORD PTR [rdx+368] - mov QWORD PTR [rcx+360], r10 - sbb r9, QWORD PTR [r8+368] - mov r10, QWORD PTR [rdx+376] - mov QWORD PTR [rcx+368], r9 - sbb r10, QWORD PTR [r8+376] - mov QWORD PTR [rcx+376], r10 - sbb rax, 0 - ret -sp_3072_sub_48 ENDP -_text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Reduce the number back to 3072 bits using Montgomery reduction. ; * @@ -34638,345 +34638,211 @@ ENDIF ret sp_4096_mont_reduce_64 ENDP _text ENDS -IFDEF HAVE_INTEL_AVX2 -; /* Conditionally subtract b from a using the mask m. -; * m is -1 to subtract and 0 when not copying. +; /* Sub b from a into r. (r = a - b) ; * -; * r A single precision number representing condition subtract result. -; * a A single precision number to subtract from. -; * b A single precision number to subtract. -; * m Mask value to apply. +; * r A single precision integer. +; * a A single precision integer. +; * b A single precision integer. ; */ _text SEGMENT READONLY PARA -sp_4096_cond_sub_avx2_64 PROC - push r12 - mov rax, 0 - mov r12, QWORD PTR [r8] - mov r10, QWORD PTR [rdx] - pext r12, r12, r9 - sub r10, r12 - mov r12, QWORD PTR [r8+8] - mov r11, QWORD PTR [rdx+8] - pext r12, r12, r9 - mov QWORD PTR [rcx], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+16] - mov r12, QWORD PTR [rdx+16] - pext r10, r10, r9 - mov QWORD PTR [rcx+8], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+24] +sp_4096_sub_64 PROC + mov r9, QWORD PTR [rdx] + xor rax, rax + sub r9, QWORD PTR [r8] + mov r10, QWORD PTR [rdx+8] + mov QWORD PTR [rcx], r9 + sbb r10, QWORD PTR [r8+8] + mov r9, QWORD PTR [rdx+16] + mov QWORD PTR [rcx+8], r10 + sbb r9, QWORD PTR [r8+16] mov r10, QWORD PTR [rdx+24] - pext r11, r11, r9 - mov QWORD PTR [rcx+16], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+32] - mov r11, QWORD PTR [rdx+32] - pext r12, r12, r9 + mov QWORD PTR [rcx+16], r9 + sbb r10, QWORD PTR [r8+24] + mov r9, QWORD PTR [rdx+32] mov QWORD PTR [rcx+24], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+40] - mov r12, QWORD PTR [rdx+40] - pext r10, r10, r9 - mov QWORD PTR [rcx+32], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+48] - mov r10, QWORD PTR [rdx+48] - pext r11, r11, r9 - mov QWORD PTR [rcx+40], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+56] - mov r11, QWORD PTR [rdx+56] - pext r12, r12, r9 - mov QWORD PTR [rcx+48], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+64] - mov r12, QWORD PTR [rdx+64] - pext r10, r10, r9 - mov QWORD PTR [rcx+56], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+72] + sbb r9, QWORD PTR [r8+32] + mov r10, QWORD PTR [rdx+40] + mov QWORD PTR [rcx+32], r9 + sbb r10, QWORD PTR [r8+40] + mov r9, QWORD PTR [rdx+48] + mov QWORD PTR [rcx+40], r10 + sbb r9, QWORD PTR [r8+48] + mov r10, QWORD PTR [rdx+56] + mov QWORD PTR [rcx+48], r9 + sbb r10, QWORD PTR [r8+56] + mov r9, QWORD PTR [rdx+64] + mov QWORD PTR [rcx+56], r10 + sbb r9, QWORD PTR [r8+64] mov r10, QWORD PTR [rdx+72] - pext r11, r11, r9 - mov QWORD PTR [rcx+64], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+80] - mov r11, QWORD PTR [rdx+80] - pext r12, r12, r9 + mov QWORD PTR [rcx+64], r9 + sbb r10, QWORD PTR [r8+72] + mov r9, QWORD PTR [rdx+80] mov QWORD PTR [rcx+72], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+88] - mov r12, QWORD PTR [rdx+88] - pext r10, r10, r9 - mov QWORD PTR [rcx+80], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+96] - mov r10, QWORD PTR [rdx+96] - pext r11, r11, r9 - mov QWORD PTR [rcx+88], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+104] - mov r11, QWORD PTR [rdx+104] - pext r12, r12, r9 - mov QWORD PTR [rcx+96], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+112] - mov r12, QWORD PTR [rdx+112] - pext r10, r10, r9 - mov QWORD PTR [rcx+104], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+120] + sbb r9, QWORD PTR [r8+80] + mov r10, QWORD PTR [rdx+88] + mov QWORD PTR [rcx+80], r9 + sbb r10, QWORD PTR [r8+88] + mov r9, QWORD PTR [rdx+96] + mov QWORD PTR [rcx+88], r10 + sbb r9, QWORD PTR [r8+96] + mov r10, QWORD PTR [rdx+104] + mov QWORD PTR [rcx+96], r9 + sbb r10, QWORD PTR [r8+104] + mov r9, QWORD PTR [rdx+112] + mov QWORD PTR [rcx+104], r10 + sbb r9, QWORD PTR [r8+112] mov r10, QWORD PTR [rdx+120] - pext r11, r11, r9 - mov QWORD PTR [rcx+112], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+128] - mov r11, QWORD PTR [rdx+128] - pext r12, r12, r9 + mov QWORD PTR [rcx+112], r9 + sbb r10, QWORD PTR [r8+120] + mov r9, QWORD PTR [rdx+128] mov QWORD PTR [rcx+120], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+136] - mov r12, QWORD PTR [rdx+136] - pext r10, r10, r9 - mov QWORD PTR [rcx+128], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+144] - mov r10, QWORD PTR [rdx+144] - pext r11, r11, r9 - mov QWORD PTR [rcx+136], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+152] - mov r11, QWORD PTR [rdx+152] - pext r12, r12, r9 - mov QWORD PTR [rcx+144], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+160] - mov r12, QWORD PTR [rdx+160] - pext r10, r10, r9 - mov QWORD PTR [rcx+152], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+168] + sbb r9, QWORD PTR [r8+128] + mov r10, QWORD PTR [rdx+136] + mov QWORD PTR [rcx+128], r9 + sbb r10, QWORD PTR [r8+136] + mov r9, QWORD PTR [rdx+144] + mov QWORD PTR [rcx+136], r10 + sbb r9, QWORD PTR [r8+144] + mov r10, QWORD PTR [rdx+152] + mov QWORD PTR [rcx+144], r9 + sbb r10, QWORD PTR [r8+152] + mov r9, QWORD PTR [rdx+160] + mov QWORD PTR [rcx+152], r10 + sbb r9, QWORD PTR [r8+160] mov r10, QWORD PTR [rdx+168] - pext r11, r11, r9 - mov QWORD PTR [rcx+160], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+176] - mov r11, QWORD PTR [rdx+176] - pext r12, r12, r9 + mov QWORD PTR [rcx+160], r9 + sbb r10, QWORD PTR [r8+168] + mov r9, QWORD PTR [rdx+176] mov QWORD PTR [rcx+168], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+184] - mov r12, QWORD PTR [rdx+184] - pext r10, r10, r9 - mov QWORD PTR [rcx+176], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+192] - mov r10, QWORD PTR [rdx+192] - pext r11, r11, r9 - mov QWORD PTR [rcx+184], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+200] - mov r11, QWORD PTR [rdx+200] - pext r12, r12, r9 - mov QWORD PTR [rcx+192], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+208] - mov r12, QWORD PTR [rdx+208] - pext r10, r10, r9 - mov QWORD PTR [rcx+200], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+216] + sbb r9, QWORD PTR [r8+176] + mov r10, QWORD PTR [rdx+184] + mov QWORD PTR [rcx+176], r9 + sbb r10, QWORD PTR [r8+184] + mov r9, QWORD PTR [rdx+192] + mov QWORD PTR [rcx+184], r10 + sbb r9, QWORD PTR [r8+192] + mov r10, QWORD PTR [rdx+200] + mov QWORD PTR [rcx+192], r9 + sbb r10, QWORD PTR [r8+200] + mov r9, QWORD PTR [rdx+208] + mov QWORD PTR [rcx+200], r10 + sbb r9, QWORD PTR [r8+208] mov r10, QWORD PTR [rdx+216] - pext r11, r11, r9 - mov QWORD PTR [rcx+208], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+224] - mov r11, QWORD PTR [rdx+224] - pext r12, r12, r9 + mov QWORD PTR [rcx+208], r9 + sbb r10, QWORD PTR [r8+216] + mov r9, QWORD PTR [rdx+224] mov QWORD PTR [rcx+216], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+232] - mov r12, QWORD PTR [rdx+232] - pext r10, r10, r9 - mov QWORD PTR [rcx+224], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+240] - mov r10, QWORD PTR [rdx+240] - pext r11, r11, r9 - mov QWORD PTR [rcx+232], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+248] - mov r11, QWORD PTR [rdx+248] - pext r12, r12, r9 - mov QWORD PTR [rcx+240], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+256] - mov r12, QWORD PTR [rdx+256] - pext r10, r10, r9 - mov QWORD PTR [rcx+248], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+264] + sbb r9, QWORD PTR [r8+224] + mov r10, QWORD PTR [rdx+232] + mov QWORD PTR [rcx+224], r9 + sbb r10, QWORD PTR [r8+232] + mov r9, QWORD PTR [rdx+240] + mov QWORD PTR [rcx+232], r10 + sbb r9, QWORD PTR [r8+240] + mov r10, QWORD PTR [rdx+248] + mov QWORD PTR [rcx+240], r9 + sbb r10, QWORD PTR [r8+248] + mov r9, QWORD PTR [rdx+256] + mov QWORD PTR [rcx+248], r10 + sbb r9, QWORD PTR [r8+256] mov r10, QWORD PTR [rdx+264] - pext r11, r11, r9 - mov QWORD PTR [rcx+256], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+272] - mov r11, QWORD PTR [rdx+272] - pext r12, r12, r9 + mov QWORD PTR [rcx+256], r9 + sbb r10, QWORD PTR [r8+264] + mov r9, QWORD PTR [rdx+272] mov QWORD PTR [rcx+264], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+280] - mov r12, QWORD PTR [rdx+280] - pext r10, r10, r9 - mov QWORD PTR [rcx+272], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+288] - mov r10, QWORD PTR [rdx+288] - pext r11, r11, r9 - mov QWORD PTR [rcx+280], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+296] - mov r11, QWORD PTR [rdx+296] - pext r12, r12, r9 - mov QWORD PTR [rcx+288], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+304] - mov r12, QWORD PTR [rdx+304] - pext r10, r10, r9 - mov QWORD PTR [rcx+296], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+312] + sbb r9, QWORD PTR [r8+272] + mov r10, QWORD PTR [rdx+280] + mov QWORD PTR [rcx+272], r9 + sbb r10, QWORD PTR [r8+280] + mov r9, QWORD PTR [rdx+288] + mov QWORD PTR [rcx+280], r10 + sbb r9, QWORD PTR [r8+288] + mov r10, QWORD PTR [rdx+296] + mov QWORD PTR [rcx+288], r9 + sbb r10, QWORD PTR [r8+296] + mov r9, QWORD PTR [rdx+304] + mov QWORD PTR [rcx+296], r10 + sbb r9, QWORD PTR [r8+304] mov r10, QWORD PTR [rdx+312] - pext r11, r11, r9 - mov QWORD PTR [rcx+304], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+320] - mov r11, QWORD PTR [rdx+320] - pext r12, r12, r9 + mov QWORD PTR [rcx+304], r9 + sbb r10, QWORD PTR [r8+312] + mov r9, QWORD PTR [rdx+320] mov QWORD PTR [rcx+312], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+328] - mov r12, QWORD PTR [rdx+328] - pext r10, r10, r9 - mov QWORD PTR [rcx+320], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+336] - mov r10, QWORD PTR [rdx+336] - pext r11, r11, r9 - mov QWORD PTR [rcx+328], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+344] - mov r11, QWORD PTR [rdx+344] - pext r12, r12, r9 - mov QWORD PTR [rcx+336], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+352] - mov r12, QWORD PTR [rdx+352] - pext r10, r10, r9 - mov QWORD PTR [rcx+344], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+360] + sbb r9, QWORD PTR [r8+320] + mov r10, QWORD PTR [rdx+328] + mov QWORD PTR [rcx+320], r9 + sbb r10, QWORD PTR [r8+328] + mov r9, QWORD PTR [rdx+336] + mov QWORD PTR [rcx+328], r10 + sbb r9, QWORD PTR [r8+336] + mov r10, QWORD PTR [rdx+344] + mov QWORD PTR [rcx+336], r9 + sbb r10, QWORD PTR [r8+344] + mov r9, QWORD PTR [rdx+352] + mov QWORD PTR [rcx+344], r10 + sbb r9, QWORD PTR [r8+352] mov r10, QWORD PTR [rdx+360] - pext r11, r11, r9 - mov QWORD PTR [rcx+352], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+368] - mov r11, QWORD PTR [rdx+368] - pext r12, r12, r9 + mov QWORD PTR [rcx+352], r9 + sbb r10, QWORD PTR [r8+360] + mov r9, QWORD PTR [rdx+368] mov QWORD PTR [rcx+360], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+376] - mov r12, QWORD PTR [rdx+376] - pext r10, r10, r9 - mov QWORD PTR [rcx+368], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+384] - mov r10, QWORD PTR [rdx+384] - pext r11, r11, r9 - mov QWORD PTR [rcx+376], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+392] - mov r11, QWORD PTR [rdx+392] - pext r12, r12, r9 - mov QWORD PTR [rcx+384], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+400] - mov r12, QWORD PTR [rdx+400] - pext r10, r10, r9 - mov QWORD PTR [rcx+392], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+408] + sbb r9, QWORD PTR [r8+368] + mov r10, QWORD PTR [rdx+376] + mov QWORD PTR [rcx+368], r9 + sbb r10, QWORD PTR [r8+376] + mov r9, QWORD PTR [rdx+384] + mov QWORD PTR [rcx+376], r10 + sbb r9, QWORD PTR [r8+384] + mov r10, QWORD PTR [rdx+392] + mov QWORD PTR [rcx+384], r9 + sbb r10, QWORD PTR [r8+392] + mov r9, QWORD PTR [rdx+400] + mov QWORD PTR [rcx+392], r10 + sbb r9, QWORD PTR [r8+400] mov r10, QWORD PTR [rdx+408] - pext r11, r11, r9 - mov QWORD PTR [rcx+400], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+416] - mov r11, QWORD PTR [rdx+416] - pext r12, r12, r9 + mov QWORD PTR [rcx+400], r9 + sbb r10, QWORD PTR [r8+408] + mov r9, QWORD PTR [rdx+416] mov QWORD PTR [rcx+408], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+424] - mov r12, QWORD PTR [rdx+424] - pext r10, r10, r9 - mov QWORD PTR [rcx+416], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+432] - mov r10, QWORD PTR [rdx+432] - pext r11, r11, r9 - mov QWORD PTR [rcx+424], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+440] - mov r11, QWORD PTR [rdx+440] - pext r12, r12, r9 - mov QWORD PTR [rcx+432], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+448] - mov r12, QWORD PTR [rdx+448] - pext r10, r10, r9 - mov QWORD PTR [rcx+440], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+456] + sbb r9, QWORD PTR [r8+416] + mov r10, QWORD PTR [rdx+424] + mov QWORD PTR [rcx+416], r9 + sbb r10, QWORD PTR [r8+424] + mov r9, QWORD PTR [rdx+432] + mov QWORD PTR [rcx+424], r10 + sbb r9, QWORD PTR [r8+432] + mov r10, QWORD PTR [rdx+440] + mov QWORD PTR [rcx+432], r9 + sbb r10, QWORD PTR [r8+440] + mov r9, QWORD PTR [rdx+448] + mov QWORD PTR [rcx+440], r10 + sbb r9, QWORD PTR [r8+448] mov r10, QWORD PTR [rdx+456] - pext r11, r11, r9 - mov QWORD PTR [rcx+448], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+464] - mov r11, QWORD PTR [rdx+464] - pext r12, r12, r9 + mov QWORD PTR [rcx+448], r9 + sbb r10, QWORD PTR [r8+456] + mov r9, QWORD PTR [rdx+464] mov QWORD PTR [rcx+456], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+472] - mov r12, QWORD PTR [rdx+472] - pext r10, r10, r9 - mov QWORD PTR [rcx+464], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+480] - mov r10, QWORD PTR [rdx+480] - pext r11, r11, r9 - mov QWORD PTR [rcx+472], r12 - sbb r10, r11 - mov r12, QWORD PTR [r8+488] - mov r11, QWORD PTR [rdx+488] - pext r12, r12, r9 - mov QWORD PTR [rcx+480], r10 - sbb r11, r12 - mov r10, QWORD PTR [r8+496] - mov r12, QWORD PTR [rdx+496] - pext r10, r10, r9 - mov QWORD PTR [rcx+488], r11 - sbb r12, r10 - mov r11, QWORD PTR [r8+504] + sbb r9, QWORD PTR [r8+464] + mov r10, QWORD PTR [rdx+472] + mov QWORD PTR [rcx+464], r9 + sbb r10, QWORD PTR [r8+472] + mov r9, QWORD PTR [rdx+480] + mov QWORD PTR [rcx+472], r10 + sbb r9, QWORD PTR [r8+480] + mov r10, QWORD PTR [rdx+488] + mov QWORD PTR [rcx+480], r9 + sbb r10, QWORD PTR [r8+488] + mov r9, QWORD PTR [rdx+496] + mov QWORD PTR [rcx+488], r10 + sbb r9, QWORD PTR [r8+496] mov r10, QWORD PTR [rdx+504] - pext r11, r11, r9 - mov QWORD PTR [rcx+496], r12 - sbb r10, r11 + mov QWORD PTR [rcx+496], r9 + sbb r10, QWORD PTR [r8+504] mov QWORD PTR [rcx+504], r10 sbb rax, 0 - pop r12 ret -sp_4096_cond_sub_avx2_64 ENDP +sp_4096_sub_64 ENDP _text ENDS -ENDIF IFDEF HAVE_INTEL_AVX2 ; /* Mul a by digit b into r. (r = a * b) ; * @@ -35398,6 +35264,345 @@ div_4096_word_asm_64 PROC div_4096_word_asm_64 ENDP _text ENDS ENDIF +IFDEF HAVE_INTEL_AVX2 +; /* Conditionally subtract b from a using the mask m. +; * m is -1 to subtract and 0 when not copying. +; * +; * r A single precision number representing condition subtract result. +; * a A single precision number to subtract from. +; * b A single precision number to subtract. +; * m Mask value to apply. +; */ +_text SEGMENT READONLY PARA +sp_4096_cond_sub_avx2_64 PROC + push r12 + mov rax, 0 + mov r12, QWORD PTR [r8] + mov r10, QWORD PTR [rdx] + pext r12, r12, r9 + sub r10, r12 + mov r12, QWORD PTR [r8+8] + mov r11, QWORD PTR [rdx+8] + pext r12, r12, r9 + mov QWORD PTR [rcx], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+16] + mov r12, QWORD PTR [rdx+16] + pext r10, r10, r9 + mov QWORD PTR [rcx+8], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+24] + mov r10, QWORD PTR [rdx+24] + pext r11, r11, r9 + mov QWORD PTR [rcx+16], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+32] + mov r11, QWORD PTR [rdx+32] + pext r12, r12, r9 + mov QWORD PTR [rcx+24], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+40] + mov r12, QWORD PTR [rdx+40] + pext r10, r10, r9 + mov QWORD PTR [rcx+32], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+48] + mov r10, QWORD PTR [rdx+48] + pext r11, r11, r9 + mov QWORD PTR [rcx+40], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+56] + mov r11, QWORD PTR [rdx+56] + pext r12, r12, r9 + mov QWORD PTR [rcx+48], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+64] + mov r12, QWORD PTR [rdx+64] + pext r10, r10, r9 + mov QWORD PTR [rcx+56], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+72] + mov r10, QWORD PTR [rdx+72] + pext r11, r11, r9 + mov QWORD PTR [rcx+64], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+80] + mov r11, QWORD PTR [rdx+80] + pext r12, r12, r9 + mov QWORD PTR [rcx+72], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+88] + mov r12, QWORD PTR [rdx+88] + pext r10, r10, r9 + mov QWORD PTR [rcx+80], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+96] + mov r10, QWORD PTR [rdx+96] + pext r11, r11, r9 + mov QWORD PTR [rcx+88], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+104] + mov r11, QWORD PTR [rdx+104] + pext r12, r12, r9 + mov QWORD PTR [rcx+96], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+112] + mov r12, QWORD PTR [rdx+112] + pext r10, r10, r9 + mov QWORD PTR [rcx+104], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+120] + mov r10, QWORD PTR [rdx+120] + pext r11, r11, r9 + mov QWORD PTR [rcx+112], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+128] + mov r11, QWORD PTR [rdx+128] + pext r12, r12, r9 + mov QWORD PTR [rcx+120], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+136] + mov r12, QWORD PTR [rdx+136] + pext r10, r10, r9 + mov QWORD PTR [rcx+128], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+144] + mov r10, QWORD PTR [rdx+144] + pext r11, r11, r9 + mov QWORD PTR [rcx+136], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+152] + mov r11, QWORD PTR [rdx+152] + pext r12, r12, r9 + mov QWORD PTR [rcx+144], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+160] + mov r12, QWORD PTR [rdx+160] + pext r10, r10, r9 + mov QWORD PTR [rcx+152], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+168] + mov r10, QWORD PTR [rdx+168] + pext r11, r11, r9 + mov QWORD PTR [rcx+160], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+176] + mov r11, QWORD PTR [rdx+176] + pext r12, r12, r9 + mov QWORD PTR [rcx+168], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+184] + mov r12, QWORD PTR [rdx+184] + pext r10, r10, r9 + mov QWORD PTR [rcx+176], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+192] + mov r10, QWORD PTR [rdx+192] + pext r11, r11, r9 + mov QWORD PTR [rcx+184], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+200] + mov r11, QWORD PTR [rdx+200] + pext r12, r12, r9 + mov QWORD PTR [rcx+192], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+208] + mov r12, QWORD PTR [rdx+208] + pext r10, r10, r9 + mov QWORD PTR [rcx+200], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+216] + mov r10, QWORD PTR [rdx+216] + pext r11, r11, r9 + mov QWORD PTR [rcx+208], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+224] + mov r11, QWORD PTR [rdx+224] + pext r12, r12, r9 + mov QWORD PTR [rcx+216], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+232] + mov r12, QWORD PTR [rdx+232] + pext r10, r10, r9 + mov QWORD PTR [rcx+224], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+240] + mov r10, QWORD PTR [rdx+240] + pext r11, r11, r9 + mov QWORD PTR [rcx+232], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+248] + mov r11, QWORD PTR [rdx+248] + pext r12, r12, r9 + mov QWORD PTR [rcx+240], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+256] + mov r12, QWORD PTR [rdx+256] + pext r10, r10, r9 + mov QWORD PTR [rcx+248], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+264] + mov r10, QWORD PTR [rdx+264] + pext r11, r11, r9 + mov QWORD PTR [rcx+256], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+272] + mov r11, QWORD PTR [rdx+272] + pext r12, r12, r9 + mov QWORD PTR [rcx+264], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+280] + mov r12, QWORD PTR [rdx+280] + pext r10, r10, r9 + mov QWORD PTR [rcx+272], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+288] + mov r10, QWORD PTR [rdx+288] + pext r11, r11, r9 + mov QWORD PTR [rcx+280], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+296] + mov r11, QWORD PTR [rdx+296] + pext r12, r12, r9 + mov QWORD PTR [rcx+288], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+304] + mov r12, QWORD PTR [rdx+304] + pext r10, r10, r9 + mov QWORD PTR [rcx+296], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+312] + mov r10, QWORD PTR [rdx+312] + pext r11, r11, r9 + mov QWORD PTR [rcx+304], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+320] + mov r11, QWORD PTR [rdx+320] + pext r12, r12, r9 + mov QWORD PTR [rcx+312], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+328] + mov r12, QWORD PTR [rdx+328] + pext r10, r10, r9 + mov QWORD PTR [rcx+320], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+336] + mov r10, QWORD PTR [rdx+336] + pext r11, r11, r9 + mov QWORD PTR [rcx+328], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+344] + mov r11, QWORD PTR [rdx+344] + pext r12, r12, r9 + mov QWORD PTR [rcx+336], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+352] + mov r12, QWORD PTR [rdx+352] + pext r10, r10, r9 + mov QWORD PTR [rcx+344], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+360] + mov r10, QWORD PTR [rdx+360] + pext r11, r11, r9 + mov QWORD PTR [rcx+352], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+368] + mov r11, QWORD PTR [rdx+368] + pext r12, r12, r9 + mov QWORD PTR [rcx+360], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+376] + mov r12, QWORD PTR [rdx+376] + pext r10, r10, r9 + mov QWORD PTR [rcx+368], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+384] + mov r10, QWORD PTR [rdx+384] + pext r11, r11, r9 + mov QWORD PTR [rcx+376], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+392] + mov r11, QWORD PTR [rdx+392] + pext r12, r12, r9 + mov QWORD PTR [rcx+384], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+400] + mov r12, QWORD PTR [rdx+400] + pext r10, r10, r9 + mov QWORD PTR [rcx+392], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+408] + mov r10, QWORD PTR [rdx+408] + pext r11, r11, r9 + mov QWORD PTR [rcx+400], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+416] + mov r11, QWORD PTR [rdx+416] + pext r12, r12, r9 + mov QWORD PTR [rcx+408], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+424] + mov r12, QWORD PTR [rdx+424] + pext r10, r10, r9 + mov QWORD PTR [rcx+416], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+432] + mov r10, QWORD PTR [rdx+432] + pext r11, r11, r9 + mov QWORD PTR [rcx+424], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+440] + mov r11, QWORD PTR [rdx+440] + pext r12, r12, r9 + mov QWORD PTR [rcx+432], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+448] + mov r12, QWORD PTR [rdx+448] + pext r10, r10, r9 + mov QWORD PTR [rcx+440], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+456] + mov r10, QWORD PTR [rdx+456] + pext r11, r11, r9 + mov QWORD PTR [rcx+448], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+464] + mov r11, QWORD PTR [rdx+464] + pext r12, r12, r9 + mov QWORD PTR [rcx+456], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+472] + mov r12, QWORD PTR [rdx+472] + pext r10, r10, r9 + mov QWORD PTR [rcx+464], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+480] + mov r10, QWORD PTR [rdx+480] + pext r11, r11, r9 + mov QWORD PTR [rcx+472], r12 + sbb r10, r11 + mov r12, QWORD PTR [r8+488] + mov r11, QWORD PTR [rdx+488] + pext r12, r12, r9 + mov QWORD PTR [rcx+480], r10 + sbb r11, r12 + mov r10, QWORD PTR [r8+496] + mov r12, QWORD PTR [rdx+496] + pext r10, r10, r9 + mov QWORD PTR [rcx+488], r11 + sbb r12, r10 + mov r11, QWORD PTR [r8+504] + mov r10, QWORD PTR [rdx+504] + pext r11, r11, r9 + mov QWORD PTR [rcx+496], r12 + sbb r10, r11 + mov QWORD PTR [rcx+504], r10 + sbb rax, 0 + pop r12 + ret +sp_4096_cond_sub_avx2_64 ENDP +_text ENDS +ENDIF ; /* Compare a with b in constant time. ; * ; * a A single precision integer. @@ -35929,211 +36134,6 @@ sp_4096_cmp_64 PROC ret sp_4096_cmp_64 ENDP _text ENDS -; /* Sub b from a into r. (r = a - b) -; * -; * r A single precision integer. -; * a A single precision integer. -; * b A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_4096_sub_64 PROC - mov r9, QWORD PTR [rdx] - xor rax, rax - sub r9, QWORD PTR [r8] - mov r10, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r9 - sbb r10, QWORD PTR [r8+8] - mov r9, QWORD PTR [rdx+16] - mov QWORD PTR [rcx+8], r10 - sbb r9, QWORD PTR [r8+16] - mov r10, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r9 - sbb r10, QWORD PTR [r8+24] - mov r9, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r10 - sbb r9, QWORD PTR [r8+32] - mov r10, QWORD PTR [rdx+40] - mov QWORD PTR [rcx+32], r9 - sbb r10, QWORD PTR [r8+40] - mov r9, QWORD PTR [rdx+48] - mov QWORD PTR [rcx+40], r10 - sbb r9, QWORD PTR [r8+48] - mov r10, QWORD PTR [rdx+56] - mov QWORD PTR [rcx+48], r9 - sbb r10, QWORD PTR [r8+56] - mov r9, QWORD PTR [rdx+64] - mov QWORD PTR [rcx+56], r10 - sbb r9, QWORD PTR [r8+64] - mov r10, QWORD PTR [rdx+72] - mov QWORD PTR [rcx+64], r9 - sbb r10, QWORD PTR [r8+72] - mov r9, QWORD PTR [rdx+80] - mov QWORD PTR [rcx+72], r10 - sbb r9, QWORD PTR [r8+80] - mov r10, QWORD PTR [rdx+88] - mov QWORD PTR [rcx+80], r9 - sbb r10, QWORD PTR [r8+88] - mov r9, QWORD PTR [rdx+96] - mov QWORD PTR [rcx+88], r10 - sbb r9, QWORD PTR [r8+96] - mov r10, QWORD PTR [rdx+104] - mov QWORD PTR [rcx+96], r9 - sbb r10, QWORD PTR [r8+104] - mov r9, QWORD PTR [rdx+112] - mov QWORD PTR [rcx+104], r10 - sbb r9, QWORD PTR [r8+112] - mov r10, QWORD PTR [rdx+120] - mov QWORD PTR [rcx+112], r9 - sbb r10, QWORD PTR [r8+120] - mov r9, QWORD PTR [rdx+128] - mov QWORD PTR [rcx+120], r10 - sbb r9, QWORD PTR [r8+128] - mov r10, QWORD PTR [rdx+136] - mov QWORD PTR [rcx+128], r9 - sbb r10, QWORD PTR [r8+136] - mov r9, QWORD PTR [rdx+144] - mov QWORD PTR [rcx+136], r10 - sbb r9, QWORD PTR [r8+144] - mov r10, QWORD PTR [rdx+152] - mov QWORD PTR [rcx+144], r9 - sbb r10, QWORD PTR [r8+152] - mov r9, QWORD PTR [rdx+160] - mov QWORD PTR [rcx+152], r10 - sbb r9, QWORD PTR [r8+160] - mov r10, QWORD PTR [rdx+168] - mov QWORD PTR [rcx+160], r9 - sbb r10, QWORD PTR [r8+168] - mov r9, QWORD PTR [rdx+176] - mov QWORD PTR [rcx+168], r10 - sbb r9, QWORD PTR [r8+176] - mov r10, QWORD PTR [rdx+184] - mov QWORD PTR [rcx+176], r9 - sbb r10, QWORD PTR [r8+184] - mov r9, QWORD PTR [rdx+192] - mov QWORD PTR [rcx+184], r10 - sbb r9, QWORD PTR [r8+192] - mov r10, QWORD PTR [rdx+200] - mov QWORD PTR [rcx+192], r9 - sbb r10, QWORD PTR [r8+200] - mov r9, QWORD PTR [rdx+208] - mov QWORD PTR [rcx+200], r10 - sbb r9, QWORD PTR [r8+208] - mov r10, QWORD PTR [rdx+216] - mov QWORD PTR [rcx+208], r9 - sbb r10, QWORD PTR [r8+216] - mov r9, QWORD PTR [rdx+224] - mov QWORD PTR [rcx+216], r10 - sbb r9, QWORD PTR [r8+224] - mov r10, QWORD PTR [rdx+232] - mov QWORD PTR [rcx+224], r9 - sbb r10, QWORD PTR [r8+232] - mov r9, QWORD PTR [rdx+240] - mov QWORD PTR [rcx+232], r10 - sbb r9, QWORD PTR [r8+240] - mov r10, QWORD PTR [rdx+248] - mov QWORD PTR [rcx+240], r9 - sbb r10, QWORD PTR [r8+248] - mov r9, QWORD PTR [rdx+256] - mov QWORD PTR [rcx+248], r10 - sbb r9, QWORD PTR [r8+256] - mov r10, QWORD PTR [rdx+264] - mov QWORD PTR [rcx+256], r9 - sbb r10, QWORD PTR [r8+264] - mov r9, QWORD PTR [rdx+272] - mov QWORD PTR [rcx+264], r10 - sbb r9, QWORD PTR [r8+272] - mov r10, QWORD PTR [rdx+280] - mov QWORD PTR [rcx+272], r9 - sbb r10, QWORD PTR [r8+280] - mov r9, QWORD PTR [rdx+288] - mov QWORD PTR [rcx+280], r10 - sbb r9, QWORD PTR [r8+288] - mov r10, QWORD PTR [rdx+296] - mov QWORD PTR [rcx+288], r9 - sbb r10, QWORD PTR [r8+296] - mov r9, QWORD PTR [rdx+304] - mov QWORD PTR [rcx+296], r10 - sbb r9, QWORD PTR [r8+304] - mov r10, QWORD PTR [rdx+312] - mov QWORD PTR [rcx+304], r9 - sbb r10, QWORD PTR [r8+312] - mov r9, QWORD PTR [rdx+320] - mov QWORD PTR [rcx+312], r10 - sbb r9, QWORD PTR [r8+320] - mov r10, QWORD PTR [rdx+328] - mov QWORD PTR [rcx+320], r9 - sbb r10, QWORD PTR [r8+328] - mov r9, QWORD PTR [rdx+336] - mov QWORD PTR [rcx+328], r10 - sbb r9, QWORD PTR [r8+336] - mov r10, QWORD PTR [rdx+344] - mov QWORD PTR [rcx+336], r9 - sbb r10, QWORD PTR [r8+344] - mov r9, QWORD PTR [rdx+352] - mov QWORD PTR [rcx+344], r10 - sbb r9, QWORD PTR [r8+352] - mov r10, QWORD PTR [rdx+360] - mov QWORD PTR [rcx+352], r9 - sbb r10, QWORD PTR [r8+360] - mov r9, QWORD PTR [rdx+368] - mov QWORD PTR [rcx+360], r10 - sbb r9, QWORD PTR [r8+368] - mov r10, QWORD PTR [rdx+376] - mov QWORD PTR [rcx+368], r9 - sbb r10, QWORD PTR [r8+376] - mov r9, QWORD PTR [rdx+384] - mov QWORD PTR [rcx+376], r10 - sbb r9, QWORD PTR [r8+384] - mov r10, QWORD PTR [rdx+392] - mov QWORD PTR [rcx+384], r9 - sbb r10, QWORD PTR [r8+392] - mov r9, QWORD PTR [rdx+400] - mov QWORD PTR [rcx+392], r10 - sbb r9, QWORD PTR [r8+400] - mov r10, QWORD PTR [rdx+408] - mov QWORD PTR [rcx+400], r9 - sbb r10, QWORD PTR [r8+408] - mov r9, QWORD PTR [rdx+416] - mov QWORD PTR [rcx+408], r10 - sbb r9, QWORD PTR [r8+416] - mov r10, QWORD PTR [rdx+424] - mov QWORD PTR [rcx+416], r9 - sbb r10, QWORD PTR [r8+424] - mov r9, QWORD PTR [rdx+432] - mov QWORD PTR [rcx+424], r10 - sbb r9, QWORD PTR [r8+432] - mov r10, QWORD PTR [rdx+440] - mov QWORD PTR [rcx+432], r9 - sbb r10, QWORD PTR [r8+440] - mov r9, QWORD PTR [rdx+448] - mov QWORD PTR [rcx+440], r10 - sbb r9, QWORD PTR [r8+448] - mov r10, QWORD PTR [rdx+456] - mov QWORD PTR [rcx+448], r9 - sbb r10, QWORD PTR [r8+456] - mov r9, QWORD PTR [rdx+464] - mov QWORD PTR [rcx+456], r10 - sbb r9, QWORD PTR [r8+464] - mov r10, QWORD PTR [rdx+472] - mov QWORD PTR [rcx+464], r9 - sbb r10, QWORD PTR [r8+472] - mov r9, QWORD PTR [rdx+480] - mov QWORD PTR [rcx+472], r10 - sbb r9, QWORD PTR [r8+480] - mov r10, QWORD PTR [rdx+488] - mov QWORD PTR [rcx+480], r9 - sbb r10, QWORD PTR [r8+488] - mov r9, QWORD PTR [rdx+496] - mov QWORD PTR [rcx+488], r10 - sbb r9, QWORD PTR [r8+496] - mov r10, QWORD PTR [rdx+504] - mov QWORD PTR [rcx+496], r9 - sbb r10, QWORD PTR [r8+504] - mov QWORD PTR [rcx+504], r10 - sbb rax, 0 - ret -sp_4096_sub_64 ENDP -_text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Reduce the number back to 4096 bits using Montgomery reduction. ; * diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index f94634558..7d9806fa0 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -14122,8 +14122,10 @@ exit_rsa_even_mod: (void)out; (void)outSz; +#ifndef WOLFSSL_RSA_PUBLIC_ONLY (void)plain; (void)plainSz; +#endif (void)inLen; (void)rng; diff --git a/wolfssl/wolfcrypt/sp_int.h b/wolfssl/wolfcrypt/sp_int.h index 0024f965f..4ea51f29d 100644 --- a/wolfssl/wolfcrypt/sp_int.h +++ b/wolfssl/wolfcrypt/sp_int.h @@ -806,7 +806,8 @@ MP_API int sp_add_d(sp_int* a, sp_int_digit d, sp_int* r); MP_API int sp_sub_d(sp_int* a, sp_int_digit d, sp_int* r); MP_API int sp_mul_d(sp_int* a, sp_int_digit d, sp_int* r); #if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \ - defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) + defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \ + defined(WC_MP_TO_RADIX) MP_API int sp_div_d(sp_int* a, sp_int_digit d, sp_int* r, sp_int_digit* rem); #endif #if defined(WOLFSSL_SP_MATH_ALL) || (defined(HAVE_ECC) && \