From 605d70111372766c3a386998b9cd1dba0094878a Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 25 Feb 2022 10:25:45 +1000 Subject: [PATCH] SP P521: hash len needs special attention when 521 bits Need to right shift number down when hash is more than 521 bits. Previously handled at a byte level and now at bit level. Always return err from sp_*_ecc_mulmod_add_only_*(). When ECC add and double points are public and only have SP implementation, check that the point ordinates are the right size. --- wolfcrypt/src/ecc.c | 37 +- wolfcrypt/src/sp_arm32.c | 218 ++- wolfcrypt/src/sp_arm64.c | 42 +- wolfcrypt/src/sp_armthumb.c | 740 ++++---- wolfcrypt/src/sp_c32.c | 82 +- wolfcrypt/src/sp_c64.c | 58 +- wolfcrypt/src/sp_cortexm.c | 220 ++- wolfcrypt/src/sp_x86_64.c | 73 +- wolfcrypt/src/sp_x86_64_asm.S | 3159 ++++++++++++++++--------------- wolfcrypt/src/sp_x86_64_asm.asm | 3093 +++++++++++++++--------------- 10 files changed, 4052 insertions(+), 3670 deletions(-) diff --git a/wolfcrypt/src/ecc.c b/wolfcrypt/src/ecc.c index 643d440a4..84da56266 100644 --- a/wolfcrypt/src/ecc.c +++ b/wolfcrypt/src/ecc.c @@ -1971,27 +1971,41 @@ done: return err; #else + int modBits; + if (P == NULL || Q == NULL || R == NULL || modulus == NULL) { return ECC_BAD_ARG_E; } + modBits = mp_count_bits(modulus); +#ifdef WOLFSSL_PUBLIC_ECC_ADD_DBL + if ((mp_count_bits(P->x) > modBits) || + (mp_count_bits(P->y) > modBits) || + (mp_count_bits(P->z) > modBits) || + (mp_count_bits(Q->x) > modBits) || + (mp_count_bits(Q->y) > modBits) || + (mp_count_bits(Q->z) > modBits)) { + return ECC_OUT_OF_RANGE_E; + } +#endif + (void)a; (void)mp; #ifndef WOLFSSL_SP_NO_256 - if (mp_count_bits(modulus) == 256) { + if (modBits == 256) { return sp_ecc_proj_add_point_256(P->x, P->y, P->z, Q->x, Q->y, Q->z, R->x, R->y, R->z); } #endif #ifdef WOLFSSL_SP_384 - if (mp_count_bits(modulus) == 384) { + if (modBits == 384) { return sp_ecc_proj_add_point_384(P->x, P->y, P->z, Q->x, Q->y, Q->z, R->x, R->y, R->z); } #endif #ifdef WOLFSSL_SP_521 - if (mp_count_bits(modulus) == 521) { + if (modBits == 521) { return sp_ecc_proj_add_point_521(P->x, P->y, P->z, Q->x, Q->y, Q->z, R->x, R->y, R->z); } @@ -2315,24 +2329,35 @@ int ecc_projective_dbl_point(ecc_point *P, ecc_point *R, mp_int* a, return err; #else + int modBits; + if (P == NULL || R == NULL || modulus == NULL) return ECC_BAD_ARG_E; + modBits = mp_count_bits(modulus); +#ifdef WOLFSSL_PUBLIC_ECC_ADD_DBL + if ((mp_count_bits(P->x) > modBits) || + (mp_count_bits(P->y) > modBits) || + (mp_count_bits(P->z) > modBits)) { + return ECC_OUT_OF_RANGE_E; + } +#endif + (void)a; (void)mp; #ifndef WOLFSSL_SP_NO_256 - if (mp_count_bits(modulus) == 256) { + if (modBits == 256) { return sp_ecc_proj_dbl_point_256(P->x, P->y, P->z, R->x, R->y, R->z); } #endif #ifdef WOLFSSL_SP_384 - if (mp_count_bits(modulus) == 384) { + if (modBits == 384) { return sp_ecc_proj_dbl_point_384(P->x, P->y, P->z, R->x, R->y, R->z); } #endif #ifdef WOLFSSL_SP_521 - if (mp_count_bits(modulus) == 521) { + if (modBits == 521) { return sp_ecc_proj_dbl_point_521(P->x, P->y, P->z, R->x, R->y, R->z); } #endif diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c index bd4a31b2b..296541be6 100644 --- a/wolfcrypt/src/sp_arm32.c +++ b/wolfcrypt/src/sp_arm32.c @@ -34578,6 +34578,8 @@ int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out, } #endif /* HAVE_ECC_DHE */ +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +#endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) @@ -43653,6 +43655,8 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out, } #endif /* HAVE_ECC_DHE */ +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +#endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) @@ -55681,6 +55685,99 @@ int sp_ecc_secret_gen_521(const mp_int* priv, const ecc_point* pub, byte* out, } #endif /* HAVE_ECC_DHE */ +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +static void sp_521_rshift_17(sp_digit* r, const sp_digit* a, byte n) +{ + __asm__ __volatile__ ( + "rsb r6, %[n], #32\n\t" + "ldrd r2, r3, [%[a]]\n\t" + "lsr r2, r2, %[n]\n\t" + "lsl r5, r3, r6\n\t" + "lsr r3, r3, %[n]\n\t" + "orr r2, r2, r5\n\t" + "ldr r4, [%[a], #8]\n\t" + "str r2, [%[r], #0]\n\t" + "lsl r5, r4, r6\n\t" + "lsr r4, r4, %[n]\n\t" + "orr r3, r3, r5\n\t" + "ldr r2, [%[a], #12]\n\t" + "str r3, [%[r], #4]\n\t" + "lsl r5, r2, r6\n\t" + "lsr r2, r2, %[n]\n\t" + "orr r4, r4, r5\n\t" + "ldr r3, [%[a], #16]\n\t" + "str r4, [%[r], #8]\n\t" + "lsl r5, r3, r6\n\t" + "lsr r3, r3, %[n]\n\t" + "orr r2, r2, r5\n\t" + "ldr r4, [%[a], #20]\n\t" + "str r2, [%[r], #12]\n\t" + "lsl r5, r4, r6\n\t" + "lsr r4, r4, %[n]\n\t" + "orr r3, r3, r5\n\t" + "ldr r2, [%[a], #24]\n\t" + "str r3, [%[r], #16]\n\t" + "lsl r5, r2, r6\n\t" + "lsr r2, r2, %[n]\n\t" + "orr r4, r4, r5\n\t" + "ldr r3, [%[a], #28]\n\t" + "str r4, [%[r], #20]\n\t" + "lsl r5, r3, r6\n\t" + "lsr r3, r3, %[n]\n\t" + "orr r2, r2, r5\n\t" + "ldr r4, [%[a], #32]\n\t" + "str r2, [%[r], #24]\n\t" + "lsl r5, r4, r6\n\t" + "lsr r4, r4, %[n]\n\t" + "orr r3, r3, r5\n\t" + "ldr r2, [%[a], #36]\n\t" + "str r3, [%[r], #28]\n\t" + "lsl r5, r2, r6\n\t" + "lsr r2, r2, %[n]\n\t" + "orr r4, r4, r5\n\t" + "ldr r3, [%[a], #40]\n\t" + "str r4, [%[r], #32]\n\t" + "lsl r5, r3, r6\n\t" + "lsr r3, r3, %[n]\n\t" + "orr r2, r2, r5\n\t" + "ldr r4, [%[a], #44]\n\t" + "str r2, [%[r], #36]\n\t" + "lsl r5, r4, r6\n\t" + "lsr r4, r4, %[n]\n\t" + "orr r3, r3, r5\n\t" + "ldr r2, [%[a], #48]\n\t" + "str r3, [%[r], #40]\n\t" + "lsl r5, r2, r6\n\t" + "lsr r2, r2, %[n]\n\t" + "orr r4, r4, r5\n\t" + "ldr r3, [%[a], #52]\n\t" + "str r4, [%[r], #44]\n\t" + "lsl r5, r3, r6\n\t" + "lsr r3, r3, %[n]\n\t" + "orr r2, r2, r5\n\t" + "ldr r4, [%[a], #56]\n\t" + "str r2, [%[r], #48]\n\t" + "lsl r5, r4, r6\n\t" + "lsr r4, r4, %[n]\n\t" + "orr r3, r3, r5\n\t" + "ldr r2, [%[a], #60]\n\t" + "str r3, [%[r], #52]\n\t" + "lsl r5, r2, r6\n\t" + "lsr r2, r2, %[n]\n\t" + "orr r4, r4, r5\n\t" + "ldr r3, [%[a], #64]\n\t" + "str r4, [%[r], #56]\n\t" + "lsl r5, r3, r6\n\t" + "lsr r3, r3, %[n]\n\t" + "orr r2, r2, r5\n\t" + "strd r2, r3, [%[r], #60]\n\t" + : + : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : "memory", "r2", "r3", "r4", "r5", "r6" + ); +} + +#endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) @@ -56008,97 +56105,6 @@ static void sp_521_lshift_34(sp_digit* r, const sp_digit* a, byte n) ); } -static void sp_521_rshift_17(sp_digit* r, const sp_digit* a, byte n) -{ - __asm__ __volatile__ ( - "rsb r6, %[n], #32\n\t" - "ldrd r2, r3, [%[a]]\n\t" - "lsr r2, r2, %[n]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #8]\n\t" - "str r2, [%[r], #0]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #12]\n\t" - "str r3, [%[r], #4]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #16]\n\t" - "str r4, [%[r], #8]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #20]\n\t" - "str r2, [%[r], #12]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #24]\n\t" - "str r3, [%[r], #16]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #28]\n\t" - "str r4, [%[r], #20]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #32]\n\t" - "str r2, [%[r], #24]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #36]\n\t" - "str r3, [%[r], #28]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #40]\n\t" - "str r4, [%[r], #32]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #44]\n\t" - "str r2, [%[r], #36]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #48]\n\t" - "str r3, [%[r], #40]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #52]\n\t" - "str r4, [%[r], #44]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #56]\n\t" - "str r2, [%[r], #48]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #60]\n\t" - "str r3, [%[r], #52]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #64]\n\t" - "str r4, [%[r], #56]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "strd r2, r3, [%[r], #60]\n\t" - : - : [r] "r" (r), [a] "r" (a), [n] "r" (n) - : "memory", "r2", "r3", "r4", "r5", "r6" - ); -} - #ifdef WOLFSSL_SP_SMALL /* Sub b from a into a. (a -= b) * @@ -56816,8 +56822,8 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W case 0: /* INIT */ ctx->s = ctx->e; ctx->kInv = ctx->k; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } ctx->i = SP_ECC_MAX_SIG_GEN; @@ -56855,6 +56861,9 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W sp_521_from_mp(ctx->x, 17, priv); sp_521_from_bin(ctx->e, 17, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_17(ctx->e, ctx->e, 7); + } ctx->state = 4; break; } @@ -56991,8 +57000,8 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, tmp = e + 8 * 17; s = e; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } } @@ -57021,6 +57030,10 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, sp_521_from_mp(x, 17, priv); sp_521_from_bin(e, 17, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_17(e, e, 7); + } + err = sp_521_calc_s_17(s, r, k, x, e, tmp); } @@ -57634,8 +57647,8 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, switch (ctx->state) { case 0: /* INIT */ - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(ctx->u1, 17, hash, (int)hashLen); @@ -57644,6 +57657,9 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, sp_521_from_mp(ctx->p2.x, 17, pX); sp_521_from_mp(ctx->p2.y, 17, pY); sp_521_from_mp(ctx->p2.z, 17, pZ); + if (hashLen == 66U) { + sp_521_rshift_17(ctx->u1, ctx->u1, 7); + } ctx->state = 1; break; case 1: /* NORMS0 */ @@ -57795,8 +57811,8 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, tmp = u1 + 6 * 17; p2 = p1 + 1; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(u1, 17, hash, (int)hashLen); @@ -57806,6 +57822,10 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_521_from_mp(p2->y, 17, pY); sp_521_from_mp(p2->z, 17, pZ); + if (hashLen == 66U) { + sp_521_rshift_17(u1, u1, 7); + } + err = sp_521_calc_vfy_point_17(p1, p2, s, u1, u2, tmp, heap); } if (err == MP_OKAY) { diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c index 33941598b..bda5f232b 100644 --- a/wolfcrypt/src/sp_arm64.c +++ b/wolfcrypt/src/sp_arm64.c @@ -37174,7 +37174,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, XFREE(rt, heap, DYNAMIC_TYPE_ECC); #endif - return MP_OKAY; + return err; } /* Multiply the base point of P256 by the scalar and return the result. @@ -37609,6 +37609,8 @@ int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out, } #endif /* HAVE_ECC_DHE */ +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +#endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) @@ -63031,7 +63033,7 @@ static int sp_384_ecc_mulmod_add_only_6(sp_point_384* r, const sp_point_384* g, XFREE(rt, heap, DYNAMIC_TYPE_ECC); #endif - return MP_OKAY; + return err; } /* Multiply the base point of P384 by the scalar and return the result. @@ -63470,6 +63472,8 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out, } #endif /* HAVE_ECC_DHE */ +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +#endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) @@ -107875,7 +107879,7 @@ static int sp_521_ecc_mulmod_add_only_9(sp_point_521* r, const sp_point_521* g, XFREE(rt, heap, DYNAMIC_TYPE_ECC); #endif - return MP_OKAY; + return err; } /* Multiply the base point of P521 by the scalar and return the result. @@ -108326,6 +108330,8 @@ int sp_ecc_secret_gen_521(const mp_int* priv, const ecc_point* pub, byte* out, } #endif /* HAVE_ECC_DHE */ +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +#endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) @@ -108619,8 +108625,8 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W case 0: /* INIT */ ctx->s = ctx->e; ctx->kInv = ctx->k; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } ctx->i = SP_ECC_MAX_SIG_GEN; @@ -108658,6 +108664,9 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W sp_521_from_mp(ctx->x, 9, priv); sp_521_from_bin(ctx->e, 9, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_9(ctx->e, ctx->e, 7); + } ctx->state = 4; break; } @@ -108794,8 +108803,8 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, tmp = e + 8 * 9; s = e; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } } @@ -108824,6 +108833,10 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, sp_521_from_mp(x, 9, priv); sp_521_from_bin(e, 9, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_9(e, e, 7); + } + err = sp_521_calc_s_9(s, r, k, x, e, tmp); } @@ -109199,8 +109212,8 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, switch (ctx->state) { case 0: /* INIT */ - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(ctx->u1, 9, hash, (int)hashLen); @@ -109209,6 +109222,9 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, sp_521_from_mp(ctx->p2.x, 9, pX); sp_521_from_mp(ctx->p2.y, 9, pY); sp_521_from_mp(ctx->p2.z, 9, pZ); + if (hashLen == 66U) { + sp_521_rshift_9(ctx->u1, ctx->u1, 7); + } ctx->state = 1; break; case 1: /* NORMS0 */ @@ -109360,8 +109376,8 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, tmp = u1 + 6 * 9; p2 = p1 + 1; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(u1, 9, hash, (int)hashLen); @@ -109371,6 +109387,10 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_521_from_mp(p2->y, 9, pY); sp_521_from_mp(p2->z, 9, pZ); + if (hashLen == 66U) { + sp_521_rshift_9(u1, u1, 7); + } + err = sp_521_calc_vfy_point_9(p1, p2, s, u1, u2, tmp, heap); } if (err == MP_OKAY) { diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c index d3bc95a4a..b482379a7 100644 --- a/wolfcrypt/src/sp_armthumb.c +++ b/wolfcrypt/src/sp_armthumb.c @@ -103265,6 +103265,8 @@ int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out, } #endif /* HAVE_ECC_DHE */ +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +#endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) @@ -113731,6 +113733,8 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out, } #endif /* HAVE_ECC_DHE */ +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +#endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) @@ -127934,6 +127938,360 @@ int sp_ecc_secret_gen_521(const mp_int* priv, const ecc_point* pub, byte* out, } #endif /* HAVE_ECC_DHE */ +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +/* Right shift a by n bits into r. (r = a >> n) + * + * r A single precision integer. + * a A single precision integer. + * n Integer representing number of bits to shift. + */ +static void sp_521_rshift_17(sp_digit* r, const sp_digit* a, byte n) +{ + __asm__ __volatile__ ( + "movs r7, #32\n\t" +#ifdef WOLFSSL_KEIL + "subs r7, r7, %[n]\n\t" +#else +#ifdef __clang__ + "subs r7, r7, %[n]\n\t" +#else + "sub r7, r7, %[n]\n\t" +#endif +#endif + "ldr r3, [%[a]]\n\t" + "ldr r4, [%[a], #4]\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r3, r3, %[n]\n\t" +#else + "lsr r3, r3, %[n]\n\t" +#endif + "movs r6, r4\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r4, r4, %[n]\n\t" +#else + "lsr r4, r4, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r3, r3, r6\n\t" +#elif defined(__clang__) + "orrs r3, r6\n\t" +#else + "orr r3, r6\n\t" +#endif + "ldr r5, [%[a], #8]\n\t" + "str r3, [%[r]]\n\t" + "movs r6, r5\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, r5, %[n]\n\t" +#else + "lsr r5, r5, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r4, r4, r6\n\t" +#elif defined(__clang__) + "orrs r4, r6\n\t" +#else + "orr r4, r6\n\t" +#endif + "ldr r3, [%[a], #12]\n\t" + "str r4, [%[r], #4]\n\t" + "movs r6, r3\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r3, r3, %[n]\n\t" +#else + "lsr r3, r3, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r5, r5, r6\n\t" +#elif defined(__clang__) + "orrs r5, r6\n\t" +#else + "orr r5, r6\n\t" +#endif + "ldr r4, [%[a], #16]\n\t" + "str r5, [%[r], #8]\n\t" + "movs r6, r4\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r4, r4, %[n]\n\t" +#else + "lsr r4, r4, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r3, r3, r6\n\t" +#elif defined(__clang__) + "orrs r3, r6\n\t" +#else + "orr r3, r6\n\t" +#endif + "ldr r5, [%[a], #20]\n\t" + "str r3, [%[r], #12]\n\t" + "movs r6, r5\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, r5, %[n]\n\t" +#else + "lsr r5, r5, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r4, r4, r6\n\t" +#elif defined(__clang__) + "orrs r4, r6\n\t" +#else + "orr r4, r6\n\t" +#endif + "ldr r3, [%[a], #24]\n\t" + "str r4, [%[r], #16]\n\t" + "movs r6, r3\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r3, r3, %[n]\n\t" +#else + "lsr r3, r3, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r5, r5, r6\n\t" +#elif defined(__clang__) + "orrs r5, r6\n\t" +#else + "orr r5, r6\n\t" +#endif + "ldr r4, [%[a], #28]\n\t" + "str r5, [%[r], #20]\n\t" + "movs r6, r4\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r4, r4, %[n]\n\t" +#else + "lsr r4, r4, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r3, r3, r6\n\t" +#elif defined(__clang__) + "orrs r3, r6\n\t" +#else + "orr r3, r6\n\t" +#endif + "ldr r5, [%[a], #32]\n\t" + "str r3, [%[r], #24]\n\t" + "movs r6, r5\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, r5, %[n]\n\t" +#else + "lsr r5, r5, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r4, r4, r6\n\t" +#elif defined(__clang__) + "orrs r4, r6\n\t" +#else + "orr r4, r6\n\t" +#endif + "ldr r3, [%[a], #36]\n\t" + "str r4, [%[r], #28]\n\t" + "movs r6, r3\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r3, r3, %[n]\n\t" +#else + "lsr r3, r3, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r5, r5, r6\n\t" +#elif defined(__clang__) + "orrs r5, r6\n\t" +#else + "orr r5, r6\n\t" +#endif + "ldr r4, [%[a], #40]\n\t" + "str r5, [%[r], #32]\n\t" + "movs r6, r4\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r4, r4, %[n]\n\t" +#else + "lsr r4, r4, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r3, r3, r6\n\t" +#elif defined(__clang__) + "orrs r3, r6\n\t" +#else + "orr r3, r6\n\t" +#endif + "ldr r5, [%[a], #44]\n\t" + "str r3, [%[r], #36]\n\t" + "movs r6, r5\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, r5, %[n]\n\t" +#else + "lsr r5, r5, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r4, r4, r6\n\t" +#elif defined(__clang__) + "orrs r4, r6\n\t" +#else + "orr r4, r6\n\t" +#endif + "ldr r3, [%[a], #48]\n\t" + "str r4, [%[r], #40]\n\t" + "movs r6, r3\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r3, r3, %[n]\n\t" +#else + "lsr r3, r3, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r5, r5, r6\n\t" +#elif defined(__clang__) + "orrs r5, r6\n\t" +#else + "orr r5, r6\n\t" +#endif + "ldr r4, [%[a], #52]\n\t" + "str r5, [%[r], #44]\n\t" + "movs r6, r4\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r4, r4, %[n]\n\t" +#else + "lsr r4, r4, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r3, r3, r6\n\t" +#elif defined(__clang__) + "orrs r3, r6\n\t" +#else + "orr r3, r6\n\t" +#endif + "ldr r5, [%[a], #56]\n\t" + "str r3, [%[r], #48]\n\t" + "movs r6, r5\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r5, r5, %[n]\n\t" +#else + "lsr r5, r5, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r4, r4, r6\n\t" +#elif defined(__clang__) + "orrs r4, r6\n\t" +#else + "orr r4, r6\n\t" +#endif + "ldr r3, [%[a], #60]\n\t" + "str r4, [%[r], #52]\n\t" + "movs r6, r3\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r3, r3, %[n]\n\t" +#else + "lsr r3, r3, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r5, r5, r6\n\t" +#elif defined(__clang__) + "orrs r5, r6\n\t" +#else + "orr r5, r6\n\t" +#endif + "ldr r4, [%[a], #64]\n\t" + "str r5, [%[r], #56]\n\t" + "movs r6, r4\n\t" +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsrs r4, r4, %[n]\n\t" +#else + "lsr r4, r4, %[n]\n\t" +#endif +#if defined(__clang__) || defined(WOLFSSL_KEIL) + "lsls r6, r6, r7\n\t" +#else + "lsl r6, r6, r7\n\t" +#endif +#ifdef WOLFSSL_KEIL + "orrs r3, r3, r6\n\t" +#elif defined(__clang__) + "orrs r3, r6\n\t" +#else + "orr r3, r6\n\t" +#endif + "str r3, [%[r], #60]\n\t" + "str r4, [%[r], #64]\n\t" + : [r] "+l" (r), [a] "+l" (a), [n] "+l" (n) + : + : "memory", "r3", "r4", "r5", "r6", "r7" + ); +} + +#endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) @@ -129221,358 +129579,6 @@ static void sp_521_lshift_34(sp_digit* r, const sp_digit* a, byte n) ); } -/* Right shift a by n bits into r. (r = a >> n) - * - * r A single precision integer. - * a A single precision integer. - * n Integer representing number of bits to shift. - */ -static void sp_521_rshift_17(sp_digit* r, const sp_digit* a, byte n) -{ - __asm__ __volatile__ ( - "movs r7, #32\n\t" -#ifdef WOLFSSL_KEIL - "subs r7, r7, %[n]\n\t" -#else -#ifdef __clang__ - "subs r7, r7, %[n]\n\t" -#else - "sub r7, r7, %[n]\n\t" -#endif -#endif - "ldr r3, [%[a]]\n\t" - "ldr r4, [%[a], #4]\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r3, r3, %[n]\n\t" -#else - "lsr r3, r3, %[n]\n\t" -#endif - "movs r6, r4\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r4, r4, %[n]\n\t" -#else - "lsr r4, r4, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r3, r3, r6\n\t" -#elif defined(__clang__) - "orrs r3, r6\n\t" -#else - "orr r3, r6\n\t" -#endif - "ldr r5, [%[a], #8]\n\t" - "str r3, [%[r]]\n\t" - "movs r6, r5\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r5, r5, %[n]\n\t" -#else - "lsr r5, r5, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r4, r4, r6\n\t" -#elif defined(__clang__) - "orrs r4, r6\n\t" -#else - "orr r4, r6\n\t" -#endif - "ldr r3, [%[a], #12]\n\t" - "str r4, [%[r], #4]\n\t" - "movs r6, r3\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r3, r3, %[n]\n\t" -#else - "lsr r3, r3, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r5, r5, r6\n\t" -#elif defined(__clang__) - "orrs r5, r6\n\t" -#else - "orr r5, r6\n\t" -#endif - "ldr r4, [%[a], #16]\n\t" - "str r5, [%[r], #8]\n\t" - "movs r6, r4\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r4, r4, %[n]\n\t" -#else - "lsr r4, r4, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r3, r3, r6\n\t" -#elif defined(__clang__) - "orrs r3, r6\n\t" -#else - "orr r3, r6\n\t" -#endif - "ldr r5, [%[a], #20]\n\t" - "str r3, [%[r], #12]\n\t" - "movs r6, r5\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r5, r5, %[n]\n\t" -#else - "lsr r5, r5, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r4, r4, r6\n\t" -#elif defined(__clang__) - "orrs r4, r6\n\t" -#else - "orr r4, r6\n\t" -#endif - "ldr r3, [%[a], #24]\n\t" - "str r4, [%[r], #16]\n\t" - "movs r6, r3\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r3, r3, %[n]\n\t" -#else - "lsr r3, r3, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r5, r5, r6\n\t" -#elif defined(__clang__) - "orrs r5, r6\n\t" -#else - "orr r5, r6\n\t" -#endif - "ldr r4, [%[a], #28]\n\t" - "str r5, [%[r], #20]\n\t" - "movs r6, r4\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r4, r4, %[n]\n\t" -#else - "lsr r4, r4, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r3, r3, r6\n\t" -#elif defined(__clang__) - "orrs r3, r6\n\t" -#else - "orr r3, r6\n\t" -#endif - "ldr r5, [%[a], #32]\n\t" - "str r3, [%[r], #24]\n\t" - "movs r6, r5\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r5, r5, %[n]\n\t" -#else - "lsr r5, r5, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r4, r4, r6\n\t" -#elif defined(__clang__) - "orrs r4, r6\n\t" -#else - "orr r4, r6\n\t" -#endif - "ldr r3, [%[a], #36]\n\t" - "str r4, [%[r], #28]\n\t" - "movs r6, r3\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r3, r3, %[n]\n\t" -#else - "lsr r3, r3, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r5, r5, r6\n\t" -#elif defined(__clang__) - "orrs r5, r6\n\t" -#else - "orr r5, r6\n\t" -#endif - "ldr r4, [%[a], #40]\n\t" - "str r5, [%[r], #32]\n\t" - "movs r6, r4\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r4, r4, %[n]\n\t" -#else - "lsr r4, r4, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r3, r3, r6\n\t" -#elif defined(__clang__) - "orrs r3, r6\n\t" -#else - "orr r3, r6\n\t" -#endif - "ldr r5, [%[a], #44]\n\t" - "str r3, [%[r], #36]\n\t" - "movs r6, r5\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r5, r5, %[n]\n\t" -#else - "lsr r5, r5, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r4, r4, r6\n\t" -#elif defined(__clang__) - "orrs r4, r6\n\t" -#else - "orr r4, r6\n\t" -#endif - "ldr r3, [%[a], #48]\n\t" - "str r4, [%[r], #40]\n\t" - "movs r6, r3\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r3, r3, %[n]\n\t" -#else - "lsr r3, r3, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r5, r5, r6\n\t" -#elif defined(__clang__) - "orrs r5, r6\n\t" -#else - "orr r5, r6\n\t" -#endif - "ldr r4, [%[a], #52]\n\t" - "str r5, [%[r], #44]\n\t" - "movs r6, r4\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r4, r4, %[n]\n\t" -#else - "lsr r4, r4, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r3, r3, r6\n\t" -#elif defined(__clang__) - "orrs r3, r6\n\t" -#else - "orr r3, r6\n\t" -#endif - "ldr r5, [%[a], #56]\n\t" - "str r3, [%[r], #48]\n\t" - "movs r6, r5\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r5, r5, %[n]\n\t" -#else - "lsr r5, r5, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r4, r4, r6\n\t" -#elif defined(__clang__) - "orrs r4, r6\n\t" -#else - "orr r4, r6\n\t" -#endif - "ldr r3, [%[a], #60]\n\t" - "str r4, [%[r], #52]\n\t" - "movs r6, r3\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r3, r3, %[n]\n\t" -#else - "lsr r3, r3, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r5, r5, r6\n\t" -#elif defined(__clang__) - "orrs r5, r6\n\t" -#else - "orr r5, r6\n\t" -#endif - "ldr r4, [%[a], #64]\n\t" - "str r5, [%[r], #56]\n\t" - "movs r6, r4\n\t" -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsrs r4, r4, %[n]\n\t" -#else - "lsr r4, r4, %[n]\n\t" -#endif -#if defined(__clang__) || defined(WOLFSSL_KEIL) - "lsls r6, r6, r7\n\t" -#else - "lsl r6, r6, r7\n\t" -#endif -#ifdef WOLFSSL_KEIL - "orrs r3, r3, r6\n\t" -#elif defined(__clang__) - "orrs r3, r6\n\t" -#else - "orr r3, r6\n\t" -#endif - "str r3, [%[r], #60]\n\t" - "str r4, [%[r], #64]\n\t" - : [r] "+l" (r), [a] "+l" (a), [n] "+l" (n) - : - : "memory", "r3", "r4", "r5", "r6", "r7" - ); -} - #ifdef WOLFSSL_SP_SMALL /* Sub b from a into a. (a -= b) * @@ -130892,8 +130898,8 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W case 0: /* INIT */ ctx->s = ctx->e; ctx->kInv = ctx->k; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } ctx->i = SP_ECC_MAX_SIG_GEN; @@ -130931,6 +130937,9 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W sp_521_from_mp(ctx->x, 17, priv); sp_521_from_bin(ctx->e, 17, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_17(ctx->e, ctx->e, 7); + } ctx->state = 4; break; } @@ -131067,8 +131076,8 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, tmp = e + 8 * 17; s = e; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } } @@ -131097,6 +131106,10 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, sp_521_from_mp(x, 17, priv); sp_521_from_bin(e, 17, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_17(e, e, 7); + } + err = sp_521_calc_s_17(s, r, k, x, e, tmp); } @@ -133705,8 +133718,8 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, switch (ctx->state) { case 0: /* INIT */ - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(ctx->u1, 17, hash, (int)hashLen); @@ -133715,6 +133728,9 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, sp_521_from_mp(ctx->p2.x, 17, pX); sp_521_from_mp(ctx->p2.y, 17, pY); sp_521_from_mp(ctx->p2.z, 17, pZ); + if (hashLen == 66U) { + sp_521_rshift_17(ctx->u1, ctx->u1, 7); + } ctx->state = 1; break; case 1: /* NORMS0 */ @@ -133866,8 +133882,8 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, tmp = u1 + 6 * 17; p2 = p1 + 1; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(u1, 17, hash, (int)hashLen); @@ -133877,6 +133893,10 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_521_from_mp(p2->y, 17, pY); sp_521_from_mp(p2->z, 17, pZ); + if (hashLen == 66U) { + sp_521_rshift_17(u1, u1, 7); + } + err = sp_521_calc_vfy_point_17(p1, p2, s, u1, u2, tmp, heap); } if (err == MP_OKAY) { diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c index 6d6e334f4..0f4ed9320 100644 --- a/wolfcrypt/src/sp_c32.c +++ b/wolfcrypt/src/sp_c32.c @@ -17517,7 +17517,7 @@ SP_NOINLINE static void sp_4096_rshift_81(sp_digit* r, const sp_digit* a, { int i; - for (i=0; i<72; i += 8) { + for (i=0; i<80; i += 8) { r[i+0] = (a[i+0] >> n) | ((a[i+1] << (26 - n)) & 0x3ffffff); r[i+1] = (a[i+1] >> n) | ((a[i+2] << (26 - n)) & 0x3ffffff); r[i+2] = (a[i+2] >> n) | ((a[i+3] << (26 - n)) & 0x3ffffff); @@ -17527,14 +17527,6 @@ SP_NOINLINE static void sp_4096_rshift_81(sp_digit* r, const sp_digit* a, r[i+6] = (a[i+6] >> n) | ((a[i+7] << (26 - n)) & 0x3ffffff); r[i+7] = (a[i+7] >> n) | ((a[i+8] << (26 - n)) & 0x3ffffff); } - r[72] = (a[72] >> n) | ((a[73] << (26 - n)) & 0x3ffffff); - r[73] = (a[73] >> n) | ((a[74] << (26 - n)) & 0x3ffffff); - r[74] = (a[74] >> n) | ((a[75] << (26 - n)) & 0x3ffffff); - r[75] = (a[75] >> n) | ((a[76] << (26 - n)) & 0x3ffffff); - r[76] = (a[76] >> n) | ((a[77] << (26 - n)) & 0x3ffffff); - r[77] = (a[77] >> n) | ((a[78] << (26 - n)) & 0x3ffffff); - r[78] = (a[78] >> n) | ((a[79] << (26 - n)) & 0x3ffffff); - r[79] = (a[79] >> n) | ((a[80] << (26 - n)) & 0x3ffffff); r[80] = a[80] >> n; } @@ -25420,7 +25412,7 @@ SP_NOINLINE static void sp_256_rshift_9(sp_digit* r, const sp_digit* a, r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff; } #else - for (i=0; i<0; i += 8) { + for (i=0; i<8; i += 8) { r[i+0] = (a[i+0] >> n) | ((a[i+1] << (29 - n)) & 0x1fffffff); r[i+1] = (a[i+1] >> n) | ((a[i+2] << (29 - n)) & 0x1fffffff); r[i+2] = (a[i+2] >> n) | ((a[i+3] << (29 - n)) & 0x1fffffff); @@ -25430,14 +25422,6 @@ SP_NOINLINE static void sp_256_rshift_9(sp_digit* r, const sp_digit* a, r[i+6] = (a[i+6] >> n) | ((a[i+7] << (29 - n)) & 0x1fffffff); r[i+7] = (a[i+7] >> n) | ((a[i+8] << (29 - n)) & 0x1fffffff); } - r[0] = (a[0] >> n) | ((a[1] << (29 - n)) & 0x1fffffff); - r[1] = (a[1] >> n) | ((a[2] << (29 - n)) & 0x1fffffff); - r[2] = (a[2] >> n) | ((a[3] << (29 - n)) & 0x1fffffff); - r[3] = (a[3] >> n) | ((a[4] << (29 - n)) & 0x1fffffff); - r[4] = (a[4] >> n) | ((a[5] << (29 - n)) & 0x1fffffff); - r[5] = (a[5] >> n) | ((a[6] << (29 - n)) & 0x1fffffff); - r[6] = (a[6] >> n) | ((a[7] << (29 - n)) & 0x1fffffff); - r[7] = (a[7] >> n) | ((a[8] << (29 - n)) & 0x1fffffff); #endif /* WOLFSSL_SP_SMALL */ r[8] = a[8] >> n; } @@ -41230,6 +41214,34 @@ int sp_ecc_secret_gen_521(const mp_int* priv, const ecc_point* pub, byte* out, #endif /* HAVE_ECC_DHE */ #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +SP_NOINLINE static void sp_521_rshift_21(sp_digit* r, const sp_digit* a, + byte n) +{ + int i; + +#ifdef WOLFSSL_SP_SMALL + for (i=0; i<20; i++) { + r[i] = ((a[i] >> n) | (a[i + 1] << (25 - n))) & 0x1ffffff; + } +#else + for (i=0; i<16; i += 8) { + r[i+0] = (a[i+0] >> n) | ((a[i+1] << (25 - n)) & 0x1ffffff); + r[i+1] = (a[i+1] >> n) | ((a[i+2] << (25 - n)) & 0x1ffffff); + r[i+2] = (a[i+2] >> n) | ((a[i+3] << (25 - n)) & 0x1ffffff); + r[i+3] = (a[i+3] >> n) | ((a[i+4] << (25 - n)) & 0x1ffffff); + r[i+4] = (a[i+4] >> n) | ((a[i+5] << (25 - n)) & 0x1ffffff); + r[i+5] = (a[i+5] >> n) | ((a[i+6] << (25 - n)) & 0x1ffffff); + r[i+6] = (a[i+6] >> n) | ((a[i+7] << (25 - n)) & 0x1ffffff); + r[i+7] = (a[i+7] >> n) | ((a[i+8] << (25 - n)) & 0x1ffffff); + } + r[16] = (a[16] >> n) | ((a[17] << (25 - n)) & 0x1ffffff); + r[17] = (a[17] >> n) | ((a[18] << (25 - n)) & 0x1ffffff); + r[18] = (a[18] >> n) | ((a[19] << (25 - n)) & 0x1ffffff); + r[19] = (a[19] >> n) | ((a[20] << (25 - n)) & 0x1ffffff); +#endif /* WOLFSSL_SP_SMALL */ + r[20] = a[20] >> n; +} + #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) /* Multiply a by scalar b into r. (r = a * b) @@ -41738,8 +41750,8 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W case 0: /* INIT */ ctx->s = ctx->e; ctx->kInv = ctx->k; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } ctx->i = SP_ECC_MAX_SIG_GEN; @@ -41777,6 +41789,10 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W sp_521_from_mp(ctx->x, 21, priv); sp_521_from_bin(ctx->e, 21, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_21(ctx->e, ctx->e, 7); + ctx->e[20] |= ((sp_digit)hash[0]) << 13; + } ctx->state = 4; break; } @@ -41913,8 +41929,8 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, tmp = e + 8 * 21; s = e; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } } @@ -41943,6 +41959,11 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, sp_521_from_mp(x, 21, priv); sp_521_from_bin(e, 21, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_21(e, e, 7); + e[20] |= ((sp_digit)hash[0]) << 13; + } + err = sp_521_calc_s_21(s, r, k, x, e, tmp); } @@ -42292,8 +42313,8 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, switch (ctx->state) { case 0: /* INIT */ - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(ctx->u1, 21, hash, (int)hashLen); @@ -42302,6 +42323,10 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, sp_521_from_mp(ctx->p2.x, 21, pX); sp_521_from_mp(ctx->p2.y, 21, pY); sp_521_from_mp(ctx->p2.z, 21, pZ); + if (hashLen == 66U) { + sp_521_rshift_21(ctx->u1, ctx->u1, 7); + ctx->u1[20] |= ((sp_digit)hash[0]) << 13; + } ctx->state = 1; break; case 1: /* NORMS0 */ @@ -42453,8 +42478,8 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, tmp = u1 + 6 * 21; p2 = p1 + 1; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(u1, 21, hash, (int)hashLen); @@ -42464,6 +42489,11 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_521_from_mp(p2->y, 21, pY); sp_521_from_mp(p2->z, 21, pZ); + if (hashLen == 66U) { + sp_521_rshift_21(u1, u1, 7); + u1[20] |= ((sp_digit)hash[0]) << 13; + } + err = sp_521_calc_vfy_point_21(p1, p2, s, u1, u2, tmp, heap); } if (err == MP_OKAY) { diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c index 58ce44572..dcf495917 100644 --- a/wolfcrypt/src/sp_c64.c +++ b/wolfcrypt/src/sp_c64.c @@ -41249,6 +41249,30 @@ int sp_ecc_secret_gen_521(const mp_int* priv, const ecc_point* pub, byte* out, #endif /* HAVE_ECC_DHE */ #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +SP_NOINLINE static void sp_521_rshift_9(sp_digit* r, const sp_digit* a, + byte n) +{ + int i; + +#ifdef WOLFSSL_SP_SMALL + for (i=0; i<8; i++) { + r[i] = ((a[i] >> n) | (a[i + 1] << (58 - n))) & 0x3ffffffffffffffL; + } +#else + for (i=0; i<8; i += 8) { + r[i+0] = (a[i+0] >> n) | ((a[i+1] << (58 - n)) & 0x3ffffffffffffffL); + r[i+1] = (a[i+1] >> n) | ((a[i+2] << (58 - n)) & 0x3ffffffffffffffL); + r[i+2] = (a[i+2] >> n) | ((a[i+3] << (58 - n)) & 0x3ffffffffffffffL); + r[i+3] = (a[i+3] >> n) | ((a[i+4] << (58 - n)) & 0x3ffffffffffffffL); + r[i+4] = (a[i+4] >> n) | ((a[i+5] << (58 - n)) & 0x3ffffffffffffffL); + r[i+5] = (a[i+5] >> n) | ((a[i+6] << (58 - n)) & 0x3ffffffffffffffL); + r[i+6] = (a[i+6] >> n) | ((a[i+7] << (58 - n)) & 0x3ffffffffffffffL); + r[i+7] = (a[i+7] >> n) | ((a[i+8] << (58 - n)) & 0x3ffffffffffffffL); + } +#endif /* WOLFSSL_SP_SMALL */ + r[8] = a[8] >> n; +} + #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) /* Multiply a by scalar b into r. (r = a * b) @@ -41803,8 +41827,8 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W case 0: /* INIT */ ctx->s = ctx->e; ctx->kInv = ctx->k; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } ctx->i = SP_ECC_MAX_SIG_GEN; @@ -41842,6 +41866,10 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W sp_521_from_mp(ctx->x, 9, priv); sp_521_from_bin(ctx->e, 9, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_9(ctx->e, ctx->e, 7); + ctx->e[8] |= ((sp_digit)hash[0]) << 49; + } ctx->state = 4; break; } @@ -41978,8 +42006,8 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, tmp = e + 8 * 9; s = e; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } } @@ -42008,6 +42036,11 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, sp_521_from_mp(x, 9, priv); sp_521_from_bin(e, 9, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_9(e, e, 7); + e[8] |= ((sp_digit)hash[0]) << 49; + } + err = sp_521_calc_s_9(s, r, k, x, e, tmp); } @@ -42350,8 +42383,8 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, switch (ctx->state) { case 0: /* INIT */ - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(ctx->u1, 9, hash, (int)hashLen); @@ -42360,6 +42393,10 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, sp_521_from_mp(ctx->p2.x, 9, pX); sp_521_from_mp(ctx->p2.y, 9, pY); sp_521_from_mp(ctx->p2.z, 9, pZ); + if (hashLen == 66U) { + sp_521_rshift_9(ctx->u1, ctx->u1, 7); + ctx->u1[8] |= ((sp_digit)hash[0]) << 49; + } ctx->state = 1; break; case 1: /* NORMS0 */ @@ -42511,8 +42548,8 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, tmp = u1 + 6 * 9; p2 = p1 + 1; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(u1, 9, hash, (int)hashLen); @@ -42522,6 +42559,11 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_521_from_mp(p2->y, 9, pY); sp_521_from_mp(p2->z, 9, pZ); + if (hashLen == 66U) { + sp_521_rshift_9(u1, u1, 7); + u1[8] |= ((sp_digit)hash[0]) << 49; + } + err = sp_521_calc_vfy_point_9(p1, p2, s, u1, u2, tmp, heap); } if (err == MP_OKAY) { diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c index e829f2a91..e373269c9 100644 --- a/wolfcrypt/src/sp_cortexm.c +++ b/wolfcrypt/src/sp_cortexm.c @@ -22198,6 +22198,8 @@ int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out, } #endif /* HAVE_ECC_DHE */ +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +#endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) @@ -29364,6 +29366,8 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out, } #endif /* HAVE_ECC_DHE */ +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +#endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) @@ -37686,6 +37690,100 @@ int sp_ecc_secret_gen_521(const mp_int* priv, const ecc_point* pub, byte* out, } #endif /* HAVE_ECC_DHE */ +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +SP_NOINLINE static void sp_521_rshift_17(sp_digit* r, const sp_digit* a, byte n) +{ + __asm__ __volatile__ ( + "mov r6, #32\n\t" + "sub r6, r6, %[n]\n\t" + "ldrd r2, r3, [%[a]]\n\t" + "lsr r2, r2, %[n]\n\t" + "lsl r5, r3, r6\n\t" + "lsr r3, r3, %[n]\n\t" + "orr r2, r2, r5\n\t" + "ldr r4, [%[a], #8]\n\t" + "str r2, [%[r], #0]\n\t" + "lsl r5, r4, r6\n\t" + "lsr r4, r4, %[n]\n\t" + "orr r3, r3, r5\n\t" + "ldr r2, [%[a], #12]\n\t" + "str r3, [%[r], #4]\n\t" + "lsl r5, r2, r6\n\t" + "lsr r2, r2, %[n]\n\t" + "orr r4, r4, r5\n\t" + "ldr r3, [%[a], #16]\n\t" + "str r4, [%[r], #8]\n\t" + "lsl r5, r3, r6\n\t" + "lsr r3, r3, %[n]\n\t" + "orr r2, r2, r5\n\t" + "ldr r4, [%[a], #20]\n\t" + "str r2, [%[r], #12]\n\t" + "lsl r5, r4, r6\n\t" + "lsr r4, r4, %[n]\n\t" + "orr r3, r3, r5\n\t" + "ldr r2, [%[a], #24]\n\t" + "str r3, [%[r], #16]\n\t" + "lsl r5, r2, r6\n\t" + "lsr r2, r2, %[n]\n\t" + "orr r4, r4, r5\n\t" + "ldr r3, [%[a], #28]\n\t" + "str r4, [%[r], #20]\n\t" + "lsl r5, r3, r6\n\t" + "lsr r3, r3, %[n]\n\t" + "orr r2, r2, r5\n\t" + "ldr r4, [%[a], #32]\n\t" + "str r2, [%[r], #24]\n\t" + "lsl r5, r4, r6\n\t" + "lsr r4, r4, %[n]\n\t" + "orr r3, r3, r5\n\t" + "ldr r2, [%[a], #36]\n\t" + "str r3, [%[r], #28]\n\t" + "lsl r5, r2, r6\n\t" + "lsr r2, r2, %[n]\n\t" + "orr r4, r4, r5\n\t" + "ldr r3, [%[a], #40]\n\t" + "str r4, [%[r], #32]\n\t" + "lsl r5, r3, r6\n\t" + "lsr r3, r3, %[n]\n\t" + "orr r2, r2, r5\n\t" + "ldr r4, [%[a], #44]\n\t" + "str r2, [%[r], #36]\n\t" + "lsl r5, r4, r6\n\t" + "lsr r4, r4, %[n]\n\t" + "orr r3, r3, r5\n\t" + "ldr r2, [%[a], #48]\n\t" + "str r3, [%[r], #40]\n\t" + "lsl r5, r2, r6\n\t" + "lsr r2, r2, %[n]\n\t" + "orr r4, r4, r5\n\t" + "ldr r3, [%[a], #52]\n\t" + "str r4, [%[r], #44]\n\t" + "lsl r5, r3, r6\n\t" + "lsr r3, r3, %[n]\n\t" + "orr r2, r2, r5\n\t" + "ldr r4, [%[a], #56]\n\t" + "str r2, [%[r], #48]\n\t" + "lsl r5, r4, r6\n\t" + "lsr r4, r4, %[n]\n\t" + "orr r3, r3, r5\n\t" + "ldr r2, [%[a], #60]\n\t" + "str r3, [%[r], #52]\n\t" + "lsl r5, r2, r6\n\t" + "lsr r2, r2, %[n]\n\t" + "orr r4, r4, r5\n\t" + "ldr r3, [%[a], #64]\n\t" + "str r4, [%[r], #56]\n\t" + "lsl r5, r3, r6\n\t" + "lsr r3, r3, %[n]\n\t" + "orr r2, r2, r5\n\t" + "strd r2, r3, [%[r], #60]\n\t" + : + : [r] "r" (r), [a] "r" (a), [n] "r" (n) + : "memory", "r2", "r3", "r4", "r5", "r6" + ); +} + +#endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) @@ -38017,98 +38115,6 @@ static void sp_521_lshift_34(sp_digit* r, const sp_digit* a, byte n) ); } -SP_NOINLINE static void sp_521_rshift_17(sp_digit* r, const sp_digit* a, byte n) -{ - __asm__ __volatile__ ( - "mov r6, #32\n\t" - "sub r6, r6, %[n]\n\t" - "ldrd r2, r3, [%[a]]\n\t" - "lsr r2, r2, %[n]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #8]\n\t" - "str r2, [%[r], #0]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #12]\n\t" - "str r3, [%[r], #4]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #16]\n\t" - "str r4, [%[r], #8]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #20]\n\t" - "str r2, [%[r], #12]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #24]\n\t" - "str r3, [%[r], #16]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #28]\n\t" - "str r4, [%[r], #20]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #32]\n\t" - "str r2, [%[r], #24]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #36]\n\t" - "str r3, [%[r], #28]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #40]\n\t" - "str r4, [%[r], #32]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #44]\n\t" - "str r2, [%[r], #36]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #48]\n\t" - "str r3, [%[r], #40]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #52]\n\t" - "str r4, [%[r], #44]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #56]\n\t" - "str r2, [%[r], #48]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #60]\n\t" - "str r3, [%[r], #52]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #64]\n\t" - "str r4, [%[r], #56]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "strd r2, r3, [%[r], #60]\n\t" - : - : [r] "r" (r), [a] "r" (a), [n] "r" (n) - : "memory", "r2", "r3", "r4", "r5", "r6" - ); -} - #ifdef WOLFSSL_SP_SMALL /* Sub b from a into a. (a -= b) * @@ -38695,8 +38701,8 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W case 0: /* INIT */ ctx->s = ctx->e; ctx->kInv = ctx->k; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } ctx->i = SP_ECC_MAX_SIG_GEN; @@ -38734,6 +38740,9 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W sp_521_from_mp(ctx->x, 17, priv); sp_521_from_bin(ctx->e, 17, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_17(ctx->e, ctx->e, 7); + } ctx->state = 4; break; } @@ -38870,8 +38879,8 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, tmp = e + 8 * 17; s = e; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } } @@ -38900,6 +38909,10 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, sp_521_from_mp(x, 17, priv); sp_521_from_bin(e, 17, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_17(e, e, 7); + } + err = sp_521_calc_s_17(s, r, k, x, e, tmp); } @@ -39512,8 +39525,8 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, switch (ctx->state) { case 0: /* INIT */ - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(ctx->u1, 17, hash, (int)hashLen); @@ -39522,6 +39535,9 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, sp_521_from_mp(ctx->p2.x, 17, pX); sp_521_from_mp(ctx->p2.y, 17, pY); sp_521_from_mp(ctx->p2.z, 17, pZ); + if (hashLen == 66U) { + sp_521_rshift_17(ctx->u1, ctx->u1, 7); + } ctx->state = 1; break; case 1: /* NORMS0 */ @@ -39673,8 +39689,8 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, tmp = u1 + 6 * 17; p2 = p1 + 1; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(u1, 17, hash, (int)hashLen); @@ -39684,6 +39700,10 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_521_from_mp(p2->y, 17, pY); sp_521_from_mp(p2->z, 17, pZ); + if (hashLen == 66U) { + sp_521_rshift_17(u1, u1, 7); + } + err = sp_521_calc_vfy_point_17(p1, p2, s, u1, u2, tmp, heap); } if (err == MP_OKAY) { diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c index 0ef833fce..8826657b0 100644 --- a/wolfcrypt/src/sp_x86_64.c +++ b/wolfcrypt/src/sp_x86_64.c @@ -7057,7 +7057,13 @@ static const sp_digit p256_b[4] = { #endif extern void sp_256_mul_4(sp_digit* r, const sp_digit* a, const sp_digit* b); +#ifdef HAVE_INTEL_AVX2 +extern void sp_256_mul_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* b); +#endif /* HAVE_INTEL_AVX2 */ extern void sp_256_sqr_4(sp_digit* r, const sp_digit* a); +#ifdef HAVE_INTEL_AVX2 +extern void sp_256_sqr_avx2_4(sp_digit* r, const sp_digit* a); +#endif /* HAVE_INTEL_AVX2 */ extern sp_digit sp_256_add_4(sp_digit* r, const sp_digit* a, const sp_digit* b); extern sp_digit sp_256_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b); /* Multiply a number by Montgomery normalizer mod modulus (prime). @@ -22980,7 +22986,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, XFREE(rt, heap, DYNAMIC_TYPE_ECC); #endif - return MP_OKAY; + return err; } /* Multiply the base point of P256 by the scalar and return the result. @@ -23114,7 +23120,7 @@ static int sp_256_ecc_mulmod_add_only_avx2_4(sp_point_256* r, const sp_point_256 XFREE(rt, heap, DYNAMIC_TYPE_ECC); #endif - return MP_OKAY; + return err; } /* Multiply the base point of P256 by the scalar and return the result. @@ -23557,9 +23563,6 @@ int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out, #endif /* HAVE_ECC_DHE */ #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) -#ifdef HAVE_INTEL_AVX2 -extern void sp_256_mul_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* b); -#endif /* HAVE_INTEL_AVX2 */ #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) extern sp_digit sp_256_sub_in_place_4(sp_digit* a, const sp_digit* b); @@ -25656,7 +25659,13 @@ static const sp_digit p384_b[6] = { #endif extern void sp_384_mul_6(sp_digit* r, const sp_digit* a, const sp_digit* b); +#ifdef HAVE_INTEL_AVX2 +extern void sp_384_mul_avx2_6(sp_digit* r, const sp_digit* a, const sp_digit* b); +#endif /* HAVE_INTEL_AVX2 */ extern void sp_384_sqr_6(sp_digit* r, const sp_digit* a); +#ifdef HAVE_INTEL_AVX2 +extern void sp_384_sqr_avx2_6(sp_digit* r, const sp_digit* a); +#endif /* HAVE_INTEL_AVX2 */ extern sp_digit sp_384_add_6(sp_digit* r, const sp_digit* a, const sp_digit* b); extern sp_digit sp_384_sub_6(sp_digit* r, const sp_digit* a, const sp_digit* b); /* Multiply a number by Montgomery normalizer mod modulus (prime). @@ -27198,7 +27207,6 @@ static int sp_384_ecc_mulmod_win_add_sub_6(sp_point_384* r, const sp_point_384* #ifdef HAVE_INTEL_AVX2 #define sp_384_mod_mul_norm_avx2_6 sp_384_mod_mul_norm_6 #ifdef HAVE_INTEL_AVX2 -extern void sp_384_mul_avx2_6(sp_digit* r, const sp_digit* a, const sp_digit* b); #define sp_384_mont_reduce_avx2_6 sp_384_mont_reduce_6 extern void sp_384_mont_reduce_order_avx2_6(sp_digit* a, const sp_digit* m, sp_digit mp); /* Multiply two Montgomery form numbers mod the modulus (prime). @@ -27219,7 +27227,6 @@ static void sp_384_mont_mul_avx2_6(sp_digit* r, const sp_digit* a, #endif /* HAVE_INTEL_AVX2 */ #ifdef HAVE_INTEL_AVX2 -extern void sp_384_sqr_avx2_6(sp_digit* r, const sp_digit* a); /* Square the Montgomery form number. (r = a * a mod m) * * r Result of squaring. @@ -47595,7 +47602,7 @@ static int sp_384_ecc_mulmod_add_only_6(sp_point_384* r, const sp_point_384* g, XFREE(rt, heap, DYNAMIC_TYPE_ECC); #endif - return MP_OKAY; + return err; } /* Multiply the base point of P384 by the scalar and return the result. @@ -47729,7 +47736,7 @@ static int sp_384_ecc_mulmod_add_only_avx2_6(sp_point_384* r, const sp_point_384 XFREE(rt, heap, DYNAMIC_TYPE_ECC); #endif - return MP_OKAY; + return err; } /* Multiply the base point of P384 by the scalar and return the result. @@ -48172,8 +48179,6 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out, #endif /* HAVE_ECC_DHE */ #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) -#ifdef HAVE_INTEL_AVX2 -#endif /* HAVE_INTEL_AVX2 */ #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) extern sp_digit sp_384_sub_in_place_6(sp_digit* a, const sp_digit* b); @@ -50344,7 +50349,13 @@ static const sp_digit p521_b[9] = { #endif extern void sp_521_mul_9(sp_digit* r, const sp_digit* a, const sp_digit* b); +#ifdef HAVE_INTEL_AVX2 +extern void sp_521_mul_avx2_9(sp_digit* r, const sp_digit* a, const sp_digit* b); +#endif /* HAVE_INTEL_AVX2 */ extern void sp_521_sqr_9(sp_digit* r, const sp_digit* a); +#ifdef HAVE_INTEL_AVX2 +extern void sp_521_sqr_avx2_9(sp_digit* r, const sp_digit* a); +#endif /* HAVE_INTEL_AVX2 */ extern sp_digit sp_521_add_9(sp_digit* r, const sp_digit* a, const sp_digit* b); extern sp_digit sp_521_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b); /* Multiply a number by Montgomery normalizer mod modulus (prime). @@ -88242,7 +88253,7 @@ static int sp_521_ecc_mulmod_add_only_9(sp_point_521* r, const sp_point_521* g, XFREE(rt, heap, DYNAMIC_TYPE_ECC); #endif - return MP_OKAY; + return err; } /* Multiply the base point of P521 by the scalar and return the result. @@ -88376,7 +88387,7 @@ static int sp_521_ecc_mulmod_add_only_avx2_9(sp_point_521* r, const sp_point_521 XFREE(rt, heap, DYNAMIC_TYPE_ECC); #endif - return MP_OKAY; + return err; } /* Multiply the base point of P521 by the scalar and return the result. @@ -88821,14 +88832,11 @@ int sp_ecc_secret_gen_521(const mp_int* priv, const ecc_point* pub, byte* out, #endif /* HAVE_ECC_DHE */ #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) -#ifdef HAVE_INTEL_AVX2 -extern void sp_521_mul_avx2_9(sp_digit* r, const sp_digit* a, const sp_digit* b); -#endif /* HAVE_INTEL_AVX2 */ +extern void sp_521_rshift_9(sp_digit* r, const sp_digit* a, int n); #endif #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) extern void sp_521_lshift_9(sp_digit* r, const sp_digit* a, int n); extern void sp_521_lshift_18(sp_digit* r, const sp_digit* a, int n); -extern void sp_521_rshift_9(sp_digit* r, const sp_digit* a, int n); extern sp_digit sp_521_sub_in_place_9(sp_digit* a, const sp_digit* b); extern void sp_521_mul_d_9(sp_digit* r, const sp_digit* a, sp_digit b); extern void sp_521_mul_d_avx2_9(sp_digit* r, const sp_digit* a, const sp_digit b); @@ -89169,7 +89177,6 @@ static void sp_521_mont_inv_order_9(sp_digit* r, const sp_digit* a, #endif /* HAVE_ECC_SIGN || (HAVE_ECC_VERIFY && WOLFSSL_SP_SMALL) */ #ifdef HAVE_INTEL_AVX2 -extern void sp_521_sqr_avx2_9(sp_digit* r, const sp_digit* a); /* Multiply two number mod the order of P521 curve. (r = a * b mod order) * * r Result of the multiplication. @@ -89469,8 +89476,8 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W case 0: /* INIT */ ctx->s = ctx->e; ctx->kInv = ctx->k; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } ctx->i = SP_ECC_MAX_SIG_GEN; @@ -89508,6 +89515,9 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W sp_521_from_mp(ctx->x, 9, priv); sp_521_from_bin(ctx->e, 9, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_9(ctx->e, ctx->e, 7); + } ctx->state = 4; break; } @@ -89647,8 +89657,8 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, tmp = e + 8 * 9; s = e; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } } @@ -89682,6 +89692,10 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, sp_521_from_mp(x, 9, priv); sp_521_from_bin(e, 9, hash, (int)hashLen); + if (hashLen == 66U) { + sp_521_rshift_9(e, e, 7); + } + err = sp_521_calc_s_9(s, r, k, x, e, tmp); } @@ -90006,8 +90020,8 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, switch (ctx->state) { case 0: /* INIT */ - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(ctx->u1, 9, hash, (int)hashLen); @@ -90016,6 +90030,9 @@ int sp_ecc_verify_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, sp_521_from_mp(ctx->p2.x, 9, pX); sp_521_from_mp(ctx->p2.y, 9, pY); sp_521_from_mp(ctx->p2.z, 9, pZ); + if (hashLen == 66U) { + sp_521_rshift_9(ctx->u1, ctx->u1, 7); + } ctx->state = 1; break; case 1: /* NORMS0 */ @@ -90170,8 +90187,8 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, tmp = u1 + 6 * 9; p2 = p1 + 1; - if (hashLen > 65U) { - hashLen = 65U; + if (hashLen > 66U) { + hashLen = 66U; } sp_521_from_bin(u1, 9, hash, (int)hashLen); @@ -90181,6 +90198,10 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_521_from_mp(p2->y, 9, pY); sp_521_from_mp(p2->z, 9, pZ); + if (hashLen == 66U) { + sp_521_rshift_9(u1, u1, 7); + } + err = sp_521_calc_vfy_point_9(p1, p2, s, u1, u2, tmp, heap); } if (err == MP_OKAY) { diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S index 7d5a2a022..10b5c98ba 100644 --- a/wolfcrypt/src/sp_x86_64_asm.S +++ b/wolfcrypt/src/sp_x86_64_asm.S @@ -38820,6 +38820,127 @@ _sp_256_mul_4: #ifndef __APPLE__ .size sp_256_mul_4,.-sp_256_mul_4 #endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX2 +/* Multiply a and b into r. (r = a * b) + * + * r Result of multiplication. + * a First number to multiply. + * b Second number to multiply. + */ +#ifndef __APPLE__ +.text +.globl sp_256_mul_avx2_4 +.type sp_256_mul_avx2_4,@function +.align 16 +sp_256_mul_avx2_4: +#else +.section __TEXT,__text +.globl _sp_256_mul_avx2_4 +.p2align 4 +_sp_256_mul_avx2_4: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbp + movq %rdx, %rbp + # A[0] * B[0] + movq (%rbp), %rdx + mulxq (%rsi), %r8, %r9 + # A[2] * B[0] + mulxq 16(%rsi), %r10, %r11 + # A[1] * B[0] + mulxq 8(%rsi), %rax, %rcx + xorq %r15, %r15 + adcxq %rax, %r9 + # A[1] * B[3] + movq 24(%rbp), %rdx + mulxq 8(%rsi), %r12, %r13 + adcxq %rcx, %r10 + # A[0] * B[1] + movq 8(%rbp), %rdx + mulxq (%rsi), %rax, %rcx + adoxq %rax, %r9 + # A[2] * B[1] + mulxq 16(%rsi), %rax, %r14 + adoxq %rcx, %r10 + adcxq %rax, %r11 + # A[1] * B[2] + movq 16(%rbp), %rdx + mulxq 8(%rsi), %rax, %rcx + adcxq %r14, %r12 + adoxq %rax, %r11 + adcxq %r15, %r13 + adoxq %rcx, %r12 + # A[0] * B[2] + mulxq (%rsi), %rax, %rcx + adoxq %r15, %r13 + xorq %r14, %r14 + adcxq %rax, %r10 + # A[1] * B[1] + movq 8(%rbp), %rdx + mulxq 8(%rsi), %rdx, %rax + adcxq %rcx, %r11 + adoxq %rdx, %r10 + # A[3] * B[1] + movq 8(%rbp), %rdx + adoxq %rax, %r11 + mulxq 24(%rsi), %rax, %rcx + adcxq %rax, %r12 + # A[2] * B[2] + movq 16(%rbp), %rdx + mulxq 16(%rsi), %rdx, %rax + adcxq %rcx, %r13 + adoxq %rdx, %r12 + # A[3] * B[3] + movq 24(%rbp), %rdx + adoxq %rax, %r13 + mulxq 24(%rsi), %rax, %rcx + adoxq %r15, %r14 + adcxq %rax, %r14 + # A[0] * B[3] + mulxq (%rsi), %rdx, %rax + adcxq %rcx, %r15 + xorq %rcx, %rcx + adcxq %rdx, %r11 + # A[3] * B[0] + movq 24(%rsi), %rdx + adcxq %rax, %r12 + mulxq (%rbp), %rbx, %rax + adoxq %rbx, %r11 + adoxq %rax, %r12 + # A[3] * B[2] + mulxq 16(%rbp), %rdx, %rax + adcxq %rdx, %r13 + # A[2] * B[3] + movq 24(%rbp), %rdx + adcxq %rax, %r14 + mulxq 16(%rsi), %rax, %rdx + adcxq %rcx, %r15 + adoxq %rax, %r13 + adoxq %rdx, %r14 + adoxq %rcx, %r15 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, 32(%rdi) + movq %r13, 40(%rdi) + movq %r14, 48(%rdi) + movq %r15, 56(%rdi) + popq %rbp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size sp_256_mul_avx2_4,.-sp_256_mul_avx2_4 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -38942,6 +39063,101 @@ _sp_256_sqr_4: #ifndef __APPLE__ .size sp_256_sqr_4,.-sp_256_sqr_4 #endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX2 +/* Square a and put result in r. (r = a * a) + * + * r Result of squaring. + * a Number to square in Montgomery form. + */ +#ifndef __APPLE__ +.text +.globl sp_256_sqr_avx2_4 +.type sp_256_sqr_avx2_4,@function +.align 16 +sp_256_sqr_avx2_4: +#else +.section __TEXT,__text +.globl _sp_256_sqr_avx2_4 +.p2align 4 +_sp_256_sqr_avx2_4: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + # A[0] * A[1] + movq (%rsi), %rdx + movq 16(%rsi), %r15 + mulxq 8(%rsi), %r9, %r10 + # A[0] * A[3] + mulxq 24(%rsi), %r11, %r12 + # A[2] * A[1] + movq %r15, %rdx + mulxq 8(%rsi), %rcx, %rbx + # A[2] * A[3] + mulxq 24(%rsi), %r13, %r14 + xorq %r15, %r15 + adoxq %rcx, %r11 + adoxq %rbx, %r12 + # A[2] * A[0] + mulxq (%rsi), %rcx, %rbx + # A[1] * A[3] + movq 8(%rsi), %rdx + adoxq %r15, %r13 + mulxq 24(%rsi), %rax, %r8 + adcxq %rcx, %r10 + adoxq %r15, %r14 + adcxq %rbx, %r11 + adcxq %rax, %r12 + adcxq %r8, %r13 + adcxq %r15, %r14 + # Double with Carry Flag + xorq %r15, %r15 + # A[0] * A[0] + movq (%rsi), %rdx + mulxq %rdx, %r8, %rax + adcxq %r9, %r9 + adcxq %r10, %r10 + adoxq %rax, %r9 + # A[1] * A[1] + movq 8(%rsi), %rdx + mulxq %rdx, %rcx, %rbx + adcxq %r11, %r11 + adoxq %rcx, %r10 + # A[2] * A[2] + movq 16(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adcxq %r12, %r12 + adoxq %rbx, %r11 + adcxq %r13, %r13 + adoxq %rax, %r12 + adcxq %r14, %r14 + # A[3] * A[3] + movq 24(%rsi), %rdx + mulxq %rdx, %rax, %rbx + adoxq %rcx, %r13 + adcxq %r15, %r15 + adoxq %rax, %r14 + adoxq %rbx, %r15 + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) + movq %r12, 32(%rdi) + movq %r13, 40(%rdi) + movq %r14, 48(%rdi) + movq %r15, 56(%rdi) + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_256_sqr_avx2_4,.-sp_256_sqr_avx2_4 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -41168,127 +41384,6 @@ _sp_256_to_bin_movbe_4: .size sp_256_to_bin_movbe_4,.-sp_256_to_bin_movbe_4 #endif /* __APPLE__ */ #endif /* NO_MOVBE_SUPPORT */ -#ifdef HAVE_INTEL_AVX2 -/* Multiply a and b into r. (r = a * b) - * - * r Result of multiplication. - * a First number to multiply. - * b Second number to multiply. - */ -#ifndef __APPLE__ -.text -.globl sp_256_mul_avx2_4 -.type sp_256_mul_avx2_4,@function -.align 16 -sp_256_mul_avx2_4: -#else -.section __TEXT,__text -.globl _sp_256_mul_avx2_4 -.p2align 4 -_sp_256_mul_avx2_4: -#endif /* __APPLE__ */ - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbp - movq %rdx, %rbp - # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rax, %rcx - xorq %r15, %r15 - adcxq %rax, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rcx, %r10 - # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rsi), %rax, %rcx - adoxq %rax, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rax, %r14 - adoxq %rcx, %r10 - adcxq %rax, %r11 - # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rsi), %rax, %rcx - adcxq %r14, %r12 - adoxq %rax, %r11 - adcxq %r15, %r13 - adoxq %rcx, %r12 - # A[0] * B[2] - mulxq (%rsi), %rax, %rcx - adoxq %r15, %r13 - xorq %r14, %r14 - adcxq %rax, %r10 - # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rsi), %rdx, %rax - adcxq %rcx, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rax, %r11 - mulxq 24(%rsi), %rax, %rcx - adcxq %rax, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rsi), %rdx, %rax - adcxq %rcx, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rax, %r13 - mulxq 24(%rsi), %rax, %rcx - adoxq %r15, %r14 - adcxq %rax, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rax - adcxq %rcx, %r15 - xorq %rcx, %rcx - adcxq %rdx, %r11 - # A[3] * B[0] - movq 24(%rsi), %rdx - adcxq %rax, %r12 - mulxq (%rbp), %rbx, %rax - adoxq %rbx, %r11 - adoxq %rax, %r12 - # A[3] * B[2] - mulxq 16(%rbp), %rdx, %rax - adcxq %rdx, %r13 - # A[2] * B[3] - movq 24(%rbp), %rdx - adcxq %rax, %r14 - mulxq 16(%rsi), %rax, %rdx - adcxq %rcx, %r15 - adoxq %rax, %r13 - adoxq %rdx, %r14 - adoxq %rcx, %r15 - movq %r8, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, 32(%rdi) - movq %r13, 40(%rdi) - movq %r14, 48(%rdi) - movq %r15, 56(%rdi) - popq %rbp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - repz retq -#ifndef __APPLE__ -.size sp_256_mul_avx2_4,.-sp_256_mul_avx2_4 -#endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ /* Sub b from a into a. (a -= b) * * a A single precision integer and result. @@ -42720,6 +42815,230 @@ _sp_384_mul_6: #ifndef __APPLE__ .size sp_384_mul_6,.-sp_384_mul_6 #endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX2 +/* Multiply a and b into r. (r = a * b) + * + * r Result of multiplication. + * a First number to multiply. + * b Second number to multiply. + */ +#ifndef __APPLE__ +.text +.globl sp_384_mul_avx2_6 +.type sp_384_mul_avx2_6,@function +.align 16 +sp_384_mul_avx2_6: +#else +.section __TEXT,__text +.globl _sp_384_mul_avx2_6 +.p2align 4 +_sp_384_mul_avx2_6: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %rax + subq $40, %rsp + xorq %rbx, %rbx + movq (%rsi), %rdx + # A[0] * B[0] + mulxq (%rax), %r9, %r10 + # A[0] * B[1] + mulxq 8(%rax), %rcx, %r11 + adcxq %rcx, %r10 + # A[0] * B[2] + mulxq 16(%rax), %rcx, %r12 + adcxq %rcx, %r11 + # A[0] * B[3] + mulxq 24(%rax), %rcx, %r13 + adcxq %rcx, %r12 + # A[0] * B[4] + mulxq 32(%rax), %rcx, %r14 + adcxq %rcx, %r13 + # A[0] * B[5] + mulxq 40(%rax), %rcx, %r15 + adcxq %rcx, %r14 + adcxq %rbx, %r15 + movq %r9, (%rsp) + movq $0x00, %r9 + adcxq %rbx, %r9 + xorq %rbx, %rbx + movq 8(%rsi), %rdx + # A[1] * B[0] + mulxq (%rax), %rcx, %r8 + adcxq %rcx, %r10 + adoxq %r8, %r11 + # A[1] * B[1] + mulxq 8(%rax), %rcx, %r8 + adcxq %rcx, %r11 + adoxq %r8, %r12 + # A[1] * B[2] + mulxq 16(%rax), %rcx, %r8 + adcxq %rcx, %r12 + adoxq %r8, %r13 + # A[1] * B[3] + mulxq 24(%rax), %rcx, %r8 + adcxq %rcx, %r13 + adoxq %r8, %r14 + # A[1] * B[4] + mulxq 32(%rax), %rcx, %r8 + adcxq %rcx, %r14 + adoxq %r8, %r15 + # A[1] * B[5] + mulxq 40(%rax), %rcx, %r8 + adcxq %rcx, %r15 + adoxq %r8, %r9 + adcxq %rbx, %r9 + movq %r10, 8(%rsp) + movq $0x00, %r10 + adcxq %rbx, %r10 + adoxq %rbx, %r10 + xorq %rbx, %rbx + movq 16(%rsi), %rdx + # A[2] * B[0] + mulxq (%rax), %rcx, %r8 + adcxq %rcx, %r11 + adoxq %r8, %r12 + # A[2] * B[1] + mulxq 8(%rax), %rcx, %r8 + adcxq %rcx, %r12 + adoxq %r8, %r13 + # A[2] * B[2] + mulxq 16(%rax), %rcx, %r8 + adcxq %rcx, %r13 + adoxq %r8, %r14 + # A[2] * B[3] + mulxq 24(%rax), %rcx, %r8 + adcxq %rcx, %r14 + adoxq %r8, %r15 + # A[2] * B[4] + mulxq 32(%rax), %rcx, %r8 + adcxq %rcx, %r15 + adoxq %r8, %r9 + # A[2] * B[5] + mulxq 40(%rax), %rcx, %r8 + adcxq %rcx, %r9 + adoxq %r8, %r10 + adcxq %rbx, %r10 + movq %r11, 16(%rsp) + movq $0x00, %r11 + adcxq %rbx, %r11 + adoxq %rbx, %r11 + xorq %rbx, %rbx + movq 24(%rsi), %rdx + # A[3] * B[0] + mulxq (%rax), %rcx, %r8 + adcxq %rcx, %r12 + adoxq %r8, %r13 + # A[3] * B[1] + mulxq 8(%rax), %rcx, %r8 + adcxq %rcx, %r13 + adoxq %r8, %r14 + # A[3] * B[2] + mulxq 16(%rax), %rcx, %r8 + adcxq %rcx, %r14 + adoxq %r8, %r15 + # A[3] * B[3] + mulxq 24(%rax), %rcx, %r8 + adcxq %rcx, %r15 + adoxq %r8, %r9 + # A[3] * B[4] + mulxq 32(%rax), %rcx, %r8 + adcxq %rcx, %r9 + adoxq %r8, %r10 + # A[3] * B[5] + mulxq 40(%rax), %rcx, %r8 + adcxq %rcx, %r10 + adoxq %r8, %r11 + adcxq %rbx, %r11 + movq %r12, 24(%rsp) + movq $0x00, %r12 + adcxq %rbx, %r12 + adoxq %rbx, %r12 + xorq %rbx, %rbx + movq 32(%rsi), %rdx + # A[4] * B[0] + mulxq (%rax), %rcx, %r8 + adcxq %rcx, %r13 + adoxq %r8, %r14 + # A[4] * B[1] + mulxq 8(%rax), %rcx, %r8 + adcxq %rcx, %r14 + adoxq %r8, %r15 + # A[4] * B[2] + mulxq 16(%rax), %rcx, %r8 + adcxq %rcx, %r15 + adoxq %r8, %r9 + # A[4] * B[3] + mulxq 24(%rax), %rcx, %r8 + adcxq %rcx, %r9 + adoxq %r8, %r10 + # A[4] * B[4] + mulxq 32(%rax), %rcx, %r8 + adcxq %rcx, %r10 + adoxq %r8, %r11 + # A[4] * B[5] + mulxq 40(%rax), %rcx, %r8 + adcxq %rcx, %r11 + adoxq %r8, %r12 + adcxq %rbx, %r12 + movq %r13, 32(%rsp) + movq 40(%rsi), %rdx + # A[5] * B[0] + mulxq (%rax), %rcx, %r8 + adcxq %rcx, %r14 + adoxq %r8, %r15 + # A[5] * B[1] + mulxq 8(%rax), %rcx, %r8 + adcxq %rcx, %r15 + adoxq %r8, %r9 + # A[5] * B[2] + mulxq 16(%rax), %rcx, %r8 + adcxq %rcx, %r9 + adoxq %r8, %r10 + # A[5] * B[3] + mulxq 24(%rax), %rcx, %r8 + adcxq %rcx, %r10 + adoxq %r8, %r11 + # A[5] * B[4] + mulxq 32(%rax), %rcx, %r8 + adcxq %rcx, %r11 + adoxq %r8, %r12 + # A[5] * B[5] + mulxq 40(%rax), %rcx, %r13 + adcxq %rcx, %r12 + adoxq %rbx, %r13 + adcxq %rbx, %r13 + movq %r14, 40(%rdi) + movq %r15, 48(%rdi) + movq %r9, 56(%rdi) + movq %r10, 64(%rdi) + movq %r11, 72(%rdi) + movq %r12, 80(%rdi) + movq %r13, 88(%rdi) + movq (%rsp), %r9 + movq 8(%rsp), %r10 + movq 16(%rsp), %r11 + movq 24(%rsp), %r12 + movq 32(%rsp), %r13 + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq %r12, 24(%rdi) + movq %r13, 32(%rdi) + addq $40, %rsp + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_384_mul_avx2_6,.-sp_384_mul_avx2_6 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -42944,6 +43263,170 @@ _sp_384_sqr_6: #ifndef __APPLE__ .size sp_384_sqr_6,.-sp_384_sqr_6 #endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX2 +/* Square a and put result in r. (r = a * a) + * + * r Result of squaring. + * a Number to square in Montgomery form. + */ +#ifndef __APPLE__ +.text +.globl sp_384_sqr_avx2_6 +.type sp_384_sqr_avx2_6,@function +.align 16 +sp_384_sqr_avx2_6: +#else +.section __TEXT,__text +.globl _sp_384_sqr_avx2_6 +.p2align 4 +_sp_384_sqr_avx2_6: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + pushq %rbp + push %rdi + xorq %rdi, %rdi + movq (%rsi), %rdx + movq 8(%rsi), %r15 + movq 16(%rsi), %rbx + movq 24(%rsi), %rbp + # Diagonal 0 + # A[1] * A[0] + mulxq 8(%rsi), %r8, %r9 + # A[2] * A[0] + mulxq 16(%rsi), %rax, %r10 + adcxq %rax, %r9 + # A[3] * A[0] + mulxq 24(%rsi), %rax, %r11 + adcxq %rax, %r10 + # A[4] * A[0] + mulxq 32(%rsi), %rax, %r12 + adcxq %rax, %r11 + # A[5] * A[0] + mulxq 40(%rsi), %rax, %r13 + adcxq %rax, %r12 + adcxq %rdi, %r13 + # Diagonal 1 + movq %r15, %rdx + # A[2] * A[1] + mulxq 16(%rsi), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + # A[3] * A[1] + mulxq 24(%rsi), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + # A[4] * A[1] + mulxq 32(%rsi), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + # A[5] * A[1] + mulxq 40(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %rdi, %r14 + movq %rbx, %rdx + # A[5] * A[2] + mulxq 40(%rsi), %rax, %r15 + adcxq %rax, %r14 + adoxq %rdi, %r15 + adcxq %rdi, %r15 + adcxq %rdi, %rbx + # Diagonal 2 + # A[3] * A[2] + mulxq 24(%rsi), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + # A[4] * A[2] + mulxq 32(%rsi), %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + movq %rbp, %rdx + # A[4] * A[3] + mulxq 32(%rsi), %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + # A[5] * A[3] + mulxq 40(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rdi, %rbx + movq 32(%rsi), %rdx + # A[5] * A[4] + mulxq 40(%rsi), %rax, %rbp + adcxq %rax, %rbx + adoxq %rdi, %rbp + adcxq %rdi, %rbp + adcxq %rdi, %rdi + # Doubling previous result as we add in square words results + # A[0] * A[0] + movq (%rsi), %rdx + mulxq %rdx, %rax, %rcx + pop %rdx + movq %rax, (%rdx) + adoxq %r8, %r8 + push %rdx + adcxq %rcx, %r8 + # A[1] * A[1] + movq 8(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r9, %r9 + adcxq %rax, %r9 + adoxq %r10, %r10 + adcxq %rcx, %r10 + # A[2] * A[2] + movq 16(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r11, %r11 + adcxq %rax, %r11 + adoxq %r12, %r12 + adcxq %rcx, %r12 + # A[3] * A[3] + movq 24(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r13, %r13 + adcxq %rax, %r13 + adoxq %r14, %r14 + adcxq %rcx, %r14 + # A[4] * A[4] + movq 32(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r15, %r15 + adcxq %rax, %r15 + adoxq %rbx, %rbx + adcxq %rcx, %rbx + # A[5] * A[5] + movq 40(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %rbp, %rbp + adcxq %rax, %rbp + adcxq %rdi, %rcx + movq $0x00, %rax + adoxq %rax, %rcx + pop %rdi + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + movq %r11, 32(%rdi) + movq %r12, 40(%rdi) + movq %r13, 48(%rdi) + movq %r14, 56(%rdi) + movq %r15, 64(%rdi) + movq %rbx, 72(%rdi) + movq %rbp, 80(%rdi) + movq %rcx, 88(%rdi) + popq %rbp + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_384_sqr_avx2_6,.-sp_384_sqr_avx2_6 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -43881,230 +44364,6 @@ L_384_get_point_33_avx2_6_start: #endif /* HAVE_INTEL_AVX2 */ #endif /* !WC_NO_CACHE_RESISTANT */ #ifdef HAVE_INTEL_AVX2 -/* Multiply a and b into r. (r = a * b) - * - * r Result of multiplication. - * a First number to multiply. - * b Second number to multiply. - */ -#ifndef __APPLE__ -.text -.globl sp_384_mul_avx2_6 -.type sp_384_mul_avx2_6,@function -.align 16 -sp_384_mul_avx2_6: -#else -.section __TEXT,__text -.globl _sp_384_mul_avx2_6 -.p2align 4 -_sp_384_mul_avx2_6: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbx - movq %rdx, %rax - subq $40, %rsp - xorq %rbx, %rbx - movq (%rsi), %rdx - # A[0] * B[0] - mulxq (%rax), %r9, %r10 - # A[0] * B[1] - mulxq 8(%rax), %rcx, %r11 - adcxq %rcx, %r10 - # A[0] * B[2] - mulxq 16(%rax), %rcx, %r12 - adcxq %rcx, %r11 - # A[0] * B[3] - mulxq 24(%rax), %rcx, %r13 - adcxq %rcx, %r12 - # A[0] * B[4] - mulxq 32(%rax), %rcx, %r14 - adcxq %rcx, %r13 - # A[0] * B[5] - mulxq 40(%rax), %rcx, %r15 - adcxq %rcx, %r14 - adcxq %rbx, %r15 - movq %r9, (%rsp) - movq $0x00, %r9 - adcxq %rbx, %r9 - xorq %rbx, %rbx - movq 8(%rsi), %rdx - # A[1] * B[0] - mulxq (%rax), %rcx, %r8 - adcxq %rcx, %r10 - adoxq %r8, %r11 - # A[1] * B[1] - mulxq 8(%rax), %rcx, %r8 - adcxq %rcx, %r11 - adoxq %r8, %r12 - # A[1] * B[2] - mulxq 16(%rax), %rcx, %r8 - adcxq %rcx, %r12 - adoxq %r8, %r13 - # A[1] * B[3] - mulxq 24(%rax), %rcx, %r8 - adcxq %rcx, %r13 - adoxq %r8, %r14 - # A[1] * B[4] - mulxq 32(%rax), %rcx, %r8 - adcxq %rcx, %r14 - adoxq %r8, %r15 - # A[1] * B[5] - mulxq 40(%rax), %rcx, %r8 - adcxq %rcx, %r15 - adoxq %r8, %r9 - adcxq %rbx, %r9 - movq %r10, 8(%rsp) - movq $0x00, %r10 - adcxq %rbx, %r10 - adoxq %rbx, %r10 - xorq %rbx, %rbx - movq 16(%rsi), %rdx - # A[2] * B[0] - mulxq (%rax), %rcx, %r8 - adcxq %rcx, %r11 - adoxq %r8, %r12 - # A[2] * B[1] - mulxq 8(%rax), %rcx, %r8 - adcxq %rcx, %r12 - adoxq %r8, %r13 - # A[2] * B[2] - mulxq 16(%rax), %rcx, %r8 - adcxq %rcx, %r13 - adoxq %r8, %r14 - # A[2] * B[3] - mulxq 24(%rax), %rcx, %r8 - adcxq %rcx, %r14 - adoxq %r8, %r15 - # A[2] * B[4] - mulxq 32(%rax), %rcx, %r8 - adcxq %rcx, %r15 - adoxq %r8, %r9 - # A[2] * B[5] - mulxq 40(%rax), %rcx, %r8 - adcxq %rcx, %r9 - adoxq %r8, %r10 - adcxq %rbx, %r10 - movq %r11, 16(%rsp) - movq $0x00, %r11 - adcxq %rbx, %r11 - adoxq %rbx, %r11 - xorq %rbx, %rbx - movq 24(%rsi), %rdx - # A[3] * B[0] - mulxq (%rax), %rcx, %r8 - adcxq %rcx, %r12 - adoxq %r8, %r13 - # A[3] * B[1] - mulxq 8(%rax), %rcx, %r8 - adcxq %rcx, %r13 - adoxq %r8, %r14 - # A[3] * B[2] - mulxq 16(%rax), %rcx, %r8 - adcxq %rcx, %r14 - adoxq %r8, %r15 - # A[3] * B[3] - mulxq 24(%rax), %rcx, %r8 - adcxq %rcx, %r15 - adoxq %r8, %r9 - # A[3] * B[4] - mulxq 32(%rax), %rcx, %r8 - adcxq %rcx, %r9 - adoxq %r8, %r10 - # A[3] * B[5] - mulxq 40(%rax), %rcx, %r8 - adcxq %rcx, %r10 - adoxq %r8, %r11 - adcxq %rbx, %r11 - movq %r12, 24(%rsp) - movq $0x00, %r12 - adcxq %rbx, %r12 - adoxq %rbx, %r12 - xorq %rbx, %rbx - movq 32(%rsi), %rdx - # A[4] * B[0] - mulxq (%rax), %rcx, %r8 - adcxq %rcx, %r13 - adoxq %r8, %r14 - # A[4] * B[1] - mulxq 8(%rax), %rcx, %r8 - adcxq %rcx, %r14 - adoxq %r8, %r15 - # A[4] * B[2] - mulxq 16(%rax), %rcx, %r8 - adcxq %rcx, %r15 - adoxq %r8, %r9 - # A[4] * B[3] - mulxq 24(%rax), %rcx, %r8 - adcxq %rcx, %r9 - adoxq %r8, %r10 - # A[4] * B[4] - mulxq 32(%rax), %rcx, %r8 - adcxq %rcx, %r10 - adoxq %r8, %r11 - # A[4] * B[5] - mulxq 40(%rax), %rcx, %r8 - adcxq %rcx, %r11 - adoxq %r8, %r12 - adcxq %rbx, %r12 - movq %r13, 32(%rsp) - movq 40(%rsi), %rdx - # A[5] * B[0] - mulxq (%rax), %rcx, %r8 - adcxq %rcx, %r14 - adoxq %r8, %r15 - # A[5] * B[1] - mulxq 8(%rax), %rcx, %r8 - adcxq %rcx, %r15 - adoxq %r8, %r9 - # A[5] * B[2] - mulxq 16(%rax), %rcx, %r8 - adcxq %rcx, %r9 - adoxq %r8, %r10 - # A[5] * B[3] - mulxq 24(%rax), %rcx, %r8 - adcxq %rcx, %r10 - adoxq %r8, %r11 - # A[5] * B[4] - mulxq 32(%rax), %rcx, %r8 - adcxq %rcx, %r11 - adoxq %r8, %r12 - # A[5] * B[5] - mulxq 40(%rax), %rcx, %r13 - adcxq %rcx, %r12 - adoxq %rbx, %r13 - adcxq %rbx, %r13 - movq %r14, 40(%rdi) - movq %r15, 48(%rdi) - movq %r9, 56(%rdi) - movq %r10, 64(%rdi) - movq %r11, 72(%rdi) - movq %r12, 80(%rdi) - movq %r13, 88(%rdi) - movq (%rsp), %r9 - movq 8(%rsp), %r10 - movq 16(%rsp), %r11 - movq 24(%rsp), %r12 - movq 32(%rsp), %r13 - movq %r9, (%rdi) - movq %r10, 8(%rdi) - movq %r11, 16(%rdi) - movq %r12, 24(%rdi) - movq %r13, 32(%rdi) - addq $40, %rsp - popq %rbx - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size sp_384_mul_avx2_6,.-sp_384_mul_avx2_6 -#endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ -#ifdef HAVE_INTEL_AVX2 /* Reduce the number back to 384 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -44435,170 +44694,6 @@ L_mont_loop_order_avx2_6: #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ #ifdef HAVE_INTEL_AVX2 -/* Square a and put result in r. (r = a * a) - * - * r Result of squaring. - * a Number to square in Montgomery form. - */ -#ifndef __APPLE__ -.text -.globl sp_384_sqr_avx2_6 -.type sp_384_sqr_avx2_6,@function -.align 16 -sp_384_sqr_avx2_6: -#else -.section __TEXT,__text -.globl _sp_384_sqr_avx2_6 -.p2align 4 -_sp_384_sqr_avx2_6: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbx - pushq %rbp - push %rdi - xorq %rdi, %rdi - movq (%rsi), %rdx - movq 8(%rsi), %r15 - movq 16(%rsi), %rbx - movq 24(%rsi), %rbp - # Diagonal 0 - # A[1] * A[0] - mulxq 8(%rsi), %r8, %r9 - # A[2] * A[0] - mulxq 16(%rsi), %rax, %r10 - adcxq %rax, %r9 - # A[3] * A[0] - mulxq 24(%rsi), %rax, %r11 - adcxq %rax, %r10 - # A[4] * A[0] - mulxq 32(%rsi), %rax, %r12 - adcxq %rax, %r11 - # A[5] * A[0] - mulxq 40(%rsi), %rax, %r13 - adcxq %rax, %r12 - adcxq %rdi, %r13 - # Diagonal 1 - movq %r15, %rdx - # A[2] * A[1] - mulxq 16(%rsi), %rax, %rcx - adcxq %rax, %r10 - adoxq %rcx, %r11 - # A[3] * A[1] - mulxq 24(%rsi), %rax, %rcx - adcxq %rax, %r11 - adoxq %rcx, %r12 - # A[4] * A[1] - mulxq 32(%rsi), %rax, %rcx - adcxq %rax, %r12 - adoxq %rcx, %r13 - # A[5] * A[1] - mulxq 40(%rsi), %rax, %r14 - adcxq %rax, %r13 - adoxq %rdi, %r14 - movq %rbx, %rdx - # A[5] * A[2] - mulxq 40(%rsi), %rax, %r15 - adcxq %rax, %r14 - adoxq %rdi, %r15 - adcxq %rdi, %r15 - adcxq %rdi, %rbx - # Diagonal 2 - # A[3] * A[2] - mulxq 24(%rsi), %rax, %rcx - adcxq %rax, %r12 - adoxq %rcx, %r13 - # A[4] * A[2] - mulxq 32(%rsi), %rax, %rcx - adcxq %rax, %r13 - adoxq %rcx, %r14 - movq %rbp, %rdx - # A[4] * A[3] - mulxq 32(%rsi), %rax, %rcx - adcxq %rax, %r14 - adoxq %rcx, %r15 - # A[5] * A[3] - mulxq 40(%rsi), %rax, %rbx - adcxq %rax, %r15 - adoxq %rdi, %rbx - movq 32(%rsi), %rdx - # A[5] * A[4] - mulxq 40(%rsi), %rax, %rbp - adcxq %rax, %rbx - adoxq %rdi, %rbp - adcxq %rdi, %rbp - adcxq %rdi, %rdi - # Doubling previous result as we add in square words results - # A[0] * A[0] - movq (%rsi), %rdx - mulxq %rdx, %rax, %rcx - pop %rdx - movq %rax, (%rdx) - adoxq %r8, %r8 - push %rdx - adcxq %rcx, %r8 - # A[1] * A[1] - movq 8(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r9, %r9 - adcxq %rax, %r9 - adoxq %r10, %r10 - adcxq %rcx, %r10 - # A[2] * A[2] - movq 16(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r11, %r11 - adcxq %rax, %r11 - adoxq %r12, %r12 - adcxq %rcx, %r12 - # A[3] * A[3] - movq 24(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r13, %r13 - adcxq %rax, %r13 - adoxq %r14, %r14 - adcxq %rcx, %r14 - # A[4] * A[4] - movq 32(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r15, %r15 - adcxq %rax, %r15 - adoxq %rbx, %rbx - adcxq %rcx, %rbx - # A[5] * A[5] - movq 40(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %rbp, %rbp - adcxq %rax, %rbp - adcxq %rdi, %rcx - movq $0x00, %rax - adoxq %rax, %rcx - pop %rdi - movq %r8, 8(%rdi) - movq %r9, 16(%rdi) - movq %r10, 24(%rdi) - movq %r11, 32(%rdi) - movq %r12, 40(%rdi) - movq %r13, 48(%rdi) - movq %r14, 56(%rdi) - movq %r15, 64(%rdi) - movq %rbx, 72(%rdi) - movq %rbp, 80(%rdi) - movq %rcx, 88(%rdi) - popq %rbp - popq %rbx - popq %r15 - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size sp_384_sqr_avx2_6,.-sp_384_sqr_avx2_6 -#endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ -#ifdef HAVE_INTEL_AVX2 /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -46210,6 +46305,590 @@ _sp_521_mul_9: #ifndef __APPLE__ .size sp_521_mul_9,.-sp_521_mul_9 #endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX2 +/* Multiply a and b into r. (r = a * b) + * + * r Result of multiplication. + * a First number to multiply. + * b Second number to multiply. + */ +#ifndef __APPLE__ +.text +.globl sp_521_mul_avx2_9 +.type sp_521_mul_avx2_9,@function +.align 16 +sp_521_mul_avx2_9: +#else +.section __TEXT,__text +.globl _sp_521_mul_avx2_9 +.p2align 4 +_sp_521_mul_avx2_9: +#endif /* __APPLE__ */ + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + movq %rdx, %rbp + subq $0x48, %rsp + cmpq %rdi, %rsi + movq %rsp, %rbx + cmovne %rdi, %rbx + cmpq %rdi, %rbp + cmove %rsp, %rbx + addq $0x48, %rdi + xorq %r13, %r13 + movq (%rsi), %rdx + # A[0] * B[0] + mulx (%rbp), %r8, %r9 + # A[0] * B[1] + mulx 8(%rbp), %rax, %r10 + movq %r8, (%rbx) + adcxq %rax, %r9 + # A[0] * B[2] + mulx 16(%rbp), %rax, %r11 + movq %r9, 8(%rbx) + adcxq %rax, %r10 + movq %r10, 16(%rbx) + # A[0] * B[3] + mulx 24(%rbp), %rax, %r8 + adcxq %rax, %r11 + # A[0] * B[4] + mulx 32(%rbp), %rax, %r9 + movq %r11, 24(%rbx) + adcxq %rax, %r8 + # A[0] * B[5] + mulx 40(%rbp), %rax, %r10 + movq %r8, 32(%rbx) + adcxq %rax, %r9 + movq %r9, 40(%rbx) + # A[0] * B[6] + mulx 48(%rbp), %rax, %r11 + adcxq %rax, %r10 + # A[0] * B[7] + mulx 56(%rbp), %rax, %r8 + movq %r10, 48(%rbx) + adcxq %rax, %r11 + # A[0] * B[8] + mulx 64(%rbp), %rax, %r9 + movq %r11, 56(%rbx) + adcxq %rax, %r8 + adcxq %r13, %r9 + movq %r13, %r12 + adcxq %r13, %r12 + movq %r8, 64(%rbx) + movq %r9, (%rdi) + movq 8(%rsi), %rdx + movq 8(%rbx), %r9 + movq 16(%rbx), %r10 + movq 24(%rbx), %r11 + movq 32(%rbx), %r8 + # A[1] * B[0] + mulx (%rbp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + # A[1] * B[1] + mulx 8(%rbp), %rax, %rcx + movq %r9, 8(%rbx) + adcxq %rax, %r10 + adoxq %rcx, %r11 + # A[1] * B[2] + mulx 16(%rbp), %rax, %rcx + movq %r10, 16(%rbx) + adcxq %rax, %r11 + adoxq %rcx, %r8 + movq %r11, 24(%rbx) + movq 40(%rbx), %r9 + movq 48(%rbx), %r10 + movq 56(%rbx), %r11 + # A[1] * B[3] + mulx 24(%rbp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + # A[1] * B[4] + mulx 32(%rbp), %rax, %rcx + movq %r8, 32(%rbx) + adcxq %rax, %r9 + adoxq %rcx, %r10 + # A[1] * B[5] + mulx 40(%rbp), %rax, %rcx + movq %r9, 40(%rbx) + adcxq %rax, %r10 + adoxq %rcx, %r11 + movq %r10, 48(%rbx) + movq 64(%rbx), %r8 + movq (%rdi), %r9 + # A[1] * B[6] + mulx 48(%rbp), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r8 + # A[1] * B[7] + mulx 56(%rbp), %rax, %rcx + movq %r11, 56(%rbx) + adcxq %rax, %r8 + adoxq %rcx, %r9 + # A[1] * B[8] + mulx 64(%rbp), %rax, %rcx + movq %r8, 64(%rbx) + movq %r13, %r10 + adcxq %rax, %r9 + adoxq %rcx, %r10 + adcxq %r12, %r10 + movq %r13, %r12 + adoxq %r13, %r12 + adcxq %r13, %r12 + movq %r9, (%rdi) + movq %r10, 8(%rdi) + movq 16(%rsi), %rdx + movq 16(%rbx), %r10 + movq 24(%rbx), %r11 + movq 32(%rbx), %r8 + movq 40(%rbx), %r9 + # A[2] * B[0] + mulx (%rbp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + # A[2] * B[1] + mulx 8(%rbp), %rax, %rcx + movq %r10, 16(%rbx) + adcxq %rax, %r11 + adoxq %rcx, %r8 + # A[2] * B[2] + mulx 16(%rbp), %rax, %rcx + movq %r11, 24(%rbx) + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 32(%rbx) + movq 48(%rbx), %r10 + movq 56(%rbx), %r11 + movq 64(%rbx), %r8 + # A[2] * B[3] + mulx 24(%rbp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + # A[2] * B[4] + mulx 32(%rbp), %rax, %rcx + movq %r9, 40(%rbx) + adcxq %rax, %r10 + adoxq %rcx, %r11 + # A[2] * B[5] + mulx 40(%rbp), %rax, %rcx + movq %r10, 48(%rbx) + adcxq %rax, %r11 + adoxq %rcx, %r8 + movq %r11, 56(%rbx) + movq (%rdi), %r9 + movq 8(%rdi), %r10 + # A[2] * B[6] + mulx 48(%rbp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + # A[2] * B[7] + mulx 56(%rbp), %rax, %rcx + movq %r8, 64(%rbx) + adcxq %rax, %r9 + adoxq %rcx, %r10 + # A[2] * B[8] + mulx 64(%rbp), %rax, %rcx + movq %r9, (%rdi) + movq %r13, %r11 + adcxq %rax, %r10 + adoxq %rcx, %r11 + adcxq %r12, %r11 + movq %r13, %r12 + adoxq %r13, %r12 + adcxq %r13, %r12 + movq %r10, 8(%rdi) + movq %r11, 16(%rdi) + movq 24(%rsi), %rdx + movq 24(%rbx), %r11 + movq 32(%rbx), %r8 + movq 40(%rbx), %r9 + movq 48(%rbx), %r10 + # A[3] * B[0] + mulx (%rbp), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r8 + # A[3] * B[1] + mulx 8(%rbp), %rax, %rcx + movq %r11, 24(%rbx) + adcxq %rax, %r8 + adoxq %rcx, %r9 + # A[3] * B[2] + mulx 16(%rbp), %rax, %rcx + movq %r8, 32(%rbx) + adcxq %rax, %r9 + adoxq %rcx, %r10 + movq %r9, 40(%rbx) + movq 56(%rbx), %r11 + movq 64(%rbx), %r8 + movq (%rdi), %r9 + # A[3] * B[3] + mulx 24(%rbp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + # A[3] * B[4] + mulx 32(%rbp), %rax, %rcx + movq %r10, 48(%rbx) + adcxq %rax, %r11 + adoxq %rcx, %r8 + # A[3] * B[5] + mulx 40(%rbp), %rax, %rcx + movq %r11, 56(%rbx) + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 64(%rbx) + movq 8(%rdi), %r10 + movq 16(%rdi), %r11 + # A[3] * B[6] + mulx 48(%rbp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + # A[3] * B[7] + mulx 56(%rbp), %rax, %rcx + movq %r9, (%rdi) + adcxq %rax, %r10 + adoxq %rcx, %r11 + # A[3] * B[8] + mulx 64(%rbp), %rax, %rcx + movq %r10, 8(%rdi) + movq %r13, %r8 + adcxq %rax, %r11 + adoxq %rcx, %r8 + adcxq %r12, %r8 + movq %r13, %r12 + adoxq %r13, %r12 + adcxq %r13, %r12 + movq %r11, 16(%rdi) + movq %r8, 24(%rdi) + movq 32(%rsi), %rdx + movq 32(%rbx), %r8 + movq 40(%rbx), %r9 + movq 48(%rbx), %r10 + movq 56(%rbx), %r11 + # A[4] * B[0] + mulx (%rbp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + # A[4] * B[1] + mulx 8(%rbp), %rax, %rcx + movq %r8, 32(%rbx) + adcxq %rax, %r9 + adoxq %rcx, %r10 + # A[4] * B[2] + mulx 16(%rbp), %rax, %rcx + movq %r9, 40(%rbx) + adcxq %rax, %r10 + adoxq %rcx, %r11 + movq %r10, 48(%rbx) + movq 64(%rbx), %r8 + movq (%rdi), %r9 + movq 8(%rdi), %r10 + # A[4] * B[3] + mulx 24(%rbp), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r8 + # A[4] * B[4] + mulx 32(%rbp), %rax, %rcx + movq %r11, 56(%rbx) + adcxq %rax, %r8 + adoxq %rcx, %r9 + # A[4] * B[5] + mulx 40(%rbp), %rax, %rcx + movq %r8, 64(%rbx) + adcxq %rax, %r9 + adoxq %rcx, %r10 + movq %r9, (%rdi) + movq 16(%rdi), %r11 + movq 24(%rdi), %r8 + # A[4] * B[6] + mulx 48(%rbp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + # A[4] * B[7] + mulx 56(%rbp), %rax, %rcx + movq %r10, 8(%rdi) + adcxq %rax, %r11 + adoxq %rcx, %r8 + # A[4] * B[8] + mulx 64(%rbp), %rax, %rcx + movq %r11, 16(%rdi) + movq %r13, %r9 + adcxq %rax, %r8 + adoxq %rcx, %r9 + adcxq %r12, %r9 + movq %r13, %r12 + adoxq %r13, %r12 + adcxq %r13, %r12 + movq %r8, 24(%rdi) + movq %r9, 32(%rdi) + movq 40(%rsi), %rdx + movq 40(%rbx), %r9 + movq 48(%rbx), %r10 + movq 56(%rbx), %r11 + movq 64(%rbx), %r8 + # A[5] * B[0] + mulx (%rbp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + # A[5] * B[1] + mulx 8(%rbp), %rax, %rcx + movq %r9, 40(%rbx) + adcxq %rax, %r10 + adoxq %rcx, %r11 + # A[5] * B[2] + mulx 16(%rbp), %rax, %rcx + movq %r10, 48(%rbx) + adcxq %rax, %r11 + adoxq %rcx, %r8 + movq %r11, 56(%rbx) + movq (%rdi), %r9 + movq 8(%rdi), %r10 + movq 16(%rdi), %r11 + # A[5] * B[3] + mulx 24(%rbp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + # A[5] * B[4] + mulx 32(%rbp), %rax, %rcx + movq %r8, 64(%rbx) + adcxq %rax, %r9 + adoxq %rcx, %r10 + # A[5] * B[5] + mulx 40(%rbp), %rax, %rcx + movq %r9, (%rdi) + adcxq %rax, %r10 + adoxq %rcx, %r11 + movq %r10, 8(%rdi) + movq 24(%rdi), %r8 + movq 32(%rdi), %r9 + # A[5] * B[6] + mulx 48(%rbp), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r8 + # A[5] * B[7] + mulx 56(%rbp), %rax, %rcx + movq %r11, 16(%rdi) + adcxq %rax, %r8 + adoxq %rcx, %r9 + # A[5] * B[8] + mulx 64(%rbp), %rax, %rcx + movq %r8, 24(%rdi) + movq %r13, %r10 + adcxq %rax, %r9 + adoxq %rcx, %r10 + adcxq %r12, %r10 + movq %r13, %r12 + adoxq %r13, %r12 + adcxq %r13, %r12 + movq %r9, 32(%rdi) + movq %r10, 40(%rdi) + movq 48(%rsi), %rdx + movq 48(%rbx), %r10 + movq 56(%rbx), %r11 + movq 64(%rbx), %r8 + movq (%rdi), %r9 + # A[6] * B[0] + mulx (%rbp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + # A[6] * B[1] + mulx 8(%rbp), %rax, %rcx + movq %r10, 48(%rbx) + adcxq %rax, %r11 + adoxq %rcx, %r8 + # A[6] * B[2] + mulx 16(%rbp), %rax, %rcx + movq %r11, 56(%rbx) + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 64(%rbx) + movq 8(%rdi), %r10 + movq 16(%rdi), %r11 + movq 24(%rdi), %r8 + # A[6] * B[3] + mulx 24(%rbp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + # A[6] * B[4] + mulx 32(%rbp), %rax, %rcx + movq %r9, (%rdi) + adcxq %rax, %r10 + adoxq %rcx, %r11 + # A[6] * B[5] + mulx 40(%rbp), %rax, %rcx + movq %r10, 8(%rdi) + adcxq %rax, %r11 + adoxq %rcx, %r8 + movq %r11, 16(%rdi) + movq 32(%rdi), %r9 + movq 40(%rdi), %r10 + # A[6] * B[6] + mulx 48(%rbp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + # A[6] * B[7] + mulx 56(%rbp), %rax, %rcx + movq %r8, 24(%rdi) + adcxq %rax, %r9 + adoxq %rcx, %r10 + # A[6] * B[8] + mulx 64(%rbp), %rax, %rcx + movq %r9, 32(%rdi) + movq %r13, %r11 + adcxq %rax, %r10 + adoxq %rcx, %r11 + adcxq %r12, %r11 + movq %r13, %r12 + adoxq %r13, %r12 + adcxq %r13, %r12 + movq %r10, 40(%rdi) + movq %r11, 48(%rdi) + movq 56(%rsi), %rdx + movq 56(%rbx), %r11 + movq 64(%rbx), %r8 + movq (%rdi), %r9 + movq 8(%rdi), %r10 + # A[7] * B[0] + mulx (%rbp), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r8 + # A[7] * B[1] + mulx 8(%rbp), %rax, %rcx + movq %r11, 56(%rbx) + adcxq %rax, %r8 + adoxq %rcx, %r9 + # A[7] * B[2] + mulx 16(%rbp), %rax, %rcx + movq %r8, 64(%rbx) + adcxq %rax, %r9 + adoxq %rcx, %r10 + movq %r9, (%rdi) + movq 16(%rdi), %r11 + movq 24(%rdi), %r8 + movq 32(%rdi), %r9 + # A[7] * B[3] + mulx 24(%rbp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + # A[7] * B[4] + mulx 32(%rbp), %rax, %rcx + movq %r10, 8(%rdi) + adcxq %rax, %r11 + adoxq %rcx, %r8 + # A[7] * B[5] + mulx 40(%rbp), %rax, %rcx + movq %r11, 16(%rdi) + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 24(%rdi) + movq 40(%rdi), %r10 + movq 48(%rdi), %r11 + # A[7] * B[6] + mulx 48(%rbp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + # A[7] * B[7] + mulx 56(%rbp), %rax, %rcx + movq %r9, 32(%rdi) + adcxq %rax, %r10 + adoxq %rcx, %r11 + # A[7] * B[8] + mulx 64(%rbp), %rax, %rcx + movq %r10, 40(%rdi) + movq %r13, %r8 + adcxq %rax, %r11 + adoxq %rcx, %r8 + adcxq %r12, %r8 + movq %r13, %r12 + adoxq %r13, %r12 + adcxq %r13, %r12 + movq %r11, 48(%rdi) + movq %r8, 56(%rdi) + movq 64(%rsi), %rdx + movq 64(%rbx), %r8 + movq (%rdi), %r9 + movq 8(%rdi), %r10 + movq 16(%rdi), %r11 + # A[8] * B[0] + mulx (%rbp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + # A[8] * B[1] + mulx 8(%rbp), %rax, %rcx + movq %r8, 64(%rbx) + adcxq %rax, %r9 + adoxq %rcx, %r10 + # A[8] * B[2] + mulx 16(%rbp), %rax, %rcx + movq %r9, (%rdi) + adcxq %rax, %r10 + adoxq %rcx, %r11 + movq %r10, 8(%rdi) + movq 24(%rdi), %r8 + movq 32(%rdi), %r9 + movq 40(%rdi), %r10 + # A[8] * B[3] + mulx 24(%rbp), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r8 + # A[8] * B[4] + mulx 32(%rbp), %rax, %rcx + movq %r11, 16(%rdi) + adcxq %rax, %r8 + adoxq %rcx, %r9 + # A[8] * B[5] + mulx 40(%rbp), %rax, %rcx + movq %r8, 24(%rdi) + adcxq %rax, %r9 + adoxq %rcx, %r10 + movq %r9, 32(%rdi) + movq 48(%rdi), %r11 + movq 56(%rdi), %r8 + # A[8] * B[6] + mulx 48(%rbp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + # A[8] * B[7] + mulx 56(%rbp), %rax, %rcx + movq %r10, 40(%rdi) + adcxq %rax, %r11 + adoxq %rcx, %r8 + # A[8] * B[8] + mulx 64(%rbp), %rax, %rcx + movq %r11, 48(%rdi) + movq %r13, %r9 + adcxq %rax, %r8 + adoxq %rcx, %r9 + adcxq %r12, %r9 + movq %r8, 56(%rdi) + movq %r9, 64(%rdi) + subq $0x48, %rdi + cmpq %rdi, %rsi + je L_start_521_mul_avx2_9 + cmpq %rdi, %rbp + jne L_end_521_mul_avx2_9 +L_start_521_mul_avx2_9: + vmovdqu (%rbx), %xmm0 + vmovups %xmm0, (%rdi) + vmovdqu 16(%rbx), %xmm0 + vmovups %xmm0, 16(%rdi) + vmovdqu 32(%rbx), %xmm0 + vmovups %xmm0, 32(%rdi) + vmovdqu 48(%rbx), %xmm0 + vmovups %xmm0, 48(%rdi) + movq 64(%rbx), %rax + movq %rax, 64(%rdi) +L_end_521_mul_avx2_9: + addq $0x48, %rsp + popq %r13 + popq %r12 + popq %rbp + popq %rbx + repz retq +#ifndef __APPLE__ +.size sp_521_mul_avx2_9,.-sp_521_mul_avx2_9 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -46632,6 +47311,410 @@ _sp_521_sqr_9: #ifndef __APPLE__ .size sp_521_sqr_9,.-sp_521_sqr_9 #endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX2 +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +#ifndef __APPLE__ +.text +.globl sp_521_sqr_avx2_9 +.type sp_521_sqr_avx2_9,@function +.align 16 +sp_521_sqr_avx2_9: +#else +.section __TEXT,__text +.globl _sp_521_sqr_avx2_9 +.p2align 4 +_sp_521_sqr_avx2_9: +#endif /* __APPLE__ */ + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + subq $0x48, %rsp + cmpq %rdi, %rsi + movq %rsp, %rbp + cmovne %rdi, %rbp + addq $0x48, %rdi + xorq %r10, %r10 + # Diagonal 1 + # Zero into %r9 + # A[1] x A[0] + movq (%rsi), %rdx + mulxq 8(%rsi), %r8, %r9 + movq %r8, 8(%rbp) + # Zero into %r8 + # A[2] x A[0] + mulxq 16(%rsi), %rax, %r8 + adcxq %rax, %r9 + adoxq %r10, %r8 + movq %r9, 16(%rbp) + # No load %r12 - %r9 + # A[3] x A[0] + mulxq 24(%rsi), %rax, %r12 + adcxq %rax, %r8 + adoxq %r10, %r12 + movq %r8, 24(%rbp) + # No load %r13 - %r8 + # A[4] x A[0] + mulxq 32(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %r10, %r13 + # No store %r12 - %r9 + # No load %r14 - %r9 + # A[5] x A[0] + mulxq 40(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %r10, %r14 + # No store %r13 - %r8 + # No load %r15 - %r8 + # A[6] x A[0] + mulxq 48(%rsi), %rax, %r15 + adcxq %rax, %r14 + adoxq %r10, %r15 + # No store %r14 - %r9 + # No load %rbx - %r9 + # A[7] x A[0] + mulxq 56(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %r10, %rbx + # No store %r15 - %r8 + # Zero into %r8 + # A[8] x A[0] + mulxq 64(%rsi), %rax, %r8 + adcxq %rax, %rbx + adoxq %r10, %r8 + # No store %rbx - %r9 + # Zero into %r9 + # A[8] x A[1] + movq 8(%rsi), %rdx + mulxq 64(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %r10, %r9 + movq %r8, (%rdi) + # Carry + adcxq %r10, %r9 + movq %r10, %r11 + adcxq %r10, %r11 + adoxq %r10, %r11 + movq %r9, 8(%rdi) + # Diagonal 2 + movq 24(%rbp), %r9 + # No load %r12 - %r8 + # A[2] x A[1] + mulxq 16(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r12 + movq %r9, 24(%rbp) + # No load %r13 - %r9 + # A[3] x A[1] + mulxq 24(%rsi), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + # No store %r12 - %r8 + # No load %r14 - %r8 + # A[4] x A[1] + mulxq 32(%rsi), %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + # No store %r13 - %r9 + # No load %r15 - %r9 + # A[5] x A[1] + mulxq 40(%rsi), %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + # No store %r14 - %r8 + # No load %rbx - %r8 + # A[6] x A[1] + mulxq 48(%rsi), %rax, %rcx + adcxq %rax, %r15 + adoxq %rcx, %rbx + # No store %r15 - %r9 + movq (%rdi), %r9 + # A[7] x A[1] + mulxq 56(%rsi), %rax, %rcx + adcxq %rax, %rbx + adoxq %rcx, %r9 + # No store %rbx - %r8 + movq 8(%rdi), %r8 + # A[7] x A[2] + movq 16(%rsi), %rdx + mulxq 56(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, (%rdi) + # Zero into %r9 + # A[7] x A[3] + movq 24(%rsi), %rdx + mulxq 56(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %r10, %r9 + movq %r8, 8(%rdi) + # Zero into %r8 + # A[7] x A[4] + movq 32(%rsi), %rdx + mulxq 56(%rsi), %rax, %r8 + adcxq %rax, %r9 + adoxq %r10, %r8 + movq %r9, 16(%rdi) + # Carry + adcxq %r11, %r8 + movq %r10, %r11 + adcxq %r10, %r11 + adoxq %r10, %r11 + movq %r8, 24(%rdi) + # Diagonal 3 + # No load %r14 - %r9 + # A[3] x A[2] + movq 16(%rsi), %rdx + mulxq 24(%rsi), %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + # No store %r13 - %r8 + # No load %r15 - %r8 + # A[4] x A[2] + mulxq 32(%rsi), %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + # No store %r14 - %r9 + # No load %rbx - %r9 + # A[5] x A[2] + mulxq 40(%rsi), %rax, %rcx + adcxq %rax, %r15 + adoxq %rcx, %rbx + # No store %r15 - %r8 + movq (%rdi), %r8 + # A[6] x A[2] + mulxq 48(%rsi), %rax, %rcx + adcxq %rax, %rbx + adoxq %rcx, %r8 + # No store %rbx - %r9 + movq 8(%rdi), %r9 + # A[6] x A[3] + movq 24(%rsi), %rdx + mulxq 48(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, (%rdi) + movq 16(%rdi), %r8 + # A[6] x A[4] + movq 32(%rsi), %rdx + mulxq 48(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, 8(%rdi) + movq 24(%rdi), %r9 + # A[6] x A[5] + movq 40(%rsi), %rdx + mulxq 48(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 16(%rdi) + # Zero into %r8 + # A[8] x A[4] + movq 32(%rsi), %rdx + mulxq 64(%rsi), %rax, %r8 + adcxq %rax, %r9 + adoxq %r10, %r8 + movq %r9, 24(%rdi) + # Zero into %r9 + # A[8] x A[5] + movq 40(%rsi), %rdx + mulxq 64(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %r10, %r9 + movq %r8, 32(%rdi) + # Carry + adcxq %r11, %r9 + movq %r10, %r11 + adcxq %r10, %r11 + adoxq %r10, %r11 + movq %r9, 40(%rdi) + # Diagonal 4 + # No load %rbx - %r8 + # A[4] x A[3] + movq 24(%rsi), %rdx + mulxq 32(%rsi), %rax, %rcx + adcxq %rax, %r15 + adoxq %rcx, %rbx + # No store %r15 - %r9 + movq (%rdi), %r9 + # A[5] x A[3] + mulxq 40(%rsi), %rax, %rcx + adcxq %rax, %rbx + adoxq %rcx, %r9 + # No store %rbx - %r8 + movq 8(%rdi), %r8 + # A[5] x A[4] + movq 32(%rsi), %rdx + mulxq 40(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, (%rdi) + movq 16(%rdi), %r9 + # A[8] x A[2] + movq 16(%rsi), %rdx + mulxq 64(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 8(%rdi) + movq 24(%rdi), %r8 + # A[8] x A[3] + movq 24(%rsi), %rdx + mulxq 64(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, 16(%rdi) + movq 32(%rdi), %r9 + # A[7] x A[5] + movq 40(%rsi), %rdx + mulxq 56(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movq %r8, 24(%rdi) + movq 40(%rdi), %r8 + # A[7] x A[6] + movq 48(%rsi), %rdx + mulxq 56(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r8 + movq %r9, 32(%rdi) + # Zero into %r9 + # A[8] x A[6] + mulxq 64(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %r10, %r9 + movq %r8, 40(%rdi) + # Zero into %r8 + # A[8] x A[7] + movq 56(%rsi), %rdx + mulxq 64(%rsi), %rax, %r8 + adcxq %rax, %r9 + adoxq %r10, %r8 + movq %r9, 48(%rdi) + # Carry + adcxq %r11, %r8 + movq %r10, %r11 + adcxq %r10, %r11 + adoxq %r10, %r11 + movq %r8, 56(%rdi) + movq %r11, 64(%rdi) + # Double and Add in A[i] x A[i] + movq 8(%rbp), %r9 + # A[0] x A[0] + movq (%rsi), %rdx + mulxq %rdx, %rax, %rcx + movq %rax, (%rbp) + adoxq %r9, %r9 + adcxq %rcx, %r9 + movq %r9, 8(%rbp) + movq 16(%rbp), %r8 + movq 24(%rbp), %r9 + # A[1] x A[1] + movq 8(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r9, %r9 + adcxq %rax, %r8 + adcxq %rcx, %r9 + movq %r8, 16(%rbp) + movq %r9, 24(%rbp) + # A[2] x A[2] + movq 16(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r12, %r12 + adoxq %r13, %r13 + adcxq %rax, %r12 + adcxq %rcx, %r13 + # A[3] x A[3] + movq 24(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r14, %r14 + adoxq %r15, %r15 + adcxq %rax, %r14 + adcxq %rcx, %r15 + movq (%rdi), %r9 + # A[4] x A[4] + movq 32(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %rbx, %rbx + adoxq %r9, %r9 + adcxq %rax, %rbx + adcxq %rcx, %r9 + movq %r9, (%rdi) + movq 8(%rdi), %r8 + movq 16(%rdi), %r9 + # A[5] x A[5] + movq 40(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r9, %r9 + adcxq %rax, %r8 + adcxq %rcx, %r9 + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq 24(%rdi), %r8 + movq 32(%rdi), %r9 + # A[6] x A[6] + movq 48(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r9, %r9 + adcxq %rax, %r8 + adcxq %rcx, %r9 + movq %r8, 24(%rdi) + movq %r9, 32(%rdi) + movq 40(%rdi), %r8 + movq 48(%rdi), %r9 + # A[7] x A[7] + movq 56(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r9, %r9 + adcxq %rax, %r8 + adcxq %rcx, %r9 + movq %r8, 40(%rdi) + movq %r9, 48(%rdi) + movq 56(%rdi), %r8 + movq 64(%rdi), %r9 + # A[8] x A[8] + movq 64(%rsi), %rdx + mulxq %rdx, %rax, %rcx + adoxq %r8, %r8 + adoxq %r9, %r9 + adcxq %rax, %r8 + adcxq %rcx, %r9 + movq %r8, 56(%rdi) + movq %r9, 64(%rdi) + movq %r12, -40(%rdi) + movq %r13, -32(%rdi) + movq %r14, -24(%rdi) + movq %r15, -16(%rdi) + movq %rbx, -8(%rdi) + subq $0x48, %rdi + cmpq %rdi, %rsi + jne L_end_521_sqr_avx2_9 + vmovdqu (%rbp), %xmm0 + vmovups %xmm0, (%rdi) + vmovdqu 16(%rbp), %xmm0 + vmovups %xmm0, 16(%rdi) +L_end_521_sqr_avx2_9: + addq $0x48, %rsp + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + repz retq +#ifndef __APPLE__ +.size sp_521_sqr_avx2_9,.-sp_521_sqr_avx2_9 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX2 */ /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -50930,590 +52013,55 @@ _sp_521_to_bin_movbe_9: .size sp_521_to_bin_movbe_9,.-sp_521_to_bin_movbe_9 #endif /* __APPLE__ */ #endif /* NO_MOVBE_SUPPORT */ -#ifdef HAVE_INTEL_AVX2 -/* Multiply a and b into r. (r = a * b) +/* Shift number right by 1 bit. (r = a >> 1) * - * r Result of multiplication. - * a First number to multiply. - * b Second number to multiply. + * r Result of right shift by 1. + * a Number to shift. */ #ifndef __APPLE__ .text -.globl sp_521_mul_avx2_9 -.type sp_521_mul_avx2_9,@function +.globl sp_521_rshift_9 +.type sp_521_rshift_9,@function .align 16 -sp_521_mul_avx2_9: +sp_521_rshift_9: #else .section __TEXT,__text -.globl _sp_521_mul_avx2_9 +.globl _sp_521_rshift_9 .p2align 4 -_sp_521_mul_avx2_9: +_sp_521_rshift_9: #endif /* __APPLE__ */ - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - movq %rdx, %rbp - subq $0x48, %rsp - cmpq %rdi, %rsi - movq %rsp, %rbx - cmovne %rdi, %rbx - cmpq %rdi, %rbp - cmove %rsp, %rbx - addq $0x48, %rdi - xorq %r13, %r13 + movq %rdx, %rcx movq (%rsi), %rdx - # A[0] * B[0] - mulx (%rbp), %r8, %r9 - # A[0] * B[1] - mulx 8(%rbp), %rax, %r10 - movq %r8, (%rbx) - adcxq %rax, %r9 - # A[0] * B[2] - mulx 16(%rbp), %rax, %r11 - movq %r9, 8(%rbx) - adcxq %rax, %r10 - movq %r10, 16(%rbx) - # A[0] * B[3] - mulx 24(%rbp), %rax, %r8 - adcxq %rax, %r11 - # A[0] * B[4] - mulx 32(%rbp), %rax, %r9 - movq %r11, 24(%rbx) - adcxq %rax, %r8 - # A[0] * B[5] - mulx 40(%rbp), %rax, %r10 - movq %r8, 32(%rbx) - adcxq %rax, %r9 - movq %r9, 40(%rbx) - # A[0] * B[6] - mulx 48(%rbp), %rax, %r11 - adcxq %rax, %r10 - # A[0] * B[7] - mulx 56(%rbp), %rax, %r8 - movq %r10, 48(%rbx) - adcxq %rax, %r11 - # A[0] * B[8] - mulx 64(%rbp), %rax, %r9 - movq %r11, 56(%rbx) - adcxq %rax, %r8 - adcxq %r13, %r9 - movq %r13, %r12 - adcxq %r13, %r12 - movq %r8, 64(%rbx) - movq %r9, (%rdi) - movq 8(%rsi), %rdx - movq 8(%rbx), %r9 - movq 16(%rbx), %r10 - movq 24(%rbx), %r11 - movq 32(%rbx), %r8 - # A[1] * B[0] - mulx (%rbp), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r10 - # A[1] * B[1] - mulx 8(%rbp), %rax, %rcx - movq %r9, 8(%rbx) - adcxq %rax, %r10 - adoxq %rcx, %r11 - # A[1] * B[2] - mulx 16(%rbp), %rax, %rcx - movq %r10, 16(%rbx) - adcxq %rax, %r11 - adoxq %rcx, %r8 - movq %r11, 24(%rbx) - movq 40(%rbx), %r9 - movq 48(%rbx), %r10 - movq 56(%rbx), %r11 - # A[1] * B[3] - mulx 24(%rbp), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - # A[1] * B[4] - mulx 32(%rbp), %rax, %rcx - movq %r8, 32(%rbx) - adcxq %rax, %r9 - adoxq %rcx, %r10 - # A[1] * B[5] - mulx 40(%rbp), %rax, %rcx - movq %r9, 40(%rbx) - adcxq %rax, %r10 - adoxq %rcx, %r11 - movq %r10, 48(%rbx) - movq 64(%rbx), %r8 - movq (%rdi), %r9 - # A[1] * B[6] - mulx 48(%rbp), %rax, %rcx - adcxq %rax, %r11 - adoxq %rcx, %r8 - # A[1] * B[7] - mulx 56(%rbp), %rax, %rcx - movq %r11, 56(%rbx) - adcxq %rax, %r8 - adoxq %rcx, %r9 - # A[1] * B[8] - mulx 64(%rbp), %rax, %rcx - movq %r8, 64(%rbx) - movq %r13, %r10 - adcxq %rax, %r9 - adoxq %rcx, %r10 - adcxq %r12, %r10 - movq %r13, %r12 - adoxq %r13, %r12 - adcxq %r13, %r12 - movq %r9, (%rdi) - movq %r10, 8(%rdi) - movq 16(%rsi), %rdx - movq 16(%rbx), %r10 - movq 24(%rbx), %r11 - movq 32(%rbx), %r8 - movq 40(%rbx), %r9 - # A[2] * B[0] - mulx (%rbp), %rax, %rcx - adcxq %rax, %r10 - adoxq %rcx, %r11 - # A[2] * B[1] - mulx 8(%rbp), %rax, %rcx - movq %r10, 16(%rbx) - adcxq %rax, %r11 - adoxq %rcx, %r8 - # A[2] * B[2] - mulx 16(%rbp), %rax, %rcx - movq %r11, 24(%rbx) - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 32(%rbx) - movq 48(%rbx), %r10 - movq 56(%rbx), %r11 - movq 64(%rbx), %r8 - # A[2] * B[3] - mulx 24(%rbp), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r10 - # A[2] * B[4] - mulx 32(%rbp), %rax, %rcx - movq %r9, 40(%rbx) - adcxq %rax, %r10 - adoxq %rcx, %r11 - # A[2] * B[5] - mulx 40(%rbp), %rax, %rcx - movq %r10, 48(%rbx) - adcxq %rax, %r11 - adoxq %rcx, %r8 - movq %r11, 56(%rbx) - movq (%rdi), %r9 - movq 8(%rdi), %r10 - # A[2] * B[6] - mulx 48(%rbp), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - # A[2] * B[7] - mulx 56(%rbp), %rax, %rcx - movq %r8, 64(%rbx) - adcxq %rax, %r9 - adoxq %rcx, %r10 - # A[2] * B[8] - mulx 64(%rbp), %rax, %rcx - movq %r9, (%rdi) - movq %r13, %r11 - adcxq %rax, %r10 - adoxq %rcx, %r11 - adcxq %r12, %r11 - movq %r13, %r12 - adoxq %r13, %r12 - adcxq %r13, %r12 - movq %r10, 8(%rdi) - movq %r11, 16(%rdi) - movq 24(%rsi), %rdx - movq 24(%rbx), %r11 - movq 32(%rbx), %r8 - movq 40(%rbx), %r9 - movq 48(%rbx), %r10 - # A[3] * B[0] - mulx (%rbp), %rax, %rcx - adcxq %rax, %r11 - adoxq %rcx, %r8 - # A[3] * B[1] - mulx 8(%rbp), %rax, %rcx - movq %r11, 24(%rbx) - adcxq %rax, %r8 - adoxq %rcx, %r9 - # A[3] * B[2] - mulx 16(%rbp), %rax, %rcx - movq %r8, 32(%rbx) - adcxq %rax, %r9 - adoxq %rcx, %r10 - movq %r9, 40(%rbx) - movq 56(%rbx), %r11 - movq 64(%rbx), %r8 - movq (%rdi), %r9 - # A[3] * B[3] - mulx 24(%rbp), %rax, %rcx - adcxq %rax, %r10 - adoxq %rcx, %r11 - # A[3] * B[4] - mulx 32(%rbp), %rax, %rcx - movq %r10, 48(%rbx) - adcxq %rax, %r11 - adoxq %rcx, %r8 - # A[3] * B[5] - mulx 40(%rbp), %rax, %rcx - movq %r11, 56(%rbx) - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 64(%rbx) - movq 8(%rdi), %r10 - movq 16(%rdi), %r11 - # A[3] * B[6] - mulx 48(%rbp), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r10 - # A[3] * B[7] - mulx 56(%rbp), %rax, %rcx - movq %r9, (%rdi) - adcxq %rax, %r10 - adoxq %rcx, %r11 - # A[3] * B[8] - mulx 64(%rbp), %rax, %rcx - movq %r10, 8(%rdi) - movq %r13, %r8 - adcxq %rax, %r11 - adoxq %rcx, %r8 - adcxq %r12, %r8 - movq %r13, %r12 - adoxq %r13, %r12 - adcxq %r13, %r12 - movq %r11, 16(%rdi) - movq %r8, 24(%rdi) - movq 32(%rsi), %rdx - movq 32(%rbx), %r8 - movq 40(%rbx), %r9 - movq 48(%rbx), %r10 - movq 56(%rbx), %r11 - # A[4] * B[0] - mulx (%rbp), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - # A[4] * B[1] - mulx 8(%rbp), %rax, %rcx - movq %r8, 32(%rbx) - adcxq %rax, %r9 - adoxq %rcx, %r10 - # A[4] * B[2] - mulx 16(%rbp), %rax, %rcx - movq %r9, 40(%rbx) - adcxq %rax, %r10 - adoxq %rcx, %r11 - movq %r10, 48(%rbx) - movq 64(%rbx), %r8 - movq (%rdi), %r9 - movq 8(%rdi), %r10 - # A[4] * B[3] - mulx 24(%rbp), %rax, %rcx - adcxq %rax, %r11 - adoxq %rcx, %r8 - # A[4] * B[4] - mulx 32(%rbp), %rax, %rcx - movq %r11, 56(%rbx) - adcxq %rax, %r8 - adoxq %rcx, %r9 - # A[4] * B[5] - mulx 40(%rbp), %rax, %rcx - movq %r8, 64(%rbx) - adcxq %rax, %r9 - adoxq %rcx, %r10 - movq %r9, (%rdi) - movq 16(%rdi), %r11 - movq 24(%rdi), %r8 - # A[4] * B[6] - mulx 48(%rbp), %rax, %rcx - adcxq %rax, %r10 - adoxq %rcx, %r11 - # A[4] * B[7] - mulx 56(%rbp), %rax, %rcx - movq %r10, 8(%rdi) - adcxq %rax, %r11 - adoxq %rcx, %r8 - # A[4] * B[8] - mulx 64(%rbp), %rax, %rcx - movq %r11, 16(%rdi) - movq %r13, %r9 - adcxq %rax, %r8 - adoxq %rcx, %r9 - adcxq %r12, %r9 - movq %r13, %r12 - adoxq %r13, %r12 - adcxq %r13, %r12 - movq %r8, 24(%rdi) - movq %r9, 32(%rdi) - movq 40(%rsi), %rdx - movq 40(%rbx), %r9 - movq 48(%rbx), %r10 - movq 56(%rbx), %r11 - movq 64(%rbx), %r8 - # A[5] * B[0] - mulx (%rbp), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r10 - # A[5] * B[1] - mulx 8(%rbp), %rax, %rcx - movq %r9, 40(%rbx) - adcxq %rax, %r10 - adoxq %rcx, %r11 - # A[5] * B[2] - mulx 16(%rbp), %rax, %rcx - movq %r10, 48(%rbx) - adcxq %rax, %r11 - adoxq %rcx, %r8 - movq %r11, 56(%rbx) - movq (%rdi), %r9 - movq 8(%rdi), %r10 - movq 16(%rdi), %r11 - # A[5] * B[3] - mulx 24(%rbp), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - # A[5] * B[4] - mulx 32(%rbp), %rax, %rcx - movq %r8, 64(%rbx) - adcxq %rax, %r9 - adoxq %rcx, %r10 - # A[5] * B[5] - mulx 40(%rbp), %rax, %rcx - movq %r9, (%rdi) - adcxq %rax, %r10 - adoxq %rcx, %r11 - movq %r10, 8(%rdi) - movq 24(%rdi), %r8 - movq 32(%rdi), %r9 - # A[5] * B[6] - mulx 48(%rbp), %rax, %rcx - adcxq %rax, %r11 - adoxq %rcx, %r8 - # A[5] * B[7] - mulx 56(%rbp), %rax, %rcx - movq %r11, 16(%rdi) - adcxq %rax, %r8 - adoxq %rcx, %r9 - # A[5] * B[8] - mulx 64(%rbp), %rax, %rcx - movq %r8, 24(%rdi) - movq %r13, %r10 - adcxq %rax, %r9 - adoxq %rcx, %r10 - adcxq %r12, %r10 - movq %r13, %r12 - adoxq %r13, %r12 - adcxq %r13, %r12 - movq %r9, 32(%rdi) - movq %r10, 40(%rdi) - movq 48(%rsi), %rdx - movq 48(%rbx), %r10 - movq 56(%rbx), %r11 - movq 64(%rbx), %r8 - movq (%rdi), %r9 - # A[6] * B[0] - mulx (%rbp), %rax, %rcx - adcxq %rax, %r10 - adoxq %rcx, %r11 - # A[6] * B[1] - mulx 8(%rbp), %rax, %rcx - movq %r10, 48(%rbx) - adcxq %rax, %r11 - adoxq %rcx, %r8 - # A[6] * B[2] - mulx 16(%rbp), %rax, %rcx - movq %r11, 56(%rbx) - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 64(%rbx) - movq 8(%rdi), %r10 - movq 16(%rdi), %r11 - movq 24(%rdi), %r8 - # A[6] * B[3] - mulx 24(%rbp), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r10 - # A[6] * B[4] - mulx 32(%rbp), %rax, %rcx - movq %r9, (%rdi) - adcxq %rax, %r10 - adoxq %rcx, %r11 - # A[6] * B[5] - mulx 40(%rbp), %rax, %rcx - movq %r10, 8(%rdi) - adcxq %rax, %r11 - adoxq %rcx, %r8 - movq %r11, 16(%rdi) - movq 32(%rdi), %r9 - movq 40(%rdi), %r10 - # A[6] * B[6] - mulx 48(%rbp), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - # A[6] * B[7] - mulx 56(%rbp), %rax, %rcx - movq %r8, 24(%rdi) - adcxq %rax, %r9 - adoxq %rcx, %r10 - # A[6] * B[8] - mulx 64(%rbp), %rax, %rcx - movq %r9, 32(%rdi) - movq %r13, %r11 - adcxq %rax, %r10 - adoxq %rcx, %r11 - adcxq %r12, %r11 - movq %r13, %r12 - adoxq %r13, %r12 - adcxq %r13, %r12 - movq %r10, 40(%rdi) - movq %r11, 48(%rdi) - movq 56(%rsi), %rdx - movq 56(%rbx), %r11 - movq 64(%rbx), %r8 - movq (%rdi), %r9 - movq 8(%rdi), %r10 - # A[7] * B[0] - mulx (%rbp), %rax, %rcx - adcxq %rax, %r11 - adoxq %rcx, %r8 - # A[7] * B[1] - mulx 8(%rbp), %rax, %rcx - movq %r11, 56(%rbx) - adcxq %rax, %r8 - adoxq %rcx, %r9 - # A[7] * B[2] - mulx 16(%rbp), %rax, %rcx - movq %r8, 64(%rbx) - adcxq %rax, %r9 - adoxq %rcx, %r10 - movq %r9, (%rdi) - movq 16(%rdi), %r11 - movq 24(%rdi), %r8 - movq 32(%rdi), %r9 - # A[7] * B[3] - mulx 24(%rbp), %rax, %rcx - adcxq %rax, %r10 - adoxq %rcx, %r11 - # A[7] * B[4] - mulx 32(%rbp), %rax, %rcx - movq %r10, 8(%rdi) - adcxq %rax, %r11 - adoxq %rcx, %r8 - # A[7] * B[5] - mulx 40(%rbp), %rax, %rcx - movq %r11, 16(%rdi) - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 24(%rdi) - movq 40(%rdi), %r10 - movq 48(%rdi), %r11 - # A[7] * B[6] - mulx 48(%rbp), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r10 - # A[7] * B[7] - mulx 56(%rbp), %rax, %rcx - movq %r9, 32(%rdi) - adcxq %rax, %r10 - adoxq %rcx, %r11 - # A[7] * B[8] - mulx 64(%rbp), %rax, %rcx - movq %r10, 40(%rdi) - movq %r13, %r8 - adcxq %rax, %r11 - adoxq %rcx, %r8 - adcxq %r12, %r8 - movq %r13, %r12 - adoxq %r13, %r12 - adcxq %r13, %r12 - movq %r11, 48(%rdi) - movq %r8, 56(%rdi) + movq 8(%rsi), %rax + movq 16(%rsi), %r8 + movq 24(%rsi), %r9 + movq 32(%rsi), %r10 + shrdq %cl, %rax, %rdx + shrdq %cl, %r8, %rax + shrdq %cl, %r9, %r8 + shrdq %cl, %r10, %r9 + movq %rdx, (%rdi) + movq %rax, 8(%rdi) + movq %r8, 16(%rdi) + movq %r9, 24(%rdi) + movq 40(%rsi), %rax + movq 48(%rsi), %r8 + movq 56(%rsi), %r9 movq 64(%rsi), %rdx - movq 64(%rbx), %r8 - movq (%rdi), %r9 - movq 8(%rdi), %r10 - movq 16(%rdi), %r11 - # A[8] * B[0] - mulx (%rbp), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - # A[8] * B[1] - mulx 8(%rbp), %rax, %rcx - movq %r8, 64(%rbx) - adcxq %rax, %r9 - adoxq %rcx, %r10 - # A[8] * B[2] - mulx 16(%rbp), %rax, %rcx - movq %r9, (%rdi) - adcxq %rax, %r10 - adoxq %rcx, %r11 - movq %r10, 8(%rdi) - movq 24(%rdi), %r8 - movq 32(%rdi), %r9 - movq 40(%rdi), %r10 - # A[8] * B[3] - mulx 24(%rbp), %rax, %rcx - adcxq %rax, %r11 - adoxq %rcx, %r8 - # A[8] * B[4] - mulx 32(%rbp), %rax, %rcx - movq %r11, 16(%rdi) - adcxq %rax, %r8 - adoxq %rcx, %r9 - # A[8] * B[5] - mulx 40(%rbp), %rax, %rcx - movq %r8, 24(%rdi) - adcxq %rax, %r9 - adoxq %rcx, %r10 - movq %r9, 32(%rdi) - movq 48(%rdi), %r11 - movq 56(%rdi), %r8 - # A[8] * B[6] - mulx 48(%rbp), %rax, %rcx - adcxq %rax, %r10 - adoxq %rcx, %r11 - # A[8] * B[7] - mulx 56(%rbp), %rax, %rcx - movq %r10, 40(%rdi) - adcxq %rax, %r11 - adoxq %rcx, %r8 - # A[8] * B[8] - mulx 64(%rbp), %rax, %rcx - movq %r11, 48(%rdi) - movq %r13, %r9 - adcxq %rax, %r8 - adoxq %rcx, %r9 - adcxq %r12, %r9 - movq %r8, 56(%rdi) - movq %r9, 64(%rdi) - subq $0x48, %rdi - cmpq %rdi, %rsi - je L_start_521_mul_avx2_9 - cmpq %rdi, %rbp - jne L_end_521_mul_avx2_9 -L_start_521_mul_avx2_9: - vmovdqu (%rbx), %xmm0 - vmovups %xmm0, (%rdi) - vmovdqu 16(%rbx), %xmm0 - vmovups %xmm0, 16(%rdi) - vmovdqu 32(%rbx), %xmm0 - vmovups %xmm0, 32(%rdi) - vmovdqu 48(%rbx), %xmm0 - vmovups %xmm0, 48(%rdi) - movq 64(%rbx), %rax - movq %rax, 64(%rdi) -L_end_521_mul_avx2_9: - addq $0x48, %rsp - popq %r13 - popq %r12 - popq %rbp - popq %rbx + shrdq %cl, %rax, %r10 + shrdq %cl, %r8, %rax + shrdq %cl, %r9, %r8 + shrdq %cl, %rdx, %r9 + movq %r10, 32(%rdi) + movq %rax, 40(%rdi) + movq %r8, 48(%rdi) + movq %r9, 56(%rdi) + shrq %cl, %rdx + movq %rdx, 64(%rdi) repz retq #ifndef __APPLE__ -.size sp_521_mul_avx2_9,.-sp_521_mul_avx2_9 +.size sp_521_rshift_9,.-sp_521_rshift_9 #endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ /* Shift number left by n bit. (r = a << n) * * r Result of left shift by n. @@ -51647,55 +52195,6 @@ _sp_521_lshift_18: #ifndef __APPLE__ .size sp_521_lshift_18,.-sp_521_lshift_18 #endif /* __APPLE__ */ -/* Shift number right by 1 bit. (r = a >> 1) - * - * r Result of right shift by 1. - * a Number to shift. - */ -#ifndef __APPLE__ -.text -.globl sp_521_rshift_9 -.type sp_521_rshift_9,@function -.align 16 -sp_521_rshift_9: -#else -.section __TEXT,__text -.globl _sp_521_rshift_9 -.p2align 4 -_sp_521_rshift_9: -#endif /* __APPLE__ */ - movq %rdx, %rcx - movq (%rsi), %rdx - movq 8(%rsi), %rax - movq 16(%rsi), %r8 - movq 24(%rsi), %r9 - movq 32(%rsi), %r10 - shrdq %cl, %rax, %rdx - shrdq %cl, %r8, %rax - shrdq %cl, %r9, %r8 - shrdq %cl, %r10, %r9 - movq %rdx, (%rdi) - movq %rax, 8(%rdi) - movq %r8, 16(%rdi) - movq %r9, 24(%rdi) - movq 40(%rsi), %rax - movq 48(%rsi), %r8 - movq 56(%rsi), %r9 - movq 64(%rsi), %rdx - shrdq %cl, %rax, %r10 - shrdq %cl, %r8, %rax - shrdq %cl, %r9, %r8 - shrdq %cl, %rdx, %r9 - movq %r10, 32(%rdi) - movq %rax, 40(%rdi) - movq %r8, 48(%rdi) - movq %r9, 56(%rdi) - shrq %cl, %rdx - movq %rdx, 64(%rdi) - repz retq -#ifndef __APPLE__ -.size sp_521_rshift_9,.-sp_521_rshift_9 -#endif /* __APPLE__ */ /* Sub b from a into a. (a -= b) * * a A single precision integer and result. @@ -51948,410 +52447,6 @@ _div_521_word_asm_9: .size div_521_word_asm_9,.-div_521_word_asm_9 #endif /* __APPLE__ */ #endif /* _WIN64 */ -#ifdef HAVE_INTEL_AVX2 -/* Square a and put result in r. (r = a * a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_521_sqr_avx2_9 -.type sp_521_sqr_avx2_9,@function -.align 16 -sp_521_sqr_avx2_9: -#else -.section __TEXT,__text -.globl _sp_521_sqr_avx2_9 -.p2align 4 -_sp_521_sqr_avx2_9: -#endif /* __APPLE__ */ - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbx - subq $0x48, %rsp - cmpq %rdi, %rsi - movq %rsp, %rbp - cmovne %rdi, %rbp - addq $0x48, %rdi - xorq %r10, %r10 - # Diagonal 1 - # Zero into %r9 - # A[1] x A[0] - movq (%rsi), %rdx - mulxq 8(%rsi), %r8, %r9 - movq %r8, 8(%rbp) - # Zero into %r8 - # A[2] x A[0] - mulxq 16(%rsi), %rax, %r8 - adcxq %rax, %r9 - adoxq %r10, %r8 - movq %r9, 16(%rbp) - # No load %r12 - %r9 - # A[3] x A[0] - mulxq 24(%rsi), %rax, %r12 - adcxq %rax, %r8 - adoxq %r10, %r12 - movq %r8, 24(%rbp) - # No load %r13 - %r8 - # A[4] x A[0] - mulxq 32(%rsi), %rax, %r13 - adcxq %rax, %r12 - adoxq %r10, %r13 - # No store %r12 - %r9 - # No load %r14 - %r9 - # A[5] x A[0] - mulxq 40(%rsi), %rax, %r14 - adcxq %rax, %r13 - adoxq %r10, %r14 - # No store %r13 - %r8 - # No load %r15 - %r8 - # A[6] x A[0] - mulxq 48(%rsi), %rax, %r15 - adcxq %rax, %r14 - adoxq %r10, %r15 - # No store %r14 - %r9 - # No load %rbx - %r9 - # A[7] x A[0] - mulxq 56(%rsi), %rax, %rbx - adcxq %rax, %r15 - adoxq %r10, %rbx - # No store %r15 - %r8 - # Zero into %r8 - # A[8] x A[0] - mulxq 64(%rsi), %rax, %r8 - adcxq %rax, %rbx - adoxq %r10, %r8 - # No store %rbx - %r9 - # Zero into %r9 - # A[8] x A[1] - movq 8(%rsi), %rdx - mulxq 64(%rsi), %rax, %r9 - adcxq %rax, %r8 - adoxq %r10, %r9 - movq %r8, (%rdi) - # Carry - adcxq %r10, %r9 - movq %r10, %r11 - adcxq %r10, %r11 - adoxq %r10, %r11 - movq %r9, 8(%rdi) - # Diagonal 2 - movq 24(%rbp), %r9 - # No load %r12 - %r8 - # A[2] x A[1] - mulxq 16(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r12 - movq %r9, 24(%rbp) - # No load %r13 - %r9 - # A[3] x A[1] - mulxq 24(%rsi), %rax, %rcx - adcxq %rax, %r12 - adoxq %rcx, %r13 - # No store %r12 - %r8 - # No load %r14 - %r8 - # A[4] x A[1] - mulxq 32(%rsi), %rax, %rcx - adcxq %rax, %r13 - adoxq %rcx, %r14 - # No store %r13 - %r9 - # No load %r15 - %r9 - # A[5] x A[1] - mulxq 40(%rsi), %rax, %rcx - adcxq %rax, %r14 - adoxq %rcx, %r15 - # No store %r14 - %r8 - # No load %rbx - %r8 - # A[6] x A[1] - mulxq 48(%rsi), %rax, %rcx - adcxq %rax, %r15 - adoxq %rcx, %rbx - # No store %r15 - %r9 - movq (%rdi), %r9 - # A[7] x A[1] - mulxq 56(%rsi), %rax, %rcx - adcxq %rax, %rbx - adoxq %rcx, %r9 - # No store %rbx - %r8 - movq 8(%rdi), %r8 - # A[7] x A[2] - movq 16(%rsi), %rdx - mulxq 56(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, (%rdi) - # Zero into %r9 - # A[7] x A[3] - movq 24(%rsi), %rdx - mulxq 56(%rsi), %rax, %r9 - adcxq %rax, %r8 - adoxq %r10, %r9 - movq %r8, 8(%rdi) - # Zero into %r8 - # A[7] x A[4] - movq 32(%rsi), %rdx - mulxq 56(%rsi), %rax, %r8 - adcxq %rax, %r9 - adoxq %r10, %r8 - movq %r9, 16(%rdi) - # Carry - adcxq %r11, %r8 - movq %r10, %r11 - adcxq %r10, %r11 - adoxq %r10, %r11 - movq %r8, 24(%rdi) - # Diagonal 3 - # No load %r14 - %r9 - # A[3] x A[2] - movq 16(%rsi), %rdx - mulxq 24(%rsi), %rax, %rcx - adcxq %rax, %r13 - adoxq %rcx, %r14 - # No store %r13 - %r8 - # No load %r15 - %r8 - # A[4] x A[2] - mulxq 32(%rsi), %rax, %rcx - adcxq %rax, %r14 - adoxq %rcx, %r15 - # No store %r14 - %r9 - # No load %rbx - %r9 - # A[5] x A[2] - mulxq 40(%rsi), %rax, %rcx - adcxq %rax, %r15 - adoxq %rcx, %rbx - # No store %r15 - %r8 - movq (%rdi), %r8 - # A[6] x A[2] - mulxq 48(%rsi), %rax, %rcx - adcxq %rax, %rbx - adoxq %rcx, %r8 - # No store %rbx - %r9 - movq 8(%rdi), %r9 - # A[6] x A[3] - movq 24(%rsi), %rdx - mulxq 48(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, (%rdi) - movq 16(%rdi), %r8 - # A[6] x A[4] - movq 32(%rsi), %rdx - mulxq 48(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, 8(%rdi) - movq 24(%rdi), %r9 - # A[6] x A[5] - movq 40(%rsi), %rdx - mulxq 48(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 16(%rdi) - # Zero into %r8 - # A[8] x A[4] - movq 32(%rsi), %rdx - mulxq 64(%rsi), %rax, %r8 - adcxq %rax, %r9 - adoxq %r10, %r8 - movq %r9, 24(%rdi) - # Zero into %r9 - # A[8] x A[5] - movq 40(%rsi), %rdx - mulxq 64(%rsi), %rax, %r9 - adcxq %rax, %r8 - adoxq %r10, %r9 - movq %r8, 32(%rdi) - # Carry - adcxq %r11, %r9 - movq %r10, %r11 - adcxq %r10, %r11 - adoxq %r10, %r11 - movq %r9, 40(%rdi) - # Diagonal 4 - # No load %rbx - %r8 - # A[4] x A[3] - movq 24(%rsi), %rdx - mulxq 32(%rsi), %rax, %rcx - adcxq %rax, %r15 - adoxq %rcx, %rbx - # No store %r15 - %r9 - movq (%rdi), %r9 - # A[5] x A[3] - mulxq 40(%rsi), %rax, %rcx - adcxq %rax, %rbx - adoxq %rcx, %r9 - # No store %rbx - %r8 - movq 8(%rdi), %r8 - # A[5] x A[4] - movq 32(%rsi), %rdx - mulxq 40(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, (%rdi) - movq 16(%rdi), %r9 - # A[8] x A[2] - movq 16(%rsi), %rdx - mulxq 64(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 8(%rdi) - movq 24(%rdi), %r8 - # A[8] x A[3] - movq 24(%rsi), %rdx - mulxq 64(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, 16(%rdi) - movq 32(%rdi), %r9 - # A[7] x A[5] - movq 40(%rsi), %rdx - mulxq 56(%rsi), %rax, %rcx - adcxq %rax, %r8 - adoxq %rcx, %r9 - movq %r8, 24(%rdi) - movq 40(%rdi), %r8 - # A[7] x A[6] - movq 48(%rsi), %rdx - mulxq 56(%rsi), %rax, %rcx - adcxq %rax, %r9 - adoxq %rcx, %r8 - movq %r9, 32(%rdi) - # Zero into %r9 - # A[8] x A[6] - mulxq 64(%rsi), %rax, %r9 - adcxq %rax, %r8 - adoxq %r10, %r9 - movq %r8, 40(%rdi) - # Zero into %r8 - # A[8] x A[7] - movq 56(%rsi), %rdx - mulxq 64(%rsi), %rax, %r8 - adcxq %rax, %r9 - adoxq %r10, %r8 - movq %r9, 48(%rdi) - # Carry - adcxq %r11, %r8 - movq %r10, %r11 - adcxq %r10, %r11 - adoxq %r10, %r11 - movq %r8, 56(%rdi) - movq %r11, 64(%rdi) - # Double and Add in A[i] x A[i] - movq 8(%rbp), %r9 - # A[0] x A[0] - movq (%rsi), %rdx - mulxq %rdx, %rax, %rcx - movq %rax, (%rbp) - adoxq %r9, %r9 - adcxq %rcx, %r9 - movq %r9, 8(%rbp) - movq 16(%rbp), %r8 - movq 24(%rbp), %r9 - # A[1] x A[1] - movq 8(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r9, %r9 - adcxq %rax, %r8 - adcxq %rcx, %r9 - movq %r8, 16(%rbp) - movq %r9, 24(%rbp) - # A[2] x A[2] - movq 16(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r12, %r12 - adoxq %r13, %r13 - adcxq %rax, %r12 - adcxq %rcx, %r13 - # A[3] x A[3] - movq 24(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r14, %r14 - adoxq %r15, %r15 - adcxq %rax, %r14 - adcxq %rcx, %r15 - movq (%rdi), %r9 - # A[4] x A[4] - movq 32(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %rbx, %rbx - adoxq %r9, %r9 - adcxq %rax, %rbx - adcxq %rcx, %r9 - movq %r9, (%rdi) - movq 8(%rdi), %r8 - movq 16(%rdi), %r9 - # A[5] x A[5] - movq 40(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r9, %r9 - adcxq %rax, %r8 - adcxq %rcx, %r9 - movq %r8, 8(%rdi) - movq %r9, 16(%rdi) - movq 24(%rdi), %r8 - movq 32(%rdi), %r9 - # A[6] x A[6] - movq 48(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r9, %r9 - adcxq %rax, %r8 - adcxq %rcx, %r9 - movq %r8, 24(%rdi) - movq %r9, 32(%rdi) - movq 40(%rdi), %r8 - movq 48(%rdi), %r9 - # A[7] x A[7] - movq 56(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r9, %r9 - adcxq %rax, %r8 - adcxq %rcx, %r9 - movq %r8, 40(%rdi) - movq %r9, 48(%rdi) - movq 56(%rdi), %r8 - movq 64(%rdi), %r9 - # A[8] x A[8] - movq 64(%rsi), %rdx - mulxq %rdx, %rax, %rcx - adoxq %r8, %r8 - adoxq %r9, %r9 - adcxq %rax, %r8 - adcxq %rcx, %r9 - movq %r8, 56(%rdi) - movq %r9, 64(%rdi) - movq %r12, -40(%rdi) - movq %r13, -32(%rdi) - movq %r14, -24(%rdi) - movq %r15, -16(%rdi) - movq %rbx, -8(%rdi) - subq $0x48, %rdi - cmpq %rdi, %rsi - jne L_end_521_sqr_avx2_9 - vmovdqu (%rbp), %xmm0 - vmovups %xmm0, (%rdi) - vmovdqu 16(%rbp), %xmm0 - vmovups %xmm0, 16(%rdi) -L_end_521_sqr_avx2_9: - addq $0x48, %rsp - popq %rbx - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - repz retq -#ifndef __APPLE__ -.size sp_521_sqr_avx2_9,.-sp_521_sqr_avx2_9 -#endif /* __APPLE__ */ -#endif /* HAVE_INTEL_AVX2 */ /* Shift number right by 1 bit. (r = a >> 1) * * r Result of right shift by 1. diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm index c72586133..24e69c373 100644 --- a/wolfcrypt/src/sp_x86_64_asm.asm +++ b/wolfcrypt/src/sp_x86_64_asm.asm @@ -37699,6 +37699,121 @@ sp_256_mul_4 PROC ret sp_256_mul_4 ENDP _text ENDS +IFDEF HAVE_INTEL_AVX2 +; /* Multiply a and b into r. (r = a * b) +; * +; * r Result of multiplication. +; * a First number to multiply. +; * b Second number to multiply. +; */ +_text SEGMENT READONLY PARA +sp_256_mul_avx2_4 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rbp + push rdi + push rsi + mov rbp, r8 + mov rdi, rdx + ; A[0] * B[0] + mov rdx, QWORD PTR [rbp] + mulx r9, r8, QWORD PTR [rdi] + ; A[2] * B[0] + mulx r11, r10, QWORD PTR [rdi+16] + ; A[1] * B[0] + mulx rsi, rax, QWORD PTR [rdi+8] + xor r15, r15 + adcx r9, rax + ; A[1] * B[3] + mov rdx, QWORD PTR [rbp+24] + mulx r13, r12, QWORD PTR [rdi+8] + adcx r10, rsi + ; A[0] * B[1] + mov rdx, QWORD PTR [rbp+8] + mulx rsi, rax, QWORD PTR [rdi] + adox r9, rax + ; A[2] * B[1] + mulx r14, rax, QWORD PTR [rdi+16] + adox r10, rsi + adcx r11, rax + ; A[1] * B[2] + mov rdx, QWORD PTR [rbp+16] + mulx rsi, rax, QWORD PTR [rdi+8] + adcx r12, r14 + adox r11, rax + adcx r13, r15 + adox r12, rsi + ; A[0] * B[2] + mulx rsi, rax, QWORD PTR [rdi] + adox r13, r15 + xor r14, r14 + adcx r10, rax + ; A[1] * B[1] + mov rdx, QWORD PTR [rbp+8] + mulx rax, rdx, QWORD PTR [rdi+8] + adcx r11, rsi + adox r10, rdx + ; A[3] * B[1] + mov rdx, QWORD PTR [rbp+8] + adox r11, rax + mulx rsi, rax, QWORD PTR [rdi+24] + adcx r12, rax + ; A[2] * B[2] + mov rdx, QWORD PTR [rbp+16] + mulx rax, rdx, QWORD PTR [rdi+16] + adcx r13, rsi + adox r12, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rbp+24] + adox r13, rax + mulx rsi, rax, QWORD PTR [rdi+24] + adox r14, r15 + adcx r14, rax + ; A[0] * B[3] + mulx rax, rdx, QWORD PTR [rdi] + adcx r15, rsi + xor rsi, rsi + adcx r11, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rdi+24] + adcx r12, rax + mulx rax, rbx, QWORD PTR [rbp] + adox r11, rbx + adox r12, rax + ; A[3] * B[2] + mulx rax, rdx, QWORD PTR [rbp+16] + adcx r13, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rbp+24] + adcx r14, rax + mulx rdx, rax, QWORD PTR [rdi+16] + adcx r15, rsi + adox r13, rax + adox r14, rdx + adox r15, rsi + mov QWORD PTR [rcx], r8 + mov QWORD PTR [rcx+8], r9 + mov QWORD PTR [rcx+16], r10 + mov QWORD PTR [rcx+24], r11 + mov QWORD PTR [rcx+32], r12 + mov QWORD PTR [rcx+40], r13 + mov QWORD PTR [rcx+48], r14 + mov QWORD PTR [rcx+56], r15 + pop rsi + pop rdi + pop rbp + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +sp_256_mul_avx2_4 ENDP +_text ENDS +ENDIF ; /* Square a and put result in r. (r = a * a) ; * ; * r A single precision integer. @@ -37815,6 +37930,95 @@ sp_256_sqr_4 PROC ret sp_256_sqr_4 ENDP _text ENDS +IFDEF HAVE_INTEL_AVX2 +; /* Square a and put result in r. (r = a * a) +; * +; * r Result of squaring. +; * a Number to square in Montgomery form. +; */ +_text SEGMENT READONLY PARA +sp_256_sqr_avx2_4 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + mov rax, rdx + ; A[0] * A[1] + mov rdx, QWORD PTR [rax] + mov r15, QWORD PTR [rax+16] + mulx r10, r9, QWORD PTR [rax+8] + ; A[0] * A[3] + mulx r12, r11, QWORD PTR [rax+24] + ; A[2] * A[1] + mov rdx, r15 + mulx rbx, rsi, QWORD PTR [rax+8] + ; A[2] * A[3] + mulx r14, r13, QWORD PTR [rax+24] + xor r15, r15 + adox r11, rsi + adox r12, rbx + ; A[2] * A[0] + mulx rbx, rsi, QWORD PTR [rax] + ; A[1] * A[3] + mov rdx, QWORD PTR [rax+8] + adox r13, r15 + mulx r8, rdi, QWORD PTR [rax+24] + adcx r10, rsi + adox r14, r15 + adcx r11, rbx + adcx r12, rdi + adcx r13, r8 + adcx r14, r15 + ; Double with Carry Flag + xor r15, r15 + ; A[0] * A[0] + mov rdx, QWORD PTR [rax] + mulx rdi, r8, rdx + adcx r9, r9 + adcx r10, r10 + adox r9, rdi + ; A[1] * A[1] + mov rdx, QWORD PTR [rax+8] + mulx rbx, rsi, rdx + adcx r11, r11 + adox r10, rsi + ; A[2] * A[2] + mov rdx, QWORD PTR [rax+16] + mulx rsi, rdi, rdx + adcx r12, r12 + adox r11, rbx + adcx r13, r13 + adox r12, rdi + adcx r14, r14 + ; A[3] * A[3] + mov rdx, QWORD PTR [rax+24] + mulx rbx, rdi, rdx + adox r13, rsi + adcx r15, r15 + adox r14, rdi + adox r15, rbx + mov QWORD PTR [rcx], r8 + mov QWORD PTR [rcx+8], r9 + mov QWORD PTR [rcx+16], r10 + mov QWORD PTR [rcx+24], r11 + mov QWORD PTR [rcx+32], r12 + mov QWORD PTR [rcx+40], r13 + mov QWORD PTR [rcx+48], r14 + mov QWORD PTR [rcx+56], r15 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sp_256_sqr_avx2_4 ENDP +_text ENDS +ENDIF ; /* Add b to a into r. (r = a + b) ; * ; * r A single precision integer. @@ -39793,121 +39997,6 @@ sp_256_to_bin_movbe_4 PROC sp_256_to_bin_movbe_4 ENDP _text ENDS ENDIF -IFDEF HAVE_INTEL_AVX2 -; /* Multiply a and b into r. (r = a * b) -; * -; * r Result of multiplication. -; * a First number to multiply. -; * b Second number to multiply. -; */ -_text SEGMENT READONLY PARA -sp_256_mul_avx2_4 PROC - push rbx - push r12 - push r13 - push r14 - push r15 - push rbp - push rdi - push rsi - mov rbp, r8 - mov rdi, rdx - ; A[0] * B[0] - mov rdx, QWORD PTR [rbp] - mulx r9, r8, QWORD PTR [rdi] - ; A[2] * B[0] - mulx r11, r10, QWORD PTR [rdi+16] - ; A[1] * B[0] - mulx rsi, rax, QWORD PTR [rdi+8] - xor r15, r15 - adcx r9, rax - ; A[1] * B[3] - mov rdx, QWORD PTR [rbp+24] - mulx r13, r12, QWORD PTR [rdi+8] - adcx r10, rsi - ; A[0] * B[1] - mov rdx, QWORD PTR [rbp+8] - mulx rsi, rax, QWORD PTR [rdi] - adox r9, rax - ; A[2] * B[1] - mulx r14, rax, QWORD PTR [rdi+16] - adox r10, rsi - adcx r11, rax - ; A[1] * B[2] - mov rdx, QWORD PTR [rbp+16] - mulx rsi, rax, QWORD PTR [rdi+8] - adcx r12, r14 - adox r11, rax - adcx r13, r15 - adox r12, rsi - ; A[0] * B[2] - mulx rsi, rax, QWORD PTR [rdi] - adox r13, r15 - xor r14, r14 - adcx r10, rax - ; A[1] * B[1] - mov rdx, QWORD PTR [rbp+8] - mulx rax, rdx, QWORD PTR [rdi+8] - adcx r11, rsi - adox r10, rdx - ; A[3] * B[1] - mov rdx, QWORD PTR [rbp+8] - adox r11, rax - mulx rsi, rax, QWORD PTR [rdi+24] - adcx r12, rax - ; A[2] * B[2] - mov rdx, QWORD PTR [rbp+16] - mulx rax, rdx, QWORD PTR [rdi+16] - adcx r13, rsi - adox r12, rdx - ; A[3] * B[3] - mov rdx, QWORD PTR [rbp+24] - adox r13, rax - mulx rsi, rax, QWORD PTR [rdi+24] - adox r14, r15 - adcx r14, rax - ; A[0] * B[3] - mulx rax, rdx, QWORD PTR [rdi] - adcx r15, rsi - xor rsi, rsi - adcx r11, rdx - ; A[3] * B[0] - mov rdx, QWORD PTR [rdi+24] - adcx r12, rax - mulx rax, rbx, QWORD PTR [rbp] - adox r11, rbx - adox r12, rax - ; A[3] * B[2] - mulx rax, rdx, QWORD PTR [rbp+16] - adcx r13, rdx - ; A[2] * B[3] - mov rdx, QWORD PTR [rbp+24] - adcx r14, rax - mulx rdx, rax, QWORD PTR [rdi+16] - adcx r15, rsi - adox r13, rax - adox r14, rdx - adox r15, rsi - mov QWORD PTR [rcx], r8 - mov QWORD PTR [rcx+8], r9 - mov QWORD PTR [rcx+16], r10 - mov QWORD PTR [rcx+24], r11 - mov QWORD PTR [rcx+32], r12 - mov QWORD PTR [rcx+40], r13 - mov QWORD PTR [rcx+48], r14 - mov QWORD PTR [rcx+56], r15 - pop rsi - pop rdi - pop rbp - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - ret -sp_256_mul_avx2_4 ENDP -_text ENDS -ENDIF ; /* Sub b from a into a. (a -= b) ; * ; * a A single precision integer and result. @@ -41244,6 +41333,223 @@ sp_384_mul_6 PROC ret sp_384_mul_6 ENDP _text ENDS +IFDEF HAVE_INTEL_AVX2 +; /* Multiply a and b into r. (r = a * b) +; * +; * r Result of multiplication. +; * a First number to multiply. +; * b Second number to multiply. +; */ +_text SEGMENT READONLY PARA +sp_384_mul_avx2_6 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + mov rax, rdx + sub rsp, 40 + xor rbx, rbx + mov rdx, QWORD PTR [rax] + ; A[0] * B[0] + mulx r12, r11, QWORD PTR [r8] + ; A[0] * B[1] + mulx r13, r9, QWORD PTR [r8+8] + adcx r12, r9 + ; A[0] * B[2] + mulx r14, r9, QWORD PTR [r8+16] + adcx r13, r9 + ; A[0] * B[3] + mulx r15, r9, QWORD PTR [r8+24] + adcx r14, r9 + ; A[0] * B[4] + mulx rdi, r9, QWORD PTR [r8+32] + adcx r15, r9 + ; A[0] * B[5] + mulx rsi, r9, QWORD PTR [r8+40] + adcx rdi, r9 + adcx rsi, rbx + mov QWORD PTR [rsp], r11 + mov r11, 0 + adcx r11, rbx + xor rbx, rbx + mov rdx, QWORD PTR [rax+8] + ; A[1] * B[0] + mulx r10, r9, QWORD PTR [r8] + adcx r12, r9 + adox r13, r10 + ; A[1] * B[1] + mulx r10, r9, QWORD PTR [r8+8] + adcx r13, r9 + adox r14, r10 + ; A[1] * B[2] + mulx r10, r9, QWORD PTR [r8+16] + adcx r14, r9 + adox r15, r10 + ; A[1] * B[3] + mulx r10, r9, QWORD PTR [r8+24] + adcx r15, r9 + adox rdi, r10 + ; A[1] * B[4] + mulx r10, r9, QWORD PTR [r8+32] + adcx rdi, r9 + adox rsi, r10 + ; A[1] * B[5] + mulx r10, r9, QWORD PTR [r8+40] + adcx rsi, r9 + adox r11, r10 + adcx r11, rbx + mov QWORD PTR [rsp+8], r12 + mov r12, 0 + adcx r12, rbx + adox r12, rbx + xor rbx, rbx + mov rdx, QWORD PTR [rax+16] + ; A[2] * B[0] + mulx r10, r9, QWORD PTR [r8] + adcx r13, r9 + adox r14, r10 + ; A[2] * B[1] + mulx r10, r9, QWORD PTR [r8+8] + adcx r14, r9 + adox r15, r10 + ; A[2] * B[2] + mulx r10, r9, QWORD PTR [r8+16] + adcx r15, r9 + adox rdi, r10 + ; A[2] * B[3] + mulx r10, r9, QWORD PTR [r8+24] + adcx rdi, r9 + adox rsi, r10 + ; A[2] * B[4] + mulx r10, r9, QWORD PTR [r8+32] + adcx rsi, r9 + adox r11, r10 + ; A[2] * B[5] + mulx r10, r9, QWORD PTR [r8+40] + adcx r11, r9 + adox r12, r10 + adcx r12, rbx + mov QWORD PTR [rsp+16], r13 + mov r13, 0 + adcx r13, rbx + adox r13, rbx + xor rbx, rbx + mov rdx, QWORD PTR [rax+24] + ; A[3] * B[0] + mulx r10, r9, QWORD PTR [r8] + adcx r14, r9 + adox r15, r10 + ; A[3] * B[1] + mulx r10, r9, QWORD PTR [r8+8] + adcx r15, r9 + adox rdi, r10 + ; A[3] * B[2] + mulx r10, r9, QWORD PTR [r8+16] + adcx rdi, r9 + adox rsi, r10 + ; A[3] * B[3] + mulx r10, r9, QWORD PTR [r8+24] + adcx rsi, r9 + adox r11, r10 + ; A[3] * B[4] + mulx r10, r9, QWORD PTR [r8+32] + adcx r11, r9 + adox r12, r10 + ; A[3] * B[5] + mulx r10, r9, QWORD PTR [r8+40] + adcx r12, r9 + adox r13, r10 + adcx r13, rbx + mov QWORD PTR [rsp+24], r14 + mov r14, 0 + adcx r14, rbx + adox r14, rbx + xor rbx, rbx + mov rdx, QWORD PTR [rax+32] + ; A[4] * B[0] + mulx r10, r9, QWORD PTR [r8] + adcx r15, r9 + adox rdi, r10 + ; A[4] * B[1] + mulx r10, r9, QWORD PTR [r8+8] + adcx rdi, r9 + adox rsi, r10 + ; A[4] * B[2] + mulx r10, r9, QWORD PTR [r8+16] + adcx rsi, r9 + adox r11, r10 + ; A[4] * B[3] + mulx r10, r9, QWORD PTR [r8+24] + adcx r11, r9 + adox r12, r10 + ; A[4] * B[4] + mulx r10, r9, QWORD PTR [r8+32] + adcx r12, r9 + adox r13, r10 + ; A[4] * B[5] + mulx r10, r9, QWORD PTR [r8+40] + adcx r13, r9 + adox r14, r10 + adcx r14, rbx + mov QWORD PTR [rsp+32], r15 + mov rdx, QWORD PTR [rax+40] + ; A[5] * B[0] + mulx r10, r9, QWORD PTR [r8] + adcx rdi, r9 + adox rsi, r10 + ; A[5] * B[1] + mulx r10, r9, QWORD PTR [r8+8] + adcx rsi, r9 + adox r11, r10 + ; A[5] * B[2] + mulx r10, r9, QWORD PTR [r8+16] + adcx r11, r9 + adox r12, r10 + ; A[5] * B[3] + mulx r10, r9, QWORD PTR [r8+24] + adcx r12, r9 + adox r13, r10 + ; A[5] * B[4] + mulx r10, r9, QWORD PTR [r8+32] + adcx r13, r9 + adox r14, r10 + ; A[5] * B[5] + mulx r15, r9, QWORD PTR [r8+40] + adcx r14, r9 + adox r15, rbx + adcx r15, rbx + mov QWORD PTR [rcx+40], rdi + mov QWORD PTR [rcx+48], rsi + mov QWORD PTR [rcx+56], r11 + mov QWORD PTR [rcx+64], r12 + mov QWORD PTR [rcx+72], r13 + mov QWORD PTR [rcx+80], r14 + mov QWORD PTR [rcx+88], r15 + mov r11, QWORD PTR [rsp] + mov r12, QWORD PTR [rsp+8] + mov r13, QWORD PTR [rsp+16] + mov r14, QWORD PTR [rsp+24] + mov r15, QWORD PTR [rsp+32] + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + mov QWORD PTR [rcx+32], r15 + add rsp, 40 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sp_384_mul_avx2_6 ENDP +_text ENDS +ENDIF ; /* Square a and put result in r. (r = a * a) ; * ; * r A single precision integer. @@ -41462,6 +41768,164 @@ sp_384_sqr_6 PROC ret sp_384_sqr_6 ENDP _text ENDS +IFDEF HAVE_INTEL_AVX2 +; /* Square a and put result in r. (r = a * a) +; * +; * r Result of squaring. +; * a Number to square in Montgomery form. +; */ +_text SEGMENT READONLY PARA +sp_384_sqr_avx2_6 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov rax, rdx + push rcx + xor rcx, rcx + mov rdx, QWORD PTR [rax] + mov rsi, QWORD PTR [rax+8] + mov rbx, QWORD PTR [rax+16] + mov rbp, QWORD PTR [rax+24] + ; Diagonal 0 + ; A[1] * A[0] + mulx r11, r10, QWORD PTR [rax+8] + ; A[2] * A[0] + mulx r12, r8, QWORD PTR [rax+16] + adcx r11, r8 + ; A[3] * A[0] + mulx r13, r8, QWORD PTR [rax+24] + adcx r12, r8 + ; A[4] * A[0] + mulx r14, r8, QWORD PTR [rax+32] + adcx r13, r8 + ; A[5] * A[0] + mulx r15, r8, QWORD PTR [rax+40] + adcx r14, r8 + adcx r15, rcx + ; Diagonal 1 + mov rdx, rsi + ; A[2] * A[1] + mulx r9, r8, QWORD PTR [rax+16] + adcx r12, r8 + adox r13, r9 + ; A[3] * A[1] + mulx r9, r8, QWORD PTR [rax+24] + adcx r13, r8 + adox r14, r9 + ; A[4] * A[1] + mulx r9, r8, QWORD PTR [rax+32] + adcx r14, r8 + adox r15, r9 + ; A[5] * A[1] + mulx rdi, r8, QWORD PTR [rax+40] + adcx r15, r8 + adox rdi, rcx + mov rdx, rbx + ; A[5] * A[2] + mulx rsi, r8, QWORD PTR [rax+40] + adcx rdi, r8 + adox rsi, rcx + adcx rsi, rcx + adcx rbx, rcx + ; Diagonal 2 + ; A[3] * A[2] + mulx r9, r8, QWORD PTR [rax+24] + adcx r14, r8 + adox r15, r9 + ; A[4] * A[2] + mulx r9, r8, QWORD PTR [rax+32] + adcx r15, r8 + adox rdi, r9 + mov rdx, rbp + ; A[4] * A[3] + mulx r9, r8, QWORD PTR [rax+32] + adcx rdi, r8 + adox rsi, r9 + ; A[5] * A[3] + mulx rbx, r8, QWORD PTR [rax+40] + adcx rsi, r8 + adox rbx, rcx + mov rdx, QWORD PTR [rax+32] + ; A[5] * A[4] + mulx rbp, r8, QWORD PTR [rax+40] + adcx rbx, r8 + adox rbp, rcx + adcx rbp, rcx + adcx rcx, rcx + ; Doubling previous result as we add in square words results + ; A[0] * A[0] + mov rdx, QWORD PTR [rax] + mulx r9, r8, rdx + pop rdx + mov QWORD PTR [rdx], r8 + adox r10, r10 + push rdx + adcx r10, r9 + ; A[1] * A[1] + mov rdx, QWORD PTR [rax+8] + mulx r9, r8, rdx + adox r11, r11 + adcx r11, r8 + adox r12, r12 + adcx r12, r9 + ; A[2] * A[2] + mov rdx, QWORD PTR [rax+16] + mulx r9, r8, rdx + adox r13, r13 + adcx r13, r8 + adox r14, r14 + adcx r14, r9 + ; A[3] * A[3] + mov rdx, QWORD PTR [rax+24] + mulx r9, r8, rdx + adox r15, r15 + adcx r15, r8 + adox rdi, rdi + adcx rdi, r9 + ; A[4] * A[4] + mov rdx, QWORD PTR [rax+32] + mulx r9, r8, rdx + adox rsi, rsi + adcx rsi, r8 + adox rbx, rbx + adcx rbx, r9 + ; A[5] * A[5] + mov rdx, QWORD PTR [rax+40] + mulx r9, r8, rdx + adox rbp, rbp + adcx rbp, r8 + adcx r9, rcx + mov r8, 0 + adox r9, r8 + pop rcx + mov QWORD PTR [rcx+8], r10 + mov QWORD PTR [rcx+16], r11 + mov QWORD PTR [rcx+24], r12 + mov QWORD PTR [rcx+32], r13 + mov QWORD PTR [rcx+40], r14 + mov QWORD PTR [rcx+48], r15 + mov QWORD PTR [rcx+56], rdi + mov QWORD PTR [rcx+64], rsi + mov QWORD PTR [rcx+72], rbx + mov QWORD PTR [rcx+80], rbp + mov QWORD PTR [rcx+88], r9 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sp_384_sqr_avx2_6 ENDP +_text ENDS +ENDIF ; /* Add b to a into r. (r = a + b) ; * ; * r A single precision integer. @@ -42285,223 +42749,6 @@ _text ENDS ENDIF ENDIF IFDEF HAVE_INTEL_AVX2 -; /* Multiply a and b into r. (r = a * b) -; * -; * r Result of multiplication. -; * a First number to multiply. -; * b Second number to multiply. -; */ -_text SEGMENT READONLY PARA -sp_384_mul_avx2_6 PROC - push r12 - push r13 - push r14 - push r15 - push rdi - push rsi - push rbx - mov rax, rdx - sub rsp, 40 - xor rbx, rbx - mov rdx, QWORD PTR [rax] - ; A[0] * B[0] - mulx r12, r11, QWORD PTR [r8] - ; A[0] * B[1] - mulx r13, r9, QWORD PTR [r8+8] - adcx r12, r9 - ; A[0] * B[2] - mulx r14, r9, QWORD PTR [r8+16] - adcx r13, r9 - ; A[0] * B[3] - mulx r15, r9, QWORD PTR [r8+24] - adcx r14, r9 - ; A[0] * B[4] - mulx rdi, r9, QWORD PTR [r8+32] - adcx r15, r9 - ; A[0] * B[5] - mulx rsi, r9, QWORD PTR [r8+40] - adcx rdi, r9 - adcx rsi, rbx - mov QWORD PTR [rsp], r11 - mov r11, 0 - adcx r11, rbx - xor rbx, rbx - mov rdx, QWORD PTR [rax+8] - ; A[1] * B[0] - mulx r10, r9, QWORD PTR [r8] - adcx r12, r9 - adox r13, r10 - ; A[1] * B[1] - mulx r10, r9, QWORD PTR [r8+8] - adcx r13, r9 - adox r14, r10 - ; A[1] * B[2] - mulx r10, r9, QWORD PTR [r8+16] - adcx r14, r9 - adox r15, r10 - ; A[1] * B[3] - mulx r10, r9, QWORD PTR [r8+24] - adcx r15, r9 - adox rdi, r10 - ; A[1] * B[4] - mulx r10, r9, QWORD PTR [r8+32] - adcx rdi, r9 - adox rsi, r10 - ; A[1] * B[5] - mulx r10, r9, QWORD PTR [r8+40] - adcx rsi, r9 - adox r11, r10 - adcx r11, rbx - mov QWORD PTR [rsp+8], r12 - mov r12, 0 - adcx r12, rbx - adox r12, rbx - xor rbx, rbx - mov rdx, QWORD PTR [rax+16] - ; A[2] * B[0] - mulx r10, r9, QWORD PTR [r8] - adcx r13, r9 - adox r14, r10 - ; A[2] * B[1] - mulx r10, r9, QWORD PTR [r8+8] - adcx r14, r9 - adox r15, r10 - ; A[2] * B[2] - mulx r10, r9, QWORD PTR [r8+16] - adcx r15, r9 - adox rdi, r10 - ; A[2] * B[3] - mulx r10, r9, QWORD PTR [r8+24] - adcx rdi, r9 - adox rsi, r10 - ; A[2] * B[4] - mulx r10, r9, QWORD PTR [r8+32] - adcx rsi, r9 - adox r11, r10 - ; A[2] * B[5] - mulx r10, r9, QWORD PTR [r8+40] - adcx r11, r9 - adox r12, r10 - adcx r12, rbx - mov QWORD PTR [rsp+16], r13 - mov r13, 0 - adcx r13, rbx - adox r13, rbx - xor rbx, rbx - mov rdx, QWORD PTR [rax+24] - ; A[3] * B[0] - mulx r10, r9, QWORD PTR [r8] - adcx r14, r9 - adox r15, r10 - ; A[3] * B[1] - mulx r10, r9, QWORD PTR [r8+8] - adcx r15, r9 - adox rdi, r10 - ; A[3] * B[2] - mulx r10, r9, QWORD PTR [r8+16] - adcx rdi, r9 - adox rsi, r10 - ; A[3] * B[3] - mulx r10, r9, QWORD PTR [r8+24] - adcx rsi, r9 - adox r11, r10 - ; A[3] * B[4] - mulx r10, r9, QWORD PTR [r8+32] - adcx r11, r9 - adox r12, r10 - ; A[3] * B[5] - mulx r10, r9, QWORD PTR [r8+40] - adcx r12, r9 - adox r13, r10 - adcx r13, rbx - mov QWORD PTR [rsp+24], r14 - mov r14, 0 - adcx r14, rbx - adox r14, rbx - xor rbx, rbx - mov rdx, QWORD PTR [rax+32] - ; A[4] * B[0] - mulx r10, r9, QWORD PTR [r8] - adcx r15, r9 - adox rdi, r10 - ; A[4] * B[1] - mulx r10, r9, QWORD PTR [r8+8] - adcx rdi, r9 - adox rsi, r10 - ; A[4] * B[2] - mulx r10, r9, QWORD PTR [r8+16] - adcx rsi, r9 - adox r11, r10 - ; A[4] * B[3] - mulx r10, r9, QWORD PTR [r8+24] - adcx r11, r9 - adox r12, r10 - ; A[4] * B[4] - mulx r10, r9, QWORD PTR [r8+32] - adcx r12, r9 - adox r13, r10 - ; A[4] * B[5] - mulx r10, r9, QWORD PTR [r8+40] - adcx r13, r9 - adox r14, r10 - adcx r14, rbx - mov QWORD PTR [rsp+32], r15 - mov rdx, QWORD PTR [rax+40] - ; A[5] * B[0] - mulx r10, r9, QWORD PTR [r8] - adcx rdi, r9 - adox rsi, r10 - ; A[5] * B[1] - mulx r10, r9, QWORD PTR [r8+8] - adcx rsi, r9 - adox r11, r10 - ; A[5] * B[2] - mulx r10, r9, QWORD PTR [r8+16] - adcx r11, r9 - adox r12, r10 - ; A[5] * B[3] - mulx r10, r9, QWORD PTR [r8+24] - adcx r12, r9 - adox r13, r10 - ; A[5] * B[4] - mulx r10, r9, QWORD PTR [r8+32] - adcx r13, r9 - adox r14, r10 - ; A[5] * B[5] - mulx r15, r9, QWORD PTR [r8+40] - adcx r14, r9 - adox r15, rbx - adcx r15, rbx - mov QWORD PTR [rcx+40], rdi - mov QWORD PTR [rcx+48], rsi - mov QWORD PTR [rcx+56], r11 - mov QWORD PTR [rcx+64], r12 - mov QWORD PTR [rcx+72], r13 - mov QWORD PTR [rcx+80], r14 - mov QWORD PTR [rcx+88], r15 - mov r11, QWORD PTR [rsp] - mov r12, QWORD PTR [rsp+8] - mov r13, QWORD PTR [rsp+16] - mov r14, QWORD PTR [rsp+24] - mov r15, QWORD PTR [rsp+32] - mov QWORD PTR [rcx], r11 - mov QWORD PTR [rcx+8], r12 - mov QWORD PTR [rcx+16], r13 - mov QWORD PTR [rcx+24], r14 - mov QWORD PTR [rcx+32], r15 - add rsp, 40 - pop rbx - pop rsi - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret -sp_384_mul_avx2_6 ENDP -_text ENDS -ENDIF -IFDEF HAVE_INTEL_AVX2 ; /* Reduce the number back to 384 bits using Montgomery reduction. ; * ; * a A single precision number to reduce in place. @@ -42825,164 +43072,6 @@ sp_384_mont_reduce_order_avx2_6 ENDP _text ENDS ENDIF IFDEF HAVE_INTEL_AVX2 -; /* Square a and put result in r. (r = a * a) -; * -; * r Result of squaring. -; * a Number to square in Montgomery form. -; */ -_text SEGMENT READONLY PARA -sp_384_sqr_avx2_6 PROC - push r12 - push r13 - push r14 - push r15 - push rdi - push rsi - push rbx - push rbp - mov rax, rdx - push rcx - xor rcx, rcx - mov rdx, QWORD PTR [rax] - mov rsi, QWORD PTR [rax+8] - mov rbx, QWORD PTR [rax+16] - mov rbp, QWORD PTR [rax+24] - ; Diagonal 0 - ; A[1] * A[0] - mulx r11, r10, QWORD PTR [rax+8] - ; A[2] * A[0] - mulx r12, r8, QWORD PTR [rax+16] - adcx r11, r8 - ; A[3] * A[0] - mulx r13, r8, QWORD PTR [rax+24] - adcx r12, r8 - ; A[4] * A[0] - mulx r14, r8, QWORD PTR [rax+32] - adcx r13, r8 - ; A[5] * A[0] - mulx r15, r8, QWORD PTR [rax+40] - adcx r14, r8 - adcx r15, rcx - ; Diagonal 1 - mov rdx, rsi - ; A[2] * A[1] - mulx r9, r8, QWORD PTR [rax+16] - adcx r12, r8 - adox r13, r9 - ; A[3] * A[1] - mulx r9, r8, QWORD PTR [rax+24] - adcx r13, r8 - adox r14, r9 - ; A[4] * A[1] - mulx r9, r8, QWORD PTR [rax+32] - adcx r14, r8 - adox r15, r9 - ; A[5] * A[1] - mulx rdi, r8, QWORD PTR [rax+40] - adcx r15, r8 - adox rdi, rcx - mov rdx, rbx - ; A[5] * A[2] - mulx rsi, r8, QWORD PTR [rax+40] - adcx rdi, r8 - adox rsi, rcx - adcx rsi, rcx - adcx rbx, rcx - ; Diagonal 2 - ; A[3] * A[2] - mulx r9, r8, QWORD PTR [rax+24] - adcx r14, r8 - adox r15, r9 - ; A[4] * A[2] - mulx r9, r8, QWORD PTR [rax+32] - adcx r15, r8 - adox rdi, r9 - mov rdx, rbp - ; A[4] * A[3] - mulx r9, r8, QWORD PTR [rax+32] - adcx rdi, r8 - adox rsi, r9 - ; A[5] * A[3] - mulx rbx, r8, QWORD PTR [rax+40] - adcx rsi, r8 - adox rbx, rcx - mov rdx, QWORD PTR [rax+32] - ; A[5] * A[4] - mulx rbp, r8, QWORD PTR [rax+40] - adcx rbx, r8 - adox rbp, rcx - adcx rbp, rcx - adcx rcx, rcx - ; Doubling previous result as we add in square words results - ; A[0] * A[0] - mov rdx, QWORD PTR [rax] - mulx r9, r8, rdx - pop rdx - mov QWORD PTR [rdx], r8 - adox r10, r10 - push rdx - adcx r10, r9 - ; A[1] * A[1] - mov rdx, QWORD PTR [rax+8] - mulx r9, r8, rdx - adox r11, r11 - adcx r11, r8 - adox r12, r12 - adcx r12, r9 - ; A[2] * A[2] - mov rdx, QWORD PTR [rax+16] - mulx r9, r8, rdx - adox r13, r13 - adcx r13, r8 - adox r14, r14 - adcx r14, r9 - ; A[3] * A[3] - mov rdx, QWORD PTR [rax+24] - mulx r9, r8, rdx - adox r15, r15 - adcx r15, r8 - adox rdi, rdi - adcx rdi, r9 - ; A[4] * A[4] - mov rdx, QWORD PTR [rax+32] - mulx r9, r8, rdx - adox rsi, rsi - adcx rsi, r8 - adox rbx, rbx - adcx rbx, r9 - ; A[5] * A[5] - mov rdx, QWORD PTR [rax+40] - mulx r9, r8, rdx - adox rbp, rbp - adcx rbp, r8 - adcx r9, rcx - mov r8, 0 - adox r9, r8 - pop rcx - mov QWORD PTR [rcx+8], r10 - mov QWORD PTR [rcx+16], r11 - mov QWORD PTR [rcx+24], r12 - mov QWORD PTR [rcx+32], r13 - mov QWORD PTR [rcx+40], r14 - mov QWORD PTR [rcx+48], r15 - mov QWORD PTR [rcx+56], rdi - mov QWORD PTR [rcx+64], rsi - mov QWORD PTR [rcx+72], rbx - mov QWORD PTR [rcx+80], rbp - mov QWORD PTR [rcx+88], r9 - pop rbp - pop rbx - pop rsi - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret -sp_384_sqr_avx2_6 ENDP -_text ENDS -ENDIF -IFDEF HAVE_INTEL_AVX2 ; /* Conditionally subtract b from a using the mask m. ; * m is -1 to subtract and 0 when not copying. ; * @@ -44413,6 +44502,585 @@ sp_521_mul_9 PROC ret sp_521_mul_9 ENDP _text ENDS +IFDEF HAVE_INTEL_AVX2 +; /* Multiply a and b into r. (r = a * b) +; * +; * r Result of multiplication. +; * a First number to multiply. +; * b Second number to multiply. +; */ +_text SEGMENT READONLY PARA +sp_521_mul_avx2_9 PROC + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov rbp, r8 + mov r8, rcx + mov r9, rdx + sub rsp, 72 + cmp r9, r8 + mov rbx, rsp + cmovne rbx, r8 + cmp rbp, r8 + cmove rbx, rsp + add r8, 72 + xor r15, r15 + mov rdx, QWORD PTR [r9] + ; A[0] * B[0] + mulx r11, r10, QWORD PTR [rbp] + ; A[0] * B[1] + mulx r12, rax, QWORD PTR [rbp+8] + mov QWORD PTR [rbx], r10 + adcx r11, rax + ; A[0] * B[2] + mulx r13, rax, QWORD PTR [rbp+16] + mov QWORD PTR [rbx+8], r11 + adcx r12, rax + mov QWORD PTR [rbx+16], r12 + ; A[0] * B[3] + mulx r10, rax, QWORD PTR [rbp+24] + adcx r13, rax + ; A[0] * B[4] + mulx r11, rax, QWORD PTR [rbp+32] + mov QWORD PTR [rbx+24], r13 + adcx r10, rax + ; A[0] * B[5] + mulx r12, rax, QWORD PTR [rbp+40] + mov QWORD PTR [rbx+32], r10 + adcx r11, rax + mov QWORD PTR [rbx+40], r11 + ; A[0] * B[6] + mulx r13, rax, QWORD PTR [rbp+48] + adcx r12, rax + ; A[0] * B[7] + mulx r10, rax, QWORD PTR [rbp+56] + mov QWORD PTR [rbx+48], r12 + adcx r13, rax + ; A[0] * B[8] + mulx r11, rax, QWORD PTR [rbp+64] + mov QWORD PTR [rbx+56], r13 + adcx r10, rax + adcx r11, r15 + mov r14, r15 + adcx r14, r15 + mov QWORD PTR [rbx+64], r10 + mov QWORD PTR [r8], r11 + mov rdx, QWORD PTR [r9+8] + mov r11, QWORD PTR [rbx+8] + mov r12, QWORD PTR [rbx+16] + mov r13, QWORD PTR [rbx+24] + mov r10, QWORD PTR [rbx+32] + ; A[1] * B[0] + mulx rcx, rax, QWORD PTR [rbp] + adcx r11, rax + adox r12, rcx + ; A[1] * B[1] + mulx rcx, rax, QWORD PTR [rbp+8] + mov QWORD PTR [rbx+8], r11 + adcx r12, rax + adox r13, rcx + ; A[1] * B[2] + mulx rcx, rax, QWORD PTR [rbp+16] + mov QWORD PTR [rbx+16], r12 + adcx r13, rax + adox r10, rcx + mov QWORD PTR [rbx+24], r13 + mov r11, QWORD PTR [rbx+40] + mov r12, QWORD PTR [rbx+48] + mov r13, QWORD PTR [rbx+56] + ; A[1] * B[3] + mulx rcx, rax, QWORD PTR [rbp+24] + adcx r10, rax + adox r11, rcx + ; A[1] * B[4] + mulx rcx, rax, QWORD PTR [rbp+32] + mov QWORD PTR [rbx+32], r10 + adcx r11, rax + adox r12, rcx + ; A[1] * B[5] + mulx rcx, rax, QWORD PTR [rbp+40] + mov QWORD PTR [rbx+40], r11 + adcx r12, rax + adox r13, rcx + mov QWORD PTR [rbx+48], r12 + mov r10, QWORD PTR [rbx+64] + mov r11, QWORD PTR [r8] + ; A[1] * B[6] + mulx rcx, rax, QWORD PTR [rbp+48] + adcx r13, rax + adox r10, rcx + ; A[1] * B[7] + mulx rcx, rax, QWORD PTR [rbp+56] + mov QWORD PTR [rbx+56], r13 + adcx r10, rax + adox r11, rcx + ; A[1] * B[8] + mulx rcx, rax, QWORD PTR [rbp+64] + mov QWORD PTR [rbx+64], r10 + mov r12, r15 + adcx r11, rax + adox r12, rcx + adcx r12, r14 + mov r14, r15 + adox r14, r15 + adcx r14, r15 + mov QWORD PTR [r8], r11 + mov QWORD PTR [r8+8], r12 + mov rdx, QWORD PTR [r9+16] + mov r12, QWORD PTR [rbx+16] + mov r13, QWORD PTR [rbx+24] + mov r10, QWORD PTR [rbx+32] + mov r11, QWORD PTR [rbx+40] + ; A[2] * B[0] + mulx rcx, rax, QWORD PTR [rbp] + adcx r12, rax + adox r13, rcx + ; A[2] * B[1] + mulx rcx, rax, QWORD PTR [rbp+8] + mov QWORD PTR [rbx+16], r12 + adcx r13, rax + adox r10, rcx + ; A[2] * B[2] + mulx rcx, rax, QWORD PTR [rbp+16] + mov QWORD PTR [rbx+24], r13 + adcx r10, rax + adox r11, rcx + mov QWORD PTR [rbx+32], r10 + mov r12, QWORD PTR [rbx+48] + mov r13, QWORD PTR [rbx+56] + mov r10, QWORD PTR [rbx+64] + ; A[2] * B[3] + mulx rcx, rax, QWORD PTR [rbp+24] + adcx r11, rax + adox r12, rcx + ; A[2] * B[4] + mulx rcx, rax, QWORD PTR [rbp+32] + mov QWORD PTR [rbx+40], r11 + adcx r12, rax + adox r13, rcx + ; A[2] * B[5] + mulx rcx, rax, QWORD PTR [rbp+40] + mov QWORD PTR [rbx+48], r12 + adcx r13, rax + adox r10, rcx + mov QWORD PTR [rbx+56], r13 + mov r11, QWORD PTR [r8] + mov r12, QWORD PTR [r8+8] + ; A[2] * B[6] + mulx rcx, rax, QWORD PTR [rbp+48] + adcx r10, rax + adox r11, rcx + ; A[2] * B[7] + mulx rcx, rax, QWORD PTR [rbp+56] + mov QWORD PTR [rbx+64], r10 + adcx r11, rax + adox r12, rcx + ; A[2] * B[8] + mulx rcx, rax, QWORD PTR [rbp+64] + mov QWORD PTR [r8], r11 + mov r13, r15 + adcx r12, rax + adox r13, rcx + adcx r13, r14 + mov r14, r15 + adox r14, r15 + adcx r14, r15 + mov QWORD PTR [r8+8], r12 + mov QWORD PTR [r8+16], r13 + mov rdx, QWORD PTR [r9+24] + mov r13, QWORD PTR [rbx+24] + mov r10, QWORD PTR [rbx+32] + mov r11, QWORD PTR [rbx+40] + mov r12, QWORD PTR [rbx+48] + ; A[3] * B[0] + mulx rcx, rax, QWORD PTR [rbp] + adcx r13, rax + adox r10, rcx + ; A[3] * B[1] + mulx rcx, rax, QWORD PTR [rbp+8] + mov QWORD PTR [rbx+24], r13 + adcx r10, rax + adox r11, rcx + ; A[3] * B[2] + mulx rcx, rax, QWORD PTR [rbp+16] + mov QWORD PTR [rbx+32], r10 + adcx r11, rax + adox r12, rcx + mov QWORD PTR [rbx+40], r11 + mov r13, QWORD PTR [rbx+56] + mov r10, QWORD PTR [rbx+64] + mov r11, QWORD PTR [r8] + ; A[3] * B[3] + mulx rcx, rax, QWORD PTR [rbp+24] + adcx r12, rax + adox r13, rcx + ; A[3] * B[4] + mulx rcx, rax, QWORD PTR [rbp+32] + mov QWORD PTR [rbx+48], r12 + adcx r13, rax + adox r10, rcx + ; A[3] * B[5] + mulx rcx, rax, QWORD PTR [rbp+40] + mov QWORD PTR [rbx+56], r13 + adcx r10, rax + adox r11, rcx + mov QWORD PTR [rbx+64], r10 + mov r12, QWORD PTR [r8+8] + mov r13, QWORD PTR [r8+16] + ; A[3] * B[6] + mulx rcx, rax, QWORD PTR [rbp+48] + adcx r11, rax + adox r12, rcx + ; A[3] * B[7] + mulx rcx, rax, QWORD PTR [rbp+56] + mov QWORD PTR [r8], r11 + adcx r12, rax + adox r13, rcx + ; A[3] * B[8] + mulx rcx, rax, QWORD PTR [rbp+64] + mov QWORD PTR [r8+8], r12 + mov r10, r15 + adcx r13, rax + adox r10, rcx + adcx r10, r14 + mov r14, r15 + adox r14, r15 + adcx r14, r15 + mov QWORD PTR [r8+16], r13 + mov QWORD PTR [r8+24], r10 + mov rdx, QWORD PTR [r9+32] + mov r10, QWORD PTR [rbx+32] + mov r11, QWORD PTR [rbx+40] + mov r12, QWORD PTR [rbx+48] + mov r13, QWORD PTR [rbx+56] + ; A[4] * B[0] + mulx rcx, rax, QWORD PTR [rbp] + adcx r10, rax + adox r11, rcx + ; A[4] * B[1] + mulx rcx, rax, QWORD PTR [rbp+8] + mov QWORD PTR [rbx+32], r10 + adcx r11, rax + adox r12, rcx + ; A[4] * B[2] + mulx rcx, rax, QWORD PTR [rbp+16] + mov QWORD PTR [rbx+40], r11 + adcx r12, rax + adox r13, rcx + mov QWORD PTR [rbx+48], r12 + mov r10, QWORD PTR [rbx+64] + mov r11, QWORD PTR [r8] + mov r12, QWORD PTR [r8+8] + ; A[4] * B[3] + mulx rcx, rax, QWORD PTR [rbp+24] + adcx r13, rax + adox r10, rcx + ; A[4] * B[4] + mulx rcx, rax, QWORD PTR [rbp+32] + mov QWORD PTR [rbx+56], r13 + adcx r10, rax + adox r11, rcx + ; A[4] * B[5] + mulx rcx, rax, QWORD PTR [rbp+40] + mov QWORD PTR [rbx+64], r10 + adcx r11, rax + adox r12, rcx + mov QWORD PTR [r8], r11 + mov r13, QWORD PTR [r8+16] + mov r10, QWORD PTR [r8+24] + ; A[4] * B[6] + mulx rcx, rax, QWORD PTR [rbp+48] + adcx r12, rax + adox r13, rcx + ; A[4] * B[7] + mulx rcx, rax, QWORD PTR [rbp+56] + mov QWORD PTR [r8+8], r12 + adcx r13, rax + adox r10, rcx + ; A[4] * B[8] + mulx rcx, rax, QWORD PTR [rbp+64] + mov QWORD PTR [r8+16], r13 + mov r11, r15 + adcx r10, rax + adox r11, rcx + adcx r11, r14 + mov r14, r15 + adox r14, r15 + adcx r14, r15 + mov QWORD PTR [r8+24], r10 + mov QWORD PTR [r8+32], r11 + mov rdx, QWORD PTR [r9+40] + mov r11, QWORD PTR [rbx+40] + mov r12, QWORD PTR [rbx+48] + mov r13, QWORD PTR [rbx+56] + mov r10, QWORD PTR [rbx+64] + ; A[5] * B[0] + mulx rcx, rax, QWORD PTR [rbp] + adcx r11, rax + adox r12, rcx + ; A[5] * B[1] + mulx rcx, rax, QWORD PTR [rbp+8] + mov QWORD PTR [rbx+40], r11 + adcx r12, rax + adox r13, rcx + ; A[5] * B[2] + mulx rcx, rax, QWORD PTR [rbp+16] + mov QWORD PTR [rbx+48], r12 + adcx r13, rax + adox r10, rcx + mov QWORD PTR [rbx+56], r13 + mov r11, QWORD PTR [r8] + mov r12, QWORD PTR [r8+8] + mov r13, QWORD PTR [r8+16] + ; A[5] * B[3] + mulx rcx, rax, QWORD PTR [rbp+24] + adcx r10, rax + adox r11, rcx + ; A[5] * B[4] + mulx rcx, rax, QWORD PTR [rbp+32] + mov QWORD PTR [rbx+64], r10 + adcx r11, rax + adox r12, rcx + ; A[5] * B[5] + mulx rcx, rax, QWORD PTR [rbp+40] + mov QWORD PTR [r8], r11 + adcx r12, rax + adox r13, rcx + mov QWORD PTR [r8+8], r12 + mov r10, QWORD PTR [r8+24] + mov r11, QWORD PTR [r8+32] + ; A[5] * B[6] + mulx rcx, rax, QWORD PTR [rbp+48] + adcx r13, rax + adox r10, rcx + ; A[5] * B[7] + mulx rcx, rax, QWORD PTR [rbp+56] + mov QWORD PTR [r8+16], r13 + adcx r10, rax + adox r11, rcx + ; A[5] * B[8] + mulx rcx, rax, QWORD PTR [rbp+64] + mov QWORD PTR [r8+24], r10 + mov r12, r15 + adcx r11, rax + adox r12, rcx + adcx r12, r14 + mov r14, r15 + adox r14, r15 + adcx r14, r15 + mov QWORD PTR [r8+32], r11 + mov QWORD PTR [r8+40], r12 + mov rdx, QWORD PTR [r9+48] + mov r12, QWORD PTR [rbx+48] + mov r13, QWORD PTR [rbx+56] + mov r10, QWORD PTR [rbx+64] + mov r11, QWORD PTR [r8] + ; A[6] * B[0] + mulx rcx, rax, QWORD PTR [rbp] + adcx r12, rax + adox r13, rcx + ; A[6] * B[1] + mulx rcx, rax, QWORD PTR [rbp+8] + mov QWORD PTR [rbx+48], r12 + adcx r13, rax + adox r10, rcx + ; A[6] * B[2] + mulx rcx, rax, QWORD PTR [rbp+16] + mov QWORD PTR [rbx+56], r13 + adcx r10, rax + adox r11, rcx + mov QWORD PTR [rbx+64], r10 + mov r12, QWORD PTR [r8+8] + mov r13, QWORD PTR [r8+16] + mov r10, QWORD PTR [r8+24] + ; A[6] * B[3] + mulx rcx, rax, QWORD PTR [rbp+24] + adcx r11, rax + adox r12, rcx + ; A[6] * B[4] + mulx rcx, rax, QWORD PTR [rbp+32] + mov QWORD PTR [r8], r11 + adcx r12, rax + adox r13, rcx + ; A[6] * B[5] + mulx rcx, rax, QWORD PTR [rbp+40] + mov QWORD PTR [r8+8], r12 + adcx r13, rax + adox r10, rcx + mov QWORD PTR [r8+16], r13 + mov r11, QWORD PTR [r8+32] + mov r12, QWORD PTR [r8+40] + ; A[6] * B[6] + mulx rcx, rax, QWORD PTR [rbp+48] + adcx r10, rax + adox r11, rcx + ; A[6] * B[7] + mulx rcx, rax, QWORD PTR [rbp+56] + mov QWORD PTR [r8+24], r10 + adcx r11, rax + adox r12, rcx + ; A[6] * B[8] + mulx rcx, rax, QWORD PTR [rbp+64] + mov QWORD PTR [r8+32], r11 + mov r13, r15 + adcx r12, rax + adox r13, rcx + adcx r13, r14 + mov r14, r15 + adox r14, r15 + adcx r14, r15 + mov QWORD PTR [r8+40], r12 + mov QWORD PTR [r8+48], r13 + mov rdx, QWORD PTR [r9+56] + mov r13, QWORD PTR [rbx+56] + mov r10, QWORD PTR [rbx+64] + mov r11, QWORD PTR [r8] + mov r12, QWORD PTR [r8+8] + ; A[7] * B[0] + mulx rcx, rax, QWORD PTR [rbp] + adcx r13, rax + adox r10, rcx + ; A[7] * B[1] + mulx rcx, rax, QWORD PTR [rbp+8] + mov QWORD PTR [rbx+56], r13 + adcx r10, rax + adox r11, rcx + ; A[7] * B[2] + mulx rcx, rax, QWORD PTR [rbp+16] + mov QWORD PTR [rbx+64], r10 + adcx r11, rax + adox r12, rcx + mov QWORD PTR [r8], r11 + mov r13, QWORD PTR [r8+16] + mov r10, QWORD PTR [r8+24] + mov r11, QWORD PTR [r8+32] + ; A[7] * B[3] + mulx rcx, rax, QWORD PTR [rbp+24] + adcx r12, rax + adox r13, rcx + ; A[7] * B[4] + mulx rcx, rax, QWORD PTR [rbp+32] + mov QWORD PTR [r8+8], r12 + adcx r13, rax + adox r10, rcx + ; A[7] * B[5] + mulx rcx, rax, QWORD PTR [rbp+40] + mov QWORD PTR [r8+16], r13 + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8+24], r10 + mov r12, QWORD PTR [r8+40] + mov r13, QWORD PTR [r8+48] + ; A[7] * B[6] + mulx rcx, rax, QWORD PTR [rbp+48] + adcx r11, rax + adox r12, rcx + ; A[7] * B[7] + mulx rcx, rax, QWORD PTR [rbp+56] + mov QWORD PTR [r8+32], r11 + adcx r12, rax + adox r13, rcx + ; A[7] * B[8] + mulx rcx, rax, QWORD PTR [rbp+64] + mov QWORD PTR [r8+40], r12 + mov r10, r15 + adcx r13, rax + adox r10, rcx + adcx r10, r14 + mov r14, r15 + adox r14, r15 + adcx r14, r15 + mov QWORD PTR [r8+48], r13 + mov QWORD PTR [r8+56], r10 + mov rdx, QWORD PTR [r9+64] + mov r10, QWORD PTR [rbx+64] + mov r11, QWORD PTR [r8] + mov r12, QWORD PTR [r8+8] + mov r13, QWORD PTR [r8+16] + ; A[8] * B[0] + mulx rcx, rax, QWORD PTR [rbp] + adcx r10, rax + adox r11, rcx + ; A[8] * B[1] + mulx rcx, rax, QWORD PTR [rbp+8] + mov QWORD PTR [rbx+64], r10 + adcx r11, rax + adox r12, rcx + ; A[8] * B[2] + mulx rcx, rax, QWORD PTR [rbp+16] + mov QWORD PTR [r8], r11 + adcx r12, rax + adox r13, rcx + mov QWORD PTR [r8+8], r12 + mov r10, QWORD PTR [r8+24] + mov r11, QWORD PTR [r8+32] + mov r12, QWORD PTR [r8+40] + ; A[8] * B[3] + mulx rcx, rax, QWORD PTR [rbp+24] + adcx r13, rax + adox r10, rcx + ; A[8] * B[4] + mulx rcx, rax, QWORD PTR [rbp+32] + mov QWORD PTR [r8+16], r13 + adcx r10, rax + adox r11, rcx + ; A[8] * B[5] + mulx rcx, rax, QWORD PTR [rbp+40] + mov QWORD PTR [r8+24], r10 + adcx r11, rax + adox r12, rcx + mov QWORD PTR [r8+32], r11 + mov r13, QWORD PTR [r8+48] + mov r10, QWORD PTR [r8+56] + ; A[8] * B[6] + mulx rcx, rax, QWORD PTR [rbp+48] + adcx r12, rax + adox r13, rcx + ; A[8] * B[7] + mulx rcx, rax, QWORD PTR [rbp+56] + mov QWORD PTR [r8+40], r12 + adcx r13, rax + adox r10, rcx + ; A[8] * B[8] + mulx rcx, rax, QWORD PTR [rbp+64] + mov QWORD PTR [r8+48], r13 + mov r11, r15 + adcx r10, rax + adox r11, rcx + adcx r11, r14 + mov QWORD PTR [r8+56], r10 + mov QWORD PTR [r8+64], r11 + sub r8, 72 + cmp r9, r8 + je L_start_521_mul_avx2_9 + cmp rbp, r8 + jne L_end_521_mul_avx2_9 +L_start_521_mul_avx2_9: + vmovdqu xmm0, OWORD PTR [rbx] + vmovups OWORD PTR [r8], xmm0 + vmovdqu xmm0, OWORD PTR [rbx+16] + vmovups OWORD PTR [r8+16], xmm0 + vmovdqu xmm0, OWORD PTR [rbx+32] + vmovups OWORD PTR [r8+32], xmm0 + vmovdqu xmm0, OWORD PTR [rbx+48] + vmovups OWORD PTR [r8+48], xmm0 + mov rax, QWORD PTR [rbx+64] + mov QWORD PTR [r8+64], rax +L_end_521_mul_avx2_9: + add rsp, 72 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + ret +sp_521_mul_avx2_9 ENDP +_text ENDS +ENDIF ; /* Square a and put result in r. (r = a * a) ; * ; * r A single precision integer. @@ -44829,6 +45497,405 @@ sp_521_sqr_9 PROC ret sp_521_sqr_9 ENDP _text ENDS +IFDEF HAVE_INTEL_AVX2 +; /* Square a and put result in r. (r = a * a) +; * +; * r A single precision integer. +; * a A single precision integer. +; */ +_text SEGMENT READONLY PARA +sp_521_sqr_avx2_9 PROC + push rbp + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + mov r8, rcx + mov r9, rdx + sub rsp, 72 + cmp r9, r8 + mov rbp, rsp + cmovne rbp, r8 + add r8, 72 + xor r12, r12 + ; Diagonal 1 + ; Zero into %r9 + ; A[1] x A[0] + mov rdx, QWORD PTR [r9] + mulx r11, r10, QWORD PTR [r9+8] + mov QWORD PTR [rbp+8], r10 + ; Zero into %r8 + ; A[2] x A[0] + mulx r10, rax, QWORD PTR [r9+16] + adcx r11, rax + adox r10, r12 + mov QWORD PTR [rbp+16], r11 + ; No load %r12 - %r9 + ; A[3] x A[0] + mulx r14, rax, QWORD PTR [r9+24] + adcx r10, rax + adox r14, r12 + mov QWORD PTR [rbp+24], r10 + ; No load %r13 - %r8 + ; A[4] x A[0] + mulx r15, rax, QWORD PTR [r9+32] + adcx r14, rax + adox r15, r12 + ; No store %r12 - %r9 + ; No load %r14 - %r9 + ; A[5] x A[0] + mulx rdi, rax, QWORD PTR [r9+40] + adcx r15, rax + adox rdi, r12 + ; No store %r13 - %r8 + ; No load %r15 - %r8 + ; A[6] x A[0] + mulx rsi, rax, QWORD PTR [r9+48] + adcx rdi, rax + adox rsi, r12 + ; No store %r14 - %r9 + ; No load %rbx - %r9 + ; A[7] x A[0] + mulx rbx, rax, QWORD PTR [r9+56] + adcx rsi, rax + adox rbx, r12 + ; No store %r15 - %r8 + ; Zero into %r8 + ; A[8] x A[0] + mulx r10, rax, QWORD PTR [r9+64] + adcx rbx, rax + adox r10, r12 + ; No store %rbx - %r9 + ; Zero into %r9 + ; A[8] x A[1] + mov rdx, QWORD PTR [r9+8] + mulx r11, rax, QWORD PTR [r9+64] + adcx r10, rax + adox r11, r12 + mov QWORD PTR [r8], r10 + ; Carry + adcx r11, r12 + mov r13, r12 + adcx r13, r12 + adox r13, r12 + mov QWORD PTR [r8+8], r11 + ; Diagonal 2 + mov r11, QWORD PTR [rbp+24] + ; No load %r12 - %r8 + ; A[2] x A[1] + mulx rcx, rax, QWORD PTR [r9+16] + adcx r11, rax + adox r14, rcx + mov QWORD PTR [rbp+24], r11 + ; No load %r13 - %r9 + ; A[3] x A[1] + mulx rcx, rax, QWORD PTR [r9+24] + adcx r14, rax + adox r15, rcx + ; No store %r12 - %r8 + ; No load %r14 - %r8 + ; A[4] x A[1] + mulx rcx, rax, QWORD PTR [r9+32] + adcx r15, rax + adox rdi, rcx + ; No store %r13 - %r9 + ; No load %r15 - %r9 + ; A[5] x A[1] + mulx rcx, rax, QWORD PTR [r9+40] + adcx rdi, rax + adox rsi, rcx + ; No store %r14 - %r8 + ; No load %rbx - %r8 + ; A[6] x A[1] + mulx rcx, rax, QWORD PTR [r9+48] + adcx rsi, rax + adox rbx, rcx + ; No store %r15 - %r9 + mov r11, QWORD PTR [r8] + ; A[7] x A[1] + mulx rcx, rax, QWORD PTR [r9+56] + adcx rbx, rax + adox r11, rcx + ; No store %rbx - %r8 + mov r10, QWORD PTR [r8+8] + ; A[7] x A[2] + mov rdx, QWORD PTR [r9+16] + mulx rcx, rax, QWORD PTR [r9+56] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8], r11 + ; Zero into %r9 + ; A[7] x A[3] + mov rdx, QWORD PTR [r9+24] + mulx r11, rax, QWORD PTR [r9+56] + adcx r10, rax + adox r11, r12 + mov QWORD PTR [r8+8], r10 + ; Zero into %r8 + ; A[7] x A[4] + mov rdx, QWORD PTR [r9+32] + mulx r10, rax, QWORD PTR [r9+56] + adcx r11, rax + adox r10, r12 + mov QWORD PTR [r8+16], r11 + ; Carry + adcx r10, r13 + mov r13, r12 + adcx r13, r12 + adox r13, r12 + mov QWORD PTR [r8+24], r10 + ; Diagonal 3 + ; No load %r14 - %r9 + ; A[3] x A[2] + mov rdx, QWORD PTR [r9+16] + mulx rcx, rax, QWORD PTR [r9+24] + adcx r15, rax + adox rdi, rcx + ; No store %r13 - %r8 + ; No load %r15 - %r8 + ; A[4] x A[2] + mulx rcx, rax, QWORD PTR [r9+32] + adcx rdi, rax + adox rsi, rcx + ; No store %r14 - %r9 + ; No load %rbx - %r9 + ; A[5] x A[2] + mulx rcx, rax, QWORD PTR [r9+40] + adcx rsi, rax + adox rbx, rcx + ; No store %r15 - %r8 + mov r10, QWORD PTR [r8] + ; A[6] x A[2] + mulx rcx, rax, QWORD PTR [r9+48] + adcx rbx, rax + adox r10, rcx + ; No store %rbx - %r9 + mov r11, QWORD PTR [r8+8] + ; A[6] x A[3] + mov rdx, QWORD PTR [r9+24] + mulx rcx, rax, QWORD PTR [r9+48] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8], r10 + mov r10, QWORD PTR [r8+16] + ; A[6] x A[4] + mov rdx, QWORD PTR [r9+32] + mulx rcx, rax, QWORD PTR [r9+48] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8+8], r11 + mov r11, QWORD PTR [r8+24] + ; A[6] x A[5] + mov rdx, QWORD PTR [r9+40] + mulx rcx, rax, QWORD PTR [r9+48] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8+16], r10 + ; Zero into %r8 + ; A[8] x A[4] + mov rdx, QWORD PTR [r9+32] + mulx r10, rax, QWORD PTR [r9+64] + adcx r11, rax + adox r10, r12 + mov QWORD PTR [r8+24], r11 + ; Zero into %r9 + ; A[8] x A[5] + mov rdx, QWORD PTR [r9+40] + mulx r11, rax, QWORD PTR [r9+64] + adcx r10, rax + adox r11, r12 + mov QWORD PTR [r8+32], r10 + ; Carry + adcx r11, r13 + mov r13, r12 + adcx r13, r12 + adox r13, r12 + mov QWORD PTR [r8+40], r11 + ; Diagonal 4 + ; No load %rbx - %r8 + ; A[4] x A[3] + mov rdx, QWORD PTR [r9+24] + mulx rcx, rax, QWORD PTR [r9+32] + adcx rsi, rax + adox rbx, rcx + ; No store %r15 - %r9 + mov r11, QWORD PTR [r8] + ; A[5] x A[3] + mulx rcx, rax, QWORD PTR [r9+40] + adcx rbx, rax + adox r11, rcx + ; No store %rbx - %r8 + mov r10, QWORD PTR [r8+8] + ; A[5] x A[4] + mov rdx, QWORD PTR [r9+32] + mulx rcx, rax, QWORD PTR [r9+40] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8], r11 + mov r11, QWORD PTR [r8+16] + ; A[8] x A[2] + mov rdx, QWORD PTR [r9+16] + mulx rcx, rax, QWORD PTR [r9+64] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8+8], r10 + mov r10, QWORD PTR [r8+24] + ; A[8] x A[3] + mov rdx, QWORD PTR [r9+24] + mulx rcx, rax, QWORD PTR [r9+64] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8+16], r11 + mov r11, QWORD PTR [r8+32] + ; A[7] x A[5] + mov rdx, QWORD PTR [r9+40] + mulx rcx, rax, QWORD PTR [r9+56] + adcx r10, rax + adox r11, rcx + mov QWORD PTR [r8+24], r10 + mov r10, QWORD PTR [r8+40] + ; A[7] x A[6] + mov rdx, QWORD PTR [r9+48] + mulx rcx, rax, QWORD PTR [r9+56] + adcx r11, rax + adox r10, rcx + mov QWORD PTR [r8+32], r11 + ; Zero into %r9 + ; A[8] x A[6] + mulx r11, rax, QWORD PTR [r9+64] + adcx r10, rax + adox r11, r12 + mov QWORD PTR [r8+40], r10 + ; Zero into %r8 + ; A[8] x A[7] + mov rdx, QWORD PTR [r9+56] + mulx r10, rax, QWORD PTR [r9+64] + adcx r11, rax + adox r10, r12 + mov QWORD PTR [r8+48], r11 + ; Carry + adcx r10, r13 + mov r13, r12 + adcx r13, r12 + adox r13, r12 + mov QWORD PTR [r8+56], r10 + mov QWORD PTR [r8+64], r13 + ; Double and Add in A[i] x A[i] + mov r11, QWORD PTR [rbp+8] + ; A[0] x A[0] + mov rdx, QWORD PTR [r9] + mulx rcx, rax, rdx + mov QWORD PTR [rbp], rax + adox r11, r11 + adcx r11, rcx + mov QWORD PTR [rbp+8], r11 + mov r10, QWORD PTR [rbp+16] + mov r11, QWORD PTR [rbp+24] + ; A[1] x A[1] + mov rdx, QWORD PTR [r9+8] + mulx rcx, rax, rdx + adox r10, r10 + adox r11, r11 + adcx r10, rax + adcx r11, rcx + mov QWORD PTR [rbp+16], r10 + mov QWORD PTR [rbp+24], r11 + ; A[2] x A[2] + mov rdx, QWORD PTR [r9+16] + mulx rcx, rax, rdx + adox r14, r14 + adox r15, r15 + adcx r14, rax + adcx r15, rcx + ; A[3] x A[3] + mov rdx, QWORD PTR [r9+24] + mulx rcx, rax, rdx + adox rdi, rdi + adox rsi, rsi + adcx rdi, rax + adcx rsi, rcx + mov r11, QWORD PTR [r8] + ; A[4] x A[4] + mov rdx, QWORD PTR [r9+32] + mulx rcx, rax, rdx + adox rbx, rbx + adox r11, r11 + adcx rbx, rax + adcx r11, rcx + mov QWORD PTR [r8], r11 + mov r10, QWORD PTR [r8+8] + mov r11, QWORD PTR [r8+16] + ; A[5] x A[5] + mov rdx, QWORD PTR [r9+40] + mulx rcx, rax, rdx + adox r10, r10 + adox r11, r11 + adcx r10, rax + adcx r11, rcx + mov QWORD PTR [r8+8], r10 + mov QWORD PTR [r8+16], r11 + mov r10, QWORD PTR [r8+24] + mov r11, QWORD PTR [r8+32] + ; A[6] x A[6] + mov rdx, QWORD PTR [r9+48] + mulx rcx, rax, rdx + adox r10, r10 + adox r11, r11 + adcx r10, rax + adcx r11, rcx + mov QWORD PTR [r8+24], r10 + mov QWORD PTR [r8+32], r11 + mov r10, QWORD PTR [r8+40] + mov r11, QWORD PTR [r8+48] + ; A[7] x A[7] + mov rdx, QWORD PTR [r9+56] + mulx rcx, rax, rdx + adox r10, r10 + adox r11, r11 + adcx r10, rax + adcx r11, rcx + mov QWORD PTR [r8+40], r10 + mov QWORD PTR [r8+48], r11 + mov r10, QWORD PTR [r8+56] + mov r11, QWORD PTR [r8+64] + ; A[8] x A[8] + mov rdx, QWORD PTR [r9+64] + mulx rcx, rax, rdx + adox r10, r10 + adox r11, r11 + adcx r10, rax + adcx r11, rcx + mov QWORD PTR [r8+56], r10 + mov QWORD PTR [r8+64], r11 + mov QWORD PTR [r8+-40], r14 + mov QWORD PTR [r8+-32], r15 + mov QWORD PTR [r8+-24], rdi + mov QWORD PTR [r8+-16], rsi + mov QWORD PTR [r8+-8], rbx + sub r8, 72 + cmp r9, r8 + jne L_end_521_sqr_avx2_9 + vmovdqu xmm0, OWORD PTR [rbp] + vmovups OWORD PTR [r8], xmm0 + vmovdqu xmm0, OWORD PTR [rbp+16] + vmovups OWORD PTR [r8+16], xmm0 +L_end_521_sqr_avx2_9: + add rsp, 72 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + ret +sp_521_sqr_avx2_9 ENDP +_text ENDS +ENDIF ; /* Add b to a into r. (r = a + b) ; * ; * r A single precision integer. @@ -48890,585 +49957,47 @@ sp_521_to_bin_movbe_9 PROC sp_521_to_bin_movbe_9 ENDP _text ENDS ENDIF -IFDEF HAVE_INTEL_AVX2 -; /* Multiply a and b into r. (r = a * b) +; /* Shift number right by 1 bit. (r = a >> 1) ; * -; * r Result of multiplication. -; * a First number to multiply. -; * b Second number to multiply. +; * r Result of right shift by 1. +; * a Number to shift. ; */ _text SEGMENT READONLY PARA -sp_521_mul_avx2_9 PROC - push rbx - push rbp +sp_521_rshift_9 PROC push r12 - push r13 - push r14 - push r15 - mov rbp, r8 - mov r8, rcx - mov r9, rdx - sub rsp, 72 - cmp r9, r8 - mov rbx, rsp - cmovne rbx, r8 - cmp rbp, r8 - cmove rbx, rsp - add r8, 72 - xor r15, r15 - mov rdx, QWORD PTR [r9] - ; A[0] * B[0] - mulx r11, r10, QWORD PTR [rbp] - ; A[0] * B[1] - mulx r12, rax, QWORD PTR [rbp+8] - mov QWORD PTR [rbx], r10 - adcx r11, rax - ; A[0] * B[2] - mulx r13, rax, QWORD PTR [rbp+16] - mov QWORD PTR [rbx+8], r11 - adcx r12, rax - mov QWORD PTR [rbx+16], r12 - ; A[0] * B[3] - mulx r10, rax, QWORD PTR [rbp+24] - adcx r13, rax - ; A[0] * B[4] - mulx r11, rax, QWORD PTR [rbp+32] - mov QWORD PTR [rbx+24], r13 - adcx r10, rax - ; A[0] * B[5] - mulx r12, rax, QWORD PTR [rbp+40] - mov QWORD PTR [rbx+32], r10 - adcx r11, rax - mov QWORD PTR [rbx+40], r11 - ; A[0] * B[6] - mulx r13, rax, QWORD PTR [rbp+48] - adcx r12, rax - ; A[0] * B[7] - mulx r10, rax, QWORD PTR [rbp+56] - mov QWORD PTR [rbx+48], r12 - adcx r13, rax - ; A[0] * B[8] - mulx r11, rax, QWORD PTR [rbp+64] - mov QWORD PTR [rbx+56], r13 - adcx r10, rax - adcx r11, r15 - mov r14, r15 - adcx r14, r15 - mov QWORD PTR [rbx+64], r10 - mov QWORD PTR [r8], r11 - mov rdx, QWORD PTR [r9+8] - mov r11, QWORD PTR [rbx+8] - mov r12, QWORD PTR [rbx+16] - mov r13, QWORD PTR [rbx+24] - mov r10, QWORD PTR [rbx+32] - ; A[1] * B[0] - mulx rcx, rax, QWORD PTR [rbp] - adcx r11, rax - adox r12, rcx - ; A[1] * B[1] - mulx rcx, rax, QWORD PTR [rbp+8] - mov QWORD PTR [rbx+8], r11 - adcx r12, rax - adox r13, rcx - ; A[1] * B[2] - mulx rcx, rax, QWORD PTR [rbp+16] - mov QWORD PTR [rbx+16], r12 - adcx r13, rax - adox r10, rcx - mov QWORD PTR [rbx+24], r13 - mov r11, QWORD PTR [rbx+40] - mov r12, QWORD PTR [rbx+48] - mov r13, QWORD PTR [rbx+56] - ; A[1] * B[3] - mulx rcx, rax, QWORD PTR [rbp+24] - adcx r10, rax - adox r11, rcx - ; A[1] * B[4] - mulx rcx, rax, QWORD PTR [rbp+32] - mov QWORD PTR [rbx+32], r10 - adcx r11, rax - adox r12, rcx - ; A[1] * B[5] - mulx rcx, rax, QWORD PTR [rbp+40] - mov QWORD PTR [rbx+40], r11 - adcx r12, rax - adox r13, rcx - mov QWORD PTR [rbx+48], r12 - mov r10, QWORD PTR [rbx+64] - mov r11, QWORD PTR [r8] - ; A[1] * B[6] - mulx rcx, rax, QWORD PTR [rbp+48] - adcx r13, rax - adox r10, rcx - ; A[1] * B[7] - mulx rcx, rax, QWORD PTR [rbp+56] - mov QWORD PTR [rbx+56], r13 - adcx r10, rax - adox r11, rcx - ; A[1] * B[8] - mulx rcx, rax, QWORD PTR [rbp+64] - mov QWORD PTR [rbx+64], r10 - mov r12, r15 - adcx r11, rax - adox r12, rcx - adcx r12, r14 - mov r14, r15 - adox r14, r15 - adcx r14, r15 - mov QWORD PTR [r8], r11 - mov QWORD PTR [r8+8], r12 - mov rdx, QWORD PTR [r9+16] - mov r12, QWORD PTR [rbx+16] - mov r13, QWORD PTR [rbx+24] - mov r10, QWORD PTR [rbx+32] - mov r11, QWORD PTR [rbx+40] - ; A[2] * B[0] - mulx rcx, rax, QWORD PTR [rbp] - adcx r12, rax - adox r13, rcx - ; A[2] * B[1] - mulx rcx, rax, QWORD PTR [rbp+8] - mov QWORD PTR [rbx+16], r12 - adcx r13, rax - adox r10, rcx - ; A[2] * B[2] - mulx rcx, rax, QWORD PTR [rbp+16] - mov QWORD PTR [rbx+24], r13 - adcx r10, rax - adox r11, rcx - mov QWORD PTR [rbx+32], r10 - mov r12, QWORD PTR [rbx+48] - mov r13, QWORD PTR [rbx+56] - mov r10, QWORD PTR [rbx+64] - ; A[2] * B[3] - mulx rcx, rax, QWORD PTR [rbp+24] - adcx r11, rax - adox r12, rcx - ; A[2] * B[4] - mulx rcx, rax, QWORD PTR [rbp+32] - mov QWORD PTR [rbx+40], r11 - adcx r12, rax - adox r13, rcx - ; A[2] * B[5] - mulx rcx, rax, QWORD PTR [rbp+40] - mov QWORD PTR [rbx+48], r12 - adcx r13, rax - adox r10, rcx - mov QWORD PTR [rbx+56], r13 - mov r11, QWORD PTR [r8] - mov r12, QWORD PTR [r8+8] - ; A[2] * B[6] - mulx rcx, rax, QWORD PTR [rbp+48] - adcx r10, rax - adox r11, rcx - ; A[2] * B[7] - mulx rcx, rax, QWORD PTR [rbp+56] - mov QWORD PTR [rbx+64], r10 - adcx r11, rax - adox r12, rcx - ; A[2] * B[8] - mulx rcx, rax, QWORD PTR [rbp+64] - mov QWORD PTR [r8], r11 - mov r13, r15 - adcx r12, rax - adox r13, rcx - adcx r13, r14 - mov r14, r15 - adox r14, r15 - adcx r14, r15 - mov QWORD PTR [r8+8], r12 - mov QWORD PTR [r8+16], r13 - mov rdx, QWORD PTR [r9+24] - mov r13, QWORD PTR [rbx+24] - mov r10, QWORD PTR [rbx+32] - mov r11, QWORD PTR [rbx+40] - mov r12, QWORD PTR [rbx+48] - ; A[3] * B[0] - mulx rcx, rax, QWORD PTR [rbp] - adcx r13, rax - adox r10, rcx - ; A[3] * B[1] - mulx rcx, rax, QWORD PTR [rbp+8] - mov QWORD PTR [rbx+24], r13 - adcx r10, rax - adox r11, rcx - ; A[3] * B[2] - mulx rcx, rax, QWORD PTR [rbp+16] - mov QWORD PTR [rbx+32], r10 - adcx r11, rax - adox r12, rcx - mov QWORD PTR [rbx+40], r11 - mov r13, QWORD PTR [rbx+56] - mov r10, QWORD PTR [rbx+64] - mov r11, QWORD PTR [r8] - ; A[3] * B[3] - mulx rcx, rax, QWORD PTR [rbp+24] - adcx r12, rax - adox r13, rcx - ; A[3] * B[4] - mulx rcx, rax, QWORD PTR [rbp+32] - mov QWORD PTR [rbx+48], r12 - adcx r13, rax - adox r10, rcx - ; A[3] * B[5] - mulx rcx, rax, QWORD PTR [rbp+40] - mov QWORD PTR [rbx+56], r13 - adcx r10, rax - adox r11, rcx - mov QWORD PTR [rbx+64], r10 - mov r12, QWORD PTR [r8+8] - mov r13, QWORD PTR [r8+16] - ; A[3] * B[6] - mulx rcx, rax, QWORD PTR [rbp+48] - adcx r11, rax - adox r12, rcx - ; A[3] * B[7] - mulx rcx, rax, QWORD PTR [rbp+56] - mov QWORD PTR [r8], r11 - adcx r12, rax - adox r13, rcx - ; A[3] * B[8] - mulx rcx, rax, QWORD PTR [rbp+64] - mov QWORD PTR [r8+8], r12 - mov r10, r15 - adcx r13, rax - adox r10, rcx - adcx r10, r14 - mov r14, r15 - adox r14, r15 - adcx r14, r15 - mov QWORD PTR [r8+16], r13 - mov QWORD PTR [r8+24], r10 - mov rdx, QWORD PTR [r9+32] - mov r10, QWORD PTR [rbx+32] - mov r11, QWORD PTR [rbx+40] - mov r12, QWORD PTR [rbx+48] - mov r13, QWORD PTR [rbx+56] - ; A[4] * B[0] - mulx rcx, rax, QWORD PTR [rbp] - adcx r10, rax - adox r11, rcx - ; A[4] * B[1] - mulx rcx, rax, QWORD PTR [rbp+8] - mov QWORD PTR [rbx+32], r10 - adcx r11, rax - adox r12, rcx - ; A[4] * B[2] - mulx rcx, rax, QWORD PTR [rbp+16] - mov QWORD PTR [rbx+40], r11 - adcx r12, rax - adox r13, rcx - mov QWORD PTR [rbx+48], r12 - mov r10, QWORD PTR [rbx+64] - mov r11, QWORD PTR [r8] - mov r12, QWORD PTR [r8+8] - ; A[4] * B[3] - mulx rcx, rax, QWORD PTR [rbp+24] - adcx r13, rax - adox r10, rcx - ; A[4] * B[4] - mulx rcx, rax, QWORD PTR [rbp+32] - mov QWORD PTR [rbx+56], r13 - adcx r10, rax - adox r11, rcx - ; A[4] * B[5] - mulx rcx, rax, QWORD PTR [rbp+40] - mov QWORD PTR [rbx+64], r10 - adcx r11, rax - adox r12, rcx - mov QWORD PTR [r8], r11 - mov r13, QWORD PTR [r8+16] - mov r10, QWORD PTR [r8+24] - ; A[4] * B[6] - mulx rcx, rax, QWORD PTR [rbp+48] - adcx r12, rax - adox r13, rcx - ; A[4] * B[7] - mulx rcx, rax, QWORD PTR [rbp+56] - mov QWORD PTR [r8+8], r12 - adcx r13, rax - adox r10, rcx - ; A[4] * B[8] - mulx rcx, rax, QWORD PTR [rbp+64] - mov QWORD PTR [r8+16], r13 - mov r11, r15 - adcx r10, rax - adox r11, rcx - adcx r11, r14 - mov r14, r15 - adox r14, r15 - adcx r14, r15 - mov QWORD PTR [r8+24], r10 - mov QWORD PTR [r8+32], r11 - mov rdx, QWORD PTR [r9+40] - mov r11, QWORD PTR [rbx+40] - mov r12, QWORD PTR [rbx+48] - mov r13, QWORD PTR [rbx+56] - mov r10, QWORD PTR [rbx+64] - ; A[5] * B[0] - mulx rcx, rax, QWORD PTR [rbp] - adcx r11, rax - adox r12, rcx - ; A[5] * B[1] - mulx rcx, rax, QWORD PTR [rbp+8] - mov QWORD PTR [rbx+40], r11 - adcx r12, rax - adox r13, rcx - ; A[5] * B[2] - mulx rcx, rax, QWORD PTR [rbp+16] - mov QWORD PTR [rbx+48], r12 - adcx r13, rax - adox r10, rcx - mov QWORD PTR [rbx+56], r13 - mov r11, QWORD PTR [r8] - mov r12, QWORD PTR [r8+8] - mov r13, QWORD PTR [r8+16] - ; A[5] * B[3] - mulx rcx, rax, QWORD PTR [rbp+24] - adcx r10, rax - adox r11, rcx - ; A[5] * B[4] - mulx rcx, rax, QWORD PTR [rbp+32] - mov QWORD PTR [rbx+64], r10 - adcx r11, rax - adox r12, rcx - ; A[5] * B[5] - mulx rcx, rax, QWORD PTR [rbp+40] - mov QWORD PTR [r8], r11 - adcx r12, rax - adox r13, rcx - mov QWORD PTR [r8+8], r12 - mov r10, QWORD PTR [r8+24] - mov r11, QWORD PTR [r8+32] - ; A[5] * B[6] - mulx rcx, rax, QWORD PTR [rbp+48] - adcx r13, rax - adox r10, rcx - ; A[5] * B[7] - mulx rcx, rax, QWORD PTR [rbp+56] - mov QWORD PTR [r8+16], r13 - adcx r10, rax - adox r11, rcx - ; A[5] * B[8] - mulx rcx, rax, QWORD PTR [rbp+64] - mov QWORD PTR [r8+24], r10 - mov r12, r15 - adcx r11, rax - adox r12, rcx - adcx r12, r14 - mov r14, r15 - adox r14, r15 - adcx r14, r15 - mov QWORD PTR [r8+32], r11 - mov QWORD PTR [r8+40], r12 - mov rdx, QWORD PTR [r9+48] - mov r12, QWORD PTR [rbx+48] - mov r13, QWORD PTR [rbx+56] - mov r10, QWORD PTR [rbx+64] - mov r11, QWORD PTR [r8] - ; A[6] * B[0] - mulx rcx, rax, QWORD PTR [rbp] - adcx r12, rax - adox r13, rcx - ; A[6] * B[1] - mulx rcx, rax, QWORD PTR [rbp+8] - mov QWORD PTR [rbx+48], r12 - adcx r13, rax - adox r10, rcx - ; A[6] * B[2] - mulx rcx, rax, QWORD PTR [rbp+16] - mov QWORD PTR [rbx+56], r13 - adcx r10, rax - adox r11, rcx - mov QWORD PTR [rbx+64], r10 - mov r12, QWORD PTR [r8+8] - mov r13, QWORD PTR [r8+16] - mov r10, QWORD PTR [r8+24] - ; A[6] * B[3] - mulx rcx, rax, QWORD PTR [rbp+24] - adcx r11, rax - adox r12, rcx - ; A[6] * B[4] - mulx rcx, rax, QWORD PTR [rbp+32] - mov QWORD PTR [r8], r11 - adcx r12, rax - adox r13, rcx - ; A[6] * B[5] - mulx rcx, rax, QWORD PTR [rbp+40] - mov QWORD PTR [r8+8], r12 - adcx r13, rax - adox r10, rcx - mov QWORD PTR [r8+16], r13 - mov r11, QWORD PTR [r8+32] - mov r12, QWORD PTR [r8+40] - ; A[6] * B[6] - mulx rcx, rax, QWORD PTR [rbp+48] - adcx r10, rax - adox r11, rcx - ; A[6] * B[7] - mulx rcx, rax, QWORD PTR [rbp+56] - mov QWORD PTR [r8+24], r10 - adcx r11, rax - adox r12, rcx - ; A[6] * B[8] - mulx rcx, rax, QWORD PTR [rbp+64] - mov QWORD PTR [r8+32], r11 - mov r13, r15 - adcx r12, rax - adox r13, rcx - adcx r13, r14 - mov r14, r15 - adox r14, r15 - adcx r14, r15 - mov QWORD PTR [r8+40], r12 - mov QWORD PTR [r8+48], r13 - mov rdx, QWORD PTR [r9+56] - mov r13, QWORD PTR [rbx+56] - mov r10, QWORD PTR [rbx+64] - mov r11, QWORD PTR [r8] - mov r12, QWORD PTR [r8+8] - ; A[7] * B[0] - mulx rcx, rax, QWORD PTR [rbp] - adcx r13, rax - adox r10, rcx - ; A[7] * B[1] - mulx rcx, rax, QWORD PTR [rbp+8] - mov QWORD PTR [rbx+56], r13 - adcx r10, rax - adox r11, rcx - ; A[7] * B[2] - mulx rcx, rax, QWORD PTR [rbp+16] - mov QWORD PTR [rbx+64], r10 - adcx r11, rax - adox r12, rcx - mov QWORD PTR [r8], r11 - mov r13, QWORD PTR [r8+16] - mov r10, QWORD PTR [r8+24] - mov r11, QWORD PTR [r8+32] - ; A[7] * B[3] - mulx rcx, rax, QWORD PTR [rbp+24] - adcx r12, rax - adox r13, rcx - ; A[7] * B[4] - mulx rcx, rax, QWORD PTR [rbp+32] - mov QWORD PTR [r8+8], r12 - adcx r13, rax - adox r10, rcx - ; A[7] * B[5] - mulx rcx, rax, QWORD PTR [rbp+40] - mov QWORD PTR [r8+16], r13 - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8+24], r10 - mov r12, QWORD PTR [r8+40] - mov r13, QWORD PTR [r8+48] - ; A[7] * B[6] - mulx rcx, rax, QWORD PTR [rbp+48] - adcx r11, rax - adox r12, rcx - ; A[7] * B[7] - mulx rcx, rax, QWORD PTR [rbp+56] - mov QWORD PTR [r8+32], r11 - adcx r12, rax - adox r13, rcx - ; A[7] * B[8] - mulx rcx, rax, QWORD PTR [rbp+64] - mov QWORD PTR [r8+40], r12 - mov r10, r15 - adcx r13, rax - adox r10, rcx - adcx r10, r14 - mov r14, r15 - adox r14, r15 - adcx r14, r15 - mov QWORD PTR [r8+48], r13 - mov QWORD PTR [r8+56], r10 - mov rdx, QWORD PTR [r9+64] - mov r10, QWORD PTR [rbx+64] - mov r11, QWORD PTR [r8] - mov r12, QWORD PTR [r8+8] - mov r13, QWORD PTR [r8+16] - ; A[8] * B[0] - mulx rcx, rax, QWORD PTR [rbp] - adcx r10, rax - adox r11, rcx - ; A[8] * B[1] - mulx rcx, rax, QWORD PTR [rbp+8] - mov QWORD PTR [rbx+64], r10 - adcx r11, rax - adox r12, rcx - ; A[8] * B[2] - mulx rcx, rax, QWORD PTR [rbp+16] - mov QWORD PTR [r8], r11 - adcx r12, rax - adox r13, rcx - mov QWORD PTR [r8+8], r12 - mov r10, QWORD PTR [r8+24] - mov r11, QWORD PTR [r8+32] - mov r12, QWORD PTR [r8+40] - ; A[8] * B[3] - mulx rcx, rax, QWORD PTR [rbp+24] - adcx r13, rax - adox r10, rcx - ; A[8] * B[4] - mulx rcx, rax, QWORD PTR [rbp+32] - mov QWORD PTR [r8+16], r13 - adcx r10, rax - adox r11, rcx - ; A[8] * B[5] - mulx rcx, rax, QWORD PTR [rbp+40] - mov QWORD PTR [r8+24], r10 - adcx r11, rax - adox r12, rcx - mov QWORD PTR [r8+32], r11 - mov r13, QWORD PTR [r8+48] - mov r10, QWORD PTR [r8+56] - ; A[8] * B[6] - mulx rcx, rax, QWORD PTR [rbp+48] - adcx r12, rax - adox r13, rcx - ; A[8] * B[7] - mulx rcx, rax, QWORD PTR [rbp+56] - mov QWORD PTR [r8+40], r12 - adcx r13, rax - adox r10, rcx - ; A[8] * B[8] - mulx rcx, rax, QWORD PTR [rbp+64] - mov QWORD PTR [r8+48], r13 - mov r11, r15 - adcx r10, rax - adox r11, rcx - adcx r11, r14 - mov QWORD PTR [r8+56], r10 - mov QWORD PTR [r8+64], r11 - sub r8, 72 - cmp r9, r8 - je L_start_521_mul_avx2_9 - cmp rbp, r8 - jne L_end_521_mul_avx2_9 -L_start_521_mul_avx2_9: - vmovdqu xmm0, OWORD PTR [rbx] - vmovups OWORD PTR [r8], xmm0 - vmovdqu xmm0, OWORD PTR [rbx+16] - vmovups OWORD PTR [r8+16], xmm0 - vmovdqu xmm0, OWORD PTR [rbx+32] - vmovups OWORD PTR [r8+32], xmm0 - vmovdqu xmm0, OWORD PTR [rbx+48] - vmovups OWORD PTR [r8+48], xmm0 - mov rax, QWORD PTR [rbx+64] - mov QWORD PTR [r8+64], rax -L_end_521_mul_avx2_9: - add rsp, 72 - pop r15 - pop r14 - pop r13 + mov rcx, r8 + mov rax, rcx + mov r8, QWORD PTR [rdx] + mov r9, QWORD PTR [rdx+8] + mov r10, QWORD PTR [rdx+16] + mov r11, QWORD PTR [rdx+24] + mov r12, QWORD PTR [rdx+32] + shrd r8, r9, cl + shrd r9, r10, cl + shrd r10, r11, cl + shrd r11, r12, cl + mov QWORD PTR [rax], r8 + mov QWORD PTR [rax+8], r9 + mov QWORD PTR [rax+16], r10 + mov QWORD PTR [rax+24], r11 + mov r9, QWORD PTR [rdx+40] + mov r10, QWORD PTR [rdx+48] + mov r11, QWORD PTR [rdx+56] + mov r8, QWORD PTR [rdx+64] + shrd r12, r9, cl + shrd r9, r10, cl + shrd r10, r11, cl + shrd r11, r8, cl + mov QWORD PTR [rax+32], r12 + mov QWORD PTR [rax+40], r9 + mov QWORD PTR [rax+48], r10 + mov QWORD PTR [rax+56], r11 + shr r8, cl + mov QWORD PTR [rax+64], r8 pop r12 - pop rbp - pop rbx ret -sp_521_mul_avx2_9 ENDP +sp_521_rshift_9 ENDP _text ENDS -ENDIF ; /* Shift number left by n bit. (r = a << n) ; * ; * r Result of left shift by n. @@ -49590,47 +50119,6 @@ sp_521_lshift_18 PROC ret sp_521_lshift_18 ENDP _text ENDS -; /* Shift number right by 1 bit. (r = a >> 1) -; * -; * r Result of right shift by 1. -; * a Number to shift. -; */ -_text SEGMENT READONLY PARA -sp_521_rshift_9 PROC - push r12 - mov rcx, r8 - mov rax, rcx - mov r8, QWORD PTR [rdx] - mov r9, QWORD PTR [rdx+8] - mov r10, QWORD PTR [rdx+16] - mov r11, QWORD PTR [rdx+24] - mov r12, QWORD PTR [rdx+32] - shrd r8, r9, cl - shrd r9, r10, cl - shrd r10, r11, cl - shrd r11, r12, cl - mov QWORD PTR [rax], r8 - mov QWORD PTR [rax+8], r9 - mov QWORD PTR [rax+16], r10 - mov QWORD PTR [rax+24], r11 - mov r9, QWORD PTR [rdx+40] - mov r10, QWORD PTR [rdx+48] - mov r11, QWORD PTR [rdx+56] - mov r8, QWORD PTR [rdx+64] - shrd r12, r9, cl - shrd r9, r10, cl - shrd r10, r11, cl - shrd r11, r8, cl - mov QWORD PTR [rax+32], r12 - mov QWORD PTR [rax+40], r9 - mov QWORD PTR [rax+48], r10 - mov QWORD PTR [rax+56], r11 - shr r8, cl - mov QWORD PTR [rax+64], r8 - pop r12 - ret -sp_521_rshift_9 ENDP -_text ENDS ; /* Sub b from a into a. (a -= b) ; * ; * a A single precision integer and result. @@ -49845,405 +50333,6 @@ div_521_word_asm_9 PROC div_521_word_asm_9 ENDP _text ENDS ENDIF -IFDEF HAVE_INTEL_AVX2 -; /* Square a and put result in r. (r = a * a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_521_sqr_avx2_9 PROC - push rbp - push r12 - push r13 - push r14 - push r15 - push rdi - push rsi - push rbx - mov r8, rcx - mov r9, rdx - sub rsp, 72 - cmp r9, r8 - mov rbp, rsp - cmovne rbp, r8 - add r8, 72 - xor r12, r12 - ; Diagonal 1 - ; Zero into %r9 - ; A[1] x A[0] - mov rdx, QWORD PTR [r9] - mulx r11, r10, QWORD PTR [r9+8] - mov QWORD PTR [rbp+8], r10 - ; Zero into %r8 - ; A[2] x A[0] - mulx r10, rax, QWORD PTR [r9+16] - adcx r11, rax - adox r10, r12 - mov QWORD PTR [rbp+16], r11 - ; No load %r12 - %r9 - ; A[3] x A[0] - mulx r14, rax, QWORD PTR [r9+24] - adcx r10, rax - adox r14, r12 - mov QWORD PTR [rbp+24], r10 - ; No load %r13 - %r8 - ; A[4] x A[0] - mulx r15, rax, QWORD PTR [r9+32] - adcx r14, rax - adox r15, r12 - ; No store %r12 - %r9 - ; No load %r14 - %r9 - ; A[5] x A[0] - mulx rdi, rax, QWORD PTR [r9+40] - adcx r15, rax - adox rdi, r12 - ; No store %r13 - %r8 - ; No load %r15 - %r8 - ; A[6] x A[0] - mulx rsi, rax, QWORD PTR [r9+48] - adcx rdi, rax - adox rsi, r12 - ; No store %r14 - %r9 - ; No load %rbx - %r9 - ; A[7] x A[0] - mulx rbx, rax, QWORD PTR [r9+56] - adcx rsi, rax - adox rbx, r12 - ; No store %r15 - %r8 - ; Zero into %r8 - ; A[8] x A[0] - mulx r10, rax, QWORD PTR [r9+64] - adcx rbx, rax - adox r10, r12 - ; No store %rbx - %r9 - ; Zero into %r9 - ; A[8] x A[1] - mov rdx, QWORD PTR [r9+8] - mulx r11, rax, QWORD PTR [r9+64] - adcx r10, rax - adox r11, r12 - mov QWORD PTR [r8], r10 - ; Carry - adcx r11, r12 - mov r13, r12 - adcx r13, r12 - adox r13, r12 - mov QWORD PTR [r8+8], r11 - ; Diagonal 2 - mov r11, QWORD PTR [rbp+24] - ; No load %r12 - %r8 - ; A[2] x A[1] - mulx rcx, rax, QWORD PTR [r9+16] - adcx r11, rax - adox r14, rcx - mov QWORD PTR [rbp+24], r11 - ; No load %r13 - %r9 - ; A[3] x A[1] - mulx rcx, rax, QWORD PTR [r9+24] - adcx r14, rax - adox r15, rcx - ; No store %r12 - %r8 - ; No load %r14 - %r8 - ; A[4] x A[1] - mulx rcx, rax, QWORD PTR [r9+32] - adcx r15, rax - adox rdi, rcx - ; No store %r13 - %r9 - ; No load %r15 - %r9 - ; A[5] x A[1] - mulx rcx, rax, QWORD PTR [r9+40] - adcx rdi, rax - adox rsi, rcx - ; No store %r14 - %r8 - ; No load %rbx - %r8 - ; A[6] x A[1] - mulx rcx, rax, QWORD PTR [r9+48] - adcx rsi, rax - adox rbx, rcx - ; No store %r15 - %r9 - mov r11, QWORD PTR [r8] - ; A[7] x A[1] - mulx rcx, rax, QWORD PTR [r9+56] - adcx rbx, rax - adox r11, rcx - ; No store %rbx - %r8 - mov r10, QWORD PTR [r8+8] - ; A[7] x A[2] - mov rdx, QWORD PTR [r9+16] - mulx rcx, rax, QWORD PTR [r9+56] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8], r11 - ; Zero into %r9 - ; A[7] x A[3] - mov rdx, QWORD PTR [r9+24] - mulx r11, rax, QWORD PTR [r9+56] - adcx r10, rax - adox r11, r12 - mov QWORD PTR [r8+8], r10 - ; Zero into %r8 - ; A[7] x A[4] - mov rdx, QWORD PTR [r9+32] - mulx r10, rax, QWORD PTR [r9+56] - adcx r11, rax - adox r10, r12 - mov QWORD PTR [r8+16], r11 - ; Carry - adcx r10, r13 - mov r13, r12 - adcx r13, r12 - adox r13, r12 - mov QWORD PTR [r8+24], r10 - ; Diagonal 3 - ; No load %r14 - %r9 - ; A[3] x A[2] - mov rdx, QWORD PTR [r9+16] - mulx rcx, rax, QWORD PTR [r9+24] - adcx r15, rax - adox rdi, rcx - ; No store %r13 - %r8 - ; No load %r15 - %r8 - ; A[4] x A[2] - mulx rcx, rax, QWORD PTR [r9+32] - adcx rdi, rax - adox rsi, rcx - ; No store %r14 - %r9 - ; No load %rbx - %r9 - ; A[5] x A[2] - mulx rcx, rax, QWORD PTR [r9+40] - adcx rsi, rax - adox rbx, rcx - ; No store %r15 - %r8 - mov r10, QWORD PTR [r8] - ; A[6] x A[2] - mulx rcx, rax, QWORD PTR [r9+48] - adcx rbx, rax - adox r10, rcx - ; No store %rbx - %r9 - mov r11, QWORD PTR [r8+8] - ; A[6] x A[3] - mov rdx, QWORD PTR [r9+24] - mulx rcx, rax, QWORD PTR [r9+48] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8], r10 - mov r10, QWORD PTR [r8+16] - ; A[6] x A[4] - mov rdx, QWORD PTR [r9+32] - mulx rcx, rax, QWORD PTR [r9+48] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8+8], r11 - mov r11, QWORD PTR [r8+24] - ; A[6] x A[5] - mov rdx, QWORD PTR [r9+40] - mulx rcx, rax, QWORD PTR [r9+48] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8+16], r10 - ; Zero into %r8 - ; A[8] x A[4] - mov rdx, QWORD PTR [r9+32] - mulx r10, rax, QWORD PTR [r9+64] - adcx r11, rax - adox r10, r12 - mov QWORD PTR [r8+24], r11 - ; Zero into %r9 - ; A[8] x A[5] - mov rdx, QWORD PTR [r9+40] - mulx r11, rax, QWORD PTR [r9+64] - adcx r10, rax - adox r11, r12 - mov QWORD PTR [r8+32], r10 - ; Carry - adcx r11, r13 - mov r13, r12 - adcx r13, r12 - adox r13, r12 - mov QWORD PTR [r8+40], r11 - ; Diagonal 4 - ; No load %rbx - %r8 - ; A[4] x A[3] - mov rdx, QWORD PTR [r9+24] - mulx rcx, rax, QWORD PTR [r9+32] - adcx rsi, rax - adox rbx, rcx - ; No store %r15 - %r9 - mov r11, QWORD PTR [r8] - ; A[5] x A[3] - mulx rcx, rax, QWORD PTR [r9+40] - adcx rbx, rax - adox r11, rcx - ; No store %rbx - %r8 - mov r10, QWORD PTR [r8+8] - ; A[5] x A[4] - mov rdx, QWORD PTR [r9+32] - mulx rcx, rax, QWORD PTR [r9+40] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8], r11 - mov r11, QWORD PTR [r8+16] - ; A[8] x A[2] - mov rdx, QWORD PTR [r9+16] - mulx rcx, rax, QWORD PTR [r9+64] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8+8], r10 - mov r10, QWORD PTR [r8+24] - ; A[8] x A[3] - mov rdx, QWORD PTR [r9+24] - mulx rcx, rax, QWORD PTR [r9+64] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8+16], r11 - mov r11, QWORD PTR [r8+32] - ; A[7] x A[5] - mov rdx, QWORD PTR [r9+40] - mulx rcx, rax, QWORD PTR [r9+56] - adcx r10, rax - adox r11, rcx - mov QWORD PTR [r8+24], r10 - mov r10, QWORD PTR [r8+40] - ; A[7] x A[6] - mov rdx, QWORD PTR [r9+48] - mulx rcx, rax, QWORD PTR [r9+56] - adcx r11, rax - adox r10, rcx - mov QWORD PTR [r8+32], r11 - ; Zero into %r9 - ; A[8] x A[6] - mulx r11, rax, QWORD PTR [r9+64] - adcx r10, rax - adox r11, r12 - mov QWORD PTR [r8+40], r10 - ; Zero into %r8 - ; A[8] x A[7] - mov rdx, QWORD PTR [r9+56] - mulx r10, rax, QWORD PTR [r9+64] - adcx r11, rax - adox r10, r12 - mov QWORD PTR [r8+48], r11 - ; Carry - adcx r10, r13 - mov r13, r12 - adcx r13, r12 - adox r13, r12 - mov QWORD PTR [r8+56], r10 - mov QWORD PTR [r8+64], r13 - ; Double and Add in A[i] x A[i] - mov r11, QWORD PTR [rbp+8] - ; A[0] x A[0] - mov rdx, QWORD PTR [r9] - mulx rcx, rax, rdx - mov QWORD PTR [rbp], rax - adox r11, r11 - adcx r11, rcx - mov QWORD PTR [rbp+8], r11 - mov r10, QWORD PTR [rbp+16] - mov r11, QWORD PTR [rbp+24] - ; A[1] x A[1] - mov rdx, QWORD PTR [r9+8] - mulx rcx, rax, rdx - adox r10, r10 - adox r11, r11 - adcx r10, rax - adcx r11, rcx - mov QWORD PTR [rbp+16], r10 - mov QWORD PTR [rbp+24], r11 - ; A[2] x A[2] - mov rdx, QWORD PTR [r9+16] - mulx rcx, rax, rdx - adox r14, r14 - adox r15, r15 - adcx r14, rax - adcx r15, rcx - ; A[3] x A[3] - mov rdx, QWORD PTR [r9+24] - mulx rcx, rax, rdx - adox rdi, rdi - adox rsi, rsi - adcx rdi, rax - adcx rsi, rcx - mov r11, QWORD PTR [r8] - ; A[4] x A[4] - mov rdx, QWORD PTR [r9+32] - mulx rcx, rax, rdx - adox rbx, rbx - adox r11, r11 - adcx rbx, rax - adcx r11, rcx - mov QWORD PTR [r8], r11 - mov r10, QWORD PTR [r8+8] - mov r11, QWORD PTR [r8+16] - ; A[5] x A[5] - mov rdx, QWORD PTR [r9+40] - mulx rcx, rax, rdx - adox r10, r10 - adox r11, r11 - adcx r10, rax - adcx r11, rcx - mov QWORD PTR [r8+8], r10 - mov QWORD PTR [r8+16], r11 - mov r10, QWORD PTR [r8+24] - mov r11, QWORD PTR [r8+32] - ; A[6] x A[6] - mov rdx, QWORD PTR [r9+48] - mulx rcx, rax, rdx - adox r10, r10 - adox r11, r11 - adcx r10, rax - adcx r11, rcx - mov QWORD PTR [r8+24], r10 - mov QWORD PTR [r8+32], r11 - mov r10, QWORD PTR [r8+40] - mov r11, QWORD PTR [r8+48] - ; A[7] x A[7] - mov rdx, QWORD PTR [r9+56] - mulx rcx, rax, rdx - adox r10, r10 - adox r11, r11 - adcx r10, rax - adcx r11, rcx - mov QWORD PTR [r8+40], r10 - mov QWORD PTR [r8+48], r11 - mov r10, QWORD PTR [r8+56] - mov r11, QWORD PTR [r8+64] - ; A[8] x A[8] - mov rdx, QWORD PTR [r9+64] - mulx rcx, rax, rdx - adox r10, r10 - adox r11, r11 - adcx r10, rax - adcx r11, rcx - mov QWORD PTR [r8+56], r10 - mov QWORD PTR [r8+64], r11 - mov QWORD PTR [r8+-40], r14 - mov QWORD PTR [r8+-32], r15 - mov QWORD PTR [r8+-24], rdi - mov QWORD PTR [r8+-16], rsi - mov QWORD PTR [r8+-8], rbx - sub r8, 72 - cmp r9, r8 - jne L_end_521_sqr_avx2_9 - vmovdqu xmm0, OWORD PTR [rbp] - vmovups OWORD PTR [r8], xmm0 - vmovdqu xmm0, OWORD PTR [rbp+16] - vmovups OWORD PTR [r8+16], xmm0 -L_end_521_sqr_avx2_9: - add rsp, 72 - pop rbx - pop rsi - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - ret -sp_521_sqr_avx2_9 ENDP -_text ENDS -ENDIF ; /* Shift number right by 1 bit. (r = a >> 1) ; * ; * r Result of right shift by 1.