From 8fb88a502d5de361ff8b400a7cf46e48b9c57701 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Tue, 26 May 2026 16:30:40 +1000 Subject: [PATCH] SP x86_64: save and restore vector registers Add save and restore vector registers calls to SP code for x86_64. --- wolfcrypt/src/sp_x86_64.c | 467 ++++++++++++++++++++++++++------------ 1 file changed, 323 insertions(+), 144 deletions(-) diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c index 1f7cd9e396..37a4bac232 100644 --- a/wolfcrypt/src/sp_x86_64.c +++ b/wolfcrypt/src/sp_x86_64.c @@ -1968,7 +1968,8 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, /* r = a ^ 0x10000 => r = a squared 16 times */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { for (i = 15; i >= 0; i--) { sp_2048_mont_sqr_avx2_32(r, r, m, mp); } @@ -1976,6 +1977,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m */ sp_2048_mont_mul_avx2_32(r, r, ah, m, mp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -2000,7 +2002,8 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, else if (e == 0x3) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { if (err == MP_OKAY) { sp_2048_sqr_avx2_32(r, ah); err = sp_2048_mod_32_cond(r, r, m); @@ -2009,6 +2012,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, sp_2048_mul_avx2_32(r, ah, r); err = sp_2048_mod_32_cond(r, r, m); } + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -2043,7 +2047,8 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, XMEMCPY(r, a, sizeof(sp_digit) * 32); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { for (i--; i>=0; i--) { sp_2048_mont_sqr_avx2_32(r, r, m, mp); if (((e >> i) & 1) == 1) { @@ -2052,6 +2057,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, } XMEMSET(&r[32], 0, sizeof(sp_digit) * 32); sp_2048_mont_reduce_avx2_32(r, m, mp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -2253,8 +2259,9 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_2048_mod_exp_avx2_16(tmpa, a, dp, 1024, p, 1); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -2264,8 +2271,9 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm, sp_2048_from_mp(dq, 16, dqm); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_2048_mod_exp_avx2_16(tmpb, a, dq, 1024, q, 1); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -2276,9 +2284,10 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm, c = sp_2048_sub_in_place_16(tmpa, tmpb); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { c += sp_2048_cond_add_avx2_16(tmpa, tmpa, p, c); sp_2048_cond_add_avx2_16(tmpa, tmpa, p, c); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -2290,8 +2299,9 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm, sp_2048_from_mp(qi, 16, qim); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_2048_mul_avx2_16(tmpa, tmpa, qi); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -2304,8 +2314,9 @@ int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_2048_mul_avx2_16(tmpa, q, tmpa); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -2445,8 +2456,9 @@ int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_2048_mod_exp_avx2_32(r, b, e, expBits, m, 0); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -2767,8 +2779,10 @@ int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen, if (base->used == 1 && base->dp[0] == 2 && m[31] == (sp_digit)-1) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_2048_mod_exp_2_avx2_32(r, e, (int)expLen * 8, m); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -2779,8 +2793,10 @@ int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen, { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_2048_mod_exp_avx2_32(r, b, e, (int)expLen * 8, m, 0); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -2854,8 +2870,9 @@ int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_2048_mod_exp_avx2_16(r, b, e, expBits, m, 0); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -4656,7 +4673,8 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, /* r = a ^ 0x10000 => r = a squared 16 times */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { for (i = 15; i >= 0; i--) { sp_3072_mont_sqr_avx2_48(r, r, m, mp); } @@ -4664,6 +4682,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m */ sp_3072_mont_mul_avx2_48(r, r, ah, m, mp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -4688,7 +4707,8 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, else if (e == 0x3) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { if (err == MP_OKAY) { sp_3072_sqr_avx2_48(r, ah); err = sp_3072_mod_48_cond(r, r, m); @@ -4697,6 +4717,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, sp_3072_mul_avx2_48(r, ah, r); err = sp_3072_mod_48_cond(r, r, m); } + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -4731,7 +4752,8 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, XMEMCPY(r, a, sizeof(sp_digit) * 48); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { for (i--; i>=0; i--) { sp_3072_mont_sqr_avx2_48(r, r, m, mp); if (((e >> i) & 1) == 1) { @@ -4740,6 +4762,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, } XMEMSET(&r[48], 0, sizeof(sp_digit) * 48); sp_3072_mont_reduce_avx2_48(r, m, mp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -4941,8 +4964,9 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_3072_mod_exp_avx2_24(tmpa, a, dp, 1536, p, 1); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -4952,8 +4976,9 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm, sp_3072_from_mp(dq, 24, dqm); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_3072_mod_exp_avx2_24(tmpb, a, dq, 1536, q, 1); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -4964,9 +4989,10 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm, c = sp_3072_sub_in_place_24(tmpa, tmpb); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { c += sp_3072_cond_add_avx2_24(tmpa, tmpa, p, c); sp_3072_cond_add_avx2_24(tmpa, tmpa, p, c); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -4978,8 +5004,9 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm, sp_3072_from_mp(qi, 24, qim); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_3072_mul_avx2_24(tmpa, tmpa, qi); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -4992,8 +5019,9 @@ int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_3072_mul_avx2_24(tmpa, q, tmpa); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -5133,8 +5161,9 @@ int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_3072_mod_exp_avx2_48(r, b, e, expBits, m, 0); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -5455,8 +5484,10 @@ int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen, if (base->used == 1 && base->dp[0] == 2 && m[47] == (sp_digit)-1) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_3072_mod_exp_2_avx2_48(r, e, (int)expLen * 8, m); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -5467,8 +5498,10 @@ int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen, { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_3072_mod_exp_avx2_48(r, b, e, (int)expLen * 8, m, 0); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -5542,8 +5575,9 @@ int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_3072_mod_exp_avx2_24(r, b, e, expBits, m, 0); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -6600,7 +6634,8 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, /* r = a ^ 0x10000 => r = a squared 16 times */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { for (i = 15; i >= 0; i--) { sp_4096_mont_sqr_avx2_64(r, r, m, mp); } @@ -6608,6 +6643,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m */ sp_4096_mont_mul_avx2_64(r, r, ah, m, mp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -6632,7 +6668,8 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, else if (e == 0x3) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { if (err == MP_OKAY) { sp_4096_sqr_avx2_64(r, ah); err = sp_4096_mod_64_cond(r, r, m); @@ -6641,6 +6678,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, sp_4096_mul_avx2_64(r, ah, r); err = sp_4096_mod_64_cond(r, r, m); } + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -6675,7 +6713,8 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, XMEMCPY(r, a, sizeof(sp_digit) * 64); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { for (i--; i>=0; i--) { sp_4096_mont_sqr_avx2_64(r, r, m, mp); if (((e >> i) & 1) == 1) { @@ -6684,6 +6723,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, } XMEMSET(&r[64], 0, sizeof(sp_digit) * 64); sp_4096_mont_reduce_avx2_64(r, m, mp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -6885,8 +6925,9 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_2048_mod_exp_avx2_32(tmpa, a, dp, 2048, p, 1); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -6896,8 +6937,9 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm, sp_4096_from_mp(dq, 32, dqm); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_2048_mod_exp_avx2_32(tmpb, a, dq, 2048, q, 1); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -6908,9 +6950,10 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm, c = sp_2048_sub_in_place_32(tmpa, tmpb); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { c += sp_4096_cond_add_avx2_32(tmpa, tmpa, p, c); sp_4096_cond_add_avx2_32(tmpa, tmpa, p, c); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -6922,8 +6965,9 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm, sp_2048_from_mp(qi, 32, qim); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_2048_mul_avx2_32(tmpa, tmpa, qi); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -6936,8 +6980,9 @@ int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_2048_mul_avx2_32(tmpa, q, tmpa); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -7077,8 +7122,9 @@ int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_4096_mod_exp_avx2_64(r, b, e, expBits, m, 0); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -7399,8 +7445,10 @@ int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen, if (base->used == 1 && base->dp[0] == 2 && m[63] == (sp_digit)-1) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_4096_mod_exp_2_avx2_64(r, e, (int)expLen * 8, m); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -7411,8 +7459,10 @@ int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen, { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_4096_mod_exp_avx2_64(r, b, e, (int)expLen * 8, m, 0); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -10872,8 +10922,9 @@ int sp_ecc_mulmod_256(const mp_int* km, const ecc_point* gm, ecc_point* r, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_256_ecc_mulmod_avx2_4(point, point, k, map, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -10935,8 +10986,9 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_256_ecc_mulmod_avx2_4(point, point, k, 0, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -10945,8 +10997,9 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_proj_point_add_avx2_4(point, point, addP, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -10955,8 +11008,10 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, if (map) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_map_avx2_4(point, point, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -23624,8 +23679,9 @@ int sp_ecc_mulmod_base_256(const mp_int* km, ecc_point* r, int map, void* heap) #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_256_ecc_mulmod_base_avx2_4(point, k, map, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -23685,8 +23741,9 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_256_ecc_mulmod_base_avx2_4(point, k, 0, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -23695,8 +23752,9 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_proj_point_add_avx2_4(point, point, addP, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -23705,8 +23763,10 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, if (map) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_map_avx2_4(point, point, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -23847,8 +23907,9 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_256_ecc_mulmod_base_avx2_4(point, k, 1, 1, NULL); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -23859,9 +23920,10 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_256_ecc_mulmod_avx2_4(infinity, point, p256_order, 1, 1, NULL); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -24035,8 +24097,9 @@ int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out, sp_256_point_from_ecc_point_4(point, pub); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_256_ecc_mulmod_avx2_4(point, point, k, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -24781,8 +24844,9 @@ static int sp_256_calc_s_4(sp_digit* s, const sp_digit* r, sp_digit* k, /* Conv k to Montgomery form (mod order) */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_mul_avx2_4(k, k, p256_norm_order); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -24794,8 +24858,9 @@ static int sp_256_calc_s_4(sp_digit* s, const sp_digit* r, sp_digit* k, /* kInv = 1/k mod order */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_mont_inv_order_avx2_4(kInv, k, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -24805,8 +24870,9 @@ static int sp_256_calc_s_4(sp_digit* s, const sp_digit* r, sp_digit* k, /* s = r * x + e */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_mul_avx2_4(x, x, r); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -24826,8 +24892,9 @@ static int sp_256_calc_s_4(sp_digit* s, const sp_digit* r, sp_digit* k, /* s = s * k^-1 mod order */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_mont_mul_order_avx2_4(s, s, kInv); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -24899,8 +24966,10 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_256_ecc_mulmod_base_avx2_4(point, k, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25148,8 +25217,9 @@ static void sp_256_add_points_4(sp_point_256* p1, const sp_point_256* p2, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_proj_point_add_avx2_4(p1, p1, p2, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25158,8 +25228,10 @@ static void sp_256_add_points_4(sp_point_256* p1, const sp_point_256* p2, if (sp_256_iszero_4(p1->x) && sp_256_iszero_4(p1->y)) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_proj_point_dbl_avx2_4(p1, p2, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25197,8 +25269,9 @@ static int sp_256_calc_vfy_point_4(sp_point_256* p1, sp_point_256* p2, #ifndef WOLFSSL_SP_SMALL #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_mod_inv_avx2_4(s, s, p256_order); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25209,8 +25282,9 @@ static int sp_256_calc_vfy_point_4(sp_point_256* p1, sp_point_256* p2, { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_mul_avx2_4(s, s, p256_norm_order); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25224,10 +25298,11 @@ static int sp_256_calc_vfy_point_4(sp_point_256* p1, sp_point_256* p2, #ifdef WOLFSSL_SP_SMALL #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_mont_inv_order_avx2_4(s, s, tmp); sp_256_mont_mul_order_avx2_4(u1, u1, s); sp_256_mont_mul_order_avx2_4(u2, u2, s); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25239,9 +25314,10 @@ static int sp_256_calc_vfy_point_4(sp_point_256* p1, sp_point_256* p2, #else #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_mont_mul_order_avx2_4(u1, u1, s); sp_256_mont_mul_order_avx2_4(u2, u2, s); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25252,8 +25328,9 @@ static int sp_256_calc_vfy_point_4(sp_point_256* p1, sp_point_256* p2, #endif /* WOLFSSL_SP_SMALL */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_256_ecc_mulmod_base_avx2_4(p1, u1, 0, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25267,8 +25344,9 @@ static int sp_256_calc_vfy_point_4(sp_point_256* p1, sp_point_256* p2, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_256_ecc_mulmod_avx2_4(p2, p2, u2, 0, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25354,16 +25432,18 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, /* u1 = r.z'.z' mod prime */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_mont_sqr_avx2_4(p1->z, p1->z, p256_mod, p256_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif sp_256_mont_sqr_4(p1->z, p1->z, p256_mod, p256_mp_mod); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_mont_mul_avx2_4(u1, u2, p1->z, p256_mod, p256_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25388,9 +25468,11 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, /* u1 = (r + 1*order).z'.z' mod prime */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_mont_mul_avx2_4(u1, u2, p1->z, p256_mod, p256_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25698,8 +25780,9 @@ int sp_ecc_check_key_256(const mp_int* pX, const mp_int* pY, /* Point * order = infinity */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_256_ecc_mulmod_avx2_4(p, pub, p256_order, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25716,8 +25799,10 @@ int sp_ecc_check_key_256(const mp_int* pX, const mp_int* pY, /* Base * private = point */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_256_ecc_mulmod_base_avx2_4(p, priv, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25782,8 +25867,9 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_proj_point_add_avx2_4(p, p, q, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25838,8 +25924,9 @@ int sp_ecc_proj_dbl_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_proj_point_dbl_avx2_4(p, p, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25891,8 +25978,9 @@ int sp_ecc_map_256(mp_int* pX, mp_int* pY, mp_int* pZ) #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_map_avx2_4(p, p, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -25936,7 +26024,7 @@ static int sp_256_mont_sqrt_4(sp_digit* y) #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { /* t2 = y ^ 0x2 */ sp_256_mont_sqr_avx2_4(t2, y, p256_mod, p256_mp_mod); /* t1 = y ^ 0x3 */ @@ -25966,6 +26054,7 @@ static int sp_256_mont_sqrt_4(sp_digit* y) /* t1 = y ^ 0xffffffff00000001000000000000000000000001 */ sp_256_mont_mul_avx2_4(t1, t1, y, p256_mod, p256_mp_mod); sp_256_mont_sqr_n_avx2_4(y, t1, 94, p256_mod, p256_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -26035,9 +26124,10 @@ int sp_ecc_uncompress_256(mp_int* xm, int odd, mp_int* ym) /* y = x^3 */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_256_mont_sqr_avx2_4(y, x, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(y, y, x, p256_mod, p256_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -29625,8 +29715,9 @@ int sp_ecc_mulmod_384(const mp_int* km, const ecc_point* gm, ecc_point* r, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_384_ecc_mulmod_avx2_6(point, point, k, map, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -29688,8 +29779,9 @@ int sp_ecc_mulmod_add_384(const mp_int* km, const ecc_point* gm, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_384_ecc_mulmod_avx2_6(point, point, k, 0, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -29698,8 +29790,9 @@ int sp_ecc_mulmod_add_384(const mp_int* km, const ecc_point* gm, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_proj_point_add_avx2_6(point, point, addP, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -29708,8 +29801,10 @@ int sp_ecc_mulmod_add_384(const mp_int* km, const ecc_point* gm, if (map) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_map_avx2_6(point, point, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -48191,8 +48286,9 @@ int sp_ecc_mulmod_base_384(const mp_int* km, ecc_point* r, int map, void* heap) #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_384_ecc_mulmod_base_avx2_6(point, k, map, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -48252,8 +48348,9 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_384_ecc_mulmod_base_avx2_6(point, k, 0, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -48262,8 +48359,9 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_proj_point_add_avx2_6(point, point, addP, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -48272,8 +48370,10 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am, if (map) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_map_avx2_6(point, point, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -48414,8 +48514,9 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_384_ecc_mulmod_base_avx2_6(point, k, 1, 1, NULL); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -48426,9 +48527,10 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_384_ecc_mulmod_avx2_6(infinity, point, p384_order, 1, 1, NULL); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -48602,8 +48704,9 @@ int sp_ecc_secret_gen_384(const mp_int* priv, const ecc_point* pub, byte* out, sp_384_point_from_ecc_point_6(point, pub); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_384_ecc_mulmod_avx2_6(point, point, k, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49206,8 +49309,9 @@ static int sp_384_calc_s_6(sp_digit* s, const sp_digit* r, sp_digit* k, /* Conv k to Montgomery form (mod order) */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_mul_avx2_6(k, k, p384_norm_order); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49219,8 +49323,9 @@ static int sp_384_calc_s_6(sp_digit* s, const sp_digit* r, sp_digit* k, /* kInv = 1/k mod order */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_mont_inv_order_avx2_6(kInv, k, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49230,8 +49335,9 @@ static int sp_384_calc_s_6(sp_digit* s, const sp_digit* r, sp_digit* k, /* s = r * x + e */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_mul_avx2_6(x, x, r); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49251,8 +49357,9 @@ static int sp_384_calc_s_6(sp_digit* s, const sp_digit* r, sp_digit* k, /* s = s * k^-1 mod order */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_mont_mul_order_avx2_6(s, s, kInv); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49324,8 +49431,10 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_384_ecc_mulmod_base_avx2_6(point, k, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49662,8 +49771,9 @@ static void sp_384_add_points_6(sp_point_384* p1, const sp_point_384* p2, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_proj_point_add_avx2_6(p1, p1, p2, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49672,8 +49782,10 @@ static void sp_384_add_points_6(sp_point_384* p1, const sp_point_384* p2, if (sp_384_iszero_6(p1->x) && sp_384_iszero_6(p1->y)) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_proj_point_dbl_avx2_6(p1, p2, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49717,8 +49829,9 @@ static int sp_384_calc_vfy_point_6(sp_point_384* p1, sp_point_384* p2, { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_mul_avx2_6(s, s, p384_norm_order); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49732,10 +49845,11 @@ static int sp_384_calc_vfy_point_6(sp_point_384* p1, sp_point_384* p2, #ifdef WOLFSSL_SP_SMALL #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_mont_inv_order_avx2_6(s, s, tmp); sp_384_mont_mul_order_avx2_6(u1, u1, s); sp_384_mont_mul_order_avx2_6(u2, u2, s); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49747,9 +49861,10 @@ static int sp_384_calc_vfy_point_6(sp_point_384* p1, sp_point_384* p2, #else #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_mont_mul_order_avx2_6(u1, u1, s); sp_384_mont_mul_order_avx2_6(u2, u2, s); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49760,8 +49875,9 @@ static int sp_384_calc_vfy_point_6(sp_point_384* p1, sp_point_384* p2, #endif /* WOLFSSL_SP_SMALL */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_384_ecc_mulmod_base_avx2_6(p1, u1, 0, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49775,8 +49891,9 @@ static int sp_384_calc_vfy_point_6(sp_point_384* p1, sp_point_384* p2, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_384_ecc_mulmod_avx2_6(p2, p2, u2, 0, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49862,16 +49979,18 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, /* u1 = r.z'.z' mod prime */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_mont_sqr_avx2_6(p1->z, p1->z, p384_mod, p384_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif sp_384_mont_sqr_6(p1->z, p1->z, p384_mod, p384_mp_mod); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_mont_mul_avx2_6(u1, u2, p1->z, p384_mod, p384_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -49896,9 +50015,11 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, /* u1 = (r + 1*order).z'.z' mod prime */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_mont_mul_avx2_6(u1, u2, p1->z, p384_mod, p384_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -50206,8 +50327,9 @@ int sp_ecc_check_key_384(const mp_int* pX, const mp_int* pY, /* Point * order = infinity */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_384_ecc_mulmod_avx2_6(p, pub, p384_order, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -50224,8 +50346,10 @@ int sp_ecc_check_key_384(const mp_int* pX, const mp_int* pY, /* Base * private = point */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_384_ecc_mulmod_base_avx2_6(p, priv, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -50290,8 +50414,9 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_proj_point_add_avx2_6(p, p, q, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -50346,8 +50471,9 @@ int sp_ecc_proj_dbl_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_proj_point_dbl_avx2_6(p, p, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -50399,8 +50525,9 @@ int sp_ecc_map_384(mp_int* pX, mp_int* pY, mp_int* pZ) #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_map_avx2_6(p, p, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -50450,7 +50577,7 @@ static int sp_384_mont_sqrt_6(sp_digit* y) #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { /* t2 = y ^ 0x2 */ sp_384_mont_sqr_avx2_6(t2, y, p384_mod, p384_mp_mod); /* t1 = y ^ 0x3 */ @@ -50505,6 +50632,7 @@ static int sp_384_mont_sqrt_6(sp_digit* y) sp_384_mont_mul_avx2_6(t1, y, t2, p384_mod, p384_mp_mod); /* t2 = y ^ 0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffbfffffffc00000000000000040000000 */ sp_384_mont_sqr_n_avx2_6(y, t1, 30, p384_mod, p384_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -50599,9 +50727,10 @@ int sp_ecc_uncompress_384(mp_int* xm, int odd, mp_int* ym) /* y = x^3 */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_384_mont_sqr_avx2_6(y, x, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(y, y, x, p384_mod, p384_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -54077,8 +54206,9 @@ int sp_ecc_mulmod_521(const mp_int* km, const ecc_point* gm, ecc_point* r, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_521_ecc_mulmod_avx2_9(point, point, k, map, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -54140,8 +54270,9 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_521_ecc_mulmod_avx2_9(point, point, k, 0, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -54150,8 +54281,9 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_proj_point_add_avx2_9(point, point, addP, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -54160,8 +54292,10 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, if (map) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_map_avx2_9(point, point, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -88829,8 +88963,9 @@ int sp_ecc_mulmod_base_521(const mp_int* km, ecc_point* r, int map, void* heap) #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_521_ecc_mulmod_base_avx2_9(point, k, map, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -88890,8 +89025,9 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_521_ecc_mulmod_base_avx2_9(point, k, 0, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -88900,8 +89036,9 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_proj_point_add_avx2_9(point, point, addP, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -88910,8 +89047,10 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, if (map) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_map_avx2_9(point, point, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -89053,8 +89192,9 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_521_ecc_mulmod_base_avx2_9(point, k, 1, 1, NULL); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -89065,9 +89205,10 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_521_ecc_mulmod_avx2_9(infinity, point, p521_order, 1, 1, NULL); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -89241,8 +89382,9 @@ int sp_ecc_secret_gen_521(const mp_int* priv, const ecc_point* pub, byte* out, sp_521_point_from_ecc_point_9(point, pub); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_521_ecc_mulmod_avx2_9(point, point, k, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -89900,8 +90042,9 @@ static int sp_521_calc_s_9(sp_digit* s, const sp_digit* r, sp_digit* k, /* Conv k to Montgomery form (mod order) */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_mul_avx2_9(k, k, p521_norm_order); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -89913,8 +90056,9 @@ static int sp_521_calc_s_9(sp_digit* s, const sp_digit* r, sp_digit* k, /* kInv = 1/k mod order */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_mont_inv_order_avx2_9(kInv, k, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -89924,8 +90068,9 @@ static int sp_521_calc_s_9(sp_digit* s, const sp_digit* r, sp_digit* k, /* s = r * x + e */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_mul_avx2_9(x, x, r); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -89945,8 +90090,9 @@ static int sp_521_calc_s_9(sp_digit* s, const sp_digit* r, sp_digit* k, /* s = s * k^-1 mod order */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_mont_mul_order_avx2_9(s, s, kInv); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -90018,8 +90164,10 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_521_ecc_mulmod_base_avx2_9(point, k, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -90364,8 +90512,9 @@ static void sp_521_add_points_9(sp_point_521* p1, const sp_point_521* p2, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_proj_point_add_avx2_9(p1, p1, p2, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -90374,8 +90523,10 @@ static void sp_521_add_points_9(sp_point_521* p1, const sp_point_521* p2, if (sp_521_iszero_9(p1->x) && sp_521_iszero_9(p1->y)) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_proj_point_dbl_avx2_9(p1, p2, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -90422,8 +90573,9 @@ static int sp_521_calc_vfy_point_9(sp_point_521* p1, sp_point_521* p2, { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_mul_avx2_9(s, s, p521_norm_order); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -90437,10 +90589,11 @@ static int sp_521_calc_vfy_point_9(sp_point_521* p1, sp_point_521* p2, #ifdef WOLFSSL_SP_SMALL #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_mont_inv_order_avx2_9(s, s, tmp); sp_521_mont_mul_order_avx2_9(u1, u1, s); sp_521_mont_mul_order_avx2_9(u2, u2, s); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -90452,9 +90605,10 @@ static int sp_521_calc_vfy_point_9(sp_point_521* p1, sp_point_521* p2, #else #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_mont_mul_order_avx2_9(u1, u1, s); sp_521_mont_mul_order_avx2_9(u2, u2, s); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -90465,8 +90619,9 @@ static int sp_521_calc_vfy_point_9(sp_point_521* p1, sp_point_521* p2, #endif /* WOLFSSL_SP_SMALL */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_521_ecc_mulmod_base_avx2_9(p1, u1, 0, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -90480,8 +90635,9 @@ static int sp_521_calc_vfy_point_9(sp_point_521* p1, sp_point_521* p2, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_521_ecc_mulmod_avx2_9(p2, p2, u2, 0, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -90571,16 +90727,18 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, /* u1 = r.z'.z' mod prime */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_mont_sqr_avx2_9(p1->z, p1->z, p521_mod, p521_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif sp_521_mont_sqr_9(p1->z, p1->z, p521_mod, p521_mp_mod); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_mont_mul_avx2_9(u1, u2, p1->z, p521_mod, p521_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -90605,9 +90763,11 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, /* u1 = (r + 1*order).z'.z' mod prime */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_mont_mul_avx2_9(u1, u2, p1->z, p521_mod, p521_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -90918,8 +91078,9 @@ int sp_ecc_check_key_521(const mp_int* pX, const mp_int* pY, /* Point * order = infinity */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_521_ecc_mulmod_avx2_9(p, pub, p521_order, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -90936,8 +91097,10 @@ int sp_ecc_check_key_521(const mp_int* pX, const mp_int* pY, /* Base * private = point */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_521_ecc_mulmod_base_avx2_9(p, priv, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -91002,8 +91165,9 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_proj_point_add_avx2_9(p, p, q, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -91058,8 +91222,9 @@ int sp_ecc_proj_dbl_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_proj_point_dbl_avx2_9(p, p, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -91111,8 +91276,9 @@ int sp_ecc_map_521(mp_int* pX, mp_int* pY, mp_int* pZ) #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_map_avx2_9(p, p, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -91161,7 +91327,7 @@ static int sp_521_mont_sqrt_9(sp_digit* y) #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { int i; XMEMCPY(t, y, sizeof(sp_digit) * 9); @@ -91171,6 +91337,7 @@ static int sp_521_mont_sqrt_9(sp_digit* y) sp_521_mont_mul_avx2_9(t, t, y, p521_mod, p521_mp_mod); } XMEMCPY(y, t, sizeof(sp_digit) * 9); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -91220,9 +91387,10 @@ int sp_ecc_uncompress_521(mp_int* xm, int odd, mp_int* ym) /* y = x^3 */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_521_mont_sqr_avx2_9(y, x, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(y, y, x, p521_mod, p521_mp_mod); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -94831,8 +94999,9 @@ int sp_ecc_mulmod_1024(const mp_int* km, const ecc_point* gm, ecc_point* r, #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_1024_ecc_mulmod_avx2_16(point, point, k, map, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -98246,8 +98415,9 @@ int sp_ecc_mulmod_base_1024(const mp_int* km, ecc_point* r, int map, void* heap) #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_1024_ecc_mulmod_base_avx2_16(point, k, map, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -98307,8 +98477,9 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_1024_ecc_mulmod_base_avx2_16(point, k, 0, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -98317,8 +98488,9 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, if (err == MP_OKAY) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { sp_1024_proj_point_add_avx2_16(point, point, addP, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -98327,8 +98499,10 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, if (map) { #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { sp_1024_map_avx2_16(point, point, tmp); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -98382,9 +98556,10 @@ int sp_ecc_gen_table_1024(const ecc_point* gm, byte* table, word32* len, sp_1024_point_from_ecc_point_16(point, gm); #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_1024_gen_stripe_table_avx2_16(point, (sp_table_entry_1024*)table, t, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -98465,9 +98640,10 @@ int sp_ecc_mulmod_table_1024(const mp_int* km, const ecc_point* gm, byte* table, #ifndef WOLFSSL_SP_SMALL #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_1024_ecc_mulmod_stripe_avx2_16(point, point, (const sp_table_entry_1024*)table, k, map, 0, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -103494,8 +103670,9 @@ int sp_ecc_check_key_1024(const mp_int* pX, const mp_int* pY, /* Point * order = infinity */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_1024_ecc_mulmod_avx2_16(p, pub, p1024_order, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif @@ -103512,8 +103689,10 @@ int sp_ecc_check_key_1024(const mp_int* pX, const mp_int* pY, /* Base * private = point */ #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags) && - IS_INTEL_AVX2(cpuid_flags)) { + IS_INTEL_AVX2(cpuid_flags) && + (SAVE_VECTOR_REGISTERS2() == 0)) { err = sp_1024_ecc_mulmod_base_avx2_16(p, priv, 1, 1, heap); + RESTORE_VECTOR_REGISTERS(); } else #endif