From ff3325fcaf69f250892087ec3864d579ff86bde8 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Mon, 8 Mar 2021 11:15:10 +1000 Subject: [PATCH] SP ECC: Fix P-256 modinv for AVX2 modinv AVX2: do quick norm on result twice and convert 32-bit signed to 64-bit signed before adding P-256 mont_add, mont_dbl, mont_tpl, mont_sub x64: handle mul/sqr result being greater than modulus but not greater than 1<<256. --- wolfcrypt/src/ecc.c | 6 ++-- wolfcrypt/src/sp_x86_64_asm.S | 54 +++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/wolfcrypt/src/ecc.c b/wolfcrypt/src/ecc.c index 56acf3e15..f5d57eb50 100644 --- a/wolfcrypt/src/ecc.c +++ b/wolfcrypt/src/ecc.c @@ -5261,8 +5261,7 @@ int wc_ecc_sign_hash_ex(const byte* in, word32 inlen, WC_RNG* rng, } #endif -#if (defined(WOLFSSL_SP_MATH) || defined(WOLFSSL_SP_MATH_ALL)) && \ - defined(WOLFSSL_HAVE_SP_ECC) +#if defined(WOLFSSL_HAVE_SP_ECC) if (key->idx != ECC_CUSTOM_IDX #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_ECC) && key->asyncDev.marker != WOLFSSL_ASYNC_MARKER_ECC @@ -6537,8 +6536,7 @@ int wc_ecc_verify_hash_ex(mp_int *r, mp_int *s, const byte* hash, } #endif -#if (defined(WOLFSSL_SP_MATH) || defined(WOLFSSL_SP_MATH_ALL)) && \ - defined(WOLFSSL_HAVE_SP_ECC) +#if defined(WOLFSSL_HAVE_SP_ECC) if (key->idx != ECC_CUSTOM_IDX #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_ECC) && key->asyncDev.marker != WOLFSSL_ASYNC_MARKER_ECC diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S index ca8c76d6e..feb766fbf 100644 --- a/wolfcrypt/src/sp_x86_64_asm.S +++ b/wolfcrypt/src/sp_x86_64_asm.S @@ -39021,6 +39021,13 @@ _sp_256_mont_add_4: andq %rsi, %r11 subq %rsi, %rax sbbq %r10, %rcx + sbbq $0x00, %r8 + sbbq %r11, %r9 + adcq $0x00, %rsi + andq %rsi, %r10 + andq %rsi, %r11 + subq %rsi, %rax + sbbq %r10, %rcx movq %rax, (%rdi) sbbq $0x00, %r8 movq %rcx, 8(%rdi) @@ -39065,6 +39072,13 @@ _sp_256_mont_dbl_4: andq %r11, %r10 subq %r11, %rdx sbbq %r9, %rax + sbbq $0x00, %rcx + sbbq %r10, %r8 + adcq $0x00, %r11 + andq %r11, %r9 + andq %r11, %r10 + subq %r11, %rdx + sbbq %r9, %rax movq %rdx, (%rdi) sbbq $0x00, %rcx movq %rax, 8(%rdi) @@ -39111,6 +39125,13 @@ _sp_256_mont_tpl_4: sbbq %r9, %rax sbbq $0x00, %rcx sbbq %r10, %r8 + adcq $0x00, %r11 + andq %r11, %r9 + andq %r11, %r10 + subq %r11, %rdx + sbbq %r9, %rax + sbbq $0x00, %rcx + sbbq %r10, %r8 movq $0xffffffff, %r9 movq $0xffffffff00000001, %r10 addq (%rsi), %rdx @@ -39123,6 +39144,13 @@ _sp_256_mont_tpl_4: andq %r11, %r10 subq %r11, %rdx sbbq %r9, %rax + sbbq $0x00, %rcx + sbbq %r10, %r8 + adcq $0x00, %r11 + andq %r11, %r9 + andq %r11, %r10 + subq %r11, %rdx + sbbq %r9, %rax movq %rdx, (%rdi) sbbq $0x00, %rcx movq %rax, 8(%rdi) @@ -39168,6 +39196,13 @@ _sp_256_mont_sub_4: andq %rsi, %r11 addq %rsi, %rax adcq %r10, %rcx + adcq $0x00, %r8 + adcq %r11, %r9 + adcq $0x00, %rsi + andq %rsi, %r10 + andq %rsi, %r11 + addq %rsi, %rax + adcq %r10, %rcx movq %rax, (%rdi) adcq $0x00, %r8 movq %rcx, 8(%rdi) @@ -41630,6 +41665,13 @@ L_256_mod_inv_avx2_4_usubv_sub_shr1: vpand %ymm14, %ymm1, %ymm1 vpaddd %ymm5, %ymm0, %ymm0 vpaddd %ymm4, %ymm1, %ymm1 + vpsrad $26, %ymm1, %ymm5 + vpsrad $26, %ymm0, %ymm4 + vpermd %ymm5, %ymm13, %ymm5 + vpand %ymm14, %ymm0, %ymm0 + vpand %ymm14, %ymm1, %ymm1 + vpaddd %ymm5, %ymm0, %ymm0 + vpaddd %ymm4, %ymm1, %ymm1 vpextrd $0x00, %xmm0, %eax vpextrd $0x01, %xmm0, %r8d vpextrd $2, %xmm0, %r10d @@ -41690,6 +41732,13 @@ L_256_mod_inv_avx2_4_vsubu_sub_shr1: vpand %ymm14, %ymm3, %ymm3 vpaddd %ymm5, %ymm2, %ymm2 vpaddd %ymm4, %ymm3, %ymm3 + vpsrad $26, %ymm3, %ymm5 + vpsrad $26, %ymm2, %ymm4 + vpermd %ymm5, %ymm13, %ymm5 + vpand %ymm14, %ymm2, %ymm2 + vpand %ymm14, %ymm3, %ymm3 + vpaddd %ymm5, %ymm2, %ymm2 + vpaddd %ymm4, %ymm3, %ymm3 vpextrd $0x00, %xmm2, %eax vpextrd $0x01, %xmm2, %r8d vpextrd $2, %xmm2, %r10d @@ -41703,14 +41752,19 @@ L_256_mod_inv_avx2_4_vsubu_sub_shr1: vpextrd $0x00, %xmm2, %r14d vpextrd $0x00, %xmm3, %r15d L_256_mod_inv_avx2_4_store_done: + movslq %eax, %rax shlq $26, %rcx addq %rcx, %rax + movslq %r8d, %r8 shlq $26, %r9 addq %r9, %r8 + movslq %r10d, %r10 shlq $26, %r11 addq %r11, %r10 + movslq %r12d, %r12 shlq $26, %r13 addq %r13, %r12 + movslq %r14d, %r14 shlq $26, %r15 addq %r15, %r14 movq %r8, %rcx