diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S index 5e19e2760..9de68792b 100644 --- a/wolfcrypt/src/sp_x86_64_asm.S +++ b/wolfcrypt/src/sp_x86_64_asm.S @@ -39668,6 +39668,13 @@ _sp_256_mont_add_4: andq %rsi, %r11 subq %rsi, %rax sbbq %r10, %rcx + sbbq $0x00, %r8 + sbbq %r11, %r9 + adcq $0x00, %rsi + andq %rsi, %r10 + andq %rsi, %r11 + subq %rsi, %rax + sbbq %r10, %rcx movq %rax, (%rdi) sbbq $0x00, %r8 movq %rcx, 8(%rdi) @@ -39712,6 +39719,13 @@ _sp_256_mont_dbl_4: andq %r11, %r10 subq %r11, %rdx sbbq %r9, %rax + sbbq $0x00, %rcx + sbbq %r10, %r8 + adcq $0x00, %r11 + andq %r11, %r9 + andq %r11, %r10 + subq %r11, %rdx + sbbq %r9, %rax movq %rdx, (%rdi) sbbq $0x00, %rcx movq %rax, 8(%rdi) @@ -39758,6 +39772,13 @@ _sp_256_mont_tpl_4: sbbq %r9, %rax sbbq $0x00, %rcx sbbq %r10, %r8 + adcq $0x00, %r11 + andq %r11, %r9 + andq %r11, %r10 + subq %r11, %rdx + sbbq %r9, %rax + sbbq $0x00, %rcx + sbbq %r10, %r8 movq $0xffffffff, %r9 movq $0xffffffff00000001, %r10 addq (%rsi), %rdx @@ -39770,6 +39791,13 @@ _sp_256_mont_tpl_4: andq %r11, %r10 subq %r11, %rdx sbbq %r9, %rax + sbbq $0x00, %rcx + sbbq %r10, %r8 + adcq $0x00, %r11 + andq %r11, %r9 + andq %r11, %r10 + subq %r11, %rdx + sbbq %r9, %rax movq %rdx, (%rdi) sbbq $0x00, %rcx movq %rax, 8(%rdi) @@ -39815,6 +39843,13 @@ _sp_256_mont_sub_4: andq %rsi, %r11 addq %rsi, %rax adcq %r10, %rcx + adcq $0x00, %r8 + adcq %r11, %r9 + adcq $0x00, %rsi + andq %rsi, %r10 + andq %rsi, %r11 + addq %rsi, %rax + adcq %r10, %rcx movq %rax, (%rdi) adcq $0x00, %r8 movq %rcx, 8(%rdi) @@ -42251,6 +42286,13 @@ L_256_mod_inv_avx2_4_usubv_sub_shr1: vpand %ymm14, %ymm1, %ymm1 vpaddd %ymm5, %ymm0, %ymm0 vpaddd %ymm4, %ymm1, %ymm1 + vpsrad $26, %ymm1, %ymm5 + vpsrad $26, %ymm0, %ymm4 + vpermd %ymm5, %ymm13, %ymm5 + vpand %ymm14, %ymm0, %ymm0 + vpand %ymm14, %ymm1, %ymm1 + vpaddd %ymm5, %ymm0, %ymm0 + vpaddd %ymm4, %ymm1, %ymm1 vpextrd $0x00, %xmm0, %eax vpextrd $0x01, %xmm0, %r8d vpextrd $2, %xmm0, %r10d @@ -42311,6 +42353,13 @@ L_256_mod_inv_avx2_4_vsubu_sub_shr1: vpand %ymm14, %ymm3, %ymm3 vpaddd %ymm5, %ymm2, %ymm2 vpaddd %ymm4, %ymm3, %ymm3 + vpsrad $26, %ymm3, %ymm5 + vpsrad $26, %ymm2, %ymm4 + vpermd %ymm5, %ymm13, %ymm5 + vpand %ymm14, %ymm2, %ymm2 + vpand %ymm14, %ymm3, %ymm3 + vpaddd %ymm5, %ymm2, %ymm2 + vpaddd %ymm4, %ymm3, %ymm3 vpextrd $0x00, %xmm2, %eax vpextrd $0x01, %xmm2, %r8d vpextrd $2, %xmm2, %r10d @@ -42324,14 +42373,19 @@ L_256_mod_inv_avx2_4_vsubu_sub_shr1: vpextrd $0x00, %xmm2, %r14d vpextrd $0x00, %xmm3, %r15d L_256_mod_inv_avx2_4_store_done: + movslq %eax, %rax shlq $26, %rcx addq %rcx, %rax + movslq %r8d, %r8 shlq $26, %r9 addq %r9, %r8 + movslq %r10d, %r10 shlq $26, %r11 addq %r11, %r10 + movslq %r12d, %r12 shlq $26, %r13 addq %r13, %r12 + movslq %r14d, %r14 shlq $26, %r15 addq %r15, %r14 movq %r8, %rcx @@ -44817,12 +44871,12 @@ L_384_get_entry_65_6_start: movdqa %xmm14, %xmm12 paddd %xmm15, %xmm14 pcmpeqd %xmm13, %xmm12 - movdqa (%rsi), %xmm6 - movdqa 16(%rsi), %xmm7 - movdqa 32(%rsi), %xmm8 - movdqa 48(%rsi), %xmm9 - movdqa 64(%rsi), %xmm10 - movdqa 80(%rsi), %xmm11 + movdqu (%rsi), %xmm6 + movdqu 16(%rsi), %xmm7 + movdqu 32(%rsi), %xmm8 + movdqu 48(%rsi), %xmm9 + movdqu 64(%rsi), %xmm10 + movdqu 80(%rsi), %xmm11 addq $0x60, %rsi pand %xmm12, %xmm6 pand %xmm12, %xmm7 diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm index 7a2aa12cb..f4ed57470 100644 --- a/wolfcrypt/src/sp_x86_64_asm.asm +++ b/wolfcrypt/src/sp_x86_64_asm.asm @@ -38455,6 +38455,13 @@ sp_256_mont_add_4 PROC and r13, rdx sub rax, rdx sbb r9, r12 + sbb r10, 0 + sbb r11, r13 + adc rdx, 0 + and r12, rdx + and r13, rdx + sub rax, rdx + sbb r9, r12 mov QWORD PTR [rcx], rax sbb r10, 0 mov QWORD PTR [rcx+8], r9 @@ -38492,6 +38499,13 @@ sp_256_mont_dbl_4 PROC and r12, r13 sub rax, r13 sbb r8, r11 + sbb r9, 0 + sbb r10, r12 + adc r13, 0 + and r11, r13 + and r12, r13 + sub rax, r13 + sbb r8, r11 mov QWORD PTR [rcx], rax sbb r9, 0 mov QWORD PTR [rcx+8], r8 @@ -38531,6 +38545,13 @@ sp_256_mont_tpl_4 PROC sbb r8, r11 sbb r9, 0 sbb r10, r12 + adc r13, 0 + and r11, r13 + and r12, r13 + sub rax, r13 + sbb r8, r11 + sbb r9, 0 + sbb r10, r12 mov r11, 4294967295 mov r12, 18446744069414584321 add rax, QWORD PTR [rdx] @@ -38543,6 +38564,13 @@ sp_256_mont_tpl_4 PROC and r12, r13 sub rax, r13 sbb r8, r11 + sbb r9, 0 + sbb r10, r12 + adc r13, 0 + and r11, r13 + and r12, r13 + sub rax, r13 + sbb r8, r11 mov QWORD PTR [rcx], rax sbb r9, 0 mov QWORD PTR [rcx+8], r8 @@ -38581,6 +38609,13 @@ sp_256_mont_sub_4 PROC and r13, rdx add rax, rdx adc r9, r12 + adc r10, 0 + adc r11, r13 + adc rdx, 0 + and r12, rdx + and r13, rdx + add rax, rdx + adc r9, r12 mov QWORD PTR [rcx], rax adc r10, 0 mov QWORD PTR [rcx+8], r9 @@ -39345,10 +39380,10 @@ L_256_get_entry_64_4_start: movdqa xmm8, xmm10 paddd xmm10, xmm11 pcmpeqd xmm8, xmm9 - movdqa xmm4, [rdx] - movdqa xmm5, [rdx+16] - movdqa xmm6, [rdx+32] - movdqa xmm7, [rdx+48] + movdqu xmm4, [rdx] + movdqu xmm5, [rdx+16] + movdqu xmm6, [rdx+32] + movdqu xmm7, [rdx+48] add rdx, 64 pand xmm4, xmm8 pand xmm5, xmm8 @@ -39432,10 +39467,10 @@ L_256_get_entry_65_4_start: movdqa xmm8, xmm10 paddd xmm10, xmm11 pcmpeqd xmm8, xmm9 - movdqa xmm4, [rdx] - movdqa xmm5, [rdx+16] - movdqa xmm6, [rdx+32] - movdqa xmm7, [rdx+48] + movdqu xmm4, [rdx] + movdqu xmm5, [rdx+16] + movdqu xmm6, [rdx+32] + movdqu xmm7, [rdx+48] add rdx, 64 pand xmm4, xmm8 pand xmm5, xmm8 @@ -40774,6 +40809,13 @@ L_256_mod_inv_avx2_4_usubv_sub_shr1: vpand ymm1, ymm1, ymm14 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm4 + vpsrad ymm5, ymm1, 26 + vpsrad ymm4, ymm0, 26 + vpermd ymm5, ymm13, ymm5 + vpand ymm0, ymm0, ymm14 + vpand ymm1, ymm1, ymm14 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm4 vpextrd eax, xmm0, 0 vpextrd r10d, xmm0, 1 vpextrd r12d, xmm0, 2 @@ -40834,6 +40876,13 @@ L_256_mod_inv_avx2_4_vsubu_sub_shr1: vpand ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm5 vpaddd ymm3, ymm3, ymm4 + vpsrad ymm5, ymm3, 26 + vpsrad ymm4, ymm2, 26 + vpermd ymm5, ymm13, ymm5 + vpand ymm2, ymm2, ymm14 + vpand ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm5 + vpaddd ymm3, ymm3, ymm4 vpextrd eax, xmm2, 0 vpextrd r10d, xmm2, 1 vpextrd r12d, xmm2, 2 @@ -40847,14 +40896,19 @@ L_256_mod_inv_avx2_4_vsubu_sub_shr1: vpextrd edi, xmm2, 0 vpextrd esi, xmm3, 0 L_256_mod_inv_avx2_4_store_done: + movslq rax, eax shl r9, 26 add rax, r9 + movslq r10, r10d shl r11, 26 add r10, r11 + movslq r12, r12d shl r13, 26 add r12, r13 + movslq r14, r14d shl r15, 26 add r14, r15 + movslq rdi, edi shl rsi, 26 add rdi, rsi mov r9, r10 @@ -43037,12 +43091,12 @@ L_384_get_entry_64_6_start: movdqa xmm12, xmm14 paddd xmm14, xmm15 pcmpeqd xmm12, xmm13 - movdqa xmm6, [rdx] - movdqa xmm7, [rdx+16] - movdqa xmm8, [rdx+32] - movdqa xmm9, [rdx+48] - movdqa xmm10, [rdx+64] - movdqa xmm11, [rdx+80] + movdqu xmm6, [rdx] + movdqu xmm7, [rdx+16] + movdqu xmm8, [rdx+32] + movdqu xmm9, [rdx+48] + movdqu xmm10, [rdx+64] + movdqu xmm11, [rdx+80] add rdx, 96 pand xmm6, xmm12 pand xmm7, xmm12 @@ -43144,12 +43198,12 @@ L_384_get_entry_65_6_start: movdqa xmm12, xmm14 paddd xmm14, xmm15 pcmpeqd xmm12, xmm13 - movdqa xmm6, [rdx] - movdqa xmm7, [rdx+16] - movdqa xmm8, [rdx+32] - movdqa xmm9, [rdx+48] - movdqa xmm10, [rdx+64] - movdqa xmm11, [rdx+80] + movdqu xmm6, [rdx] + movdqu xmm7, [rdx+16] + movdqu xmm8, [rdx+32] + movdqu xmm9, [rdx+48] + movdqu xmm10, [rdx+64] + movdqu xmm11, [rdx+80] add rdx, 96 pand xmm6, xmm12 pand xmm7, xmm12