From 2995c96f11d69bc360950765ffd9c935f4966c13 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Mon, 11 Apr 2022 15:09:40 +1000 Subject: [PATCH] Wycheproof fixes Curve25519: reduce to less than mod at end of curve25519, optimize to not use imulq when andq can be used instead. x86_64 Karatsuba Mul: Don't put add carry into overflow and then continue adding zero. Overflow is lost. --- wolfcrypt/src/fe_x25519_asm.S | 497 +++++++++++++++++--------------- wolfcrypt/src/sp_x86_64_asm.S | 8 - wolfcrypt/src/sp_x86_64_asm.asm | 8 - 3 files changed, 264 insertions(+), 249 deletions(-) diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S index 48b1a17f5..71fa4360e 100644 --- a/wolfcrypt/src/fe_x25519_asm.S +++ b/wolfcrypt/src/fe_x25519_asm.S @@ -1493,10 +1493,10 @@ _fe_mul_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -1644,10 +1644,10 @@ _fe_sq_x64: adcq $0x00, %r10 # Reduce if top bit set movq %r10, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %r15, %r10 - addq %rax, %rcx + addq %rdx, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 @@ -1797,10 +1797,10 @@ L_fe_sq_n_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -2013,10 +2013,10 @@ _fe_sq2_x64: adcq $0x00, %r10 # Reduce if top bit set movq %r10, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbx, %r10 - addq %rax, %rcx + addq %rdx, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 @@ -2631,10 +2631,10 @@ L_curve25519_x64_bits: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbp, %r11 - addq %rax, %rcx + addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -2784,10 +2784,10 @@ L_curve25519_x64_bits: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbp, %r11 - addq %rax, %rcx + addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -2910,10 +2910,10 @@ L_curve25519_x64_bits: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbp, %r11 - addq %rax, %rcx + addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -3036,10 +3036,10 @@ L_curve25519_x64_bits: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbp, %r11 - addq %rax, %rcx + addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -3239,10 +3239,10 @@ L_curve25519_x64_bits: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbp, %r11 - addq %rax, %rcx + addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -3390,10 +3390,10 @@ L_curve25519_x64_bits: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbp, %r11 - addq %rax, %rcx + addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -3549,10 +3549,10 @@ L_curve25519_x64_bits: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbp, %r11 - addq %rax, %rcx + addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -3727,10 +3727,10 @@ L_curve25519_x64_bits: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbp, %r11 - addq %rax, %rcx + addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -3880,10 +3880,10 @@ L_curve25519_x64_bits: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbp, %r11 - addq %rax, %rcx + addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -4276,13 +4276,28 @@ L_curve25519_x64_bits: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbp, %r11 + addq %rdx, %rcx + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + movq %rcx, %rax + addq $19, %rax + movq %r9, %rax + adcq $0x00, %rax + movq %r10, %rax + adcq $0x00, %rax + movq %r11, %rax + adcq $0x00, %rax + sarq $63, %rax + andq $19, %rax addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 + andq %rbp, %r11 # Store movq %rcx, (%rdi) movq %r9, 8(%rdi) @@ -4721,10 +4736,10 @@ _fe_ge_to_p2_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -4877,10 +4892,10 @@ _fe_ge_to_p2_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -5033,10 +5048,10 @@ _fe_ge_to_p2_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -5221,10 +5236,10 @@ _fe_ge_to_p3_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -5377,10 +5392,10 @@ _fe_ge_to_p3_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -5533,10 +5548,10 @@ _fe_ge_to_p3_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -5689,10 +5704,10 @@ _fe_ge_to_p3_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -5851,10 +5866,10 @@ _fe_ge_dbl_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -5979,10 +5994,10 @@ _fe_ge_dbl_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -6118,10 +6133,10 @@ _fe_ge_dbl_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -6274,10 +6289,10 @@ _fe_ge_dbl_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -6632,10 +6647,10 @@ _fe_ge_madd_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -6788,10 +6803,10 @@ _fe_ge_madd_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -6944,10 +6959,10 @@ _fe_ge_madd_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -7330,10 +7345,10 @@ _fe_ge_msub_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -7486,10 +7501,10 @@ _fe_ge_msub_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -7642,10 +7657,10 @@ _fe_ge_msub_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -8028,10 +8043,10 @@ _fe_ge_add_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -8184,10 +8199,10 @@ _fe_ge_add_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -8340,10 +8355,10 @@ _fe_ge_add_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -8496,10 +8511,10 @@ _fe_ge_add_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -8882,10 +8897,10 @@ _fe_ge_sub_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -9038,10 +9053,10 @@ _fe_ge_sub_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -9194,10 +9209,10 @@ _fe_ge_sub_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -9350,10 +9365,10 @@ _fe_ge_sub_x64: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -9644,10 +9659,10 @@ _fe_mul_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -9771,10 +9786,10 @@ _fe_sq_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -9901,10 +9916,10 @@ L_fe_sq_n_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -10083,10 +10098,10 @@ _fe_sq2_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rbx, %r11 - addq %rax, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -10674,10 +10689,10 @@ L_curve25519_avx2_bits: adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rbx, %r12 - addq %rcx, %r9 + addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -10799,10 +10814,10 @@ L_curve25519_avx2_bits: adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rbx, %r12 - addq %rcx, %r9 + addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -10900,10 +10915,10 @@ L_curve25519_avx2_bits: adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r12 - addq %rax, %r9 + addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -11001,10 +11016,10 @@ L_curve25519_avx2_bits: adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r12 - addq %rax, %r9 + addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -11176,10 +11191,10 @@ L_curve25519_avx2_bits: adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rbx, %r12 - addq %rcx, %r9 + addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -11302,10 +11317,10 @@ L_curve25519_avx2_bits: adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r12 - addq %rax, %r9 + addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -11424,10 +11439,10 @@ L_curve25519_avx2_bits: adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rax + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r12 - addq %rax, %r9 + addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -11574,10 +11589,10 @@ L_curve25519_avx2_bits: adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rbx, %r12 - addq %rcx, %r9 + addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -11699,10 +11714,10 @@ L_curve25519_avx2_bits: adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rbx, %r12 - addq %rcx, %r9 + addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 @@ -12067,13 +12082,29 @@ L_curve25519_avx2_bits: adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rbx, %r12 - addq %rcx, %r9 + addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 + movq $0x7fffffffffffffff, %rbx + movq %r9, %rdx + addq $19, %rdx + movq %r10, %rdx + adcq $0x00, %rdx + movq %r11, %rdx + adcq $0x00, %rdx + movq %r12, %rdx + adcq $0x00, %rdx + sarq $63, %rdx + andq $19, %rdx + addq %rdx, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + andq %rbx, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) @@ -12484,10 +12515,10 @@ _fe_ge_to_p2_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -12612,10 +12643,10 @@ _fe_ge_to_p2_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -12739,10 +12770,10 @@ _fe_ge_to_p2_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -12899,10 +12930,10 @@ _fe_ge_to_p3_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -13027,10 +13058,10 @@ _fe_ge_to_p3_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -13154,10 +13185,10 @@ _fe_ge_to_p3_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -13282,10 +13313,10 @@ _fe_ge_to_p3_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -13419,10 +13450,10 @@ _fe_ge_dbl_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rbp + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rbp, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -13522,10 +13553,10 @@ _fe_ge_dbl_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rbp + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rbp, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -13650,10 +13681,10 @@ _fe_ge_dbl_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rbp + sarq $63, %rdx + andq $19, %rdx andq %rcx, %r11 - addq %rbp, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -13841,10 +13872,10 @@ _fe_ge_dbl_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rbp + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rbp, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -14083,10 +14114,10 @@ _fe_ge_madd_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -14209,10 +14240,10 @@ _fe_ge_madd_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -14337,10 +14368,10 @@ _fe_ge_madd_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -14682,10 +14713,10 @@ _fe_ge_msub_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -14808,10 +14839,10 @@ _fe_ge_msub_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -14936,10 +14967,10 @@ _fe_ge_msub_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -15280,10 +15311,10 @@ _fe_ge_add_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -15406,10 +15437,10 @@ _fe_ge_add_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -15534,10 +15565,10 @@ _fe_ge_add_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -15661,10 +15692,10 @@ _fe_ge_add_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -16006,10 +16037,10 @@ _fe_ge_sub_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -16132,10 +16163,10 @@ _fe_ge_sub_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -16260,10 +16291,10 @@ _fe_ge_sub_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 @@ -16387,10 +16418,10 @@ _fe_ge_sub_avx2: adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx - shrq $63, %rdx - imulq $19, %rdx, %rcx + sarq $63, %rdx + andq $19, %rdx andq %rax, %r11 - addq %rcx, %r8 + addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S index d71201af8..4076e52d0 100644 --- a/wolfcrypt/src/sp_x86_64_asm.S +++ b/wolfcrypt/src/sp_x86_64_asm.S @@ -4677,7 +4677,6 @@ _sp_2048_mul_32: addq $0x80, %r15 # Add movq (%r15), %rax - xorq %r9, %r9 addq (%r11), %rax movq 8(%r15), %rcx movq %rax, (%r15) @@ -4728,7 +4727,6 @@ _sp_2048_mul_32: movq %rax, 120(%r15) adcq 128(%r11), %rcx movq %rcx, 128(%r15) - adcq $0x00, %r9 # Add to zero movq 136(%r11), %rax adcq $0x00, %rax @@ -5356,7 +5354,6 @@ _sp_2048_mul_avx2_32: addq $0x80, %r15 # Add movq (%r15), %rax - xorq %r9, %r9 addq (%r11), %rax movq 8(%r15), %rcx movq %rax, (%r15) @@ -5407,7 +5404,6 @@ _sp_2048_mul_avx2_32: movq %rax, 120(%r15) adcq 128(%r11), %rcx movq %rcx, 128(%r15) - adcq $0x00, %r9 # Add to zero movq 136(%r11), %rax adcq $0x00, %rax @@ -43508,7 +43504,6 @@ _sp_4096_mul_64: addq $0x100, %r15 # Add movq (%r15), %rax - xorq %r9, %r9 addq (%r11), %rax movq 8(%r15), %rcx movq %rax, (%r15) @@ -43607,7 +43602,6 @@ _sp_4096_mul_64: movq %rcx, 248(%r15) adcq 256(%r11), %r8 movq %r8, 256(%r15) - adcq $0x00, %r9 # Add to zero movq 264(%r11), %rax adcq $0x00, %rax @@ -44763,7 +44757,6 @@ _sp_4096_mul_avx2_64: addq $0x100, %r15 # Add movq (%r15), %rax - xorq %r9, %r9 addq (%r11), %rax movq 8(%r15), %rcx movq %rax, (%r15) @@ -44862,7 +44855,6 @@ _sp_4096_mul_avx2_64: movq %rcx, 248(%r15) adcq 256(%r11), %r8 movq %r8, 256(%r15) - adcq $0x00, %r9 # Add to zero movq 264(%r11), %rax adcq $0x00, %rax diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm index 971034f56..c6a7f12db 100644 --- a/wolfcrypt/src/sp_x86_64_asm.asm +++ b/wolfcrypt/src/sp_x86_64_asm.asm @@ -4585,7 +4585,6 @@ ENDIF add rsi, 128 ; Add mov rax, QWORD PTR [rsi] - xor r11, r11 add rax, QWORD PTR [r13] mov r9, QWORD PTR [rsi+8] mov QWORD PTR [rsi], rax @@ -4636,7 +4635,6 @@ ENDIF mov QWORD PTR [rsi+120], rax adc r9, QWORD PTR [r13+128] mov QWORD PTR [rsi+128], r9 - adc r11, 0 ; Add to zero mov rax, QWORD PTR [r13+136] adc rax, 0 @@ -5245,7 +5243,6 @@ ENDIF add rsi, 128 ; Add mov rax, QWORD PTR [rsi] - xor r11, r11 add rax, QWORD PTR [r13] mov r9, QWORD PTR [rsi+8] mov QWORD PTR [rsi], rax @@ -5296,7 +5293,6 @@ ENDIF mov QWORD PTR [rsi+120], rax adc r9, QWORD PTR [r13+128] mov QWORD PTR [rsi+128], r9 - adc r11, 0 ; Add to zero mov rax, QWORD PTR [r13+136] adc rax, 0 @@ -42510,7 +42506,6 @@ ENDIF add rsi, 256 ; Add mov rax, QWORD PTR [rsi] - xor r11, r11 add rax, QWORD PTR [r13] mov r9, QWORD PTR [rsi+8] mov QWORD PTR [rsi], rax @@ -42609,7 +42604,6 @@ ENDIF mov QWORD PTR [rsi+248], r9 adc r10, QWORD PTR [r13+256] mov QWORD PTR [rsi+256], r10 - adc r11, 0 ; Add to zero mov rax, QWORD PTR [r13+264] adc rax, 0 @@ -43746,7 +43740,6 @@ ENDIF add rsi, 256 ; Add mov rax, QWORD PTR [rsi] - xor r11, r11 add rax, QWORD PTR [r13] mov r9, QWORD PTR [rsi+8] mov QWORD PTR [rsi], rax @@ -43845,7 +43838,6 @@ ENDIF mov QWORD PTR [rsi+248], r9 adc r10, QWORD PTR [r13+256] mov QWORD PTR [rsi+256], r10 - adc r11, 0 ; Add to zero mov rax, QWORD PTR [r13+264] adc rax, 0