From e9f1489997f5b2cecf4dea2b08566bc739dd7106 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 15 Sep 2023 09:23:43 +1000 Subject: [PATCH] SP ECC: x64 minor speed improvement ARM32/Thumb2: for safer code, do two reductions in mont triple after doing double part. --- wolfcrypt/src/sp_arm32.c | 12 +- wolfcrypt/src/sp_cortexm.c | 12 +- wolfcrypt/src/sp_x86_64.c | 74 +++++------ wolfcrypt/src/sp_x86_64_asm.S | 212 +++++++++++++------------------- wolfcrypt/src/sp_x86_64_asm.asm | 197 ++++++++++++----------------- 5 files changed, 210 insertions(+), 297 deletions(-) diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c index 89eed12ac..876fa887a 100644 --- a/wolfcrypt/src/sp_arm32.c +++ b/wolfcrypt/src/sp_arm32.c @@ -71903,8 +71903,16 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit "sbcs r9, r9, #0\n\t" "sbcs r10, r10, r12, LSR #31\n\t" "sbcs r11, r11, r12\n\t" - "rsb r12, r12, #0\n\t" - "sbc r12, r12, #0\n\t" + "sbc r2, r2, r2\n\t" + "sub r12, r12, r2\n\t" + "subs r4, r4, r12\n\t" + "sbcs r5, r5, r12\n\t" + "sbcs r6, r6, r12\n\t" + "sbcs r7, r7, #0\n\t" + "sbcs r8, r8, #0\n\t" + "sbcs r9, r9, #0\n\t" + "sbcs r10, r10, r12, LSR #31\n\t" + "sbc r11, r11, r12\n\t" "ldm %[a]!, {r2, r3}\n\t" "adds r4, r4, r2\n\t" "adcs r5, r5, r3\n\t" diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c index a4bfec2b3..f2103078f 100644 --- a/wolfcrypt/src/sp_cortexm.c +++ b/wolfcrypt/src/sp_cortexm.c @@ -33116,8 +33116,16 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit "SBCS r9, r9, #0x0\n\t" "SBCS r10, r10, r12, LSR #31\n\t" "SBCS r11, r11, r12\n\t" - "RSB r12, r12, #0x0\n\t" - "SBC r12, r12, #0x0\n\t" + "SBC r2, r2, r2\n\t" + "SUB r12, r12, r2\n\t" + "SUBS r4, r4, r12\n\t" + "SBCS r5, r5, r12\n\t" + "SBCS r6, r6, r12\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBCS r9, r9, #0x0\n\t" + "SBCS r10, r10, r12, LSR #31\n\t" + "SBC r11, r11, r12\n\t" "LDM %[a]!, {r2, r3}\n\t" "ADDS r4, r4, r2\n\t" "ADCS r5, r5, r3\n\t" diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c index c53dab1ff..6b7bba78e 100644 --- a/wolfcrypt/src/sp_x86_64.c +++ b/wolfcrypt/src/sp_x86_64.c @@ -8607,7 +8607,7 @@ extern void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m); #ifdef __cplusplus extern "C" { #endif -extern void sp_256_mont_sub_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); +extern void sp_256_mont_rsb_sub_dbl_4(sp_digit* r, const sp_digit* a, sp_digit* b, const sp_digit* m); #ifdef __cplusplus } #endif @@ -8661,9 +8661,8 @@ static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p, /* X = T1 * T1 */ sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod); /* X = X - 2*Y */ - sp_256_mont_sub_dbl_4(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_4(y, y, x, p256_mod); + sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -8775,15 +8774,14 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con break; case 14: /* X = X - 2*Y */ - sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod); + /* Y = Y - X */ + sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod); ctx->state = 15; break; case 15: ctx->state = 16; break; case 16: - /* Y = Y - X */ - sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -8808,13 +8806,6 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con return err; } #endif /* WOLFSSL_SP_NONBLOCK */ -#ifdef __cplusplus -extern "C" { -#endif -extern void sp_256_mont_dbl_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); -#ifdef __cplusplus -} -#endif /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -8858,9 +8849,9 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i, sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); /* X = A^2 - 2B */ sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod); - sp_256_mont_sub_dbl_4(x, x, b, p256_mod); + sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_dbl_sub_4(b, b, x, p256_mod); + sp_256_mont_dbl_4(b, b, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -8886,9 +8877,9 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i, sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); /* X = A^2 - 2B */ sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod); - sp_256_mont_sub_dbl_4(x, x, b, p256_mod); + sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_dbl_sub_4(b, b, x, p256_mod); + sp_256_mont_dbl_4(b, b, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -8981,9 +8972,8 @@ static void sp_256_proj_point_add_4(sp_point_256* r, sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_4(x, x, t5, p256_mod); sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod); - sp_256_mont_sub_dbl_4(x, x, y, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_4(y, y, x, p256_mod); + sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod); sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_4(y, y, t5, p256_mod); { @@ -9159,12 +9149,11 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 20; break; case 20: - sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod); ctx->state = 21; break; case 21: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 22; break; case 22: @@ -9263,9 +9252,9 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r, x = r[j].x; /* X = A^2 - 2B */ sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod); - sp_256_mont_sub_dbl_4(x, x, b, p256_mod); + sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_dbl_sub_4(b, b, x, p256_mod); + sp_256_mont_dbl_4(b, b, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_4(r[j].z, z, y, p256_mod, p256_mp_mod); z = r[j].z; @@ -9764,7 +9753,7 @@ extern void sp_256_div2_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* m #ifdef __cplusplus } #endif -#define sp_256_mont_sub_dbl_avx2_4 sp_256_mont_sub_dbl_4 +#define sp_256_mont_rsb_sub_dbl_avx2_4 sp_256_mont_rsb_sub_dbl_4 /* Double the Montgomery form projective point p. * * r Result of doubling point. @@ -9815,9 +9804,8 @@ static void sp_256_proj_point_dbl_avx2_4(sp_point_256* r, const sp_point_256* p, /* X = T1 * T1 */ sp_256_mont_sqr_avx2_4(x, t1, p256_mod, p256_mp_mod); /* X = X - 2*Y */ - sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_avx2_4(y, y, x, p256_mod); + sp_256_mont_rsb_sub_dbl_avx2_4(x, x, y, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_avx2_4(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -9929,15 +9917,14 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r break; case 14: /* X = X - 2*Y */ - sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod); + /* Y = Y - X */ + sp_256_mont_rsb_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod); ctx->state = 15; break; case 15: ctx->state = 16; break; case 16: - /* Y = Y - X */ - sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -9962,7 +9949,6 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r return err; } #endif /* WOLFSSL_SP_NONBLOCK */ -#define sp_256_mont_dbl_sub_avx2_4 sp_256_mont_dbl_sub_4 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -10006,9 +9992,9 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i, sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod); /* X = A^2 - 2B */ sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod); - sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod); + sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod); + sp_256_mont_dbl_avx2_4(b, b, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -10034,9 +10020,9 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i, sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod); /* X = A^2 - 2B */ sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod); - sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod); + sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod); + sp_256_mont_dbl_avx2_4(b, b, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -10105,9 +10091,8 @@ static void sp_256_proj_point_add_avx2_4(sp_point_256* r, sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(x, x, t5, p256_mod); sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod); - sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_avx2_4(y, y, x, p256_mod); + sp_256_mont_rsb_sub_dbl_avx2_4(x, x, y, p256_mod); sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(y, y, t5, p256_mod); { @@ -10283,12 +10268,11 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r ctx->state = 20; break; case 20: - sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_rsb_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod); ctx->state = 21; break; case 21: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 22; break; case 22: @@ -10387,9 +10371,9 @@ static void sp_256_proj_point_dbl_n_store_avx2_4(sp_point_256* r, x = r[j].x; /* X = A^2 - 2B */ sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod); - sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod); + sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod); + sp_256_mont_dbl_avx2_4(b, b, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_avx2_4(r[j].z, z, y, p256_mod, p256_mp_mod); z = r[j].z; @@ -10689,9 +10673,8 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r, sp_256_mont_mul_4(t1, t1, t2, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(t2, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_4(t2, t2, t1, p256_mod); - sp_256_mont_sub_dbl_4(x, t2, t3, p256_mod); + sp_256_mont_rsb_sub_dbl_4(x, t2, t3, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_4(t3, t3, x, p256_mod); sp_256_mont_mul_4(t3, t3, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t1, t1, p->y, p256_mod, p256_mp_mod); sp_256_mont_sub_4(y, t3, t1, p256_mod); @@ -11178,9 +11161,8 @@ static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r, sp_256_mont_mul_avx2_4(t1, t1, t2, p256_mod, p256_mp_mod); sp_256_mont_sqr_avx2_4(t2, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(t2, t2, t1, p256_mod); - sp_256_mont_sub_dbl_avx2_4(x, t2, t3, p256_mod); + sp_256_mont_rsb_sub_dbl_avx2_4(x, t2, t3, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_avx2_4(t3, t3, x, p256_mod); sp_256_mont_mul_avx2_4(t3, t3, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t1, t1, p->y, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(y, t3, t1, p256_mod); diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S index 0fbacd68a..310f4611d 100644 --- a/wolfcrypt/src/sp_x86_64_asm.S +++ b/wolfcrypt/src/sp_x86_64_asm.S @@ -55378,11 +55378,12 @@ _sp_256_mul_avx2_4: pushq %rbx movq %rdx, %rbp movq (%rsi), %rdx + movq 8(%rbp), %r14 # A[0] * B[0] mulxq (%rbp), %r8, %r9 xorq %rbx, %rbx # A[0] * B[1] - mulxq 8(%rbp), %rax, %r10 + mulxq %r14, %rax, %r10 adcxq %rax, %r9 # A[0] * B[2] mulxq 16(%rbp), %rax, %r11 @@ -55397,7 +55398,7 @@ _sp_256_mul_avx2_4: xorq %rbx, %rbx adcxq %rax, %r9 # A[1] * B[1] - mulxq 8(%rbp), %rax, %r15 + mulxq %r14, %rax, %r15 adoxq %rcx, %r10 adcxq %rax, %r10 # A[1] * B[2] @@ -55416,7 +55417,7 @@ _sp_256_mul_avx2_4: xorq %rbx, %rbx adcxq %rax, %r10 # A[2] * B[1] - mulxq 8(%rbp), %rax, %r15 + mulxq %r14, %rax, %r15 adoxq %rcx, %r11 adcxq %rax, %r11 # A[2] * B[2] @@ -55981,11 +55982,10 @@ _sp_256_mont_mul_4: adcq $0x00, %rbx sbbq $0x00, %r9 movq $0xffffffff00000001, %rsi - movq %r9, %rax # mask m and sub from result if overflow # m[0] = -1 & mask = mask - shrq $32, %rax # m[2] = 0 & mask = 0 + movl %r9d, %eax andq %r9, %rsi subq %r9, %r13 sbbq %rax, %r14 @@ -56163,11 +56163,10 @@ _sp_256_mont_sqr_4: adcq $0x00, %r15 sbbq $0x00, %r8 movq $0xffffffff00000001, %rsi - movq %r8, %rax # mask m and sub from result if overflow # m[0] = -1 & mask = mask - shrq $32, %rax # m[2] = 0 & mask = 0 + movl %r8d, %eax andq %r8, %rsi subq %r8, %r12 sbbq %rax, %r13 @@ -56388,11 +56387,10 @@ _sp_256_mont_reduce_4: adcq $0x00, %r15 sbbq $0x00, %r8 movq $0xffffffff00000001, %rbx - movq %r8, %rax # mask m and sub from result if overflow # m[0] = -1 & mask = mask - shrq $32, %rax # m[2] = 0 & mask = 0 + movl %r8d, %eax andq %r8, %rbx subq %r8, %r12 sbbq %rax, %r13 @@ -56543,13 +56541,12 @@ _sp_256_mont_add_4: movq 16(%rsi), %r8 movq 24(%rsi), %r9 addq (%rdx), %rax - movq $0xffffffff, %r10 adcq 8(%rdx), %rcx movq $0xffffffff00000001, %r11 adcq 16(%rdx), %r8 adcq 24(%rdx), %r9 sbbq %rsi, %rsi - andq %rsi, %r10 + movl %esi, %r10d andq %rsi, %r11 subq %rsi, %rax sbbq %r10, %rcx @@ -56593,13 +56590,13 @@ _sp_256_mont_dbl_4: movq 16(%rsi), %rcx movq 24(%rsi), %r8 addq %rdx, %rdx - movq $0xffffffff, %r9 adcq %rax, %rax movq $0xffffffff00000001, %r10 adcq %rcx, %rcx + movq %r8, %r11 adcq %r8, %r8 - sbbq %r11, %r11 - andq %r11, %r9 + sarq $63, %r11 + movl %r11d, %r9d andq %r11, %r10 subq %r11, %rdx sbbq %r9, %rax @@ -56643,13 +56640,12 @@ _sp_256_mont_tpl_4: movq 16(%rsi), %rcx movq 24(%rsi), %r8 addq %rdx, %rdx - movq $0xffffffff, %r9 adcq %rax, %rax movq $0xffffffff00000001, %r10 adcq %rcx, %rcx adcq %r8, %r8 sbbq %r11, %r11 - andq %r11, %r9 + movl %r11d, %r9d andq %r11, %r10 subq %r11, %rdx sbbq %r9, %rax @@ -56663,13 +56659,12 @@ _sp_256_mont_tpl_4: sbbq $0x00, %rcx sbbq %r10, %r8 addq (%rsi), %rdx - movq $0xffffffff, %r9 adcq 8(%rsi), %rax movq $0xffffffff00000001, %r10 adcq 16(%rsi), %rcx adcq 24(%rsi), %r8 - sbbq %r11, %r11 - andq %r11, %r9 + sbbq $0x00, %r11 + movl %r11d, %r9d andq %r11, %r10 subq %r11, %rdx sbbq %r9, %rax @@ -56714,13 +56709,12 @@ _sp_256_mont_sub_4: movq 16(%rsi), %r8 movq 24(%rsi), %r9 subq (%rdx), %rax - movq $0xffffffff, %r10 sbbq 8(%rdx), %rcx movq $0xffffffff00000001, %r11 sbbq 16(%rdx), %r8 sbbq 24(%rdx), %r9 sbbq %rsi, %rsi - andq %rsi, %r10 + movl %esi, %r10d andq %rsi, %r11 addq %rsi, %rax adcq %r10, %rcx @@ -56797,15 +56791,15 @@ _sp_256_div2_4: */ #ifndef __APPLE__ .text -.globl sp_256_mont_sub_dbl_4 -.type sp_256_mont_sub_dbl_4,@function +.globl sp_256_mont_rsb_sub_dbl_4 +.type sp_256_mont_rsb_sub_dbl_4,@function .align 16 -sp_256_mont_sub_dbl_4: +sp_256_mont_rsb_sub_dbl_4: #else .section __TEXT,__text -.globl _sp_256_mont_sub_dbl_4 +.globl _sp_256_mont_rsb_sub_dbl_4 .p2align 4 -_sp_256_mont_sub_dbl_4: +_sp_256_mont_rsb_sub_dbl_4: #endif /* __APPLE__ */ pushq %r12 pushq %r13 @@ -56820,42 +56814,40 @@ _sp_256_mont_sub_dbl_4: movq 16(%rdx), %r12 movq 24(%rdx), %r13 addq %r10, %r10 - movq $0xffffffff, %r14 adcq %r11, %r11 movq $0xffffffff00000001, %r15 adcq %r12, %r12 adcq %r13, %r13 - sbbq %rdx, %rdx - andq %rdx, %r14 - andq %rdx, %r15 - subq %rdx, %r10 + sbbq %rsi, %rsi + movl %esi, %r14d + andq %rsi, %r15 + subq %rsi, %r10 sbbq %r14, %r11 sbbq $0x00, %r12 sbbq %r15, %r13 - adcq $0x00, %rdx - andq %rdx, %r14 - andq %rdx, %r15 - subq %rdx, %r10 + adcq $0x00, %rsi + andq %rsi, %r14 + andq %rsi, %r15 + subq %rsi, %r10 sbbq %r14, %r11 sbbq $0x00, %r12 sbbq %r15, %r13 subq %r10, %rax - movq $0xffffffff, %r14 sbbq %r11, %rcx movq $0xffffffff00000001, %r15 sbbq %r12, %r8 sbbq %r13, %r9 - sbbq %rdx, %rdx - andq %rdx, %r14 - andq %rdx, %r15 - addq %rdx, %rax + sbbq $0x00, %rsi + movl %esi, %r14d + andq %rsi, %r15 + addq %rsi, %rax adcq %r14, %rcx adcq $0x00, %r8 adcq %r15, %r9 - adcq $0x00, %rdx - andq %rdx, %r14 - andq %rdx, %r15 - addq %rdx, %rax + adcq $0x00, %rsi + andq %rsi, %r14 + andq %rsi, %r15 + addq %rsi, %rax adcq %r14, %rcx movq %rax, (%rdi) adcq $0x00, %r8 @@ -56863,73 +56855,40 @@ _sp_256_mont_sub_dbl_4: adcq %r15, %r9 movq %r8, 16(%rdi) movq %r9, 24(%rdi) + movq (%rdx), %r10 + movq 8(%rdx), %r11 + movq 16(%rdx), %r12 + movq 24(%rdx), %r13 + subq %rax, %r10 + sbbq %rcx, %r11 + movq $0xffffffff00000001, %r15 + sbbq %r8, %r12 + sbbq %r9, %r13 + sbbq %rsi, %rsi + movl %esi, %r14d + andq %rsi, %r15 + addq %rsi, %r10 + adcq %r14, %r11 + adcq $0x00, %r12 + adcq %r15, %r13 + adcq $0x00, %rsi + andq %rsi, %r14 + andq %rsi, %r15 + addq %rsi, %r10 + adcq %r14, %r11 + movq %r10, (%rdx) + adcq $0x00, %r12 + movq %r11, 8(%rdx) + adcq %r15, %r13 + movq %r12, 16(%rdx) + movq %r13, 24(%rdx) popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ -.size sp_256_mont_sub_dbl_4,.-sp_256_mont_sub_dbl_4 -#endif /* __APPLE__ */ -/* Two Montgomery numbers, subtract second from first and double. - * (r = 2.(a - b) % m). - * - * b must have came from a mont_sub operation. - * - * r Result of subtration. - * a Number to subtract from in Montgomery form. - * b Number to subtract with in Montgomery form. - * m Modulus (prime). - */ -#ifndef __APPLE__ -.text -.globl sp_256_mont_dbl_sub_4 -.type sp_256_mont_dbl_sub_4,@function -.align 16 -sp_256_mont_dbl_sub_4: -#else -.section __TEXT,__text -.globl _sp_256_mont_dbl_sub_4 -.p2align 4 -_sp_256_mont_dbl_sub_4: -#endif /* __APPLE__ */ - movq (%rsi), %rax - movq 8(%rsi), %rcx - movq 16(%rsi), %r8 - movq 24(%rsi), %r9 - subq (%rdx), %rax - movq $0xffffffff, %r10 - sbbq 8(%rdx), %rcx - movq $0xffffffff00000001, %r11 - sbbq 16(%rdx), %r8 - sbbq 24(%rdx), %r9 - sbbq %rdx, %rdx - andq %rdx, %r10 - andq %rdx, %r11 - addq %rdx, %rax - adcq %r10, %rcx - adcq $0x00, %r8 - adcq %r11, %r9 - addq %rax, %rax - movq $0xffffffff, %r10 - adcq %rcx, %rcx - movq $0xffffffff00000001, %r11 - adcq %r8, %r8 - adcq %r9, %r9 - sbbq %rdx, %rdx - andq %rdx, %r10 - andq %rdx, %r11 - subq %rdx, %rax - sbbq %r10, %rcx - movq %rax, (%rdi) - sbbq $0x00, %r8 - movq %rcx, 8(%rdi) - sbbq %r11, %r9 - movq %r8, 16(%rdi) - movq %r9, 24(%rdi) - repz retq -#ifndef __APPLE__ -.size sp_256_mont_dbl_sub_4,.-sp_256_mont_dbl_sub_4 +.size sp_256_mont_rsb_sub_dbl_4,.-sp_256_mont_rsb_sub_dbl_4 #endif /* __APPLE__ */ #ifndef WC_NO_CACHE_RESISTANT /* Touch each possible point that could be being copied. @@ -57085,11 +57044,12 @@ _sp_256_mont_mul_avx2_4: pushq %rbx movq %rdx, %rbp movq (%rsi), %rdx + movq 8(%rbp), %r14 # A[0] * B[0] mulxq (%rbp), %r8, %r9 xorq %rbx, %rbx # A[0] * B[1] - mulxq 8(%rbp), %rax, %r10 + mulxq %r14, %rax, %r10 adcxq %rax, %r9 # A[0] * B[2] mulxq 16(%rbp), %rax, %r11 @@ -57104,7 +57064,7 @@ _sp_256_mont_mul_avx2_4: xorq %rbx, %rbx adcxq %rax, %r9 # A[1] * B[1] - mulxq 8(%rbp), %rax, %r15 + mulxq %r14, %rax, %r15 adoxq %rcx, %r10 adcxq %rax, %r10 # A[1] * B[2] @@ -57123,7 +57083,7 @@ _sp_256_mont_mul_avx2_4: xorq %rbx, %rbx adcxq %rax, %r10 # A[2] * B[1] - mulxq 8(%rbp), %rax, %r15 + mulxq %r14, %rax, %r15 adoxq %rcx, %r11 adcxq %rax, %r11 # A[2] * B[2] @@ -57213,11 +57173,10 @@ _sp_256_mont_mul_avx2_4: adcq $0x00, %r15 sbbq $0x00, %r8 movq $0xffffffff00000001, %rsi - movq %r8, %rax # mask m and sub from result if overflow # m[0] = -1 & mask = mask - shrq $32, %rax # m[2] = 0 & mask = 0 + movl %r8d, %eax andq %r8, %rsi subq %r8, %r12 sbbq %rax, %r13 @@ -57378,11 +57337,10 @@ _sp_256_mont_sqr_avx2_4: adcq $0x00, %r15 sbbq $0x00, %r8 movq $0xffffffff00000001, %rsi - movq %r8, %rax # mask m and sub from result if overflow # m[0] = -1 & mask = mask - shrq $32, %rax # m[2] = 0 & mask = 0 + movl %r8d, %eax andq %r8, %rsi subq %r8, %r12 sbbq %rax, %r13 @@ -58352,11 +58310,12 @@ _sp_256_mont_mul_order_avx2_4: pushq %rbx movq %rdx, %rbp movq (%rsi), %rdx + movq 8(%rbp), %r14 # A[0] * B[0] mulxq (%rbp), %r8, %r9 xorq %rbx, %rbx # A[0] * B[1] - mulxq 8(%rbp), %rax, %r10 + mulxq %r14, %rax, %r10 adcxq %rax, %r9 # A[0] * B[2] mulxq 16(%rbp), %rax, %r11 @@ -58371,7 +58330,7 @@ _sp_256_mont_mul_order_avx2_4: xorq %rbx, %rbx adcxq %rax, %r9 # A[1] * B[1] - mulxq 8(%rbp), %rax, %r15 + mulxq %r14, %rax, %r15 adoxq %rcx, %r10 adcxq %rax, %r10 # A[1] * B[2] @@ -58390,7 +58349,7 @@ _sp_256_mont_mul_order_avx2_4: xorq %rbx, %rbx adcxq %rax, %r10 # A[2] * B[1] - mulxq 8(%rbp), %rax, %r15 + mulxq %r14, %rax, %r15 adoxq %rcx, %r11 adcxq %rax, %r11 # A[2] * B[2] @@ -60601,11 +60560,10 @@ _sp_384_mont_reduce_6: # Subtract mod if carry negq %r10 movq $0xfffffffffffffffe, %r9 - movq %r10, %rcx + movl %r10d, %ecx movq %r10, %r8 - shrq $32, %rcx - shlq $32, %r8 andq %r10, %r9 + shlq $32, %r8 subq %rcx, %rbx sbbq %r8, %rbp sbbq %r9, %r11 @@ -60851,7 +60809,6 @@ _sp_384_mont_add_6: movq 32(%rsi), %r10 movq 40(%rsi), %r11 addq (%rdx), %rax - movq $0xffffffff, %r12 adcq 8(%rdx), %rcx movq $0xffffffff00000000, %r13 adcq 16(%rdx), %r8 @@ -60860,7 +60817,7 @@ _sp_384_mont_add_6: adcq 32(%rdx), %r10 adcq 40(%rdx), %r11 sbbq %rsi, %rsi - andq %rsi, %r12 + movl %esi, %r12d andq %rsi, %r13 andq %rsi, %r14 subq %r12, %rax @@ -60920,16 +60877,16 @@ _sp_384_mont_dbl_6: movq 32(%rsi), %r9 movq 40(%rsi), %r10 addq %rdx, %rdx - movq $0xffffffff, %r11 adcq %rax, %rax movq $0xffffffff00000000, %r12 adcq %rcx, %rcx movq $0xfffffffffffffffe, %r13 adcq %r8, %r8 adcq %r9, %r9 + movq %r10, %r14 adcq %r10, %r10 - sbbq %r14, %r14 - andq %r14, %r11 + sarq $63, %r14 + movl %r14d, %r11d andq %r14, %r12 andq %r14, %r13 subq %r11, %rdx @@ -60989,7 +60946,6 @@ _sp_384_mont_tpl_6: movq 32(%rsi), %r9 movq 40(%rsi), %r10 addq %rdx, %rdx - movq $0xffffffff, %r11 adcq %rax, %rax movq $0xffffffff00000000, %r12 adcq %rcx, %rcx @@ -60998,7 +60954,7 @@ _sp_384_mont_tpl_6: adcq %r9, %r9 adcq %r10, %r10 sbbq %r14, %r14 - andq %r14, %r11 + movl %r14d, %r11d andq %r14, %r12 andq %r14, %r13 subq %r11, %rdx @@ -61019,7 +60975,6 @@ _sp_384_mont_tpl_6: sbbq %r14, %r9 sbbq %r14, %r10 addq (%rsi), %rdx - movq $0xffffffff, %r11 adcq 8(%rsi), %rax movq $0xffffffff00000000, %r12 adcq 16(%rsi), %rcx @@ -61028,7 +60983,7 @@ _sp_384_mont_tpl_6: adcq 32(%rsi), %r9 adcq 40(%rsi), %r10 sbbq %r14, %r14 - andq %r14, %r11 + movl %r14d, %r11d andq %r14, %r12 andq %r14, %r13 subq %r11, %rdx @@ -61089,7 +61044,6 @@ _sp_384_mont_sub_6: movq 32(%rsi), %r10 movq 40(%rsi), %r11 subq (%rdx), %rax - movq $0xffffffff, %r12 sbbq 8(%rdx), %rcx movq $0xffffffff00000000, %r13 sbbq 16(%rdx), %r8 @@ -61098,7 +61052,7 @@ _sp_384_mont_sub_6: sbbq 32(%rdx), %r10 sbbq 40(%rdx), %r11 sbbq %rsi, %rsi - andq %rsi, %r12 + movl %esi, %r12d andq %rsi, %r13 andq %rsi, %r14 addq %r12, %rax diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm index 1f42b2755..5bb8faa21 100644 --- a/wolfcrypt/src/sp_x86_64_asm.asm +++ b/wolfcrypt/src/sp_x86_64_asm.asm @@ -54329,11 +54329,12 @@ sp_256_mul_avx2_4 PROC mov rbp, r8 mov rax, rdx mov rdx, QWORD PTR [rax] + mov r14, QWORD PTR [rbp+8] ; A[0] * B[0] mulx r9, r8, QWORD PTR [rbp] xor rbx, rbx ; A[0] * B[1] - mulx r10, rdi, QWORD PTR [rbp+8] + mulx r10, rdi, r14 adcx r9, rdi ; A[0] * B[2] mulx r11, rdi, QWORD PTR [rbp+16] @@ -54348,7 +54349,7 @@ sp_256_mul_avx2_4 PROC xor rbx, rbx adcx r9, rdi ; A[1] * B[1] - mulx r15, rdi, QWORD PTR [rbp+8] + mulx r15, rdi, r14 adox r10, rsi adcx r10, rdi ; A[1] * B[2] @@ -54367,7 +54368,7 @@ sp_256_mul_avx2_4 PROC xor rbx, rbx adcx r10, rdi ; A[2] * B[1] - mulx r15, rdi, QWORD PTR [rbp+8] + mulx r15, rdi, r14 adox r11, rsi adcx r11, rdi ; A[2] * B[2] @@ -54884,11 +54885,10 @@ sp_256_mont_mul_4 PROC adc rbx, 0 sbb r11, 0 mov r10, 18446744069414584321 - mov rax, r11 ; mask m and sub from result if overflow ; m[0] = -1 & mask = mask - shr rax, 32 ; m[2] = 0 & mask = 0 + mov eax, r11d and r10, r11 sub r15, r11 sbb rdi, rax @@ -55060,11 +55060,10 @@ sp_256_mont_sqr_4 PROC adc rsi, 0 sbb r10, 0 mov r8, 18446744069414584321 - mov rax, r10 ; mask m and sub from result if overflow ; m[0] = -1 & mask = mask - shr rax, 32 ; m[2] = 0 & mask = 0 + mov eax, r10d and r8, r10 sub r14, r10 sbb r15, rax @@ -55263,11 +55262,10 @@ sp_256_mont_reduce_4 PROC adc rdi, 0 sbb r9, 0 mov rbx, 18446744069414584321 - mov rax, r9 ; mask m and sub from result if overflow ; m[0] = -1 & mask = mask - shr rax, 32 ; m[2] = 0 & mask = 0 + mov eax, r9d and rbx, r9 sub r13, r9 sbb r14, rax @@ -55404,13 +55402,12 @@ sp_256_mont_add_4 PROC mov r10, QWORD PTR [rdx+16] mov r11, QWORD PTR [rdx+24] add rax, QWORD PTR [r8] - mov r12, 4294967295 adc r9, QWORD PTR [r8+8] mov r13, 18446744069414584321 adc r10, QWORD PTR [r8+16] adc r11, QWORD PTR [r8+24] sbb rdx, rdx - and r12, rdx + mov r12d, edx and r13, rdx sub rax, rdx sbb r9, r12 @@ -55447,13 +55444,13 @@ sp_256_mont_dbl_4 PROC mov r9, QWORD PTR [rdx+16] mov r10, QWORD PTR [rdx+24] add rax, rax - mov r11, 4294967295 adc r8, r8 mov r12, 18446744069414584321 adc r9, r9 + mov r13, r10 adc r10, r10 - sbb r13, r13 - and r11, r13 + sar r13, 63 + mov r11d, r13d and r12, r13 sub rax, r13 sbb r8, r11 @@ -55490,13 +55487,12 @@ sp_256_mont_tpl_4 PROC mov r9, QWORD PTR [rdx+16] mov r10, QWORD PTR [rdx+24] add rax, rax - mov r11, 4294967295 adc r8, r8 mov r12, 18446744069414584321 adc r9, r9 adc r10, r10 sbb r13, r13 - and r11, r13 + mov r11d, r13d and r12, r13 sub rax, r13 sbb r8, r11 @@ -55510,13 +55506,12 @@ sp_256_mont_tpl_4 PROC sbb r9, 0 sbb r10, r12 add rax, QWORD PTR [rdx] - mov r11, 4294967295 adc r8, QWORD PTR [rdx+8] mov r12, 18446744069414584321 adc r9, QWORD PTR [rdx+16] adc r10, QWORD PTR [rdx+24] - sbb r13, r13 - and r11, r13 + sbb r13, 0 + mov r11d, r13d and r12, r13 sub rax, r13 sbb r8, r11 @@ -55554,13 +55549,12 @@ sp_256_mont_sub_4 PROC mov r10, QWORD PTR [rdx+16] mov r11, QWORD PTR [rdx+24] sub rax, QWORD PTR [r8] - mov r12, 4294967295 sbb r9, QWORD PTR [r8+8] mov r13, 18446744069414584321 sbb r10, QWORD PTR [r8+16] sbb r11, QWORD PTR [r8+24] sbb rdx, rdx - and r12, rdx + mov r12d, edx and r13, rdx add rax, rdx adc r9, r12 @@ -55630,7 +55624,7 @@ _text ENDS ; * m Modulus (prime). ; */ _text SEGMENT READONLY PARA -sp_256_mont_sub_dbl_4 PROC +sp_256_mont_rsb_sub_dbl_4 PROC push r12 push r13 push r14 @@ -55646,42 +55640,40 @@ sp_256_mont_sub_dbl_4 PROC mov r14, QWORD PTR [r8+16] mov r15, QWORD PTR [r8+24] add r12, r12 - mov rdi, 4294967295 adc r13, r13 mov rsi, 18446744069414584321 adc r14, r14 adc r15, r15 - sbb r8, r8 - and rdi, r8 - and rsi, r8 - sub r12, r8 + sbb rdx, rdx + mov edi, edx + and rsi, rdx + sub r12, rdx sbb r13, rdi sbb r14, 0 sbb r15, rsi - adc r8, 0 - and rdi, r8 - and rsi, r8 - sub r12, r8 + adc rdx, 0 + and rdi, rdx + and rsi, rdx + sub r12, rdx sbb r13, rdi sbb r14, 0 sbb r15, rsi sub rax, r12 - mov rdi, 4294967295 sbb r9, r13 mov rsi, 18446744069414584321 sbb r10, r14 sbb r11, r15 - sbb r8, r8 - and rdi, r8 - and rsi, r8 - add rax, r8 + sbb rdx, 0 + mov edi, edx + and rsi, rdx + add rax, rdx adc r9, rdi adc r10, 0 adc r11, rsi - adc r8, 0 - and rdi, r8 - and rsi, r8 - add rax, r8 + adc rdx, 0 + and rdi, rdx + and rsi, rdx + add rax, rdx adc r9, rdi mov QWORD PTR [rcx], rax adc r10, 0 @@ -55689,6 +55681,33 @@ sp_256_mont_sub_dbl_4 PROC adc r11, rsi mov QWORD PTR [rcx+16], r10 mov QWORD PTR [rcx+24], r11 + mov r12, QWORD PTR [r8] + mov r13, QWORD PTR [r8+8] + mov r14, QWORD PTR [r8+16] + mov r15, QWORD PTR [r8+24] + sub r12, rax + sbb r13, r9 + mov rsi, 18446744069414584321 + sbb r14, r10 + sbb r15, r11 + sbb rdx, rdx + mov edi, edx + and rsi, rdx + add r12, rdx + adc r13, rdi + adc r14, 0 + adc r15, rsi + adc rdx, 0 + and rdi, rdx + and rsi, rdx + add r12, rdx + adc r13, rdi + mov QWORD PTR [r8], r12 + adc r14, 0 + mov QWORD PTR [r8+8], r13 + adc r15, rsi + mov QWORD PTR [r8+16], r14 + mov QWORD PTR [r8+24], r15 pop rsi pop rdi pop r15 @@ -55696,60 +55715,7 @@ sp_256_mont_sub_dbl_4 PROC pop r13 pop r12 ret -sp_256_mont_sub_dbl_4 ENDP -_text ENDS -; /* Two Montgomery numbers, subtract second from first and double. -; * (r = 2.(a - b) % m). -; * -; * b must have came from a mont_sub operation. -; * -; * r Result of subtration. -; * a Number to subtract from in Montgomery form. -; * b Number to subtract with in Montgomery form. -; * m Modulus (prime). -; */ -_text SEGMENT READONLY PARA -sp_256_mont_dbl_sub_4 PROC - push r12 - push r13 - mov rax, QWORD PTR [rdx] - mov r9, QWORD PTR [rdx+8] - mov r10, QWORD PTR [rdx+16] - mov r11, QWORD PTR [rdx+24] - sub rax, QWORD PTR [r8] - mov r12, 4294967295 - sbb r9, QWORD PTR [r8+8] - mov r13, 18446744069414584321 - sbb r10, QWORD PTR [r8+16] - sbb r11, QWORD PTR [r8+24] - sbb r8, r8 - and r12, r8 - and r13, r8 - add rax, r8 - adc r9, r12 - adc r10, 0 - adc r11, r13 - add rax, rax - mov r12, 4294967295 - adc r9, r9 - mov r13, 18446744069414584321 - adc r10, r10 - adc r11, r11 - sbb r8, r8 - and r12, r8 - and r13, r8 - sub rax, r8 - sbb r9, r12 - mov QWORD PTR [rcx], rax - sbb r10, 0 - mov QWORD PTR [rcx+8], r9 - sbb r11, r13 - mov QWORD PTR [rcx+16], r10 - mov QWORD PTR [rcx+24], r11 - pop r13 - pop r12 - ret -sp_256_mont_dbl_sub_4 ENDP +sp_256_mont_rsb_sub_dbl_4 ENDP _text ENDS IFNDEF WC_NO_CACHE_RESISTANT ; /* Touch each possible point that could be being copied. @@ -55908,11 +55874,12 @@ sp_256_mont_mul_avx2_4 PROC mov rbp, r8 mov rax, rdx mov rdx, QWORD PTR [rax] + mov r14, QWORD PTR [rbp+8] ; A[0] * B[0] mulx r9, r8, QWORD PTR [rbp] xor rbx, rbx ; A[0] * B[1] - mulx r10, rdi, QWORD PTR [rbp+8] + mulx r10, rdi, r14 adcx r9, rdi ; A[0] * B[2] mulx r11, rdi, QWORD PTR [rbp+16] @@ -55927,7 +55894,7 @@ sp_256_mont_mul_avx2_4 PROC xor rbx, rbx adcx r9, rdi ; A[1] * B[1] - mulx r15, rdi, QWORD PTR [rbp+8] + mulx r15, rdi, r14 adox r10, rsi adcx r10, rdi ; A[1] * B[2] @@ -55946,7 +55913,7 @@ sp_256_mont_mul_avx2_4 PROC xor rbx, rbx adcx r10, rdi ; A[2] * B[1] - mulx r15, rdi, QWORD PTR [rbp+8] + mulx r15, rdi, r14 adox r11, rsi adcx r11, rdi ; A[2] * B[2] @@ -56036,11 +56003,10 @@ sp_256_mont_mul_avx2_4 PROC adc r15, 0 sbb r8, 0 mov rax, 18446744069414584321 - mov rdi, r8 ; mask m and sub from result if overflow ; m[0] = -1 & mask = mask - shr rdi, 32 ; m[2] = 0 & mask = 0 + mov edi, r8d and rax, r8 sub r12, r8 sbb r13, rdi @@ -56195,11 +56161,10 @@ sp_256_mont_sqr_avx2_4 PROC adc r15, 0 sbb r8, 0 mov rax, 18446744069414584321 - mov rdi, r8 ; mask m and sub from result if overflow ; m[0] = -1 & mask = mask - shr rdi, 32 ; m[2] = 0 & mask = 0 + mov edi, r8d and rax, r8 sub r12, r8 sbb r13, rdi @@ -57053,11 +57018,12 @@ sp_256_mont_mul_order_avx2_4 PROC mov rbp, r8 mov rax, rdx mov rdx, QWORD PTR [rax] + mov r14, QWORD PTR [rbp+8] ; A[0] * B[0] mulx r9, r8, QWORD PTR [rbp] xor rbx, rbx ; A[0] * B[1] - mulx r10, rdi, QWORD PTR [rbp+8] + mulx r10, rdi, r14 adcx r9, rdi ; A[0] * B[2] mulx r11, rdi, QWORD PTR [rbp+16] @@ -57072,7 +57038,7 @@ sp_256_mont_mul_order_avx2_4 PROC xor rbx, rbx adcx r9, rdi ; A[1] * B[1] - mulx r15, rdi, QWORD PTR [rbp+8] + mulx r15, rdi, r14 adox r10, rsi adcx r10, rdi ; A[1] * B[2] @@ -57091,7 +57057,7 @@ sp_256_mont_mul_order_avx2_4 PROC xor rbx, rbx adcx r10, rdi ; A[2] * B[1] - mulx r15, rdi, QWORD PTR [rbp+8] + mulx r15, rdi, r14 adox r11, rsi adcx r11, rdi ; A[2] * B[2] @@ -59213,11 +59179,10 @@ sp_384_mont_reduce_6 PROC ; Subtract mod if carry neg r11 mov r10, 18446744073709551614 - mov r8, r11 + mov r8d, r11d mov r9, r11 - shr r8, 32 - shl r9, 32 and r10, r11 + shl r9, 32 sub rbx, r8 sbb rbp, r9 sbb r12, r10 @@ -59436,7 +59401,6 @@ sp_384_mont_add_6 PROC mov r12, QWORD PTR [rdx+32] mov r13, QWORD PTR [rdx+40] add rax, QWORD PTR [r8] - mov r14, 4294967295 adc r9, QWORD PTR [r8+8] mov r15, 18446744069414584320 adc r10, QWORD PTR [r8+16] @@ -59445,7 +59409,7 @@ sp_384_mont_add_6 PROC adc r12, QWORD PTR [r8+32] adc r13, QWORD PTR [r8+40] sbb rdx, rdx - and r14, rdx + mov r14d, edx and r15, rdx and rdi, rdx sub rax, r14 @@ -59498,16 +59462,16 @@ sp_384_mont_dbl_6 PROC mov r11, QWORD PTR [rdx+32] mov r12, QWORD PTR [rdx+40] add rax, rax - mov r13, 4294967295 adc r8, r8 mov r14, 18446744069414584320 adc r9, r9 mov r15, 18446744073709551614 adc r10, r10 adc r11, r11 + mov rdi, r12 adc r12, r12 - sbb rdi, rdi - and r13, rdi + sar rdi, 63 + mov r13d, edi and r14, rdi and r15, rdi sub rax, r13 @@ -59560,7 +59524,6 @@ sp_384_mont_tpl_6 PROC mov r11, QWORD PTR [rdx+32] mov r12, QWORD PTR [rdx+40] add rax, rax - mov r13, 4294967295 adc r8, r8 mov r14, 18446744069414584320 adc r9, r9 @@ -59569,7 +59532,7 @@ sp_384_mont_tpl_6 PROC adc r11, r11 adc r12, r12 sbb rdi, rdi - and r13, rdi + mov r13d, edi and r14, rdi and r15, rdi sub rax, r13 @@ -59590,7 +59553,6 @@ sp_384_mont_tpl_6 PROC sbb r11, rdi sbb r12, rdi add rax, QWORD PTR [rdx] - mov r13, 4294967295 adc r8, QWORD PTR [rdx+8] mov r14, 18446744069414584320 adc r9, QWORD PTR [rdx+16] @@ -59599,7 +59561,7 @@ sp_384_mont_tpl_6 PROC adc r11, QWORD PTR [rdx+32] adc r12, QWORD PTR [rdx+40] sbb rdi, rdi - and r13, rdi + mov r13d, edi and r14, rdi and r15, rdi sub rax, r13 @@ -59653,7 +59615,6 @@ sp_384_mont_sub_6 PROC mov r12, QWORD PTR [rdx+32] mov r13, QWORD PTR [rdx+40] sub rax, QWORD PTR [r8] - mov r14, 4294967295 sbb r9, QWORD PTR [r8+8] mov r15, 18446744069414584320 sbb r10, QWORD PTR [r8+16] @@ -59662,7 +59623,7 @@ sp_384_mont_sub_6 PROC sbb r12, QWORD PTR [r8+32] sbb r13, QWORD PTR [r8+40] sbb rdx, rdx - and r14, rdx + mov r14d, edx and r15, rdx and rdi, rdx add rax, r14