Merge pull request #6776 from SparkiDev/sp_ecc_x64

SP ECC: x64 minor speed improvement
This commit is contained in:
JacobBarthelmeh
2023-09-15 08:23:51 -06:00
committed by GitHub
5 changed files with 210 additions and 297 deletions

View File

@ -71903,8 +71903,16 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
"sbcs r9, r9, #0\n\t"
"sbcs r10, r10, r12, LSR #31\n\t"
"sbcs r11, r11, r12\n\t"
"rsb r12, r12, #0\n\t"
"sbc r12, r12, #0\n\t"
"sbc r2, r2, r2\n\t"
"sub r12, r12, r2\n\t"
"subs r4, r4, r12\n\t"
"sbcs r5, r5, r12\n\t"
"sbcs r6, r6, r12\n\t"
"sbcs r7, r7, #0\n\t"
"sbcs r8, r8, #0\n\t"
"sbcs r9, r9, #0\n\t"
"sbcs r10, r10, r12, LSR #31\n\t"
"sbc r11, r11, r12\n\t"
"ldm %[a]!, {r2, r3}\n\t"
"adds r4, r4, r2\n\t"
"adcs r5, r5, r3\n\t"

View File

@ -33116,8 +33116,16 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
"SBCS r9, r9, #0x0\n\t"
"SBCS r10, r10, r12, LSR #31\n\t"
"SBCS r11, r11, r12\n\t"
"RSB r12, r12, #0x0\n\t"
"SBC r12, r12, #0x0\n\t"
"SBC r2, r2, r2\n\t"
"SUB r12, r12, r2\n\t"
"SUBS r4, r4, r12\n\t"
"SBCS r5, r5, r12\n\t"
"SBCS r6, r6, r12\n\t"
"SBCS r7, r7, #0x0\n\t"
"SBCS r8, r8, #0x0\n\t"
"SBCS r9, r9, #0x0\n\t"
"SBCS r10, r10, r12, LSR #31\n\t"
"SBC r11, r11, r12\n\t"
"LDM %[a]!, {r2, r3}\n\t"
"ADDS r4, r4, r2\n\t"
"ADCS r5, r5, r3\n\t"

View File

@ -8607,7 +8607,7 @@ extern void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m);
#ifdef __cplusplus
extern "C" {
#endif
extern void sp_256_mont_sub_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m);
extern void sp_256_mont_rsb_sub_dbl_4(sp_digit* r, const sp_digit* a, sp_digit* b, const sp_digit* m);
#ifdef __cplusplus
}
#endif
@ -8661,9 +8661,8 @@ static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p,
/* X = T1 * T1 */
sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod);
/* X = X - 2*Y */
sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
/* Y = Y - X */
sp_256_mont_sub_4(y, y, x, p256_mod);
sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod);
/* Y = Y * T1 */
sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod);
/* Y = Y - T2 */
@ -8775,15 +8774,14 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
break;
case 14:
/* X = X - 2*Y */
sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
/* Y = Y - X */
sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
ctx->state = 15;
break;
case 15:
ctx->state = 16;
break;
case 16:
/* Y = Y - X */
sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod);
ctx->state = 17;
break;
case 17:
@ -8808,13 +8806,6 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
return err;
}
#endif /* WOLFSSL_SP_NONBLOCK */
#ifdef __cplusplus
extern "C" {
#endif
extern void sp_256_mont_dbl_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m);
#ifdef __cplusplus
}
#endif
/* Double the Montgomery form projective point p a number of times.
*
* r Result of repeated doubling of point.
@ -8858,9 +8849,9 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod);
/* X = A^2 - 2B */
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
/* B = 2.(B - X) */
sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
sp_256_mont_dbl_4(b, b, p256_mod);
/* Z = Z*Y */
sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
/* t1 = Y^4 */
@ -8886,9 +8877,9 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod);
/* X = A^2 - 2B */
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
/* B = 2.(B - X) */
sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
sp_256_mont_dbl_4(b, b, p256_mod);
/* Z = Z*Y */
sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
/* t1 = Y^4 */
@ -8981,9 +8972,8 @@ static void sp_256_proj_point_add_4(sp_point_256* r,
sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod);
sp_256_mont_sub_4(x, x, t5, p256_mod);
sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
sp_256_mont_sub_4(y, y, x, p256_mod);
sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod);
sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod);
sp_256_mont_sub_4(y, y, t5, p256_mod);
{
@ -9159,12 +9149,11 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
ctx->state = 20;
break;
case 20:
sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
ctx->state = 21;
break;
case 21:
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod);
ctx->state = 22;
break;
case 22:
@ -9263,9 +9252,9 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r,
x = r[j].x;
/* X = A^2 - 2B */
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
/* B = 2.(B - X) */
sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
sp_256_mont_dbl_4(b, b, p256_mod);
/* Z = Z*Y */
sp_256_mont_mul_4(r[j].z, z, y, p256_mod, p256_mp_mod);
z = r[j].z;
@ -9764,7 +9753,7 @@ extern void sp_256_div2_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* m
#ifdef __cplusplus
}
#endif
#define sp_256_mont_sub_dbl_avx2_4 sp_256_mont_sub_dbl_4
#define sp_256_mont_rsb_sub_dbl_avx2_4 sp_256_mont_rsb_sub_dbl_4
/* Double the Montgomery form projective point p.
*
* r Result of doubling point.
@ -9815,9 +9804,8 @@ static void sp_256_proj_point_dbl_avx2_4(sp_point_256* r, const sp_point_256* p,
/* X = T1 * T1 */
sp_256_mont_sqr_avx2_4(x, t1, p256_mod, p256_mp_mod);
/* X = X - 2*Y */
sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod);
/* Y = Y - X */
sp_256_mont_sub_avx2_4(y, y, x, p256_mod);
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, y, p256_mod);
/* Y = Y * T1 */
sp_256_mont_mul_avx2_4(y, y, t1, p256_mod, p256_mp_mod);
/* Y = Y - T2 */
@ -9929,15 +9917,14 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
break;
case 14:
/* X = X - 2*Y */
sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
/* Y = Y - X */
sp_256_mont_rsb_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
ctx->state = 15;
break;
case 15:
ctx->state = 16;
break;
case 16:
/* Y = Y - X */
sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod);
ctx->state = 17;
break;
case 17:
@ -9962,7 +9949,6 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
return err;
}
#endif /* WOLFSSL_SP_NONBLOCK */
#define sp_256_mont_dbl_sub_avx2_4 sp_256_mont_dbl_sub_4
/* Double the Montgomery form projective point p a number of times.
*
* r Result of repeated doubling of point.
@ -10006,9 +9992,9 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod);
/* X = A^2 - 2B */
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
/* B = 2.(B - X) */
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
sp_256_mont_dbl_avx2_4(b, b, p256_mod);
/* Z = Z*Y */
sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
/* t1 = Y^4 */
@ -10034,9 +10020,9 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod);
/* X = A^2 - 2B */
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
/* B = 2.(B - X) */
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
sp_256_mont_dbl_avx2_4(b, b, p256_mod);
/* Z = Z*Y */
sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
/* t1 = Y^4 */
@ -10105,9 +10091,8 @@ static void sp_256_proj_point_add_avx2_4(sp_point_256* r,
sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod);
sp_256_mont_sub_avx2_4(x, x, t5, p256_mod);
sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod);
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
sp_256_mont_sub_avx2_4(y, y, x, p256_mod);
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, y, p256_mod);
sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod);
sp_256_mont_sub_avx2_4(y, y, t5, p256_mod);
{
@ -10283,12 +10268,11 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
ctx->state = 20;
break;
case 20:
sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
sp_256_mont_rsb_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
ctx->state = 21;
break;
case 21:
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod);
ctx->state = 22;
break;
case 22:
@ -10387,9 +10371,9 @@ static void sp_256_proj_point_dbl_n_store_avx2_4(sp_point_256* r,
x = r[j].x;
/* X = A^2 - 2B */
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
/* B = 2.(B - X) */
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
sp_256_mont_dbl_avx2_4(b, b, p256_mod);
/* Z = Z*Y */
sp_256_mont_mul_avx2_4(r[j].z, z, y, p256_mod, p256_mp_mod);
z = r[j].z;
@ -10689,9 +10673,8 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r,
sp_256_mont_mul_4(t1, t1, t2, p256_mod, p256_mp_mod);
sp_256_mont_sqr_4(t2, t4, p256_mod, p256_mp_mod);
sp_256_mont_sub_4(t2, t2, t1, p256_mod);
sp_256_mont_sub_dbl_4(x, t2, t3, p256_mod);
sp_256_mont_rsb_sub_dbl_4(x, t2, t3, p256_mod);
/* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
sp_256_mont_sub_4(t3, t3, x, p256_mod);
sp_256_mont_mul_4(t3, t3, t4, p256_mod, p256_mp_mod);
sp_256_mont_mul_4(t1, t1, p->y, p256_mod, p256_mp_mod);
sp_256_mont_sub_4(y, t3, t1, p256_mod);
@ -11178,9 +11161,8 @@ static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r,
sp_256_mont_mul_avx2_4(t1, t1, t2, p256_mod, p256_mp_mod);
sp_256_mont_sqr_avx2_4(t2, t4, p256_mod, p256_mp_mod);
sp_256_mont_sub_avx2_4(t2, t2, t1, p256_mod);
sp_256_mont_sub_dbl_avx2_4(x, t2, t3, p256_mod);
sp_256_mont_rsb_sub_dbl_avx2_4(x, t2, t3, p256_mod);
/* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
sp_256_mont_sub_avx2_4(t3, t3, x, p256_mod);
sp_256_mont_mul_avx2_4(t3, t3, t4, p256_mod, p256_mp_mod);
sp_256_mont_mul_avx2_4(t1, t1, p->y, p256_mod, p256_mp_mod);
sp_256_mont_sub_avx2_4(y, t3, t1, p256_mod);

View File

@ -55378,11 +55378,12 @@ _sp_256_mul_avx2_4:
pushq %rbx
movq %rdx, %rbp
movq (%rsi), %rdx
movq 8(%rbp), %r14
# A[0] * B[0]
mulxq (%rbp), %r8, %r9
xorq %rbx, %rbx
# A[0] * B[1]
mulxq 8(%rbp), %rax, %r10
mulxq %r14, %rax, %r10
adcxq %rax, %r9
# A[0] * B[2]
mulxq 16(%rbp), %rax, %r11
@ -55397,7 +55398,7 @@ _sp_256_mul_avx2_4:
xorq %rbx, %rbx
adcxq %rax, %r9
# A[1] * B[1]
mulxq 8(%rbp), %rax, %r15
mulxq %r14, %rax, %r15
adoxq %rcx, %r10
adcxq %rax, %r10
# A[1] * B[2]
@ -55416,7 +55417,7 @@ _sp_256_mul_avx2_4:
xorq %rbx, %rbx
adcxq %rax, %r10
# A[2] * B[1]
mulxq 8(%rbp), %rax, %r15
mulxq %r14, %rax, %r15
adoxq %rcx, %r11
adcxq %rax, %r11
# A[2] * B[2]
@ -55981,11 +55982,10 @@ _sp_256_mont_mul_4:
adcq $0x00, %rbx
sbbq $0x00, %r9
movq $0xffffffff00000001, %rsi
movq %r9, %rax
# mask m and sub from result if overflow
# m[0] = -1 & mask = mask
shrq $32, %rax
# m[2] = 0 & mask = 0
movl %r9d, %eax
andq %r9, %rsi
subq %r9, %r13
sbbq %rax, %r14
@ -56163,11 +56163,10 @@ _sp_256_mont_sqr_4:
adcq $0x00, %r15
sbbq $0x00, %r8
movq $0xffffffff00000001, %rsi
movq %r8, %rax
# mask m and sub from result if overflow
# m[0] = -1 & mask = mask
shrq $32, %rax
# m[2] = 0 & mask = 0
movl %r8d, %eax
andq %r8, %rsi
subq %r8, %r12
sbbq %rax, %r13
@ -56388,11 +56387,10 @@ _sp_256_mont_reduce_4:
adcq $0x00, %r15
sbbq $0x00, %r8
movq $0xffffffff00000001, %rbx
movq %r8, %rax
# mask m and sub from result if overflow
# m[0] = -1 & mask = mask
shrq $32, %rax
# m[2] = 0 & mask = 0
movl %r8d, %eax
andq %r8, %rbx
subq %r8, %r12
sbbq %rax, %r13
@ -56543,13 +56541,12 @@ _sp_256_mont_add_4:
movq 16(%rsi), %r8
movq 24(%rsi), %r9
addq (%rdx), %rax
movq $0xffffffff, %r10
adcq 8(%rdx), %rcx
movq $0xffffffff00000001, %r11
adcq 16(%rdx), %r8
adcq 24(%rdx), %r9
sbbq %rsi, %rsi
andq %rsi, %r10
movl %esi, %r10d
andq %rsi, %r11
subq %rsi, %rax
sbbq %r10, %rcx
@ -56593,13 +56590,13 @@ _sp_256_mont_dbl_4:
movq 16(%rsi), %rcx
movq 24(%rsi), %r8
addq %rdx, %rdx
movq $0xffffffff, %r9
adcq %rax, %rax
movq $0xffffffff00000001, %r10
adcq %rcx, %rcx
movq %r8, %r11
adcq %r8, %r8
sbbq %r11, %r11
andq %r11, %r9
sarq $63, %r11
movl %r11d, %r9d
andq %r11, %r10
subq %r11, %rdx
sbbq %r9, %rax
@ -56643,13 +56640,12 @@ _sp_256_mont_tpl_4:
movq 16(%rsi), %rcx
movq 24(%rsi), %r8
addq %rdx, %rdx
movq $0xffffffff, %r9
adcq %rax, %rax
movq $0xffffffff00000001, %r10
adcq %rcx, %rcx
adcq %r8, %r8
sbbq %r11, %r11
andq %r11, %r9
movl %r11d, %r9d
andq %r11, %r10
subq %r11, %rdx
sbbq %r9, %rax
@ -56663,13 +56659,12 @@ _sp_256_mont_tpl_4:
sbbq $0x00, %rcx
sbbq %r10, %r8
addq (%rsi), %rdx
movq $0xffffffff, %r9
adcq 8(%rsi), %rax
movq $0xffffffff00000001, %r10
adcq 16(%rsi), %rcx
adcq 24(%rsi), %r8
sbbq %r11, %r11
andq %r11, %r9
sbbq $0x00, %r11
movl %r11d, %r9d
andq %r11, %r10
subq %r11, %rdx
sbbq %r9, %rax
@ -56714,13 +56709,12 @@ _sp_256_mont_sub_4:
movq 16(%rsi), %r8
movq 24(%rsi), %r9
subq (%rdx), %rax
movq $0xffffffff, %r10
sbbq 8(%rdx), %rcx
movq $0xffffffff00000001, %r11
sbbq 16(%rdx), %r8
sbbq 24(%rdx), %r9
sbbq %rsi, %rsi
andq %rsi, %r10
movl %esi, %r10d
andq %rsi, %r11
addq %rsi, %rax
adcq %r10, %rcx
@ -56797,15 +56791,15 @@ _sp_256_div2_4:
*/
#ifndef __APPLE__
.text
.globl sp_256_mont_sub_dbl_4
.type sp_256_mont_sub_dbl_4,@function
.globl sp_256_mont_rsb_sub_dbl_4
.type sp_256_mont_rsb_sub_dbl_4,@function
.align 16
sp_256_mont_sub_dbl_4:
sp_256_mont_rsb_sub_dbl_4:
#else
.section __TEXT,__text
.globl _sp_256_mont_sub_dbl_4
.globl _sp_256_mont_rsb_sub_dbl_4
.p2align 4
_sp_256_mont_sub_dbl_4:
_sp_256_mont_rsb_sub_dbl_4:
#endif /* __APPLE__ */
pushq %r12
pushq %r13
@ -56820,42 +56814,40 @@ _sp_256_mont_sub_dbl_4:
movq 16(%rdx), %r12
movq 24(%rdx), %r13
addq %r10, %r10
movq $0xffffffff, %r14
adcq %r11, %r11
movq $0xffffffff00000001, %r15
adcq %r12, %r12
adcq %r13, %r13
sbbq %rdx, %rdx
andq %rdx, %r14
andq %rdx, %r15
subq %rdx, %r10
sbbq %rsi, %rsi
movl %esi, %r14d
andq %rsi, %r15
subq %rsi, %r10
sbbq %r14, %r11
sbbq $0x00, %r12
sbbq %r15, %r13
adcq $0x00, %rdx
andq %rdx, %r14
andq %rdx, %r15
subq %rdx, %r10
adcq $0x00, %rsi
andq %rsi, %r14
andq %rsi, %r15
subq %rsi, %r10
sbbq %r14, %r11
sbbq $0x00, %r12
sbbq %r15, %r13
subq %r10, %rax
movq $0xffffffff, %r14
sbbq %r11, %rcx
movq $0xffffffff00000001, %r15
sbbq %r12, %r8
sbbq %r13, %r9
sbbq %rdx, %rdx
andq %rdx, %r14
andq %rdx, %r15
addq %rdx, %rax
sbbq $0x00, %rsi
movl %esi, %r14d
andq %rsi, %r15
addq %rsi, %rax
adcq %r14, %rcx
adcq $0x00, %r8
adcq %r15, %r9
adcq $0x00, %rdx
andq %rdx, %r14
andq %rdx, %r15
addq %rdx, %rax
adcq $0x00, %rsi
andq %rsi, %r14
andq %rsi, %r15
addq %rsi, %rax
adcq %r14, %rcx
movq %rax, (%rdi)
adcq $0x00, %r8
@ -56863,73 +56855,40 @@ _sp_256_mont_sub_dbl_4:
adcq %r15, %r9
movq %r8, 16(%rdi)
movq %r9, 24(%rdi)
movq (%rdx), %r10
movq 8(%rdx), %r11
movq 16(%rdx), %r12
movq 24(%rdx), %r13
subq %rax, %r10
sbbq %rcx, %r11
movq $0xffffffff00000001, %r15
sbbq %r8, %r12
sbbq %r9, %r13
sbbq %rsi, %rsi
movl %esi, %r14d
andq %rsi, %r15
addq %rsi, %r10
adcq %r14, %r11
adcq $0x00, %r12
adcq %r15, %r13
adcq $0x00, %rsi
andq %rsi, %r14
andq %rsi, %r15
addq %rsi, %r10
adcq %r14, %r11
movq %r10, (%rdx)
adcq $0x00, %r12
movq %r11, 8(%rdx)
adcq %r15, %r13
movq %r12, 16(%rdx)
movq %r13, 24(%rdx)
popq %r15
popq %r14
popq %r13
popq %r12
repz retq
#ifndef __APPLE__
.size sp_256_mont_sub_dbl_4,.-sp_256_mont_sub_dbl_4
#endif /* __APPLE__ */
/* Two Montgomery numbers, subtract second from first and double.
* (r = 2.(a - b) % m).
*
* b must have came from a mont_sub operation.
*
* r Result of subtration.
* a Number to subtract from in Montgomery form.
* b Number to subtract with in Montgomery form.
* m Modulus (prime).
*/
#ifndef __APPLE__
.text
.globl sp_256_mont_dbl_sub_4
.type sp_256_mont_dbl_sub_4,@function
.align 16
sp_256_mont_dbl_sub_4:
#else
.section __TEXT,__text
.globl _sp_256_mont_dbl_sub_4
.p2align 4
_sp_256_mont_dbl_sub_4:
#endif /* __APPLE__ */
movq (%rsi), %rax
movq 8(%rsi), %rcx
movq 16(%rsi), %r8
movq 24(%rsi), %r9
subq (%rdx), %rax
movq $0xffffffff, %r10
sbbq 8(%rdx), %rcx
movq $0xffffffff00000001, %r11
sbbq 16(%rdx), %r8
sbbq 24(%rdx), %r9
sbbq %rdx, %rdx
andq %rdx, %r10
andq %rdx, %r11
addq %rdx, %rax
adcq %r10, %rcx
adcq $0x00, %r8
adcq %r11, %r9
addq %rax, %rax
movq $0xffffffff, %r10
adcq %rcx, %rcx
movq $0xffffffff00000001, %r11
adcq %r8, %r8
adcq %r9, %r9
sbbq %rdx, %rdx
andq %rdx, %r10
andq %rdx, %r11
subq %rdx, %rax
sbbq %r10, %rcx
movq %rax, (%rdi)
sbbq $0x00, %r8
movq %rcx, 8(%rdi)
sbbq %r11, %r9
movq %r8, 16(%rdi)
movq %r9, 24(%rdi)
repz retq
#ifndef __APPLE__
.size sp_256_mont_dbl_sub_4,.-sp_256_mont_dbl_sub_4
.size sp_256_mont_rsb_sub_dbl_4,.-sp_256_mont_rsb_sub_dbl_4
#endif /* __APPLE__ */
#ifndef WC_NO_CACHE_RESISTANT
/* Touch each possible point that could be being copied.
@ -57085,11 +57044,12 @@ _sp_256_mont_mul_avx2_4:
pushq %rbx
movq %rdx, %rbp
movq (%rsi), %rdx
movq 8(%rbp), %r14
# A[0] * B[0]
mulxq (%rbp), %r8, %r9
xorq %rbx, %rbx
# A[0] * B[1]
mulxq 8(%rbp), %rax, %r10
mulxq %r14, %rax, %r10
adcxq %rax, %r9
# A[0] * B[2]
mulxq 16(%rbp), %rax, %r11
@ -57104,7 +57064,7 @@ _sp_256_mont_mul_avx2_4:
xorq %rbx, %rbx
adcxq %rax, %r9
# A[1] * B[1]
mulxq 8(%rbp), %rax, %r15
mulxq %r14, %rax, %r15
adoxq %rcx, %r10
adcxq %rax, %r10
# A[1] * B[2]
@ -57123,7 +57083,7 @@ _sp_256_mont_mul_avx2_4:
xorq %rbx, %rbx
adcxq %rax, %r10
# A[2] * B[1]
mulxq 8(%rbp), %rax, %r15
mulxq %r14, %rax, %r15
adoxq %rcx, %r11
adcxq %rax, %r11
# A[2] * B[2]
@ -57213,11 +57173,10 @@ _sp_256_mont_mul_avx2_4:
adcq $0x00, %r15
sbbq $0x00, %r8
movq $0xffffffff00000001, %rsi
movq %r8, %rax
# mask m and sub from result if overflow
# m[0] = -1 & mask = mask
shrq $32, %rax
# m[2] = 0 & mask = 0
movl %r8d, %eax
andq %r8, %rsi
subq %r8, %r12
sbbq %rax, %r13
@ -57378,11 +57337,10 @@ _sp_256_mont_sqr_avx2_4:
adcq $0x00, %r15
sbbq $0x00, %r8
movq $0xffffffff00000001, %rsi
movq %r8, %rax
# mask m and sub from result if overflow
# m[0] = -1 & mask = mask
shrq $32, %rax
# m[2] = 0 & mask = 0
movl %r8d, %eax
andq %r8, %rsi
subq %r8, %r12
sbbq %rax, %r13
@ -58352,11 +58310,12 @@ _sp_256_mont_mul_order_avx2_4:
pushq %rbx
movq %rdx, %rbp
movq (%rsi), %rdx
movq 8(%rbp), %r14
# A[0] * B[0]
mulxq (%rbp), %r8, %r9
xorq %rbx, %rbx
# A[0] * B[1]
mulxq 8(%rbp), %rax, %r10
mulxq %r14, %rax, %r10
adcxq %rax, %r9
# A[0] * B[2]
mulxq 16(%rbp), %rax, %r11
@ -58371,7 +58330,7 @@ _sp_256_mont_mul_order_avx2_4:
xorq %rbx, %rbx
adcxq %rax, %r9
# A[1] * B[1]
mulxq 8(%rbp), %rax, %r15
mulxq %r14, %rax, %r15
adoxq %rcx, %r10
adcxq %rax, %r10
# A[1] * B[2]
@ -58390,7 +58349,7 @@ _sp_256_mont_mul_order_avx2_4:
xorq %rbx, %rbx
adcxq %rax, %r10
# A[2] * B[1]
mulxq 8(%rbp), %rax, %r15
mulxq %r14, %rax, %r15
adoxq %rcx, %r11
adcxq %rax, %r11
# A[2] * B[2]
@ -60601,11 +60560,10 @@ _sp_384_mont_reduce_6:
# Subtract mod if carry
negq %r10
movq $0xfffffffffffffffe, %r9
movq %r10, %rcx
movl %r10d, %ecx
movq %r10, %r8
shrq $32, %rcx
shlq $32, %r8
andq %r10, %r9
shlq $32, %r8
subq %rcx, %rbx
sbbq %r8, %rbp
sbbq %r9, %r11
@ -60851,7 +60809,6 @@ _sp_384_mont_add_6:
movq 32(%rsi), %r10
movq 40(%rsi), %r11
addq (%rdx), %rax
movq $0xffffffff, %r12
adcq 8(%rdx), %rcx
movq $0xffffffff00000000, %r13
adcq 16(%rdx), %r8
@ -60860,7 +60817,7 @@ _sp_384_mont_add_6:
adcq 32(%rdx), %r10
adcq 40(%rdx), %r11
sbbq %rsi, %rsi
andq %rsi, %r12
movl %esi, %r12d
andq %rsi, %r13
andq %rsi, %r14
subq %r12, %rax
@ -60920,16 +60877,16 @@ _sp_384_mont_dbl_6:
movq 32(%rsi), %r9
movq 40(%rsi), %r10
addq %rdx, %rdx
movq $0xffffffff, %r11
adcq %rax, %rax
movq $0xffffffff00000000, %r12
adcq %rcx, %rcx
movq $0xfffffffffffffffe, %r13
adcq %r8, %r8
adcq %r9, %r9
movq %r10, %r14
adcq %r10, %r10
sbbq %r14, %r14
andq %r14, %r11
sarq $63, %r14
movl %r14d, %r11d
andq %r14, %r12
andq %r14, %r13
subq %r11, %rdx
@ -60989,7 +60946,6 @@ _sp_384_mont_tpl_6:
movq 32(%rsi), %r9
movq 40(%rsi), %r10
addq %rdx, %rdx
movq $0xffffffff, %r11
adcq %rax, %rax
movq $0xffffffff00000000, %r12
adcq %rcx, %rcx
@ -60998,7 +60954,7 @@ _sp_384_mont_tpl_6:
adcq %r9, %r9
adcq %r10, %r10
sbbq %r14, %r14
andq %r14, %r11
movl %r14d, %r11d
andq %r14, %r12
andq %r14, %r13
subq %r11, %rdx
@ -61019,7 +60975,6 @@ _sp_384_mont_tpl_6:
sbbq %r14, %r9
sbbq %r14, %r10
addq (%rsi), %rdx
movq $0xffffffff, %r11
adcq 8(%rsi), %rax
movq $0xffffffff00000000, %r12
adcq 16(%rsi), %rcx
@ -61028,7 +60983,7 @@ _sp_384_mont_tpl_6:
adcq 32(%rsi), %r9
adcq 40(%rsi), %r10
sbbq %r14, %r14
andq %r14, %r11
movl %r14d, %r11d
andq %r14, %r12
andq %r14, %r13
subq %r11, %rdx
@ -61089,7 +61044,6 @@ _sp_384_mont_sub_6:
movq 32(%rsi), %r10
movq 40(%rsi), %r11
subq (%rdx), %rax
movq $0xffffffff, %r12
sbbq 8(%rdx), %rcx
movq $0xffffffff00000000, %r13
sbbq 16(%rdx), %r8
@ -61098,7 +61052,7 @@ _sp_384_mont_sub_6:
sbbq 32(%rdx), %r10
sbbq 40(%rdx), %r11
sbbq %rsi, %rsi
andq %rsi, %r12
movl %esi, %r12d
andq %rsi, %r13
andq %rsi, %r14
addq %r12, %rax

View File

@ -54329,11 +54329,12 @@ sp_256_mul_avx2_4 PROC
mov rbp, r8
mov rax, rdx
mov rdx, QWORD PTR [rax]
mov r14, QWORD PTR [rbp+8]
; A[0] * B[0]
mulx r9, r8, QWORD PTR [rbp]
xor rbx, rbx
; A[0] * B[1]
mulx r10, rdi, QWORD PTR [rbp+8]
mulx r10, rdi, r14
adcx r9, rdi
; A[0] * B[2]
mulx r11, rdi, QWORD PTR [rbp+16]
@ -54348,7 +54349,7 @@ sp_256_mul_avx2_4 PROC
xor rbx, rbx
adcx r9, rdi
; A[1] * B[1]
mulx r15, rdi, QWORD PTR [rbp+8]
mulx r15, rdi, r14
adox r10, rsi
adcx r10, rdi
; A[1] * B[2]
@ -54367,7 +54368,7 @@ sp_256_mul_avx2_4 PROC
xor rbx, rbx
adcx r10, rdi
; A[2] * B[1]
mulx r15, rdi, QWORD PTR [rbp+8]
mulx r15, rdi, r14
adox r11, rsi
adcx r11, rdi
; A[2] * B[2]
@ -54884,11 +54885,10 @@ sp_256_mont_mul_4 PROC
adc rbx, 0
sbb r11, 0
mov r10, 18446744069414584321
mov rax, r11
; mask m and sub from result if overflow
; m[0] = -1 & mask = mask
shr rax, 32
; m[2] = 0 & mask = 0
mov eax, r11d
and r10, r11
sub r15, r11
sbb rdi, rax
@ -55060,11 +55060,10 @@ sp_256_mont_sqr_4 PROC
adc rsi, 0
sbb r10, 0
mov r8, 18446744069414584321
mov rax, r10
; mask m and sub from result if overflow
; m[0] = -1 & mask = mask
shr rax, 32
; m[2] = 0 & mask = 0
mov eax, r10d
and r8, r10
sub r14, r10
sbb r15, rax
@ -55263,11 +55262,10 @@ sp_256_mont_reduce_4 PROC
adc rdi, 0
sbb r9, 0
mov rbx, 18446744069414584321
mov rax, r9
; mask m and sub from result if overflow
; m[0] = -1 & mask = mask
shr rax, 32
; m[2] = 0 & mask = 0
mov eax, r9d
and rbx, r9
sub r13, r9
sbb r14, rax
@ -55404,13 +55402,12 @@ sp_256_mont_add_4 PROC
mov r10, QWORD PTR [rdx+16]
mov r11, QWORD PTR [rdx+24]
add rax, QWORD PTR [r8]
mov r12, 4294967295
adc r9, QWORD PTR [r8+8]
mov r13, 18446744069414584321
adc r10, QWORD PTR [r8+16]
adc r11, QWORD PTR [r8+24]
sbb rdx, rdx
and r12, rdx
mov r12d, edx
and r13, rdx
sub rax, rdx
sbb r9, r12
@ -55447,13 +55444,13 @@ sp_256_mont_dbl_4 PROC
mov r9, QWORD PTR [rdx+16]
mov r10, QWORD PTR [rdx+24]
add rax, rax
mov r11, 4294967295
adc r8, r8
mov r12, 18446744069414584321
adc r9, r9
mov r13, r10
adc r10, r10
sbb r13, r13
and r11, r13
sar r13, 63
mov r11d, r13d
and r12, r13
sub rax, r13
sbb r8, r11
@ -55490,13 +55487,12 @@ sp_256_mont_tpl_4 PROC
mov r9, QWORD PTR [rdx+16]
mov r10, QWORD PTR [rdx+24]
add rax, rax
mov r11, 4294967295
adc r8, r8
mov r12, 18446744069414584321
adc r9, r9
adc r10, r10
sbb r13, r13
and r11, r13
mov r11d, r13d
and r12, r13
sub rax, r13
sbb r8, r11
@ -55510,13 +55506,12 @@ sp_256_mont_tpl_4 PROC
sbb r9, 0
sbb r10, r12
add rax, QWORD PTR [rdx]
mov r11, 4294967295
adc r8, QWORD PTR [rdx+8]
mov r12, 18446744069414584321
adc r9, QWORD PTR [rdx+16]
adc r10, QWORD PTR [rdx+24]
sbb r13, r13
and r11, r13
sbb r13, 0
mov r11d, r13d
and r12, r13
sub rax, r13
sbb r8, r11
@ -55554,13 +55549,12 @@ sp_256_mont_sub_4 PROC
mov r10, QWORD PTR [rdx+16]
mov r11, QWORD PTR [rdx+24]
sub rax, QWORD PTR [r8]
mov r12, 4294967295
sbb r9, QWORD PTR [r8+8]
mov r13, 18446744069414584321
sbb r10, QWORD PTR [r8+16]
sbb r11, QWORD PTR [r8+24]
sbb rdx, rdx
and r12, rdx
mov r12d, edx
and r13, rdx
add rax, rdx
adc r9, r12
@ -55630,7 +55624,7 @@ _text ENDS
; * m Modulus (prime).
; */
_text SEGMENT READONLY PARA
sp_256_mont_sub_dbl_4 PROC
sp_256_mont_rsb_sub_dbl_4 PROC
push r12
push r13
push r14
@ -55646,42 +55640,40 @@ sp_256_mont_sub_dbl_4 PROC
mov r14, QWORD PTR [r8+16]
mov r15, QWORD PTR [r8+24]
add r12, r12
mov rdi, 4294967295
adc r13, r13
mov rsi, 18446744069414584321
adc r14, r14
adc r15, r15
sbb r8, r8
and rdi, r8
and rsi, r8
sub r12, r8
sbb rdx, rdx
mov edi, edx
and rsi, rdx
sub r12, rdx
sbb r13, rdi
sbb r14, 0
sbb r15, rsi
adc r8, 0
and rdi, r8
and rsi, r8
sub r12, r8
adc rdx, 0
and rdi, rdx
and rsi, rdx
sub r12, rdx
sbb r13, rdi
sbb r14, 0
sbb r15, rsi
sub rax, r12
mov rdi, 4294967295
sbb r9, r13
mov rsi, 18446744069414584321
sbb r10, r14
sbb r11, r15
sbb r8, r8
and rdi, r8
and rsi, r8
add rax, r8
sbb rdx, 0
mov edi, edx
and rsi, rdx
add rax, rdx
adc r9, rdi
adc r10, 0
adc r11, rsi
adc r8, 0
and rdi, r8
and rsi, r8
add rax, r8
adc rdx, 0
and rdi, rdx
and rsi, rdx
add rax, rdx
adc r9, rdi
mov QWORD PTR [rcx], rax
adc r10, 0
@ -55689,6 +55681,33 @@ sp_256_mont_sub_dbl_4 PROC
adc r11, rsi
mov QWORD PTR [rcx+16], r10
mov QWORD PTR [rcx+24], r11
mov r12, QWORD PTR [r8]
mov r13, QWORD PTR [r8+8]
mov r14, QWORD PTR [r8+16]
mov r15, QWORD PTR [r8+24]
sub r12, rax
sbb r13, r9
mov rsi, 18446744069414584321
sbb r14, r10
sbb r15, r11
sbb rdx, rdx
mov edi, edx
and rsi, rdx
add r12, rdx
adc r13, rdi
adc r14, 0
adc r15, rsi
adc rdx, 0
and rdi, rdx
and rsi, rdx
add r12, rdx
adc r13, rdi
mov QWORD PTR [r8], r12
adc r14, 0
mov QWORD PTR [r8+8], r13
adc r15, rsi
mov QWORD PTR [r8+16], r14
mov QWORD PTR [r8+24], r15
pop rsi
pop rdi
pop r15
@ -55696,60 +55715,7 @@ sp_256_mont_sub_dbl_4 PROC
pop r13
pop r12
ret
sp_256_mont_sub_dbl_4 ENDP
_text ENDS
; /* Two Montgomery numbers, subtract second from first and double.
; * (r = 2.(a - b) % m).
; *
; * b must have came from a mont_sub operation.
; *
; * r Result of subtration.
; * a Number to subtract from in Montgomery form.
; * b Number to subtract with in Montgomery form.
; * m Modulus (prime).
; */
_text SEGMENT READONLY PARA
sp_256_mont_dbl_sub_4 PROC
push r12
push r13
mov rax, QWORD PTR [rdx]
mov r9, QWORD PTR [rdx+8]
mov r10, QWORD PTR [rdx+16]
mov r11, QWORD PTR [rdx+24]
sub rax, QWORD PTR [r8]
mov r12, 4294967295
sbb r9, QWORD PTR [r8+8]
mov r13, 18446744069414584321
sbb r10, QWORD PTR [r8+16]
sbb r11, QWORD PTR [r8+24]
sbb r8, r8
and r12, r8
and r13, r8
add rax, r8
adc r9, r12
adc r10, 0
adc r11, r13
add rax, rax
mov r12, 4294967295
adc r9, r9
mov r13, 18446744069414584321
adc r10, r10
adc r11, r11
sbb r8, r8
and r12, r8
and r13, r8
sub rax, r8
sbb r9, r12
mov QWORD PTR [rcx], rax
sbb r10, 0
mov QWORD PTR [rcx+8], r9
sbb r11, r13
mov QWORD PTR [rcx+16], r10
mov QWORD PTR [rcx+24], r11
pop r13
pop r12
ret
sp_256_mont_dbl_sub_4 ENDP
sp_256_mont_rsb_sub_dbl_4 ENDP
_text ENDS
IFNDEF WC_NO_CACHE_RESISTANT
; /* Touch each possible point that could be being copied.
@ -55908,11 +55874,12 @@ sp_256_mont_mul_avx2_4 PROC
mov rbp, r8
mov rax, rdx
mov rdx, QWORD PTR [rax]
mov r14, QWORD PTR [rbp+8]
; A[0] * B[0]
mulx r9, r8, QWORD PTR [rbp]
xor rbx, rbx
; A[0] * B[1]
mulx r10, rdi, QWORD PTR [rbp+8]
mulx r10, rdi, r14
adcx r9, rdi
; A[0] * B[2]
mulx r11, rdi, QWORD PTR [rbp+16]
@ -55927,7 +55894,7 @@ sp_256_mont_mul_avx2_4 PROC
xor rbx, rbx
adcx r9, rdi
; A[1] * B[1]
mulx r15, rdi, QWORD PTR [rbp+8]
mulx r15, rdi, r14
adox r10, rsi
adcx r10, rdi
; A[1] * B[2]
@ -55946,7 +55913,7 @@ sp_256_mont_mul_avx2_4 PROC
xor rbx, rbx
adcx r10, rdi
; A[2] * B[1]
mulx r15, rdi, QWORD PTR [rbp+8]
mulx r15, rdi, r14
adox r11, rsi
adcx r11, rdi
; A[2] * B[2]
@ -56036,11 +56003,10 @@ sp_256_mont_mul_avx2_4 PROC
adc r15, 0
sbb r8, 0
mov rax, 18446744069414584321
mov rdi, r8
; mask m and sub from result if overflow
; m[0] = -1 & mask = mask
shr rdi, 32
; m[2] = 0 & mask = 0
mov edi, r8d
and rax, r8
sub r12, r8
sbb r13, rdi
@ -56195,11 +56161,10 @@ sp_256_mont_sqr_avx2_4 PROC
adc r15, 0
sbb r8, 0
mov rax, 18446744069414584321
mov rdi, r8
; mask m and sub from result if overflow
; m[0] = -1 & mask = mask
shr rdi, 32
; m[2] = 0 & mask = 0
mov edi, r8d
and rax, r8
sub r12, r8
sbb r13, rdi
@ -57053,11 +57018,12 @@ sp_256_mont_mul_order_avx2_4 PROC
mov rbp, r8
mov rax, rdx
mov rdx, QWORD PTR [rax]
mov r14, QWORD PTR [rbp+8]
; A[0] * B[0]
mulx r9, r8, QWORD PTR [rbp]
xor rbx, rbx
; A[0] * B[1]
mulx r10, rdi, QWORD PTR [rbp+8]
mulx r10, rdi, r14
adcx r9, rdi
; A[0] * B[2]
mulx r11, rdi, QWORD PTR [rbp+16]
@ -57072,7 +57038,7 @@ sp_256_mont_mul_order_avx2_4 PROC
xor rbx, rbx
adcx r9, rdi
; A[1] * B[1]
mulx r15, rdi, QWORD PTR [rbp+8]
mulx r15, rdi, r14
adox r10, rsi
adcx r10, rdi
; A[1] * B[2]
@ -57091,7 +57057,7 @@ sp_256_mont_mul_order_avx2_4 PROC
xor rbx, rbx
adcx r10, rdi
; A[2] * B[1]
mulx r15, rdi, QWORD PTR [rbp+8]
mulx r15, rdi, r14
adox r11, rsi
adcx r11, rdi
; A[2] * B[2]
@ -59213,11 +59179,10 @@ sp_384_mont_reduce_6 PROC
; Subtract mod if carry
neg r11
mov r10, 18446744073709551614
mov r8, r11
mov r8d, r11d
mov r9, r11
shr r8, 32
shl r9, 32
and r10, r11
shl r9, 32
sub rbx, r8
sbb rbp, r9
sbb r12, r10
@ -59436,7 +59401,6 @@ sp_384_mont_add_6 PROC
mov r12, QWORD PTR [rdx+32]
mov r13, QWORD PTR [rdx+40]
add rax, QWORD PTR [r8]
mov r14, 4294967295
adc r9, QWORD PTR [r8+8]
mov r15, 18446744069414584320
adc r10, QWORD PTR [r8+16]
@ -59445,7 +59409,7 @@ sp_384_mont_add_6 PROC
adc r12, QWORD PTR [r8+32]
adc r13, QWORD PTR [r8+40]
sbb rdx, rdx
and r14, rdx
mov r14d, edx
and r15, rdx
and rdi, rdx
sub rax, r14
@ -59498,16 +59462,16 @@ sp_384_mont_dbl_6 PROC
mov r11, QWORD PTR [rdx+32]
mov r12, QWORD PTR [rdx+40]
add rax, rax
mov r13, 4294967295
adc r8, r8
mov r14, 18446744069414584320
adc r9, r9
mov r15, 18446744073709551614
adc r10, r10
adc r11, r11
mov rdi, r12
adc r12, r12
sbb rdi, rdi
and r13, rdi
sar rdi, 63
mov r13d, edi
and r14, rdi
and r15, rdi
sub rax, r13
@ -59560,7 +59524,6 @@ sp_384_mont_tpl_6 PROC
mov r11, QWORD PTR [rdx+32]
mov r12, QWORD PTR [rdx+40]
add rax, rax
mov r13, 4294967295
adc r8, r8
mov r14, 18446744069414584320
adc r9, r9
@ -59569,7 +59532,7 @@ sp_384_mont_tpl_6 PROC
adc r11, r11
adc r12, r12
sbb rdi, rdi
and r13, rdi
mov r13d, edi
and r14, rdi
and r15, rdi
sub rax, r13
@ -59590,7 +59553,6 @@ sp_384_mont_tpl_6 PROC
sbb r11, rdi
sbb r12, rdi
add rax, QWORD PTR [rdx]
mov r13, 4294967295
adc r8, QWORD PTR [rdx+8]
mov r14, 18446744069414584320
adc r9, QWORD PTR [rdx+16]
@ -59599,7 +59561,7 @@ sp_384_mont_tpl_6 PROC
adc r11, QWORD PTR [rdx+32]
adc r12, QWORD PTR [rdx+40]
sbb rdi, rdi
and r13, rdi
mov r13d, edi
and r14, rdi
and r15, rdi
sub rax, r13
@ -59653,7 +59615,6 @@ sp_384_mont_sub_6 PROC
mov r12, QWORD PTR [rdx+32]
mov r13, QWORD PTR [rdx+40]
sub rax, QWORD PTR [r8]
mov r14, 4294967295
sbb r9, QWORD PTR [r8+8]
mov r15, 18446744069414584320
sbb r10, QWORD PTR [r8+16]
@ -59662,7 +59623,7 @@ sp_384_mont_sub_6 PROC
sbb r12, QWORD PTR [r8+32]
sbb r13, QWORD PTR [r8+40]
sbb rdx, rdx
and r14, rdx
mov r14d, edx
and r15, rdx
and rdi, rdx
add rax, r14