forked from wolfSSL/wolfssl
Merge pull request #6776 from SparkiDev/sp_ecc_x64
SP ECC: x64 minor speed improvement
This commit is contained in:
@ -71903,8 +71903,16 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
|
||||
"sbcs r9, r9, #0\n\t"
|
||||
"sbcs r10, r10, r12, LSR #31\n\t"
|
||||
"sbcs r11, r11, r12\n\t"
|
||||
"rsb r12, r12, #0\n\t"
|
||||
"sbc r12, r12, #0\n\t"
|
||||
"sbc r2, r2, r2\n\t"
|
||||
"sub r12, r12, r2\n\t"
|
||||
"subs r4, r4, r12\n\t"
|
||||
"sbcs r5, r5, r12\n\t"
|
||||
"sbcs r6, r6, r12\n\t"
|
||||
"sbcs r7, r7, #0\n\t"
|
||||
"sbcs r8, r8, #0\n\t"
|
||||
"sbcs r9, r9, #0\n\t"
|
||||
"sbcs r10, r10, r12, LSR #31\n\t"
|
||||
"sbc r11, r11, r12\n\t"
|
||||
"ldm %[a]!, {r2, r3}\n\t"
|
||||
"adds r4, r4, r2\n\t"
|
||||
"adcs r5, r5, r3\n\t"
|
||||
|
@ -33116,8 +33116,16 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
|
||||
"SBCS r9, r9, #0x0\n\t"
|
||||
"SBCS r10, r10, r12, LSR #31\n\t"
|
||||
"SBCS r11, r11, r12\n\t"
|
||||
"RSB r12, r12, #0x0\n\t"
|
||||
"SBC r12, r12, #0x0\n\t"
|
||||
"SBC r2, r2, r2\n\t"
|
||||
"SUB r12, r12, r2\n\t"
|
||||
"SUBS r4, r4, r12\n\t"
|
||||
"SBCS r5, r5, r12\n\t"
|
||||
"SBCS r6, r6, r12\n\t"
|
||||
"SBCS r7, r7, #0x0\n\t"
|
||||
"SBCS r8, r8, #0x0\n\t"
|
||||
"SBCS r9, r9, #0x0\n\t"
|
||||
"SBCS r10, r10, r12, LSR #31\n\t"
|
||||
"SBC r11, r11, r12\n\t"
|
||||
"LDM %[a]!, {r2, r3}\n\t"
|
||||
"ADDS r4, r4, r2\n\t"
|
||||
"ADCS r5, r5, r3\n\t"
|
||||
|
@ -8607,7 +8607,7 @@ extern void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m);
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
extern void sp_256_mont_sub_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m);
|
||||
extern void sp_256_mont_rsb_sub_dbl_4(sp_digit* r, const sp_digit* a, sp_digit* b, const sp_digit* m);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@ -8661,9 +8661,8 @@ static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p,
|
||||
/* X = T1 * T1 */
|
||||
sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod);
|
||||
/* X = X - 2*Y */
|
||||
sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
|
||||
/* Y = Y - X */
|
||||
sp_256_mont_sub_4(y, y, x, p256_mod);
|
||||
sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod);
|
||||
/* Y = Y * T1 */
|
||||
sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod);
|
||||
/* Y = Y - T2 */
|
||||
@ -8775,15 +8774,14 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
|
||||
break;
|
||||
case 14:
|
||||
/* X = X - 2*Y */
|
||||
sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
||||
/* Y = Y - X */
|
||||
sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
||||
ctx->state = 15;
|
||||
break;
|
||||
case 15:
|
||||
ctx->state = 16;
|
||||
break;
|
||||
case 16:
|
||||
/* Y = Y - X */
|
||||
sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod);
|
||||
ctx->state = 17;
|
||||
break;
|
||||
case 17:
|
||||
@ -8808,13 +8806,6 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
|
||||
return err;
|
||||
}
|
||||
#endif /* WOLFSSL_SP_NONBLOCK */
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
extern void sp_256_mont_dbl_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
/* Double the Montgomery form projective point p a number of times.
|
||||
*
|
||||
* r Result of repeated doubling of point.
|
||||
@ -8858,9 +8849,9 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
|
||||
sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod);
|
||||
/* X = A^2 - 2B */
|
||||
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
|
||||
sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
|
||||
/* B = 2.(B - X) */
|
||||
sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
|
||||
sp_256_mont_dbl_4(b, b, p256_mod);
|
||||
/* Z = Z*Y */
|
||||
sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
|
||||
/* t1 = Y^4 */
|
||||
@ -8886,9 +8877,9 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
|
||||
sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod);
|
||||
/* X = A^2 - 2B */
|
||||
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
|
||||
sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
|
||||
/* B = 2.(B - X) */
|
||||
sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
|
||||
sp_256_mont_dbl_4(b, b, p256_mod);
|
||||
/* Z = Z*Y */
|
||||
sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
|
||||
/* t1 = Y^4 */
|
||||
@ -8981,9 +8972,8 @@ static void sp_256_proj_point_add_4(sp_point_256* r,
|
||||
sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_4(x, x, t5, p256_mod);
|
||||
sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
|
||||
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
||||
sp_256_mont_sub_4(y, y, x, p256_mod);
|
||||
sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod);
|
||||
sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_4(y, y, t5, p256_mod);
|
||||
{
|
||||
@ -9159,12 +9149,11 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
|
||||
ctx->state = 20;
|
||||
break;
|
||||
case 20:
|
||||
sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
||||
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
||||
sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
||||
ctx->state = 21;
|
||||
break;
|
||||
case 21:
|
||||
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
||||
sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod);
|
||||
ctx->state = 22;
|
||||
break;
|
||||
case 22:
|
||||
@ -9263,9 +9252,9 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r,
|
||||
x = r[j].x;
|
||||
/* X = A^2 - 2B */
|
||||
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
|
||||
sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
|
||||
/* B = 2.(B - X) */
|
||||
sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
|
||||
sp_256_mont_dbl_4(b, b, p256_mod);
|
||||
/* Z = Z*Y */
|
||||
sp_256_mont_mul_4(r[j].z, z, y, p256_mod, p256_mp_mod);
|
||||
z = r[j].z;
|
||||
@ -9764,7 +9753,7 @@ extern void sp_256_div2_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* m
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#define sp_256_mont_sub_dbl_avx2_4 sp_256_mont_sub_dbl_4
|
||||
#define sp_256_mont_rsb_sub_dbl_avx2_4 sp_256_mont_rsb_sub_dbl_4
|
||||
/* Double the Montgomery form projective point p.
|
||||
*
|
||||
* r Result of doubling point.
|
||||
@ -9815,9 +9804,8 @@ static void sp_256_proj_point_dbl_avx2_4(sp_point_256* r, const sp_point_256* p,
|
||||
/* X = T1 * T1 */
|
||||
sp_256_mont_sqr_avx2_4(x, t1, p256_mod, p256_mp_mod);
|
||||
/* X = X - 2*Y */
|
||||
sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod);
|
||||
/* Y = Y - X */
|
||||
sp_256_mont_sub_avx2_4(y, y, x, p256_mod);
|
||||
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, y, p256_mod);
|
||||
/* Y = Y * T1 */
|
||||
sp_256_mont_mul_avx2_4(y, y, t1, p256_mod, p256_mp_mod);
|
||||
/* Y = Y - T2 */
|
||||
@ -9929,15 +9917,14 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
|
||||
break;
|
||||
case 14:
|
||||
/* X = X - 2*Y */
|
||||
sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
||||
/* Y = Y - X */
|
||||
sp_256_mont_rsb_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
||||
ctx->state = 15;
|
||||
break;
|
||||
case 15:
|
||||
ctx->state = 16;
|
||||
break;
|
||||
case 16:
|
||||
/* Y = Y - X */
|
||||
sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod);
|
||||
ctx->state = 17;
|
||||
break;
|
||||
case 17:
|
||||
@ -9962,7 +9949,6 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
|
||||
return err;
|
||||
}
|
||||
#endif /* WOLFSSL_SP_NONBLOCK */
|
||||
#define sp_256_mont_dbl_sub_avx2_4 sp_256_mont_dbl_sub_4
|
||||
/* Double the Montgomery form projective point p a number of times.
|
||||
*
|
||||
* r Result of repeated doubling of point.
|
||||
@ -10006,9 +9992,9 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
|
||||
sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod);
|
||||
/* X = A^2 - 2B */
|
||||
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
|
||||
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
|
||||
/* B = 2.(B - X) */
|
||||
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
|
||||
sp_256_mont_dbl_avx2_4(b, b, p256_mod);
|
||||
/* Z = Z*Y */
|
||||
sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
|
||||
/* t1 = Y^4 */
|
||||
@ -10034,9 +10020,9 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
|
||||
sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod);
|
||||
/* X = A^2 - 2B */
|
||||
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
|
||||
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
|
||||
/* B = 2.(B - X) */
|
||||
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
|
||||
sp_256_mont_dbl_avx2_4(b, b, p256_mod);
|
||||
/* Z = Z*Y */
|
||||
sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
|
||||
/* t1 = Y^4 */
|
||||
@ -10105,9 +10091,8 @@ static void sp_256_proj_point_add_avx2_4(sp_point_256* r,
|
||||
sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_avx2_4(x, x, t5, p256_mod);
|
||||
sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod);
|
||||
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
||||
sp_256_mont_sub_avx2_4(y, y, x, p256_mod);
|
||||
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, y, p256_mod);
|
||||
sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_avx2_4(y, y, t5, p256_mod);
|
||||
{
|
||||
@ -10283,12 +10268,11 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
|
||||
ctx->state = 20;
|
||||
break;
|
||||
case 20:
|
||||
sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
||||
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
||||
sp_256_mont_rsb_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
||||
ctx->state = 21;
|
||||
break;
|
||||
case 21:
|
||||
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
||||
sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod);
|
||||
ctx->state = 22;
|
||||
break;
|
||||
case 22:
|
||||
@ -10387,9 +10371,9 @@ static void sp_256_proj_point_dbl_n_store_avx2_4(sp_point_256* r,
|
||||
x = r[j].x;
|
||||
/* X = A^2 - 2B */
|
||||
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
|
||||
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
|
||||
/* B = 2.(B - X) */
|
||||
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
|
||||
sp_256_mont_dbl_avx2_4(b, b, p256_mod);
|
||||
/* Z = Z*Y */
|
||||
sp_256_mont_mul_avx2_4(r[j].z, z, y, p256_mod, p256_mp_mod);
|
||||
z = r[j].z;
|
||||
@ -10689,9 +10673,8 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r,
|
||||
sp_256_mont_mul_4(t1, t1, t2, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sqr_4(t2, t4, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_4(t2, t2, t1, p256_mod);
|
||||
sp_256_mont_sub_dbl_4(x, t2, t3, p256_mod);
|
||||
sp_256_mont_rsb_sub_dbl_4(x, t2, t3, p256_mod);
|
||||
/* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
|
||||
sp_256_mont_sub_4(t3, t3, x, p256_mod);
|
||||
sp_256_mont_mul_4(t3, t3, t4, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_mul_4(t1, t1, p->y, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_4(y, t3, t1, p256_mod);
|
||||
@ -11178,9 +11161,8 @@ static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r,
|
||||
sp_256_mont_mul_avx2_4(t1, t1, t2, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sqr_avx2_4(t2, t4, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_avx2_4(t2, t2, t1, p256_mod);
|
||||
sp_256_mont_sub_dbl_avx2_4(x, t2, t3, p256_mod);
|
||||
sp_256_mont_rsb_sub_dbl_avx2_4(x, t2, t3, p256_mod);
|
||||
/* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
|
||||
sp_256_mont_sub_avx2_4(t3, t3, x, p256_mod);
|
||||
sp_256_mont_mul_avx2_4(t3, t3, t4, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_mul_avx2_4(t1, t1, p->y, p256_mod, p256_mp_mod);
|
||||
sp_256_mont_sub_avx2_4(y, t3, t1, p256_mod);
|
||||
|
@ -55378,11 +55378,12 @@ _sp_256_mul_avx2_4:
|
||||
pushq %rbx
|
||||
movq %rdx, %rbp
|
||||
movq (%rsi), %rdx
|
||||
movq 8(%rbp), %r14
|
||||
# A[0] * B[0]
|
||||
mulxq (%rbp), %r8, %r9
|
||||
xorq %rbx, %rbx
|
||||
# A[0] * B[1]
|
||||
mulxq 8(%rbp), %rax, %r10
|
||||
mulxq %r14, %rax, %r10
|
||||
adcxq %rax, %r9
|
||||
# A[0] * B[2]
|
||||
mulxq 16(%rbp), %rax, %r11
|
||||
@ -55397,7 +55398,7 @@ _sp_256_mul_avx2_4:
|
||||
xorq %rbx, %rbx
|
||||
adcxq %rax, %r9
|
||||
# A[1] * B[1]
|
||||
mulxq 8(%rbp), %rax, %r15
|
||||
mulxq %r14, %rax, %r15
|
||||
adoxq %rcx, %r10
|
||||
adcxq %rax, %r10
|
||||
# A[1] * B[2]
|
||||
@ -55416,7 +55417,7 @@ _sp_256_mul_avx2_4:
|
||||
xorq %rbx, %rbx
|
||||
adcxq %rax, %r10
|
||||
# A[2] * B[1]
|
||||
mulxq 8(%rbp), %rax, %r15
|
||||
mulxq %r14, %rax, %r15
|
||||
adoxq %rcx, %r11
|
||||
adcxq %rax, %r11
|
||||
# A[2] * B[2]
|
||||
@ -55981,11 +55982,10 @@ _sp_256_mont_mul_4:
|
||||
adcq $0x00, %rbx
|
||||
sbbq $0x00, %r9
|
||||
movq $0xffffffff00000001, %rsi
|
||||
movq %r9, %rax
|
||||
# mask m and sub from result if overflow
|
||||
# m[0] = -1 & mask = mask
|
||||
shrq $32, %rax
|
||||
# m[2] = 0 & mask = 0
|
||||
movl %r9d, %eax
|
||||
andq %r9, %rsi
|
||||
subq %r9, %r13
|
||||
sbbq %rax, %r14
|
||||
@ -56163,11 +56163,10 @@ _sp_256_mont_sqr_4:
|
||||
adcq $0x00, %r15
|
||||
sbbq $0x00, %r8
|
||||
movq $0xffffffff00000001, %rsi
|
||||
movq %r8, %rax
|
||||
# mask m and sub from result if overflow
|
||||
# m[0] = -1 & mask = mask
|
||||
shrq $32, %rax
|
||||
# m[2] = 0 & mask = 0
|
||||
movl %r8d, %eax
|
||||
andq %r8, %rsi
|
||||
subq %r8, %r12
|
||||
sbbq %rax, %r13
|
||||
@ -56388,11 +56387,10 @@ _sp_256_mont_reduce_4:
|
||||
adcq $0x00, %r15
|
||||
sbbq $0x00, %r8
|
||||
movq $0xffffffff00000001, %rbx
|
||||
movq %r8, %rax
|
||||
# mask m and sub from result if overflow
|
||||
# m[0] = -1 & mask = mask
|
||||
shrq $32, %rax
|
||||
# m[2] = 0 & mask = 0
|
||||
movl %r8d, %eax
|
||||
andq %r8, %rbx
|
||||
subq %r8, %r12
|
||||
sbbq %rax, %r13
|
||||
@ -56543,13 +56541,12 @@ _sp_256_mont_add_4:
|
||||
movq 16(%rsi), %r8
|
||||
movq 24(%rsi), %r9
|
||||
addq (%rdx), %rax
|
||||
movq $0xffffffff, %r10
|
||||
adcq 8(%rdx), %rcx
|
||||
movq $0xffffffff00000001, %r11
|
||||
adcq 16(%rdx), %r8
|
||||
adcq 24(%rdx), %r9
|
||||
sbbq %rsi, %rsi
|
||||
andq %rsi, %r10
|
||||
movl %esi, %r10d
|
||||
andq %rsi, %r11
|
||||
subq %rsi, %rax
|
||||
sbbq %r10, %rcx
|
||||
@ -56593,13 +56590,13 @@ _sp_256_mont_dbl_4:
|
||||
movq 16(%rsi), %rcx
|
||||
movq 24(%rsi), %r8
|
||||
addq %rdx, %rdx
|
||||
movq $0xffffffff, %r9
|
||||
adcq %rax, %rax
|
||||
movq $0xffffffff00000001, %r10
|
||||
adcq %rcx, %rcx
|
||||
movq %r8, %r11
|
||||
adcq %r8, %r8
|
||||
sbbq %r11, %r11
|
||||
andq %r11, %r9
|
||||
sarq $63, %r11
|
||||
movl %r11d, %r9d
|
||||
andq %r11, %r10
|
||||
subq %r11, %rdx
|
||||
sbbq %r9, %rax
|
||||
@ -56643,13 +56640,12 @@ _sp_256_mont_tpl_4:
|
||||
movq 16(%rsi), %rcx
|
||||
movq 24(%rsi), %r8
|
||||
addq %rdx, %rdx
|
||||
movq $0xffffffff, %r9
|
||||
adcq %rax, %rax
|
||||
movq $0xffffffff00000001, %r10
|
||||
adcq %rcx, %rcx
|
||||
adcq %r8, %r8
|
||||
sbbq %r11, %r11
|
||||
andq %r11, %r9
|
||||
movl %r11d, %r9d
|
||||
andq %r11, %r10
|
||||
subq %r11, %rdx
|
||||
sbbq %r9, %rax
|
||||
@ -56663,13 +56659,12 @@ _sp_256_mont_tpl_4:
|
||||
sbbq $0x00, %rcx
|
||||
sbbq %r10, %r8
|
||||
addq (%rsi), %rdx
|
||||
movq $0xffffffff, %r9
|
||||
adcq 8(%rsi), %rax
|
||||
movq $0xffffffff00000001, %r10
|
||||
adcq 16(%rsi), %rcx
|
||||
adcq 24(%rsi), %r8
|
||||
sbbq %r11, %r11
|
||||
andq %r11, %r9
|
||||
sbbq $0x00, %r11
|
||||
movl %r11d, %r9d
|
||||
andq %r11, %r10
|
||||
subq %r11, %rdx
|
||||
sbbq %r9, %rax
|
||||
@ -56714,13 +56709,12 @@ _sp_256_mont_sub_4:
|
||||
movq 16(%rsi), %r8
|
||||
movq 24(%rsi), %r9
|
||||
subq (%rdx), %rax
|
||||
movq $0xffffffff, %r10
|
||||
sbbq 8(%rdx), %rcx
|
||||
movq $0xffffffff00000001, %r11
|
||||
sbbq 16(%rdx), %r8
|
||||
sbbq 24(%rdx), %r9
|
||||
sbbq %rsi, %rsi
|
||||
andq %rsi, %r10
|
||||
movl %esi, %r10d
|
||||
andq %rsi, %r11
|
||||
addq %rsi, %rax
|
||||
adcq %r10, %rcx
|
||||
@ -56797,15 +56791,15 @@ _sp_256_div2_4:
|
||||
*/
|
||||
#ifndef __APPLE__
|
||||
.text
|
||||
.globl sp_256_mont_sub_dbl_4
|
||||
.type sp_256_mont_sub_dbl_4,@function
|
||||
.globl sp_256_mont_rsb_sub_dbl_4
|
||||
.type sp_256_mont_rsb_sub_dbl_4,@function
|
||||
.align 16
|
||||
sp_256_mont_sub_dbl_4:
|
||||
sp_256_mont_rsb_sub_dbl_4:
|
||||
#else
|
||||
.section __TEXT,__text
|
||||
.globl _sp_256_mont_sub_dbl_4
|
||||
.globl _sp_256_mont_rsb_sub_dbl_4
|
||||
.p2align 4
|
||||
_sp_256_mont_sub_dbl_4:
|
||||
_sp_256_mont_rsb_sub_dbl_4:
|
||||
#endif /* __APPLE__ */
|
||||
pushq %r12
|
||||
pushq %r13
|
||||
@ -56820,42 +56814,40 @@ _sp_256_mont_sub_dbl_4:
|
||||
movq 16(%rdx), %r12
|
||||
movq 24(%rdx), %r13
|
||||
addq %r10, %r10
|
||||
movq $0xffffffff, %r14
|
||||
adcq %r11, %r11
|
||||
movq $0xffffffff00000001, %r15
|
||||
adcq %r12, %r12
|
||||
adcq %r13, %r13
|
||||
sbbq %rdx, %rdx
|
||||
andq %rdx, %r14
|
||||
andq %rdx, %r15
|
||||
subq %rdx, %r10
|
||||
sbbq %rsi, %rsi
|
||||
movl %esi, %r14d
|
||||
andq %rsi, %r15
|
||||
subq %rsi, %r10
|
||||
sbbq %r14, %r11
|
||||
sbbq $0x00, %r12
|
||||
sbbq %r15, %r13
|
||||
adcq $0x00, %rdx
|
||||
andq %rdx, %r14
|
||||
andq %rdx, %r15
|
||||
subq %rdx, %r10
|
||||
adcq $0x00, %rsi
|
||||
andq %rsi, %r14
|
||||
andq %rsi, %r15
|
||||
subq %rsi, %r10
|
||||
sbbq %r14, %r11
|
||||
sbbq $0x00, %r12
|
||||
sbbq %r15, %r13
|
||||
subq %r10, %rax
|
||||
movq $0xffffffff, %r14
|
||||
sbbq %r11, %rcx
|
||||
movq $0xffffffff00000001, %r15
|
||||
sbbq %r12, %r8
|
||||
sbbq %r13, %r9
|
||||
sbbq %rdx, %rdx
|
||||
andq %rdx, %r14
|
||||
andq %rdx, %r15
|
||||
addq %rdx, %rax
|
||||
sbbq $0x00, %rsi
|
||||
movl %esi, %r14d
|
||||
andq %rsi, %r15
|
||||
addq %rsi, %rax
|
||||
adcq %r14, %rcx
|
||||
adcq $0x00, %r8
|
||||
adcq %r15, %r9
|
||||
adcq $0x00, %rdx
|
||||
andq %rdx, %r14
|
||||
andq %rdx, %r15
|
||||
addq %rdx, %rax
|
||||
adcq $0x00, %rsi
|
||||
andq %rsi, %r14
|
||||
andq %rsi, %r15
|
||||
addq %rsi, %rax
|
||||
adcq %r14, %rcx
|
||||
movq %rax, (%rdi)
|
||||
adcq $0x00, %r8
|
||||
@ -56863,73 +56855,40 @@ _sp_256_mont_sub_dbl_4:
|
||||
adcq %r15, %r9
|
||||
movq %r8, 16(%rdi)
|
||||
movq %r9, 24(%rdi)
|
||||
movq (%rdx), %r10
|
||||
movq 8(%rdx), %r11
|
||||
movq 16(%rdx), %r12
|
||||
movq 24(%rdx), %r13
|
||||
subq %rax, %r10
|
||||
sbbq %rcx, %r11
|
||||
movq $0xffffffff00000001, %r15
|
||||
sbbq %r8, %r12
|
||||
sbbq %r9, %r13
|
||||
sbbq %rsi, %rsi
|
||||
movl %esi, %r14d
|
||||
andq %rsi, %r15
|
||||
addq %rsi, %r10
|
||||
adcq %r14, %r11
|
||||
adcq $0x00, %r12
|
||||
adcq %r15, %r13
|
||||
adcq $0x00, %rsi
|
||||
andq %rsi, %r14
|
||||
andq %rsi, %r15
|
||||
addq %rsi, %r10
|
||||
adcq %r14, %r11
|
||||
movq %r10, (%rdx)
|
||||
adcq $0x00, %r12
|
||||
movq %r11, 8(%rdx)
|
||||
adcq %r15, %r13
|
||||
movq %r12, 16(%rdx)
|
||||
movq %r13, 24(%rdx)
|
||||
popq %r15
|
||||
popq %r14
|
||||
popq %r13
|
||||
popq %r12
|
||||
repz retq
|
||||
#ifndef __APPLE__
|
||||
.size sp_256_mont_sub_dbl_4,.-sp_256_mont_sub_dbl_4
|
||||
#endif /* __APPLE__ */
|
||||
/* Two Montgomery numbers, subtract second from first and double.
|
||||
* (r = 2.(a - b) % m).
|
||||
*
|
||||
* b must have came from a mont_sub operation.
|
||||
*
|
||||
* r Result of subtration.
|
||||
* a Number to subtract from in Montgomery form.
|
||||
* b Number to subtract with in Montgomery form.
|
||||
* m Modulus (prime).
|
||||
*/
|
||||
#ifndef __APPLE__
|
||||
.text
|
||||
.globl sp_256_mont_dbl_sub_4
|
||||
.type sp_256_mont_dbl_sub_4,@function
|
||||
.align 16
|
||||
sp_256_mont_dbl_sub_4:
|
||||
#else
|
||||
.section __TEXT,__text
|
||||
.globl _sp_256_mont_dbl_sub_4
|
||||
.p2align 4
|
||||
_sp_256_mont_dbl_sub_4:
|
||||
#endif /* __APPLE__ */
|
||||
movq (%rsi), %rax
|
||||
movq 8(%rsi), %rcx
|
||||
movq 16(%rsi), %r8
|
||||
movq 24(%rsi), %r9
|
||||
subq (%rdx), %rax
|
||||
movq $0xffffffff, %r10
|
||||
sbbq 8(%rdx), %rcx
|
||||
movq $0xffffffff00000001, %r11
|
||||
sbbq 16(%rdx), %r8
|
||||
sbbq 24(%rdx), %r9
|
||||
sbbq %rdx, %rdx
|
||||
andq %rdx, %r10
|
||||
andq %rdx, %r11
|
||||
addq %rdx, %rax
|
||||
adcq %r10, %rcx
|
||||
adcq $0x00, %r8
|
||||
adcq %r11, %r9
|
||||
addq %rax, %rax
|
||||
movq $0xffffffff, %r10
|
||||
adcq %rcx, %rcx
|
||||
movq $0xffffffff00000001, %r11
|
||||
adcq %r8, %r8
|
||||
adcq %r9, %r9
|
||||
sbbq %rdx, %rdx
|
||||
andq %rdx, %r10
|
||||
andq %rdx, %r11
|
||||
subq %rdx, %rax
|
||||
sbbq %r10, %rcx
|
||||
movq %rax, (%rdi)
|
||||
sbbq $0x00, %r8
|
||||
movq %rcx, 8(%rdi)
|
||||
sbbq %r11, %r9
|
||||
movq %r8, 16(%rdi)
|
||||
movq %r9, 24(%rdi)
|
||||
repz retq
|
||||
#ifndef __APPLE__
|
||||
.size sp_256_mont_dbl_sub_4,.-sp_256_mont_dbl_sub_4
|
||||
.size sp_256_mont_rsb_sub_dbl_4,.-sp_256_mont_rsb_sub_dbl_4
|
||||
#endif /* __APPLE__ */
|
||||
#ifndef WC_NO_CACHE_RESISTANT
|
||||
/* Touch each possible point that could be being copied.
|
||||
@ -57085,11 +57044,12 @@ _sp_256_mont_mul_avx2_4:
|
||||
pushq %rbx
|
||||
movq %rdx, %rbp
|
||||
movq (%rsi), %rdx
|
||||
movq 8(%rbp), %r14
|
||||
# A[0] * B[0]
|
||||
mulxq (%rbp), %r8, %r9
|
||||
xorq %rbx, %rbx
|
||||
# A[0] * B[1]
|
||||
mulxq 8(%rbp), %rax, %r10
|
||||
mulxq %r14, %rax, %r10
|
||||
adcxq %rax, %r9
|
||||
# A[0] * B[2]
|
||||
mulxq 16(%rbp), %rax, %r11
|
||||
@ -57104,7 +57064,7 @@ _sp_256_mont_mul_avx2_4:
|
||||
xorq %rbx, %rbx
|
||||
adcxq %rax, %r9
|
||||
# A[1] * B[1]
|
||||
mulxq 8(%rbp), %rax, %r15
|
||||
mulxq %r14, %rax, %r15
|
||||
adoxq %rcx, %r10
|
||||
adcxq %rax, %r10
|
||||
# A[1] * B[2]
|
||||
@ -57123,7 +57083,7 @@ _sp_256_mont_mul_avx2_4:
|
||||
xorq %rbx, %rbx
|
||||
adcxq %rax, %r10
|
||||
# A[2] * B[1]
|
||||
mulxq 8(%rbp), %rax, %r15
|
||||
mulxq %r14, %rax, %r15
|
||||
adoxq %rcx, %r11
|
||||
adcxq %rax, %r11
|
||||
# A[2] * B[2]
|
||||
@ -57213,11 +57173,10 @@ _sp_256_mont_mul_avx2_4:
|
||||
adcq $0x00, %r15
|
||||
sbbq $0x00, %r8
|
||||
movq $0xffffffff00000001, %rsi
|
||||
movq %r8, %rax
|
||||
# mask m and sub from result if overflow
|
||||
# m[0] = -1 & mask = mask
|
||||
shrq $32, %rax
|
||||
# m[2] = 0 & mask = 0
|
||||
movl %r8d, %eax
|
||||
andq %r8, %rsi
|
||||
subq %r8, %r12
|
||||
sbbq %rax, %r13
|
||||
@ -57378,11 +57337,10 @@ _sp_256_mont_sqr_avx2_4:
|
||||
adcq $0x00, %r15
|
||||
sbbq $0x00, %r8
|
||||
movq $0xffffffff00000001, %rsi
|
||||
movq %r8, %rax
|
||||
# mask m and sub from result if overflow
|
||||
# m[0] = -1 & mask = mask
|
||||
shrq $32, %rax
|
||||
# m[2] = 0 & mask = 0
|
||||
movl %r8d, %eax
|
||||
andq %r8, %rsi
|
||||
subq %r8, %r12
|
||||
sbbq %rax, %r13
|
||||
@ -58352,11 +58310,12 @@ _sp_256_mont_mul_order_avx2_4:
|
||||
pushq %rbx
|
||||
movq %rdx, %rbp
|
||||
movq (%rsi), %rdx
|
||||
movq 8(%rbp), %r14
|
||||
# A[0] * B[0]
|
||||
mulxq (%rbp), %r8, %r9
|
||||
xorq %rbx, %rbx
|
||||
# A[0] * B[1]
|
||||
mulxq 8(%rbp), %rax, %r10
|
||||
mulxq %r14, %rax, %r10
|
||||
adcxq %rax, %r9
|
||||
# A[0] * B[2]
|
||||
mulxq 16(%rbp), %rax, %r11
|
||||
@ -58371,7 +58330,7 @@ _sp_256_mont_mul_order_avx2_4:
|
||||
xorq %rbx, %rbx
|
||||
adcxq %rax, %r9
|
||||
# A[1] * B[1]
|
||||
mulxq 8(%rbp), %rax, %r15
|
||||
mulxq %r14, %rax, %r15
|
||||
adoxq %rcx, %r10
|
||||
adcxq %rax, %r10
|
||||
# A[1] * B[2]
|
||||
@ -58390,7 +58349,7 @@ _sp_256_mont_mul_order_avx2_4:
|
||||
xorq %rbx, %rbx
|
||||
adcxq %rax, %r10
|
||||
# A[2] * B[1]
|
||||
mulxq 8(%rbp), %rax, %r15
|
||||
mulxq %r14, %rax, %r15
|
||||
adoxq %rcx, %r11
|
||||
adcxq %rax, %r11
|
||||
# A[2] * B[2]
|
||||
@ -60601,11 +60560,10 @@ _sp_384_mont_reduce_6:
|
||||
# Subtract mod if carry
|
||||
negq %r10
|
||||
movq $0xfffffffffffffffe, %r9
|
||||
movq %r10, %rcx
|
||||
movl %r10d, %ecx
|
||||
movq %r10, %r8
|
||||
shrq $32, %rcx
|
||||
shlq $32, %r8
|
||||
andq %r10, %r9
|
||||
shlq $32, %r8
|
||||
subq %rcx, %rbx
|
||||
sbbq %r8, %rbp
|
||||
sbbq %r9, %r11
|
||||
@ -60851,7 +60809,6 @@ _sp_384_mont_add_6:
|
||||
movq 32(%rsi), %r10
|
||||
movq 40(%rsi), %r11
|
||||
addq (%rdx), %rax
|
||||
movq $0xffffffff, %r12
|
||||
adcq 8(%rdx), %rcx
|
||||
movq $0xffffffff00000000, %r13
|
||||
adcq 16(%rdx), %r8
|
||||
@ -60860,7 +60817,7 @@ _sp_384_mont_add_6:
|
||||
adcq 32(%rdx), %r10
|
||||
adcq 40(%rdx), %r11
|
||||
sbbq %rsi, %rsi
|
||||
andq %rsi, %r12
|
||||
movl %esi, %r12d
|
||||
andq %rsi, %r13
|
||||
andq %rsi, %r14
|
||||
subq %r12, %rax
|
||||
@ -60920,16 +60877,16 @@ _sp_384_mont_dbl_6:
|
||||
movq 32(%rsi), %r9
|
||||
movq 40(%rsi), %r10
|
||||
addq %rdx, %rdx
|
||||
movq $0xffffffff, %r11
|
||||
adcq %rax, %rax
|
||||
movq $0xffffffff00000000, %r12
|
||||
adcq %rcx, %rcx
|
||||
movq $0xfffffffffffffffe, %r13
|
||||
adcq %r8, %r8
|
||||
adcq %r9, %r9
|
||||
movq %r10, %r14
|
||||
adcq %r10, %r10
|
||||
sbbq %r14, %r14
|
||||
andq %r14, %r11
|
||||
sarq $63, %r14
|
||||
movl %r14d, %r11d
|
||||
andq %r14, %r12
|
||||
andq %r14, %r13
|
||||
subq %r11, %rdx
|
||||
@ -60989,7 +60946,6 @@ _sp_384_mont_tpl_6:
|
||||
movq 32(%rsi), %r9
|
||||
movq 40(%rsi), %r10
|
||||
addq %rdx, %rdx
|
||||
movq $0xffffffff, %r11
|
||||
adcq %rax, %rax
|
||||
movq $0xffffffff00000000, %r12
|
||||
adcq %rcx, %rcx
|
||||
@ -60998,7 +60954,7 @@ _sp_384_mont_tpl_6:
|
||||
adcq %r9, %r9
|
||||
adcq %r10, %r10
|
||||
sbbq %r14, %r14
|
||||
andq %r14, %r11
|
||||
movl %r14d, %r11d
|
||||
andq %r14, %r12
|
||||
andq %r14, %r13
|
||||
subq %r11, %rdx
|
||||
@ -61019,7 +60975,6 @@ _sp_384_mont_tpl_6:
|
||||
sbbq %r14, %r9
|
||||
sbbq %r14, %r10
|
||||
addq (%rsi), %rdx
|
||||
movq $0xffffffff, %r11
|
||||
adcq 8(%rsi), %rax
|
||||
movq $0xffffffff00000000, %r12
|
||||
adcq 16(%rsi), %rcx
|
||||
@ -61028,7 +60983,7 @@ _sp_384_mont_tpl_6:
|
||||
adcq 32(%rsi), %r9
|
||||
adcq 40(%rsi), %r10
|
||||
sbbq %r14, %r14
|
||||
andq %r14, %r11
|
||||
movl %r14d, %r11d
|
||||
andq %r14, %r12
|
||||
andq %r14, %r13
|
||||
subq %r11, %rdx
|
||||
@ -61089,7 +61044,6 @@ _sp_384_mont_sub_6:
|
||||
movq 32(%rsi), %r10
|
||||
movq 40(%rsi), %r11
|
||||
subq (%rdx), %rax
|
||||
movq $0xffffffff, %r12
|
||||
sbbq 8(%rdx), %rcx
|
||||
movq $0xffffffff00000000, %r13
|
||||
sbbq 16(%rdx), %r8
|
||||
@ -61098,7 +61052,7 @@ _sp_384_mont_sub_6:
|
||||
sbbq 32(%rdx), %r10
|
||||
sbbq 40(%rdx), %r11
|
||||
sbbq %rsi, %rsi
|
||||
andq %rsi, %r12
|
||||
movl %esi, %r12d
|
||||
andq %rsi, %r13
|
||||
andq %rsi, %r14
|
||||
addq %r12, %rax
|
||||
|
@ -54329,11 +54329,12 @@ sp_256_mul_avx2_4 PROC
|
||||
mov rbp, r8
|
||||
mov rax, rdx
|
||||
mov rdx, QWORD PTR [rax]
|
||||
mov r14, QWORD PTR [rbp+8]
|
||||
; A[0] * B[0]
|
||||
mulx r9, r8, QWORD PTR [rbp]
|
||||
xor rbx, rbx
|
||||
; A[0] * B[1]
|
||||
mulx r10, rdi, QWORD PTR [rbp+8]
|
||||
mulx r10, rdi, r14
|
||||
adcx r9, rdi
|
||||
; A[0] * B[2]
|
||||
mulx r11, rdi, QWORD PTR [rbp+16]
|
||||
@ -54348,7 +54349,7 @@ sp_256_mul_avx2_4 PROC
|
||||
xor rbx, rbx
|
||||
adcx r9, rdi
|
||||
; A[1] * B[1]
|
||||
mulx r15, rdi, QWORD PTR [rbp+8]
|
||||
mulx r15, rdi, r14
|
||||
adox r10, rsi
|
||||
adcx r10, rdi
|
||||
; A[1] * B[2]
|
||||
@ -54367,7 +54368,7 @@ sp_256_mul_avx2_4 PROC
|
||||
xor rbx, rbx
|
||||
adcx r10, rdi
|
||||
; A[2] * B[1]
|
||||
mulx r15, rdi, QWORD PTR [rbp+8]
|
||||
mulx r15, rdi, r14
|
||||
adox r11, rsi
|
||||
adcx r11, rdi
|
||||
; A[2] * B[2]
|
||||
@ -54884,11 +54885,10 @@ sp_256_mont_mul_4 PROC
|
||||
adc rbx, 0
|
||||
sbb r11, 0
|
||||
mov r10, 18446744069414584321
|
||||
mov rax, r11
|
||||
; mask m and sub from result if overflow
|
||||
; m[0] = -1 & mask = mask
|
||||
shr rax, 32
|
||||
; m[2] = 0 & mask = 0
|
||||
mov eax, r11d
|
||||
and r10, r11
|
||||
sub r15, r11
|
||||
sbb rdi, rax
|
||||
@ -55060,11 +55060,10 @@ sp_256_mont_sqr_4 PROC
|
||||
adc rsi, 0
|
||||
sbb r10, 0
|
||||
mov r8, 18446744069414584321
|
||||
mov rax, r10
|
||||
; mask m and sub from result if overflow
|
||||
; m[0] = -1 & mask = mask
|
||||
shr rax, 32
|
||||
; m[2] = 0 & mask = 0
|
||||
mov eax, r10d
|
||||
and r8, r10
|
||||
sub r14, r10
|
||||
sbb r15, rax
|
||||
@ -55263,11 +55262,10 @@ sp_256_mont_reduce_4 PROC
|
||||
adc rdi, 0
|
||||
sbb r9, 0
|
||||
mov rbx, 18446744069414584321
|
||||
mov rax, r9
|
||||
; mask m and sub from result if overflow
|
||||
; m[0] = -1 & mask = mask
|
||||
shr rax, 32
|
||||
; m[2] = 0 & mask = 0
|
||||
mov eax, r9d
|
||||
and rbx, r9
|
||||
sub r13, r9
|
||||
sbb r14, rax
|
||||
@ -55404,13 +55402,12 @@ sp_256_mont_add_4 PROC
|
||||
mov r10, QWORD PTR [rdx+16]
|
||||
mov r11, QWORD PTR [rdx+24]
|
||||
add rax, QWORD PTR [r8]
|
||||
mov r12, 4294967295
|
||||
adc r9, QWORD PTR [r8+8]
|
||||
mov r13, 18446744069414584321
|
||||
adc r10, QWORD PTR [r8+16]
|
||||
adc r11, QWORD PTR [r8+24]
|
||||
sbb rdx, rdx
|
||||
and r12, rdx
|
||||
mov r12d, edx
|
||||
and r13, rdx
|
||||
sub rax, rdx
|
||||
sbb r9, r12
|
||||
@ -55447,13 +55444,13 @@ sp_256_mont_dbl_4 PROC
|
||||
mov r9, QWORD PTR [rdx+16]
|
||||
mov r10, QWORD PTR [rdx+24]
|
||||
add rax, rax
|
||||
mov r11, 4294967295
|
||||
adc r8, r8
|
||||
mov r12, 18446744069414584321
|
||||
adc r9, r9
|
||||
mov r13, r10
|
||||
adc r10, r10
|
||||
sbb r13, r13
|
||||
and r11, r13
|
||||
sar r13, 63
|
||||
mov r11d, r13d
|
||||
and r12, r13
|
||||
sub rax, r13
|
||||
sbb r8, r11
|
||||
@ -55490,13 +55487,12 @@ sp_256_mont_tpl_4 PROC
|
||||
mov r9, QWORD PTR [rdx+16]
|
||||
mov r10, QWORD PTR [rdx+24]
|
||||
add rax, rax
|
||||
mov r11, 4294967295
|
||||
adc r8, r8
|
||||
mov r12, 18446744069414584321
|
||||
adc r9, r9
|
||||
adc r10, r10
|
||||
sbb r13, r13
|
||||
and r11, r13
|
||||
mov r11d, r13d
|
||||
and r12, r13
|
||||
sub rax, r13
|
||||
sbb r8, r11
|
||||
@ -55510,13 +55506,12 @@ sp_256_mont_tpl_4 PROC
|
||||
sbb r9, 0
|
||||
sbb r10, r12
|
||||
add rax, QWORD PTR [rdx]
|
||||
mov r11, 4294967295
|
||||
adc r8, QWORD PTR [rdx+8]
|
||||
mov r12, 18446744069414584321
|
||||
adc r9, QWORD PTR [rdx+16]
|
||||
adc r10, QWORD PTR [rdx+24]
|
||||
sbb r13, r13
|
||||
and r11, r13
|
||||
sbb r13, 0
|
||||
mov r11d, r13d
|
||||
and r12, r13
|
||||
sub rax, r13
|
||||
sbb r8, r11
|
||||
@ -55554,13 +55549,12 @@ sp_256_mont_sub_4 PROC
|
||||
mov r10, QWORD PTR [rdx+16]
|
||||
mov r11, QWORD PTR [rdx+24]
|
||||
sub rax, QWORD PTR [r8]
|
||||
mov r12, 4294967295
|
||||
sbb r9, QWORD PTR [r8+8]
|
||||
mov r13, 18446744069414584321
|
||||
sbb r10, QWORD PTR [r8+16]
|
||||
sbb r11, QWORD PTR [r8+24]
|
||||
sbb rdx, rdx
|
||||
and r12, rdx
|
||||
mov r12d, edx
|
||||
and r13, rdx
|
||||
add rax, rdx
|
||||
adc r9, r12
|
||||
@ -55630,7 +55624,7 @@ _text ENDS
|
||||
; * m Modulus (prime).
|
||||
; */
|
||||
_text SEGMENT READONLY PARA
|
||||
sp_256_mont_sub_dbl_4 PROC
|
||||
sp_256_mont_rsb_sub_dbl_4 PROC
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
@ -55646,42 +55640,40 @@ sp_256_mont_sub_dbl_4 PROC
|
||||
mov r14, QWORD PTR [r8+16]
|
||||
mov r15, QWORD PTR [r8+24]
|
||||
add r12, r12
|
||||
mov rdi, 4294967295
|
||||
adc r13, r13
|
||||
mov rsi, 18446744069414584321
|
||||
adc r14, r14
|
||||
adc r15, r15
|
||||
sbb r8, r8
|
||||
and rdi, r8
|
||||
and rsi, r8
|
||||
sub r12, r8
|
||||
sbb rdx, rdx
|
||||
mov edi, edx
|
||||
and rsi, rdx
|
||||
sub r12, rdx
|
||||
sbb r13, rdi
|
||||
sbb r14, 0
|
||||
sbb r15, rsi
|
||||
adc r8, 0
|
||||
and rdi, r8
|
||||
and rsi, r8
|
||||
sub r12, r8
|
||||
adc rdx, 0
|
||||
and rdi, rdx
|
||||
and rsi, rdx
|
||||
sub r12, rdx
|
||||
sbb r13, rdi
|
||||
sbb r14, 0
|
||||
sbb r15, rsi
|
||||
sub rax, r12
|
||||
mov rdi, 4294967295
|
||||
sbb r9, r13
|
||||
mov rsi, 18446744069414584321
|
||||
sbb r10, r14
|
||||
sbb r11, r15
|
||||
sbb r8, r8
|
||||
and rdi, r8
|
||||
and rsi, r8
|
||||
add rax, r8
|
||||
sbb rdx, 0
|
||||
mov edi, edx
|
||||
and rsi, rdx
|
||||
add rax, rdx
|
||||
adc r9, rdi
|
||||
adc r10, 0
|
||||
adc r11, rsi
|
||||
adc r8, 0
|
||||
and rdi, r8
|
||||
and rsi, r8
|
||||
add rax, r8
|
||||
adc rdx, 0
|
||||
and rdi, rdx
|
||||
and rsi, rdx
|
||||
add rax, rdx
|
||||
adc r9, rdi
|
||||
mov QWORD PTR [rcx], rax
|
||||
adc r10, 0
|
||||
@ -55689,6 +55681,33 @@ sp_256_mont_sub_dbl_4 PROC
|
||||
adc r11, rsi
|
||||
mov QWORD PTR [rcx+16], r10
|
||||
mov QWORD PTR [rcx+24], r11
|
||||
mov r12, QWORD PTR [r8]
|
||||
mov r13, QWORD PTR [r8+8]
|
||||
mov r14, QWORD PTR [r8+16]
|
||||
mov r15, QWORD PTR [r8+24]
|
||||
sub r12, rax
|
||||
sbb r13, r9
|
||||
mov rsi, 18446744069414584321
|
||||
sbb r14, r10
|
||||
sbb r15, r11
|
||||
sbb rdx, rdx
|
||||
mov edi, edx
|
||||
and rsi, rdx
|
||||
add r12, rdx
|
||||
adc r13, rdi
|
||||
adc r14, 0
|
||||
adc r15, rsi
|
||||
adc rdx, 0
|
||||
and rdi, rdx
|
||||
and rsi, rdx
|
||||
add r12, rdx
|
||||
adc r13, rdi
|
||||
mov QWORD PTR [r8], r12
|
||||
adc r14, 0
|
||||
mov QWORD PTR [r8+8], r13
|
||||
adc r15, rsi
|
||||
mov QWORD PTR [r8+16], r14
|
||||
mov QWORD PTR [r8+24], r15
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop r15
|
||||
@ -55696,60 +55715,7 @@ sp_256_mont_sub_dbl_4 PROC
|
||||
pop r13
|
||||
pop r12
|
||||
ret
|
||||
sp_256_mont_sub_dbl_4 ENDP
|
||||
_text ENDS
|
||||
; /* Two Montgomery numbers, subtract second from first and double.
|
||||
; * (r = 2.(a - b) % m).
|
||||
; *
|
||||
; * b must have came from a mont_sub operation.
|
||||
; *
|
||||
; * r Result of subtration.
|
||||
; * a Number to subtract from in Montgomery form.
|
||||
; * b Number to subtract with in Montgomery form.
|
||||
; * m Modulus (prime).
|
||||
; */
|
||||
_text SEGMENT READONLY PARA
|
||||
sp_256_mont_dbl_sub_4 PROC
|
||||
push r12
|
||||
push r13
|
||||
mov rax, QWORD PTR [rdx]
|
||||
mov r9, QWORD PTR [rdx+8]
|
||||
mov r10, QWORD PTR [rdx+16]
|
||||
mov r11, QWORD PTR [rdx+24]
|
||||
sub rax, QWORD PTR [r8]
|
||||
mov r12, 4294967295
|
||||
sbb r9, QWORD PTR [r8+8]
|
||||
mov r13, 18446744069414584321
|
||||
sbb r10, QWORD PTR [r8+16]
|
||||
sbb r11, QWORD PTR [r8+24]
|
||||
sbb r8, r8
|
||||
and r12, r8
|
||||
and r13, r8
|
||||
add rax, r8
|
||||
adc r9, r12
|
||||
adc r10, 0
|
||||
adc r11, r13
|
||||
add rax, rax
|
||||
mov r12, 4294967295
|
||||
adc r9, r9
|
||||
mov r13, 18446744069414584321
|
||||
adc r10, r10
|
||||
adc r11, r11
|
||||
sbb r8, r8
|
||||
and r12, r8
|
||||
and r13, r8
|
||||
sub rax, r8
|
||||
sbb r9, r12
|
||||
mov QWORD PTR [rcx], rax
|
||||
sbb r10, 0
|
||||
mov QWORD PTR [rcx+8], r9
|
||||
sbb r11, r13
|
||||
mov QWORD PTR [rcx+16], r10
|
||||
mov QWORD PTR [rcx+24], r11
|
||||
pop r13
|
||||
pop r12
|
||||
ret
|
||||
sp_256_mont_dbl_sub_4 ENDP
|
||||
sp_256_mont_rsb_sub_dbl_4 ENDP
|
||||
_text ENDS
|
||||
IFNDEF WC_NO_CACHE_RESISTANT
|
||||
; /* Touch each possible point that could be being copied.
|
||||
@ -55908,11 +55874,12 @@ sp_256_mont_mul_avx2_4 PROC
|
||||
mov rbp, r8
|
||||
mov rax, rdx
|
||||
mov rdx, QWORD PTR [rax]
|
||||
mov r14, QWORD PTR [rbp+8]
|
||||
; A[0] * B[0]
|
||||
mulx r9, r8, QWORD PTR [rbp]
|
||||
xor rbx, rbx
|
||||
; A[0] * B[1]
|
||||
mulx r10, rdi, QWORD PTR [rbp+8]
|
||||
mulx r10, rdi, r14
|
||||
adcx r9, rdi
|
||||
; A[0] * B[2]
|
||||
mulx r11, rdi, QWORD PTR [rbp+16]
|
||||
@ -55927,7 +55894,7 @@ sp_256_mont_mul_avx2_4 PROC
|
||||
xor rbx, rbx
|
||||
adcx r9, rdi
|
||||
; A[1] * B[1]
|
||||
mulx r15, rdi, QWORD PTR [rbp+8]
|
||||
mulx r15, rdi, r14
|
||||
adox r10, rsi
|
||||
adcx r10, rdi
|
||||
; A[1] * B[2]
|
||||
@ -55946,7 +55913,7 @@ sp_256_mont_mul_avx2_4 PROC
|
||||
xor rbx, rbx
|
||||
adcx r10, rdi
|
||||
; A[2] * B[1]
|
||||
mulx r15, rdi, QWORD PTR [rbp+8]
|
||||
mulx r15, rdi, r14
|
||||
adox r11, rsi
|
||||
adcx r11, rdi
|
||||
; A[2] * B[2]
|
||||
@ -56036,11 +56003,10 @@ sp_256_mont_mul_avx2_4 PROC
|
||||
adc r15, 0
|
||||
sbb r8, 0
|
||||
mov rax, 18446744069414584321
|
||||
mov rdi, r8
|
||||
; mask m and sub from result if overflow
|
||||
; m[0] = -1 & mask = mask
|
||||
shr rdi, 32
|
||||
; m[2] = 0 & mask = 0
|
||||
mov edi, r8d
|
||||
and rax, r8
|
||||
sub r12, r8
|
||||
sbb r13, rdi
|
||||
@ -56195,11 +56161,10 @@ sp_256_mont_sqr_avx2_4 PROC
|
||||
adc r15, 0
|
||||
sbb r8, 0
|
||||
mov rax, 18446744069414584321
|
||||
mov rdi, r8
|
||||
; mask m and sub from result if overflow
|
||||
; m[0] = -1 & mask = mask
|
||||
shr rdi, 32
|
||||
; m[2] = 0 & mask = 0
|
||||
mov edi, r8d
|
||||
and rax, r8
|
||||
sub r12, r8
|
||||
sbb r13, rdi
|
||||
@ -57053,11 +57018,12 @@ sp_256_mont_mul_order_avx2_4 PROC
|
||||
mov rbp, r8
|
||||
mov rax, rdx
|
||||
mov rdx, QWORD PTR [rax]
|
||||
mov r14, QWORD PTR [rbp+8]
|
||||
; A[0] * B[0]
|
||||
mulx r9, r8, QWORD PTR [rbp]
|
||||
xor rbx, rbx
|
||||
; A[0] * B[1]
|
||||
mulx r10, rdi, QWORD PTR [rbp+8]
|
||||
mulx r10, rdi, r14
|
||||
adcx r9, rdi
|
||||
; A[0] * B[2]
|
||||
mulx r11, rdi, QWORD PTR [rbp+16]
|
||||
@ -57072,7 +57038,7 @@ sp_256_mont_mul_order_avx2_4 PROC
|
||||
xor rbx, rbx
|
||||
adcx r9, rdi
|
||||
; A[1] * B[1]
|
||||
mulx r15, rdi, QWORD PTR [rbp+8]
|
||||
mulx r15, rdi, r14
|
||||
adox r10, rsi
|
||||
adcx r10, rdi
|
||||
; A[1] * B[2]
|
||||
@ -57091,7 +57057,7 @@ sp_256_mont_mul_order_avx2_4 PROC
|
||||
xor rbx, rbx
|
||||
adcx r10, rdi
|
||||
; A[2] * B[1]
|
||||
mulx r15, rdi, QWORD PTR [rbp+8]
|
||||
mulx r15, rdi, r14
|
||||
adox r11, rsi
|
||||
adcx r11, rdi
|
||||
; A[2] * B[2]
|
||||
@ -59213,11 +59179,10 @@ sp_384_mont_reduce_6 PROC
|
||||
; Subtract mod if carry
|
||||
neg r11
|
||||
mov r10, 18446744073709551614
|
||||
mov r8, r11
|
||||
mov r8d, r11d
|
||||
mov r9, r11
|
||||
shr r8, 32
|
||||
shl r9, 32
|
||||
and r10, r11
|
||||
shl r9, 32
|
||||
sub rbx, r8
|
||||
sbb rbp, r9
|
||||
sbb r12, r10
|
||||
@ -59436,7 +59401,6 @@ sp_384_mont_add_6 PROC
|
||||
mov r12, QWORD PTR [rdx+32]
|
||||
mov r13, QWORD PTR [rdx+40]
|
||||
add rax, QWORD PTR [r8]
|
||||
mov r14, 4294967295
|
||||
adc r9, QWORD PTR [r8+8]
|
||||
mov r15, 18446744069414584320
|
||||
adc r10, QWORD PTR [r8+16]
|
||||
@ -59445,7 +59409,7 @@ sp_384_mont_add_6 PROC
|
||||
adc r12, QWORD PTR [r8+32]
|
||||
adc r13, QWORD PTR [r8+40]
|
||||
sbb rdx, rdx
|
||||
and r14, rdx
|
||||
mov r14d, edx
|
||||
and r15, rdx
|
||||
and rdi, rdx
|
||||
sub rax, r14
|
||||
@ -59498,16 +59462,16 @@ sp_384_mont_dbl_6 PROC
|
||||
mov r11, QWORD PTR [rdx+32]
|
||||
mov r12, QWORD PTR [rdx+40]
|
||||
add rax, rax
|
||||
mov r13, 4294967295
|
||||
adc r8, r8
|
||||
mov r14, 18446744069414584320
|
||||
adc r9, r9
|
||||
mov r15, 18446744073709551614
|
||||
adc r10, r10
|
||||
adc r11, r11
|
||||
mov rdi, r12
|
||||
adc r12, r12
|
||||
sbb rdi, rdi
|
||||
and r13, rdi
|
||||
sar rdi, 63
|
||||
mov r13d, edi
|
||||
and r14, rdi
|
||||
and r15, rdi
|
||||
sub rax, r13
|
||||
@ -59560,7 +59524,6 @@ sp_384_mont_tpl_6 PROC
|
||||
mov r11, QWORD PTR [rdx+32]
|
||||
mov r12, QWORD PTR [rdx+40]
|
||||
add rax, rax
|
||||
mov r13, 4294967295
|
||||
adc r8, r8
|
||||
mov r14, 18446744069414584320
|
||||
adc r9, r9
|
||||
@ -59569,7 +59532,7 @@ sp_384_mont_tpl_6 PROC
|
||||
adc r11, r11
|
||||
adc r12, r12
|
||||
sbb rdi, rdi
|
||||
and r13, rdi
|
||||
mov r13d, edi
|
||||
and r14, rdi
|
||||
and r15, rdi
|
||||
sub rax, r13
|
||||
@ -59590,7 +59553,6 @@ sp_384_mont_tpl_6 PROC
|
||||
sbb r11, rdi
|
||||
sbb r12, rdi
|
||||
add rax, QWORD PTR [rdx]
|
||||
mov r13, 4294967295
|
||||
adc r8, QWORD PTR [rdx+8]
|
||||
mov r14, 18446744069414584320
|
||||
adc r9, QWORD PTR [rdx+16]
|
||||
@ -59599,7 +59561,7 @@ sp_384_mont_tpl_6 PROC
|
||||
adc r11, QWORD PTR [rdx+32]
|
||||
adc r12, QWORD PTR [rdx+40]
|
||||
sbb rdi, rdi
|
||||
and r13, rdi
|
||||
mov r13d, edi
|
||||
and r14, rdi
|
||||
and r15, rdi
|
||||
sub rax, r13
|
||||
@ -59653,7 +59615,6 @@ sp_384_mont_sub_6 PROC
|
||||
mov r12, QWORD PTR [rdx+32]
|
||||
mov r13, QWORD PTR [rdx+40]
|
||||
sub rax, QWORD PTR [r8]
|
||||
mov r14, 4294967295
|
||||
sbb r9, QWORD PTR [r8+8]
|
||||
mov r15, 18446744069414584320
|
||||
sbb r10, QWORD PTR [r8+16]
|
||||
@ -59662,7 +59623,7 @@ sp_384_mont_sub_6 PROC
|
||||
sbb r12, QWORD PTR [r8+32]
|
||||
sbb r13, QWORD PTR [r8+40]
|
||||
sbb rdx, rdx
|
||||
and r14, rdx
|
||||
mov r14d, edx
|
||||
and r15, rdx
|
||||
and rdi, rdx
|
||||
add rax, r14
|
||||
|
Reference in New Issue
Block a user