SP ECC: x64 minor speed improvement

ARM32/Thumb2: for safer code, do two reductions in mont triple after doing double part.
This commit is contained in:
Sean Parkinson
2023-09-15 09:23:43 +10:00
parent 7d85e390a9
commit e9f1489997
5 changed files with 210 additions and 297 deletions

View File

@ -71903,8 +71903,16 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
"sbcs r9, r9, #0\n\t" "sbcs r9, r9, #0\n\t"
"sbcs r10, r10, r12, LSR #31\n\t" "sbcs r10, r10, r12, LSR #31\n\t"
"sbcs r11, r11, r12\n\t" "sbcs r11, r11, r12\n\t"
"rsb r12, r12, #0\n\t" "sbc r2, r2, r2\n\t"
"sbc r12, r12, #0\n\t" "sub r12, r12, r2\n\t"
"subs r4, r4, r12\n\t"
"sbcs r5, r5, r12\n\t"
"sbcs r6, r6, r12\n\t"
"sbcs r7, r7, #0\n\t"
"sbcs r8, r8, #0\n\t"
"sbcs r9, r9, #0\n\t"
"sbcs r10, r10, r12, LSR #31\n\t"
"sbc r11, r11, r12\n\t"
"ldm %[a]!, {r2, r3}\n\t" "ldm %[a]!, {r2, r3}\n\t"
"adds r4, r4, r2\n\t" "adds r4, r4, r2\n\t"
"adcs r5, r5, r3\n\t" "adcs r5, r5, r3\n\t"

View File

@ -33116,8 +33116,16 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
"SBCS r9, r9, #0x0\n\t" "SBCS r9, r9, #0x0\n\t"
"SBCS r10, r10, r12, LSR #31\n\t" "SBCS r10, r10, r12, LSR #31\n\t"
"SBCS r11, r11, r12\n\t" "SBCS r11, r11, r12\n\t"
"RSB r12, r12, #0x0\n\t" "SBC r2, r2, r2\n\t"
"SBC r12, r12, #0x0\n\t" "SUB r12, r12, r2\n\t"
"SUBS r4, r4, r12\n\t"
"SBCS r5, r5, r12\n\t"
"SBCS r6, r6, r12\n\t"
"SBCS r7, r7, #0x0\n\t"
"SBCS r8, r8, #0x0\n\t"
"SBCS r9, r9, #0x0\n\t"
"SBCS r10, r10, r12, LSR #31\n\t"
"SBC r11, r11, r12\n\t"
"LDM %[a]!, {r2, r3}\n\t" "LDM %[a]!, {r2, r3}\n\t"
"ADDS r4, r4, r2\n\t" "ADDS r4, r4, r2\n\t"
"ADCS r5, r5, r3\n\t" "ADCS r5, r5, r3\n\t"

View File

@ -8607,7 +8607,7 @@ extern void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m);
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
extern void sp_256_mont_sub_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); extern void sp_256_mont_rsb_sub_dbl_4(sp_digit* r, const sp_digit* a, sp_digit* b, const sp_digit* m);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
@ -8661,9 +8661,8 @@ static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p,
/* X = T1 * T1 */ /* X = T1 * T1 */
sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod);
/* X = X - 2*Y */ /* X = X - 2*Y */
sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
/* Y = Y - X */ /* Y = Y - X */
sp_256_mont_sub_4(y, y, x, p256_mod); sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod);
/* Y = Y * T1 */ /* Y = Y * T1 */
sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod); sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod);
/* Y = Y - T2 */ /* Y = Y - T2 */
@ -8775,15 +8774,14 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
break; break;
case 14: case 14:
/* X = X - 2*Y */ /* X = X - 2*Y */
sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod); /* Y = Y - X */
sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
ctx->state = 15; ctx->state = 15;
break; break;
case 15: case 15:
ctx->state = 16; ctx->state = 16;
break; break;
case 16: case 16:
/* Y = Y - X */
sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod);
ctx->state = 17; ctx->state = 17;
break; break;
case 17: case 17:
@ -8808,13 +8806,6 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
return err; return err;
} }
#endif /* WOLFSSL_SP_NONBLOCK */ #endif /* WOLFSSL_SP_NONBLOCK */
#ifdef __cplusplus
extern "C" {
#endif
extern void sp_256_mont_dbl_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m);
#ifdef __cplusplus
}
#endif
/* Double the Montgomery form projective point p a number of times. /* Double the Montgomery form projective point p a number of times.
* *
* r Result of repeated doubling of point. * r Result of repeated doubling of point.
@ -8858,9 +8849,9 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod);
/* X = A^2 - 2B */ /* X = A^2 - 2B */
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_4(x, x, b, p256_mod); sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
/* B = 2.(B - X) */ /* B = 2.(B - X) */
sp_256_mont_dbl_sub_4(b, b, x, p256_mod); sp_256_mont_dbl_4(b, b, p256_mod);
/* Z = Z*Y */ /* Z = Z*Y */
sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
/* t1 = Y^4 */ /* t1 = Y^4 */
@ -8886,9 +8877,9 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod);
/* X = A^2 - 2B */ /* X = A^2 - 2B */
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_4(x, x, b, p256_mod); sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
/* B = 2.(B - X) */ /* B = 2.(B - X) */
sp_256_mont_dbl_sub_4(b, b, x, p256_mod); sp_256_mont_dbl_4(b, b, p256_mod);
/* Z = Z*Y */ /* Z = Z*Y */
sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
/* t1 = Y^4 */ /* t1 = Y^4 */
@ -8981,9 +8972,8 @@ static void sp_256_proj_point_add_4(sp_point_256* r,
sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod);
sp_256_mont_sub_4(x, x, t5, p256_mod); sp_256_mont_sub_4(x, x, t5, p256_mod);
sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
sp_256_mont_sub_4(y, y, x, p256_mod); sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod);
sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod);
sp_256_mont_sub_4(y, y, t5, p256_mod); sp_256_mont_sub_4(y, y, t5, p256_mod);
{ {
@ -9159,12 +9149,11 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
ctx->state = 20; ctx->state = 20;
break; break;
case 20: case 20:
sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
ctx->state = 21; ctx->state = 21;
break; break;
case 21: case 21:
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod);
ctx->state = 22; ctx->state = 22;
break; break;
case 22: case 22:
@ -9263,9 +9252,9 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r,
x = r[j].x; x = r[j].x;
/* X = A^2 - 2B */ /* X = A^2 - 2B */
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_4(x, x, b, p256_mod); sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
/* B = 2.(B - X) */ /* B = 2.(B - X) */
sp_256_mont_dbl_sub_4(b, b, x, p256_mod); sp_256_mont_dbl_4(b, b, p256_mod);
/* Z = Z*Y */ /* Z = Z*Y */
sp_256_mont_mul_4(r[j].z, z, y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(r[j].z, z, y, p256_mod, p256_mp_mod);
z = r[j].z; z = r[j].z;
@ -9764,7 +9753,7 @@ extern void sp_256_div2_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* m
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#define sp_256_mont_sub_dbl_avx2_4 sp_256_mont_sub_dbl_4 #define sp_256_mont_rsb_sub_dbl_avx2_4 sp_256_mont_rsb_sub_dbl_4
/* Double the Montgomery form projective point p. /* Double the Montgomery form projective point p.
* *
* r Result of doubling point. * r Result of doubling point.
@ -9815,9 +9804,8 @@ static void sp_256_proj_point_dbl_avx2_4(sp_point_256* r, const sp_point_256* p,
/* X = T1 * T1 */ /* X = T1 * T1 */
sp_256_mont_sqr_avx2_4(x, t1, p256_mod, p256_mp_mod); sp_256_mont_sqr_avx2_4(x, t1, p256_mod, p256_mp_mod);
/* X = X - 2*Y */ /* X = X - 2*Y */
sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod);
/* Y = Y - X */ /* Y = Y - X */
sp_256_mont_sub_avx2_4(y, y, x, p256_mod); sp_256_mont_rsb_sub_dbl_avx2_4(x, x, y, p256_mod);
/* Y = Y * T1 */ /* Y = Y * T1 */
sp_256_mont_mul_avx2_4(y, y, t1, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(y, y, t1, p256_mod, p256_mp_mod);
/* Y = Y - T2 */ /* Y = Y - T2 */
@ -9929,15 +9917,14 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
break; break;
case 14: case 14:
/* X = X - 2*Y */ /* X = X - 2*Y */
sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod); /* Y = Y - X */
sp_256_mont_rsb_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
ctx->state = 15; ctx->state = 15;
break; break;
case 15: case 15:
ctx->state = 16; ctx->state = 16;
break; break;
case 16: case 16:
/* Y = Y - X */
sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod);
ctx->state = 17; ctx->state = 17;
break; break;
case 17: case 17:
@ -9962,7 +9949,6 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
return err; return err;
} }
#endif /* WOLFSSL_SP_NONBLOCK */ #endif /* WOLFSSL_SP_NONBLOCK */
#define sp_256_mont_dbl_sub_avx2_4 sp_256_mont_dbl_sub_4
/* Double the Montgomery form projective point p a number of times. /* Double the Montgomery form projective point p a number of times.
* *
* r Result of repeated doubling of point. * r Result of repeated doubling of point.
@ -10006,9 +9992,9 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod);
/* X = A^2 - 2B */ /* X = A^2 - 2B */
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod); sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod); sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
/* B = 2.(B - X) */ /* B = 2.(B - X) */
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod); sp_256_mont_dbl_avx2_4(b, b, p256_mod);
/* Z = Z*Y */ /* Z = Z*Y */
sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
/* t1 = Y^4 */ /* t1 = Y^4 */
@ -10034,9 +10020,9 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod);
/* X = A^2 - 2B */ /* X = A^2 - 2B */
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod); sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod); sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
/* B = 2.(B - X) */ /* B = 2.(B - X) */
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod); sp_256_mont_dbl_avx2_4(b, b, p256_mod);
/* Z = Z*Y */ /* Z = Z*Y */
sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
/* t1 = Y^4 */ /* t1 = Y^4 */
@ -10105,9 +10091,8 @@ static void sp_256_proj_point_add_avx2_4(sp_point_256* r,
sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod);
sp_256_mont_sub_avx2_4(x, x, t5, p256_mod); sp_256_mont_sub_avx2_4(x, x, t5, p256_mod);
sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod);
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
sp_256_mont_sub_avx2_4(y, y, x, p256_mod); sp_256_mont_rsb_sub_dbl_avx2_4(x, x, y, p256_mod);
sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod);
sp_256_mont_sub_avx2_4(y, y, t5, p256_mod); sp_256_mont_sub_avx2_4(y, y, t5, p256_mod);
{ {
@ -10283,12 +10268,11 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
ctx->state = 20; ctx->state = 20;
break; break;
case 20: case 20:
sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
sp_256_mont_rsb_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
ctx->state = 21; ctx->state = 21;
break; break;
case 21: case 21:
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod);
ctx->state = 22; ctx->state = 22;
break; break;
case 22: case 22:
@ -10387,9 +10371,9 @@ static void sp_256_proj_point_dbl_n_store_avx2_4(sp_point_256* r,
x = r[j].x; x = r[j].x;
/* X = A^2 - 2B */ /* X = A^2 - 2B */
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod); sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod); sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
/* B = 2.(B - X) */ /* B = 2.(B - X) */
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod); sp_256_mont_dbl_avx2_4(b, b, p256_mod);
/* Z = Z*Y */ /* Z = Z*Y */
sp_256_mont_mul_avx2_4(r[j].z, z, y, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(r[j].z, z, y, p256_mod, p256_mp_mod);
z = r[j].z; z = r[j].z;
@ -10689,9 +10673,8 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r,
sp_256_mont_mul_4(t1, t1, t2, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t1, t1, t2, p256_mod, p256_mp_mod);
sp_256_mont_sqr_4(t2, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(t2, t4, p256_mod, p256_mp_mod);
sp_256_mont_sub_4(t2, t2, t1, p256_mod); sp_256_mont_sub_4(t2, t2, t1, p256_mod);
sp_256_mont_sub_dbl_4(x, t2, t3, p256_mod); sp_256_mont_rsb_sub_dbl_4(x, t2, t3, p256_mod);
/* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
sp_256_mont_sub_4(t3, t3, x, p256_mod);
sp_256_mont_mul_4(t3, t3, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t3, t3, t4, p256_mod, p256_mp_mod);
sp_256_mont_mul_4(t1, t1, p->y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t1, t1, p->y, p256_mod, p256_mp_mod);
sp_256_mont_sub_4(y, t3, t1, p256_mod); sp_256_mont_sub_4(y, t3, t1, p256_mod);
@ -11178,9 +11161,8 @@ static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r,
sp_256_mont_mul_avx2_4(t1, t1, t2, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t1, t1, t2, p256_mod, p256_mp_mod);
sp_256_mont_sqr_avx2_4(t2, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_avx2_4(t2, t4, p256_mod, p256_mp_mod);
sp_256_mont_sub_avx2_4(t2, t2, t1, p256_mod); sp_256_mont_sub_avx2_4(t2, t2, t1, p256_mod);
sp_256_mont_sub_dbl_avx2_4(x, t2, t3, p256_mod); sp_256_mont_rsb_sub_dbl_avx2_4(x, t2, t3, p256_mod);
/* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
sp_256_mont_sub_avx2_4(t3, t3, x, p256_mod);
sp_256_mont_mul_avx2_4(t3, t3, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t3, t3, t4, p256_mod, p256_mp_mod);
sp_256_mont_mul_avx2_4(t1, t1, p->y, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t1, t1, p->y, p256_mod, p256_mp_mod);
sp_256_mont_sub_avx2_4(y, t3, t1, p256_mod); sp_256_mont_sub_avx2_4(y, t3, t1, p256_mod);

View File

@ -55378,11 +55378,12 @@ _sp_256_mul_avx2_4:
pushq %rbx pushq %rbx
movq %rdx, %rbp movq %rdx, %rbp
movq (%rsi), %rdx movq (%rsi), %rdx
movq 8(%rbp), %r14
# A[0] * B[0] # A[0] * B[0]
mulxq (%rbp), %r8, %r9 mulxq (%rbp), %r8, %r9
xorq %rbx, %rbx xorq %rbx, %rbx
# A[0] * B[1] # A[0] * B[1]
mulxq 8(%rbp), %rax, %r10 mulxq %r14, %rax, %r10
adcxq %rax, %r9 adcxq %rax, %r9
# A[0] * B[2] # A[0] * B[2]
mulxq 16(%rbp), %rax, %r11 mulxq 16(%rbp), %rax, %r11
@ -55397,7 +55398,7 @@ _sp_256_mul_avx2_4:
xorq %rbx, %rbx xorq %rbx, %rbx
adcxq %rax, %r9 adcxq %rax, %r9
# A[1] * B[1] # A[1] * B[1]
mulxq 8(%rbp), %rax, %r15 mulxq %r14, %rax, %r15
adoxq %rcx, %r10 adoxq %rcx, %r10
adcxq %rax, %r10 adcxq %rax, %r10
# A[1] * B[2] # A[1] * B[2]
@ -55416,7 +55417,7 @@ _sp_256_mul_avx2_4:
xorq %rbx, %rbx xorq %rbx, %rbx
adcxq %rax, %r10 adcxq %rax, %r10
# A[2] * B[1] # A[2] * B[1]
mulxq 8(%rbp), %rax, %r15 mulxq %r14, %rax, %r15
adoxq %rcx, %r11 adoxq %rcx, %r11
adcxq %rax, %r11 adcxq %rax, %r11
# A[2] * B[2] # A[2] * B[2]
@ -55981,11 +55982,10 @@ _sp_256_mont_mul_4:
adcq $0x00, %rbx adcq $0x00, %rbx
sbbq $0x00, %r9 sbbq $0x00, %r9
movq $0xffffffff00000001, %rsi movq $0xffffffff00000001, %rsi
movq %r9, %rax
# mask m and sub from result if overflow # mask m and sub from result if overflow
# m[0] = -1 & mask = mask # m[0] = -1 & mask = mask
shrq $32, %rax
# m[2] = 0 & mask = 0 # m[2] = 0 & mask = 0
movl %r9d, %eax
andq %r9, %rsi andq %r9, %rsi
subq %r9, %r13 subq %r9, %r13
sbbq %rax, %r14 sbbq %rax, %r14
@ -56163,11 +56163,10 @@ _sp_256_mont_sqr_4:
adcq $0x00, %r15 adcq $0x00, %r15
sbbq $0x00, %r8 sbbq $0x00, %r8
movq $0xffffffff00000001, %rsi movq $0xffffffff00000001, %rsi
movq %r8, %rax
# mask m and sub from result if overflow # mask m and sub from result if overflow
# m[0] = -1 & mask = mask # m[0] = -1 & mask = mask
shrq $32, %rax
# m[2] = 0 & mask = 0 # m[2] = 0 & mask = 0
movl %r8d, %eax
andq %r8, %rsi andq %r8, %rsi
subq %r8, %r12 subq %r8, %r12
sbbq %rax, %r13 sbbq %rax, %r13
@ -56388,11 +56387,10 @@ _sp_256_mont_reduce_4:
adcq $0x00, %r15 adcq $0x00, %r15
sbbq $0x00, %r8 sbbq $0x00, %r8
movq $0xffffffff00000001, %rbx movq $0xffffffff00000001, %rbx
movq %r8, %rax
# mask m and sub from result if overflow # mask m and sub from result if overflow
# m[0] = -1 & mask = mask # m[0] = -1 & mask = mask
shrq $32, %rax
# m[2] = 0 & mask = 0 # m[2] = 0 & mask = 0
movl %r8d, %eax
andq %r8, %rbx andq %r8, %rbx
subq %r8, %r12 subq %r8, %r12
sbbq %rax, %r13 sbbq %rax, %r13
@ -56543,13 +56541,12 @@ _sp_256_mont_add_4:
movq 16(%rsi), %r8 movq 16(%rsi), %r8
movq 24(%rsi), %r9 movq 24(%rsi), %r9
addq (%rdx), %rax addq (%rdx), %rax
movq $0xffffffff, %r10
adcq 8(%rdx), %rcx adcq 8(%rdx), %rcx
movq $0xffffffff00000001, %r11 movq $0xffffffff00000001, %r11
adcq 16(%rdx), %r8 adcq 16(%rdx), %r8
adcq 24(%rdx), %r9 adcq 24(%rdx), %r9
sbbq %rsi, %rsi sbbq %rsi, %rsi
andq %rsi, %r10 movl %esi, %r10d
andq %rsi, %r11 andq %rsi, %r11
subq %rsi, %rax subq %rsi, %rax
sbbq %r10, %rcx sbbq %r10, %rcx
@ -56593,13 +56590,13 @@ _sp_256_mont_dbl_4:
movq 16(%rsi), %rcx movq 16(%rsi), %rcx
movq 24(%rsi), %r8 movq 24(%rsi), %r8
addq %rdx, %rdx addq %rdx, %rdx
movq $0xffffffff, %r9
adcq %rax, %rax adcq %rax, %rax
movq $0xffffffff00000001, %r10 movq $0xffffffff00000001, %r10
adcq %rcx, %rcx adcq %rcx, %rcx
movq %r8, %r11
adcq %r8, %r8 adcq %r8, %r8
sbbq %r11, %r11 sarq $63, %r11
andq %r11, %r9 movl %r11d, %r9d
andq %r11, %r10 andq %r11, %r10
subq %r11, %rdx subq %r11, %rdx
sbbq %r9, %rax sbbq %r9, %rax
@ -56643,13 +56640,12 @@ _sp_256_mont_tpl_4:
movq 16(%rsi), %rcx movq 16(%rsi), %rcx
movq 24(%rsi), %r8 movq 24(%rsi), %r8
addq %rdx, %rdx addq %rdx, %rdx
movq $0xffffffff, %r9
adcq %rax, %rax adcq %rax, %rax
movq $0xffffffff00000001, %r10 movq $0xffffffff00000001, %r10
adcq %rcx, %rcx adcq %rcx, %rcx
adcq %r8, %r8 adcq %r8, %r8
sbbq %r11, %r11 sbbq %r11, %r11
andq %r11, %r9 movl %r11d, %r9d
andq %r11, %r10 andq %r11, %r10
subq %r11, %rdx subq %r11, %rdx
sbbq %r9, %rax sbbq %r9, %rax
@ -56663,13 +56659,12 @@ _sp_256_mont_tpl_4:
sbbq $0x00, %rcx sbbq $0x00, %rcx
sbbq %r10, %r8 sbbq %r10, %r8
addq (%rsi), %rdx addq (%rsi), %rdx
movq $0xffffffff, %r9
adcq 8(%rsi), %rax adcq 8(%rsi), %rax
movq $0xffffffff00000001, %r10 movq $0xffffffff00000001, %r10
adcq 16(%rsi), %rcx adcq 16(%rsi), %rcx
adcq 24(%rsi), %r8 adcq 24(%rsi), %r8
sbbq %r11, %r11 sbbq $0x00, %r11
andq %r11, %r9 movl %r11d, %r9d
andq %r11, %r10 andq %r11, %r10
subq %r11, %rdx subq %r11, %rdx
sbbq %r9, %rax sbbq %r9, %rax
@ -56714,13 +56709,12 @@ _sp_256_mont_sub_4:
movq 16(%rsi), %r8 movq 16(%rsi), %r8
movq 24(%rsi), %r9 movq 24(%rsi), %r9
subq (%rdx), %rax subq (%rdx), %rax
movq $0xffffffff, %r10
sbbq 8(%rdx), %rcx sbbq 8(%rdx), %rcx
movq $0xffffffff00000001, %r11 movq $0xffffffff00000001, %r11
sbbq 16(%rdx), %r8 sbbq 16(%rdx), %r8
sbbq 24(%rdx), %r9 sbbq 24(%rdx), %r9
sbbq %rsi, %rsi sbbq %rsi, %rsi
andq %rsi, %r10 movl %esi, %r10d
andq %rsi, %r11 andq %rsi, %r11
addq %rsi, %rax addq %rsi, %rax
adcq %r10, %rcx adcq %r10, %rcx
@ -56797,15 +56791,15 @@ _sp_256_div2_4:
*/ */
#ifndef __APPLE__ #ifndef __APPLE__
.text .text
.globl sp_256_mont_sub_dbl_4 .globl sp_256_mont_rsb_sub_dbl_4
.type sp_256_mont_sub_dbl_4,@function .type sp_256_mont_rsb_sub_dbl_4,@function
.align 16 .align 16
sp_256_mont_sub_dbl_4: sp_256_mont_rsb_sub_dbl_4:
#else #else
.section __TEXT,__text .section __TEXT,__text
.globl _sp_256_mont_sub_dbl_4 .globl _sp_256_mont_rsb_sub_dbl_4
.p2align 4 .p2align 4
_sp_256_mont_sub_dbl_4: _sp_256_mont_rsb_sub_dbl_4:
#endif /* __APPLE__ */ #endif /* __APPLE__ */
pushq %r12 pushq %r12
pushq %r13 pushq %r13
@ -56820,42 +56814,40 @@ _sp_256_mont_sub_dbl_4:
movq 16(%rdx), %r12 movq 16(%rdx), %r12
movq 24(%rdx), %r13 movq 24(%rdx), %r13
addq %r10, %r10 addq %r10, %r10
movq $0xffffffff, %r14
adcq %r11, %r11 adcq %r11, %r11
movq $0xffffffff00000001, %r15 movq $0xffffffff00000001, %r15
adcq %r12, %r12 adcq %r12, %r12
adcq %r13, %r13 adcq %r13, %r13
sbbq %rdx, %rdx sbbq %rsi, %rsi
andq %rdx, %r14 movl %esi, %r14d
andq %rdx, %r15 andq %rsi, %r15
subq %rdx, %r10 subq %rsi, %r10
sbbq %r14, %r11 sbbq %r14, %r11
sbbq $0x00, %r12 sbbq $0x00, %r12
sbbq %r15, %r13 sbbq %r15, %r13
adcq $0x00, %rdx adcq $0x00, %rsi
andq %rdx, %r14 andq %rsi, %r14
andq %rdx, %r15 andq %rsi, %r15
subq %rdx, %r10 subq %rsi, %r10
sbbq %r14, %r11 sbbq %r14, %r11
sbbq $0x00, %r12 sbbq $0x00, %r12
sbbq %r15, %r13 sbbq %r15, %r13
subq %r10, %rax subq %r10, %rax
movq $0xffffffff, %r14
sbbq %r11, %rcx sbbq %r11, %rcx
movq $0xffffffff00000001, %r15 movq $0xffffffff00000001, %r15
sbbq %r12, %r8 sbbq %r12, %r8
sbbq %r13, %r9 sbbq %r13, %r9
sbbq %rdx, %rdx sbbq $0x00, %rsi
andq %rdx, %r14 movl %esi, %r14d
andq %rdx, %r15 andq %rsi, %r15
addq %rdx, %rax addq %rsi, %rax
adcq %r14, %rcx adcq %r14, %rcx
adcq $0x00, %r8 adcq $0x00, %r8
adcq %r15, %r9 adcq %r15, %r9
adcq $0x00, %rdx adcq $0x00, %rsi
andq %rdx, %r14 andq %rsi, %r14
andq %rdx, %r15 andq %rsi, %r15
addq %rdx, %rax addq %rsi, %rax
adcq %r14, %rcx adcq %r14, %rcx
movq %rax, (%rdi) movq %rax, (%rdi)
adcq $0x00, %r8 adcq $0x00, %r8
@ -56863,73 +56855,40 @@ _sp_256_mont_sub_dbl_4:
adcq %r15, %r9 adcq %r15, %r9
movq %r8, 16(%rdi) movq %r8, 16(%rdi)
movq %r9, 24(%rdi) movq %r9, 24(%rdi)
movq (%rdx), %r10
movq 8(%rdx), %r11
movq 16(%rdx), %r12
movq 24(%rdx), %r13
subq %rax, %r10
sbbq %rcx, %r11
movq $0xffffffff00000001, %r15
sbbq %r8, %r12
sbbq %r9, %r13
sbbq %rsi, %rsi
movl %esi, %r14d
andq %rsi, %r15
addq %rsi, %r10
adcq %r14, %r11
adcq $0x00, %r12
adcq %r15, %r13
adcq $0x00, %rsi
andq %rsi, %r14
andq %rsi, %r15
addq %rsi, %r10
adcq %r14, %r11
movq %r10, (%rdx)
adcq $0x00, %r12
movq %r11, 8(%rdx)
adcq %r15, %r13
movq %r12, 16(%rdx)
movq %r13, 24(%rdx)
popq %r15 popq %r15
popq %r14 popq %r14
popq %r13 popq %r13
popq %r12 popq %r12
repz retq repz retq
#ifndef __APPLE__ #ifndef __APPLE__
.size sp_256_mont_sub_dbl_4,.-sp_256_mont_sub_dbl_4 .size sp_256_mont_rsb_sub_dbl_4,.-sp_256_mont_rsb_sub_dbl_4
#endif /* __APPLE__ */
/* Two Montgomery numbers, subtract second from first and double.
* (r = 2.(a - b) % m).
*
* b must have came from a mont_sub operation.
*
* r Result of subtration.
* a Number to subtract from in Montgomery form.
* b Number to subtract with in Montgomery form.
* m Modulus (prime).
*/
#ifndef __APPLE__
.text
.globl sp_256_mont_dbl_sub_4
.type sp_256_mont_dbl_sub_4,@function
.align 16
sp_256_mont_dbl_sub_4:
#else
.section __TEXT,__text
.globl _sp_256_mont_dbl_sub_4
.p2align 4
_sp_256_mont_dbl_sub_4:
#endif /* __APPLE__ */
movq (%rsi), %rax
movq 8(%rsi), %rcx
movq 16(%rsi), %r8
movq 24(%rsi), %r9
subq (%rdx), %rax
movq $0xffffffff, %r10
sbbq 8(%rdx), %rcx
movq $0xffffffff00000001, %r11
sbbq 16(%rdx), %r8
sbbq 24(%rdx), %r9
sbbq %rdx, %rdx
andq %rdx, %r10
andq %rdx, %r11
addq %rdx, %rax
adcq %r10, %rcx
adcq $0x00, %r8
adcq %r11, %r9
addq %rax, %rax
movq $0xffffffff, %r10
adcq %rcx, %rcx
movq $0xffffffff00000001, %r11
adcq %r8, %r8
adcq %r9, %r9
sbbq %rdx, %rdx
andq %rdx, %r10
andq %rdx, %r11
subq %rdx, %rax
sbbq %r10, %rcx
movq %rax, (%rdi)
sbbq $0x00, %r8
movq %rcx, 8(%rdi)
sbbq %r11, %r9
movq %r8, 16(%rdi)
movq %r9, 24(%rdi)
repz retq
#ifndef __APPLE__
.size sp_256_mont_dbl_sub_4,.-sp_256_mont_dbl_sub_4
#endif /* __APPLE__ */ #endif /* __APPLE__ */
#ifndef WC_NO_CACHE_RESISTANT #ifndef WC_NO_CACHE_RESISTANT
/* Touch each possible point that could be being copied. /* Touch each possible point that could be being copied.
@ -57085,11 +57044,12 @@ _sp_256_mont_mul_avx2_4:
pushq %rbx pushq %rbx
movq %rdx, %rbp movq %rdx, %rbp
movq (%rsi), %rdx movq (%rsi), %rdx
movq 8(%rbp), %r14
# A[0] * B[0] # A[0] * B[0]
mulxq (%rbp), %r8, %r9 mulxq (%rbp), %r8, %r9
xorq %rbx, %rbx xorq %rbx, %rbx
# A[0] * B[1] # A[0] * B[1]
mulxq 8(%rbp), %rax, %r10 mulxq %r14, %rax, %r10
adcxq %rax, %r9 adcxq %rax, %r9
# A[0] * B[2] # A[0] * B[2]
mulxq 16(%rbp), %rax, %r11 mulxq 16(%rbp), %rax, %r11
@ -57104,7 +57064,7 @@ _sp_256_mont_mul_avx2_4:
xorq %rbx, %rbx xorq %rbx, %rbx
adcxq %rax, %r9 adcxq %rax, %r9
# A[1] * B[1] # A[1] * B[1]
mulxq 8(%rbp), %rax, %r15 mulxq %r14, %rax, %r15
adoxq %rcx, %r10 adoxq %rcx, %r10
adcxq %rax, %r10 adcxq %rax, %r10
# A[1] * B[2] # A[1] * B[2]
@ -57123,7 +57083,7 @@ _sp_256_mont_mul_avx2_4:
xorq %rbx, %rbx xorq %rbx, %rbx
adcxq %rax, %r10 adcxq %rax, %r10
# A[2] * B[1] # A[2] * B[1]
mulxq 8(%rbp), %rax, %r15 mulxq %r14, %rax, %r15
adoxq %rcx, %r11 adoxq %rcx, %r11
adcxq %rax, %r11 adcxq %rax, %r11
# A[2] * B[2] # A[2] * B[2]
@ -57213,11 +57173,10 @@ _sp_256_mont_mul_avx2_4:
adcq $0x00, %r15 adcq $0x00, %r15
sbbq $0x00, %r8 sbbq $0x00, %r8
movq $0xffffffff00000001, %rsi movq $0xffffffff00000001, %rsi
movq %r8, %rax
# mask m and sub from result if overflow # mask m and sub from result if overflow
# m[0] = -1 & mask = mask # m[0] = -1 & mask = mask
shrq $32, %rax
# m[2] = 0 & mask = 0 # m[2] = 0 & mask = 0
movl %r8d, %eax
andq %r8, %rsi andq %r8, %rsi
subq %r8, %r12 subq %r8, %r12
sbbq %rax, %r13 sbbq %rax, %r13
@ -57378,11 +57337,10 @@ _sp_256_mont_sqr_avx2_4:
adcq $0x00, %r15 adcq $0x00, %r15
sbbq $0x00, %r8 sbbq $0x00, %r8
movq $0xffffffff00000001, %rsi movq $0xffffffff00000001, %rsi
movq %r8, %rax
# mask m and sub from result if overflow # mask m and sub from result if overflow
# m[0] = -1 & mask = mask # m[0] = -1 & mask = mask
shrq $32, %rax
# m[2] = 0 & mask = 0 # m[2] = 0 & mask = 0
movl %r8d, %eax
andq %r8, %rsi andq %r8, %rsi
subq %r8, %r12 subq %r8, %r12
sbbq %rax, %r13 sbbq %rax, %r13
@ -58352,11 +58310,12 @@ _sp_256_mont_mul_order_avx2_4:
pushq %rbx pushq %rbx
movq %rdx, %rbp movq %rdx, %rbp
movq (%rsi), %rdx movq (%rsi), %rdx
movq 8(%rbp), %r14
# A[0] * B[0] # A[0] * B[0]
mulxq (%rbp), %r8, %r9 mulxq (%rbp), %r8, %r9
xorq %rbx, %rbx xorq %rbx, %rbx
# A[0] * B[1] # A[0] * B[1]
mulxq 8(%rbp), %rax, %r10 mulxq %r14, %rax, %r10
adcxq %rax, %r9 adcxq %rax, %r9
# A[0] * B[2] # A[0] * B[2]
mulxq 16(%rbp), %rax, %r11 mulxq 16(%rbp), %rax, %r11
@ -58371,7 +58330,7 @@ _sp_256_mont_mul_order_avx2_4:
xorq %rbx, %rbx xorq %rbx, %rbx
adcxq %rax, %r9 adcxq %rax, %r9
# A[1] * B[1] # A[1] * B[1]
mulxq 8(%rbp), %rax, %r15 mulxq %r14, %rax, %r15
adoxq %rcx, %r10 adoxq %rcx, %r10
adcxq %rax, %r10 adcxq %rax, %r10
# A[1] * B[2] # A[1] * B[2]
@ -58390,7 +58349,7 @@ _sp_256_mont_mul_order_avx2_4:
xorq %rbx, %rbx xorq %rbx, %rbx
adcxq %rax, %r10 adcxq %rax, %r10
# A[2] * B[1] # A[2] * B[1]
mulxq 8(%rbp), %rax, %r15 mulxq %r14, %rax, %r15
adoxq %rcx, %r11 adoxq %rcx, %r11
adcxq %rax, %r11 adcxq %rax, %r11
# A[2] * B[2] # A[2] * B[2]
@ -60601,11 +60560,10 @@ _sp_384_mont_reduce_6:
# Subtract mod if carry # Subtract mod if carry
negq %r10 negq %r10
movq $0xfffffffffffffffe, %r9 movq $0xfffffffffffffffe, %r9
movq %r10, %rcx movl %r10d, %ecx
movq %r10, %r8 movq %r10, %r8
shrq $32, %rcx
shlq $32, %r8
andq %r10, %r9 andq %r10, %r9
shlq $32, %r8
subq %rcx, %rbx subq %rcx, %rbx
sbbq %r8, %rbp sbbq %r8, %rbp
sbbq %r9, %r11 sbbq %r9, %r11
@ -60851,7 +60809,6 @@ _sp_384_mont_add_6:
movq 32(%rsi), %r10 movq 32(%rsi), %r10
movq 40(%rsi), %r11 movq 40(%rsi), %r11
addq (%rdx), %rax addq (%rdx), %rax
movq $0xffffffff, %r12
adcq 8(%rdx), %rcx adcq 8(%rdx), %rcx
movq $0xffffffff00000000, %r13 movq $0xffffffff00000000, %r13
adcq 16(%rdx), %r8 adcq 16(%rdx), %r8
@ -60860,7 +60817,7 @@ _sp_384_mont_add_6:
adcq 32(%rdx), %r10 adcq 32(%rdx), %r10
adcq 40(%rdx), %r11 adcq 40(%rdx), %r11
sbbq %rsi, %rsi sbbq %rsi, %rsi
andq %rsi, %r12 movl %esi, %r12d
andq %rsi, %r13 andq %rsi, %r13
andq %rsi, %r14 andq %rsi, %r14
subq %r12, %rax subq %r12, %rax
@ -60920,16 +60877,16 @@ _sp_384_mont_dbl_6:
movq 32(%rsi), %r9 movq 32(%rsi), %r9
movq 40(%rsi), %r10 movq 40(%rsi), %r10
addq %rdx, %rdx addq %rdx, %rdx
movq $0xffffffff, %r11
adcq %rax, %rax adcq %rax, %rax
movq $0xffffffff00000000, %r12 movq $0xffffffff00000000, %r12
adcq %rcx, %rcx adcq %rcx, %rcx
movq $0xfffffffffffffffe, %r13 movq $0xfffffffffffffffe, %r13
adcq %r8, %r8 adcq %r8, %r8
adcq %r9, %r9 adcq %r9, %r9
movq %r10, %r14
adcq %r10, %r10 adcq %r10, %r10
sbbq %r14, %r14 sarq $63, %r14
andq %r14, %r11 movl %r14d, %r11d
andq %r14, %r12 andq %r14, %r12
andq %r14, %r13 andq %r14, %r13
subq %r11, %rdx subq %r11, %rdx
@ -60989,7 +60946,6 @@ _sp_384_mont_tpl_6:
movq 32(%rsi), %r9 movq 32(%rsi), %r9
movq 40(%rsi), %r10 movq 40(%rsi), %r10
addq %rdx, %rdx addq %rdx, %rdx
movq $0xffffffff, %r11
adcq %rax, %rax adcq %rax, %rax
movq $0xffffffff00000000, %r12 movq $0xffffffff00000000, %r12
adcq %rcx, %rcx adcq %rcx, %rcx
@ -60998,7 +60954,7 @@ _sp_384_mont_tpl_6:
adcq %r9, %r9 adcq %r9, %r9
adcq %r10, %r10 adcq %r10, %r10
sbbq %r14, %r14 sbbq %r14, %r14
andq %r14, %r11 movl %r14d, %r11d
andq %r14, %r12 andq %r14, %r12
andq %r14, %r13 andq %r14, %r13
subq %r11, %rdx subq %r11, %rdx
@ -61019,7 +60975,6 @@ _sp_384_mont_tpl_6:
sbbq %r14, %r9 sbbq %r14, %r9
sbbq %r14, %r10 sbbq %r14, %r10
addq (%rsi), %rdx addq (%rsi), %rdx
movq $0xffffffff, %r11
adcq 8(%rsi), %rax adcq 8(%rsi), %rax
movq $0xffffffff00000000, %r12 movq $0xffffffff00000000, %r12
adcq 16(%rsi), %rcx adcq 16(%rsi), %rcx
@ -61028,7 +60983,7 @@ _sp_384_mont_tpl_6:
adcq 32(%rsi), %r9 adcq 32(%rsi), %r9
adcq 40(%rsi), %r10 adcq 40(%rsi), %r10
sbbq %r14, %r14 sbbq %r14, %r14
andq %r14, %r11 movl %r14d, %r11d
andq %r14, %r12 andq %r14, %r12
andq %r14, %r13 andq %r14, %r13
subq %r11, %rdx subq %r11, %rdx
@ -61089,7 +61044,6 @@ _sp_384_mont_sub_6:
movq 32(%rsi), %r10 movq 32(%rsi), %r10
movq 40(%rsi), %r11 movq 40(%rsi), %r11
subq (%rdx), %rax subq (%rdx), %rax
movq $0xffffffff, %r12
sbbq 8(%rdx), %rcx sbbq 8(%rdx), %rcx
movq $0xffffffff00000000, %r13 movq $0xffffffff00000000, %r13
sbbq 16(%rdx), %r8 sbbq 16(%rdx), %r8
@ -61098,7 +61052,7 @@ _sp_384_mont_sub_6:
sbbq 32(%rdx), %r10 sbbq 32(%rdx), %r10
sbbq 40(%rdx), %r11 sbbq 40(%rdx), %r11
sbbq %rsi, %rsi sbbq %rsi, %rsi
andq %rsi, %r12 movl %esi, %r12d
andq %rsi, %r13 andq %rsi, %r13
andq %rsi, %r14 andq %rsi, %r14
addq %r12, %rax addq %r12, %rax

View File

@ -54329,11 +54329,12 @@ sp_256_mul_avx2_4 PROC
mov rbp, r8 mov rbp, r8
mov rax, rdx mov rax, rdx
mov rdx, QWORD PTR [rax] mov rdx, QWORD PTR [rax]
mov r14, QWORD PTR [rbp+8]
; A[0] * B[0] ; A[0] * B[0]
mulx r9, r8, QWORD PTR [rbp] mulx r9, r8, QWORD PTR [rbp]
xor rbx, rbx xor rbx, rbx
; A[0] * B[1] ; A[0] * B[1]
mulx r10, rdi, QWORD PTR [rbp+8] mulx r10, rdi, r14
adcx r9, rdi adcx r9, rdi
; A[0] * B[2] ; A[0] * B[2]
mulx r11, rdi, QWORD PTR [rbp+16] mulx r11, rdi, QWORD PTR [rbp+16]
@ -54348,7 +54349,7 @@ sp_256_mul_avx2_4 PROC
xor rbx, rbx xor rbx, rbx
adcx r9, rdi adcx r9, rdi
; A[1] * B[1] ; A[1] * B[1]
mulx r15, rdi, QWORD PTR [rbp+8] mulx r15, rdi, r14
adox r10, rsi adox r10, rsi
adcx r10, rdi adcx r10, rdi
; A[1] * B[2] ; A[1] * B[2]
@ -54367,7 +54368,7 @@ sp_256_mul_avx2_4 PROC
xor rbx, rbx xor rbx, rbx
adcx r10, rdi adcx r10, rdi
; A[2] * B[1] ; A[2] * B[1]
mulx r15, rdi, QWORD PTR [rbp+8] mulx r15, rdi, r14
adox r11, rsi adox r11, rsi
adcx r11, rdi adcx r11, rdi
; A[2] * B[2] ; A[2] * B[2]
@ -54884,11 +54885,10 @@ sp_256_mont_mul_4 PROC
adc rbx, 0 adc rbx, 0
sbb r11, 0 sbb r11, 0
mov r10, 18446744069414584321 mov r10, 18446744069414584321
mov rax, r11
; mask m and sub from result if overflow ; mask m and sub from result if overflow
; m[0] = -1 & mask = mask ; m[0] = -1 & mask = mask
shr rax, 32
; m[2] = 0 & mask = 0 ; m[2] = 0 & mask = 0
mov eax, r11d
and r10, r11 and r10, r11
sub r15, r11 sub r15, r11
sbb rdi, rax sbb rdi, rax
@ -55060,11 +55060,10 @@ sp_256_mont_sqr_4 PROC
adc rsi, 0 adc rsi, 0
sbb r10, 0 sbb r10, 0
mov r8, 18446744069414584321 mov r8, 18446744069414584321
mov rax, r10
; mask m and sub from result if overflow ; mask m and sub from result if overflow
; m[0] = -1 & mask = mask ; m[0] = -1 & mask = mask
shr rax, 32
; m[2] = 0 & mask = 0 ; m[2] = 0 & mask = 0
mov eax, r10d
and r8, r10 and r8, r10
sub r14, r10 sub r14, r10
sbb r15, rax sbb r15, rax
@ -55263,11 +55262,10 @@ sp_256_mont_reduce_4 PROC
adc rdi, 0 adc rdi, 0
sbb r9, 0 sbb r9, 0
mov rbx, 18446744069414584321 mov rbx, 18446744069414584321
mov rax, r9
; mask m and sub from result if overflow ; mask m and sub from result if overflow
; m[0] = -1 & mask = mask ; m[0] = -1 & mask = mask
shr rax, 32
; m[2] = 0 & mask = 0 ; m[2] = 0 & mask = 0
mov eax, r9d
and rbx, r9 and rbx, r9
sub r13, r9 sub r13, r9
sbb r14, rax sbb r14, rax
@ -55404,13 +55402,12 @@ sp_256_mont_add_4 PROC
mov r10, QWORD PTR [rdx+16] mov r10, QWORD PTR [rdx+16]
mov r11, QWORD PTR [rdx+24] mov r11, QWORD PTR [rdx+24]
add rax, QWORD PTR [r8] add rax, QWORD PTR [r8]
mov r12, 4294967295
adc r9, QWORD PTR [r8+8] adc r9, QWORD PTR [r8+8]
mov r13, 18446744069414584321 mov r13, 18446744069414584321
adc r10, QWORD PTR [r8+16] adc r10, QWORD PTR [r8+16]
adc r11, QWORD PTR [r8+24] adc r11, QWORD PTR [r8+24]
sbb rdx, rdx sbb rdx, rdx
and r12, rdx mov r12d, edx
and r13, rdx and r13, rdx
sub rax, rdx sub rax, rdx
sbb r9, r12 sbb r9, r12
@ -55447,13 +55444,13 @@ sp_256_mont_dbl_4 PROC
mov r9, QWORD PTR [rdx+16] mov r9, QWORD PTR [rdx+16]
mov r10, QWORD PTR [rdx+24] mov r10, QWORD PTR [rdx+24]
add rax, rax add rax, rax
mov r11, 4294967295
adc r8, r8 adc r8, r8
mov r12, 18446744069414584321 mov r12, 18446744069414584321
adc r9, r9 adc r9, r9
mov r13, r10
adc r10, r10 adc r10, r10
sbb r13, r13 sar r13, 63
and r11, r13 mov r11d, r13d
and r12, r13 and r12, r13
sub rax, r13 sub rax, r13
sbb r8, r11 sbb r8, r11
@ -55490,13 +55487,12 @@ sp_256_mont_tpl_4 PROC
mov r9, QWORD PTR [rdx+16] mov r9, QWORD PTR [rdx+16]
mov r10, QWORD PTR [rdx+24] mov r10, QWORD PTR [rdx+24]
add rax, rax add rax, rax
mov r11, 4294967295
adc r8, r8 adc r8, r8
mov r12, 18446744069414584321 mov r12, 18446744069414584321
adc r9, r9 adc r9, r9
adc r10, r10 adc r10, r10
sbb r13, r13 sbb r13, r13
and r11, r13 mov r11d, r13d
and r12, r13 and r12, r13
sub rax, r13 sub rax, r13
sbb r8, r11 sbb r8, r11
@ -55510,13 +55506,12 @@ sp_256_mont_tpl_4 PROC
sbb r9, 0 sbb r9, 0
sbb r10, r12 sbb r10, r12
add rax, QWORD PTR [rdx] add rax, QWORD PTR [rdx]
mov r11, 4294967295
adc r8, QWORD PTR [rdx+8] adc r8, QWORD PTR [rdx+8]
mov r12, 18446744069414584321 mov r12, 18446744069414584321
adc r9, QWORD PTR [rdx+16] adc r9, QWORD PTR [rdx+16]
adc r10, QWORD PTR [rdx+24] adc r10, QWORD PTR [rdx+24]
sbb r13, r13 sbb r13, 0
and r11, r13 mov r11d, r13d
and r12, r13 and r12, r13
sub rax, r13 sub rax, r13
sbb r8, r11 sbb r8, r11
@ -55554,13 +55549,12 @@ sp_256_mont_sub_4 PROC
mov r10, QWORD PTR [rdx+16] mov r10, QWORD PTR [rdx+16]
mov r11, QWORD PTR [rdx+24] mov r11, QWORD PTR [rdx+24]
sub rax, QWORD PTR [r8] sub rax, QWORD PTR [r8]
mov r12, 4294967295
sbb r9, QWORD PTR [r8+8] sbb r9, QWORD PTR [r8+8]
mov r13, 18446744069414584321 mov r13, 18446744069414584321
sbb r10, QWORD PTR [r8+16] sbb r10, QWORD PTR [r8+16]
sbb r11, QWORD PTR [r8+24] sbb r11, QWORD PTR [r8+24]
sbb rdx, rdx sbb rdx, rdx
and r12, rdx mov r12d, edx
and r13, rdx and r13, rdx
add rax, rdx add rax, rdx
adc r9, r12 adc r9, r12
@ -55630,7 +55624,7 @@ _text ENDS
; * m Modulus (prime). ; * m Modulus (prime).
; */ ; */
_text SEGMENT READONLY PARA _text SEGMENT READONLY PARA
sp_256_mont_sub_dbl_4 PROC sp_256_mont_rsb_sub_dbl_4 PROC
push r12 push r12
push r13 push r13
push r14 push r14
@ -55646,42 +55640,40 @@ sp_256_mont_sub_dbl_4 PROC
mov r14, QWORD PTR [r8+16] mov r14, QWORD PTR [r8+16]
mov r15, QWORD PTR [r8+24] mov r15, QWORD PTR [r8+24]
add r12, r12 add r12, r12
mov rdi, 4294967295
adc r13, r13 adc r13, r13
mov rsi, 18446744069414584321 mov rsi, 18446744069414584321
adc r14, r14 adc r14, r14
adc r15, r15 adc r15, r15
sbb r8, r8 sbb rdx, rdx
and rdi, r8 mov edi, edx
and rsi, r8 and rsi, rdx
sub r12, r8 sub r12, rdx
sbb r13, rdi sbb r13, rdi
sbb r14, 0 sbb r14, 0
sbb r15, rsi sbb r15, rsi
adc r8, 0 adc rdx, 0
and rdi, r8 and rdi, rdx
and rsi, r8 and rsi, rdx
sub r12, r8 sub r12, rdx
sbb r13, rdi sbb r13, rdi
sbb r14, 0 sbb r14, 0
sbb r15, rsi sbb r15, rsi
sub rax, r12 sub rax, r12
mov rdi, 4294967295
sbb r9, r13 sbb r9, r13
mov rsi, 18446744069414584321 mov rsi, 18446744069414584321
sbb r10, r14 sbb r10, r14
sbb r11, r15 sbb r11, r15
sbb r8, r8 sbb rdx, 0
and rdi, r8 mov edi, edx
and rsi, r8 and rsi, rdx
add rax, r8 add rax, rdx
adc r9, rdi adc r9, rdi
adc r10, 0 adc r10, 0
adc r11, rsi adc r11, rsi
adc r8, 0 adc rdx, 0
and rdi, r8 and rdi, rdx
and rsi, r8 and rsi, rdx
add rax, r8 add rax, rdx
adc r9, rdi adc r9, rdi
mov QWORD PTR [rcx], rax mov QWORD PTR [rcx], rax
adc r10, 0 adc r10, 0
@ -55689,6 +55681,33 @@ sp_256_mont_sub_dbl_4 PROC
adc r11, rsi adc r11, rsi
mov QWORD PTR [rcx+16], r10 mov QWORD PTR [rcx+16], r10
mov QWORD PTR [rcx+24], r11 mov QWORD PTR [rcx+24], r11
mov r12, QWORD PTR [r8]
mov r13, QWORD PTR [r8+8]
mov r14, QWORD PTR [r8+16]
mov r15, QWORD PTR [r8+24]
sub r12, rax
sbb r13, r9
mov rsi, 18446744069414584321
sbb r14, r10
sbb r15, r11
sbb rdx, rdx
mov edi, edx
and rsi, rdx
add r12, rdx
adc r13, rdi
adc r14, 0
adc r15, rsi
adc rdx, 0
and rdi, rdx
and rsi, rdx
add r12, rdx
adc r13, rdi
mov QWORD PTR [r8], r12
adc r14, 0
mov QWORD PTR [r8+8], r13
adc r15, rsi
mov QWORD PTR [r8+16], r14
mov QWORD PTR [r8+24], r15
pop rsi pop rsi
pop rdi pop rdi
pop r15 pop r15
@ -55696,60 +55715,7 @@ sp_256_mont_sub_dbl_4 PROC
pop r13 pop r13
pop r12 pop r12
ret ret
sp_256_mont_sub_dbl_4 ENDP sp_256_mont_rsb_sub_dbl_4 ENDP
_text ENDS
; /* Two Montgomery numbers, subtract second from first and double.
; * (r = 2.(a - b) % m).
; *
; * b must have came from a mont_sub operation.
; *
; * r Result of subtration.
; * a Number to subtract from in Montgomery form.
; * b Number to subtract with in Montgomery form.
; * m Modulus (prime).
; */
_text SEGMENT READONLY PARA
sp_256_mont_dbl_sub_4 PROC
push r12
push r13
mov rax, QWORD PTR [rdx]
mov r9, QWORD PTR [rdx+8]
mov r10, QWORD PTR [rdx+16]
mov r11, QWORD PTR [rdx+24]
sub rax, QWORD PTR [r8]
mov r12, 4294967295
sbb r9, QWORD PTR [r8+8]
mov r13, 18446744069414584321
sbb r10, QWORD PTR [r8+16]
sbb r11, QWORD PTR [r8+24]
sbb r8, r8
and r12, r8
and r13, r8
add rax, r8
adc r9, r12
adc r10, 0
adc r11, r13
add rax, rax
mov r12, 4294967295
adc r9, r9
mov r13, 18446744069414584321
adc r10, r10
adc r11, r11
sbb r8, r8
and r12, r8
and r13, r8
sub rax, r8
sbb r9, r12
mov QWORD PTR [rcx], rax
sbb r10, 0
mov QWORD PTR [rcx+8], r9
sbb r11, r13
mov QWORD PTR [rcx+16], r10
mov QWORD PTR [rcx+24], r11
pop r13
pop r12
ret
sp_256_mont_dbl_sub_4 ENDP
_text ENDS _text ENDS
IFNDEF WC_NO_CACHE_RESISTANT IFNDEF WC_NO_CACHE_RESISTANT
; /* Touch each possible point that could be being copied. ; /* Touch each possible point that could be being copied.
@ -55908,11 +55874,12 @@ sp_256_mont_mul_avx2_4 PROC
mov rbp, r8 mov rbp, r8
mov rax, rdx mov rax, rdx
mov rdx, QWORD PTR [rax] mov rdx, QWORD PTR [rax]
mov r14, QWORD PTR [rbp+8]
; A[0] * B[0] ; A[0] * B[0]
mulx r9, r8, QWORD PTR [rbp] mulx r9, r8, QWORD PTR [rbp]
xor rbx, rbx xor rbx, rbx
; A[0] * B[1] ; A[0] * B[1]
mulx r10, rdi, QWORD PTR [rbp+8] mulx r10, rdi, r14
adcx r9, rdi adcx r9, rdi
; A[0] * B[2] ; A[0] * B[2]
mulx r11, rdi, QWORD PTR [rbp+16] mulx r11, rdi, QWORD PTR [rbp+16]
@ -55927,7 +55894,7 @@ sp_256_mont_mul_avx2_4 PROC
xor rbx, rbx xor rbx, rbx
adcx r9, rdi adcx r9, rdi
; A[1] * B[1] ; A[1] * B[1]
mulx r15, rdi, QWORD PTR [rbp+8] mulx r15, rdi, r14
adox r10, rsi adox r10, rsi
adcx r10, rdi adcx r10, rdi
; A[1] * B[2] ; A[1] * B[2]
@ -55946,7 +55913,7 @@ sp_256_mont_mul_avx2_4 PROC
xor rbx, rbx xor rbx, rbx
adcx r10, rdi adcx r10, rdi
; A[2] * B[1] ; A[2] * B[1]
mulx r15, rdi, QWORD PTR [rbp+8] mulx r15, rdi, r14
adox r11, rsi adox r11, rsi
adcx r11, rdi adcx r11, rdi
; A[2] * B[2] ; A[2] * B[2]
@ -56036,11 +56003,10 @@ sp_256_mont_mul_avx2_4 PROC
adc r15, 0 adc r15, 0
sbb r8, 0 sbb r8, 0
mov rax, 18446744069414584321 mov rax, 18446744069414584321
mov rdi, r8
; mask m and sub from result if overflow ; mask m and sub from result if overflow
; m[0] = -1 & mask = mask ; m[0] = -1 & mask = mask
shr rdi, 32
; m[2] = 0 & mask = 0 ; m[2] = 0 & mask = 0
mov edi, r8d
and rax, r8 and rax, r8
sub r12, r8 sub r12, r8
sbb r13, rdi sbb r13, rdi
@ -56195,11 +56161,10 @@ sp_256_mont_sqr_avx2_4 PROC
adc r15, 0 adc r15, 0
sbb r8, 0 sbb r8, 0
mov rax, 18446744069414584321 mov rax, 18446744069414584321
mov rdi, r8
; mask m and sub from result if overflow ; mask m and sub from result if overflow
; m[0] = -1 & mask = mask ; m[0] = -1 & mask = mask
shr rdi, 32
; m[2] = 0 & mask = 0 ; m[2] = 0 & mask = 0
mov edi, r8d
and rax, r8 and rax, r8
sub r12, r8 sub r12, r8
sbb r13, rdi sbb r13, rdi
@ -57053,11 +57018,12 @@ sp_256_mont_mul_order_avx2_4 PROC
mov rbp, r8 mov rbp, r8
mov rax, rdx mov rax, rdx
mov rdx, QWORD PTR [rax] mov rdx, QWORD PTR [rax]
mov r14, QWORD PTR [rbp+8]
; A[0] * B[0] ; A[0] * B[0]
mulx r9, r8, QWORD PTR [rbp] mulx r9, r8, QWORD PTR [rbp]
xor rbx, rbx xor rbx, rbx
; A[0] * B[1] ; A[0] * B[1]
mulx r10, rdi, QWORD PTR [rbp+8] mulx r10, rdi, r14
adcx r9, rdi adcx r9, rdi
; A[0] * B[2] ; A[0] * B[2]
mulx r11, rdi, QWORD PTR [rbp+16] mulx r11, rdi, QWORD PTR [rbp+16]
@ -57072,7 +57038,7 @@ sp_256_mont_mul_order_avx2_4 PROC
xor rbx, rbx xor rbx, rbx
adcx r9, rdi adcx r9, rdi
; A[1] * B[1] ; A[1] * B[1]
mulx r15, rdi, QWORD PTR [rbp+8] mulx r15, rdi, r14
adox r10, rsi adox r10, rsi
adcx r10, rdi adcx r10, rdi
; A[1] * B[2] ; A[1] * B[2]
@ -57091,7 +57057,7 @@ sp_256_mont_mul_order_avx2_4 PROC
xor rbx, rbx xor rbx, rbx
adcx r10, rdi adcx r10, rdi
; A[2] * B[1] ; A[2] * B[1]
mulx r15, rdi, QWORD PTR [rbp+8] mulx r15, rdi, r14
adox r11, rsi adox r11, rsi
adcx r11, rdi adcx r11, rdi
; A[2] * B[2] ; A[2] * B[2]
@ -59213,11 +59179,10 @@ sp_384_mont_reduce_6 PROC
; Subtract mod if carry ; Subtract mod if carry
neg r11 neg r11
mov r10, 18446744073709551614 mov r10, 18446744073709551614
mov r8, r11 mov r8d, r11d
mov r9, r11 mov r9, r11
shr r8, 32
shl r9, 32
and r10, r11 and r10, r11
shl r9, 32
sub rbx, r8 sub rbx, r8
sbb rbp, r9 sbb rbp, r9
sbb r12, r10 sbb r12, r10
@ -59436,7 +59401,6 @@ sp_384_mont_add_6 PROC
mov r12, QWORD PTR [rdx+32] mov r12, QWORD PTR [rdx+32]
mov r13, QWORD PTR [rdx+40] mov r13, QWORD PTR [rdx+40]
add rax, QWORD PTR [r8] add rax, QWORD PTR [r8]
mov r14, 4294967295
adc r9, QWORD PTR [r8+8] adc r9, QWORD PTR [r8+8]
mov r15, 18446744069414584320 mov r15, 18446744069414584320
adc r10, QWORD PTR [r8+16] adc r10, QWORD PTR [r8+16]
@ -59445,7 +59409,7 @@ sp_384_mont_add_6 PROC
adc r12, QWORD PTR [r8+32] adc r12, QWORD PTR [r8+32]
adc r13, QWORD PTR [r8+40] adc r13, QWORD PTR [r8+40]
sbb rdx, rdx sbb rdx, rdx
and r14, rdx mov r14d, edx
and r15, rdx and r15, rdx
and rdi, rdx and rdi, rdx
sub rax, r14 sub rax, r14
@ -59498,16 +59462,16 @@ sp_384_mont_dbl_6 PROC
mov r11, QWORD PTR [rdx+32] mov r11, QWORD PTR [rdx+32]
mov r12, QWORD PTR [rdx+40] mov r12, QWORD PTR [rdx+40]
add rax, rax add rax, rax
mov r13, 4294967295
adc r8, r8 adc r8, r8
mov r14, 18446744069414584320 mov r14, 18446744069414584320
adc r9, r9 adc r9, r9
mov r15, 18446744073709551614 mov r15, 18446744073709551614
adc r10, r10 adc r10, r10
adc r11, r11 adc r11, r11
mov rdi, r12
adc r12, r12 adc r12, r12
sbb rdi, rdi sar rdi, 63
and r13, rdi mov r13d, edi
and r14, rdi and r14, rdi
and r15, rdi and r15, rdi
sub rax, r13 sub rax, r13
@ -59560,7 +59524,6 @@ sp_384_mont_tpl_6 PROC
mov r11, QWORD PTR [rdx+32] mov r11, QWORD PTR [rdx+32]
mov r12, QWORD PTR [rdx+40] mov r12, QWORD PTR [rdx+40]
add rax, rax add rax, rax
mov r13, 4294967295
adc r8, r8 adc r8, r8
mov r14, 18446744069414584320 mov r14, 18446744069414584320
adc r9, r9 adc r9, r9
@ -59569,7 +59532,7 @@ sp_384_mont_tpl_6 PROC
adc r11, r11 adc r11, r11
adc r12, r12 adc r12, r12
sbb rdi, rdi sbb rdi, rdi
and r13, rdi mov r13d, edi
and r14, rdi and r14, rdi
and r15, rdi and r15, rdi
sub rax, r13 sub rax, r13
@ -59590,7 +59553,6 @@ sp_384_mont_tpl_6 PROC
sbb r11, rdi sbb r11, rdi
sbb r12, rdi sbb r12, rdi
add rax, QWORD PTR [rdx] add rax, QWORD PTR [rdx]
mov r13, 4294967295
adc r8, QWORD PTR [rdx+8] adc r8, QWORD PTR [rdx+8]
mov r14, 18446744069414584320 mov r14, 18446744069414584320
adc r9, QWORD PTR [rdx+16] adc r9, QWORD PTR [rdx+16]
@ -59599,7 +59561,7 @@ sp_384_mont_tpl_6 PROC
adc r11, QWORD PTR [rdx+32] adc r11, QWORD PTR [rdx+32]
adc r12, QWORD PTR [rdx+40] adc r12, QWORD PTR [rdx+40]
sbb rdi, rdi sbb rdi, rdi
and r13, rdi mov r13d, edi
and r14, rdi and r14, rdi
and r15, rdi and r15, rdi
sub rax, r13 sub rax, r13
@ -59653,7 +59615,6 @@ sp_384_mont_sub_6 PROC
mov r12, QWORD PTR [rdx+32] mov r12, QWORD PTR [rdx+32]
mov r13, QWORD PTR [rdx+40] mov r13, QWORD PTR [rdx+40]
sub rax, QWORD PTR [r8] sub rax, QWORD PTR [r8]
mov r14, 4294967295
sbb r9, QWORD PTR [r8+8] sbb r9, QWORD PTR [r8+8]
mov r15, 18446744069414584320 mov r15, 18446744069414584320
sbb r10, QWORD PTR [r8+16] sbb r10, QWORD PTR [r8+16]
@ -59662,7 +59623,7 @@ sp_384_mont_sub_6 PROC
sbb r12, QWORD PTR [r8+32] sbb r12, QWORD PTR [r8+32]
sbb r13, QWORD PTR [r8+40] sbb r13, QWORD PTR [r8+40]
sbb rdx, rdx sbb rdx, rdx
and r14, rdx mov r14d, edx
and r15, rdx and r15, rdx
and rdi, rdx and rdi, rdx
add rax, r14 add rax, r14