forked from wolfSSL/wolfssl
Merge pull request #6776 from SparkiDev/sp_ecc_x64
SP ECC: x64 minor speed improvement
This commit is contained in:
@ -71903,8 +71903,16 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
|
|||||||
"sbcs r9, r9, #0\n\t"
|
"sbcs r9, r9, #0\n\t"
|
||||||
"sbcs r10, r10, r12, LSR #31\n\t"
|
"sbcs r10, r10, r12, LSR #31\n\t"
|
||||||
"sbcs r11, r11, r12\n\t"
|
"sbcs r11, r11, r12\n\t"
|
||||||
"rsb r12, r12, #0\n\t"
|
"sbc r2, r2, r2\n\t"
|
||||||
"sbc r12, r12, #0\n\t"
|
"sub r12, r12, r2\n\t"
|
||||||
|
"subs r4, r4, r12\n\t"
|
||||||
|
"sbcs r5, r5, r12\n\t"
|
||||||
|
"sbcs r6, r6, r12\n\t"
|
||||||
|
"sbcs r7, r7, #0\n\t"
|
||||||
|
"sbcs r8, r8, #0\n\t"
|
||||||
|
"sbcs r9, r9, #0\n\t"
|
||||||
|
"sbcs r10, r10, r12, LSR #31\n\t"
|
||||||
|
"sbc r11, r11, r12\n\t"
|
||||||
"ldm %[a]!, {r2, r3}\n\t"
|
"ldm %[a]!, {r2, r3}\n\t"
|
||||||
"adds r4, r4, r2\n\t"
|
"adds r4, r4, r2\n\t"
|
||||||
"adcs r5, r5, r3\n\t"
|
"adcs r5, r5, r3\n\t"
|
||||||
|
@ -33116,8 +33116,16 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit
|
|||||||
"SBCS r9, r9, #0x0\n\t"
|
"SBCS r9, r9, #0x0\n\t"
|
||||||
"SBCS r10, r10, r12, LSR #31\n\t"
|
"SBCS r10, r10, r12, LSR #31\n\t"
|
||||||
"SBCS r11, r11, r12\n\t"
|
"SBCS r11, r11, r12\n\t"
|
||||||
"RSB r12, r12, #0x0\n\t"
|
"SBC r2, r2, r2\n\t"
|
||||||
"SBC r12, r12, #0x0\n\t"
|
"SUB r12, r12, r2\n\t"
|
||||||
|
"SUBS r4, r4, r12\n\t"
|
||||||
|
"SBCS r5, r5, r12\n\t"
|
||||||
|
"SBCS r6, r6, r12\n\t"
|
||||||
|
"SBCS r7, r7, #0x0\n\t"
|
||||||
|
"SBCS r8, r8, #0x0\n\t"
|
||||||
|
"SBCS r9, r9, #0x0\n\t"
|
||||||
|
"SBCS r10, r10, r12, LSR #31\n\t"
|
||||||
|
"SBC r11, r11, r12\n\t"
|
||||||
"LDM %[a]!, {r2, r3}\n\t"
|
"LDM %[a]!, {r2, r3}\n\t"
|
||||||
"ADDS r4, r4, r2\n\t"
|
"ADDS r4, r4, r2\n\t"
|
||||||
"ADCS r5, r5, r3\n\t"
|
"ADCS r5, r5, r3\n\t"
|
||||||
|
@ -8607,7 +8607,7 @@ extern void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m);
|
|||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
extern void sp_256_mont_sub_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m);
|
extern void sp_256_mont_rsb_sub_dbl_4(sp_digit* r, const sp_digit* a, sp_digit* b, const sp_digit* m);
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -8661,9 +8661,8 @@ static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p,
|
|||||||
/* X = T1 * T1 */
|
/* X = T1 * T1 */
|
||||||
sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod);
|
sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod);
|
||||||
/* X = X - 2*Y */
|
/* X = X - 2*Y */
|
||||||
sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
|
|
||||||
/* Y = Y - X */
|
/* Y = Y - X */
|
||||||
sp_256_mont_sub_4(y, y, x, p256_mod);
|
sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod);
|
||||||
/* Y = Y * T1 */
|
/* Y = Y * T1 */
|
||||||
sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod);
|
||||||
/* Y = Y - T2 */
|
/* Y = Y - T2 */
|
||||||
@ -8775,15 +8774,14 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
|
|||||||
break;
|
break;
|
||||||
case 14:
|
case 14:
|
||||||
/* X = X - 2*Y */
|
/* X = X - 2*Y */
|
||||||
sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
/* Y = Y - X */
|
||||||
|
sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
||||||
ctx->state = 15;
|
ctx->state = 15;
|
||||||
break;
|
break;
|
||||||
case 15:
|
case 15:
|
||||||
ctx->state = 16;
|
ctx->state = 16;
|
||||||
break;
|
break;
|
||||||
case 16:
|
case 16:
|
||||||
/* Y = Y - X */
|
|
||||||
sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod);
|
|
||||||
ctx->state = 17;
|
ctx->state = 17;
|
||||||
break;
|
break;
|
||||||
case 17:
|
case 17:
|
||||||
@ -8808,13 +8806,6 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
#endif /* WOLFSSL_SP_NONBLOCK */
|
#endif /* WOLFSSL_SP_NONBLOCK */
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
extern void sp_256_mont_dbl_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m);
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
/* Double the Montgomery form projective point p a number of times.
|
/* Double the Montgomery form projective point p a number of times.
|
||||||
*
|
*
|
||||||
* r Result of repeated doubling of point.
|
* r Result of repeated doubling of point.
|
||||||
@ -8858,9 +8849,9 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
|
|||||||
sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod);
|
||||||
/* X = A^2 - 2B */
|
/* X = A^2 - 2B */
|
||||||
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
|
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
|
sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
|
||||||
/* B = 2.(B - X) */
|
/* B = 2.(B - X) */
|
||||||
sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
|
sp_256_mont_dbl_4(b, b, p256_mod);
|
||||||
/* Z = Z*Y */
|
/* Z = Z*Y */
|
||||||
sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
|
||||||
/* t1 = Y^4 */
|
/* t1 = Y^4 */
|
||||||
@ -8886,9 +8877,9 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
|
|||||||
sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod);
|
||||||
/* X = A^2 - 2B */
|
/* X = A^2 - 2B */
|
||||||
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
|
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
|
sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
|
||||||
/* B = 2.(B - X) */
|
/* B = 2.(B - X) */
|
||||||
sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
|
sp_256_mont_dbl_4(b, b, p256_mod);
|
||||||
/* Z = Z*Y */
|
/* Z = Z*Y */
|
||||||
sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
|
||||||
/* t1 = Y^4 */
|
/* t1 = Y^4 */
|
||||||
@ -8981,9 +8972,8 @@ static void sp_256_proj_point_add_4(sp_point_256* r,
|
|||||||
sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod);
|
sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_4(x, x, t5, p256_mod);
|
sp_256_mont_sub_4(x, x, t5, p256_mod);
|
||||||
sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
|
|
||||||
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
||||||
sp_256_mont_sub_4(y, y, x, p256_mod);
|
sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod);
|
||||||
sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_4(y, y, t5, p256_mod);
|
sp_256_mont_sub_4(y, y, t5, p256_mod);
|
||||||
{
|
{
|
||||||
@ -9159,12 +9149,11 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
|
|||||||
ctx->state = 20;
|
ctx->state = 20;
|
||||||
break;
|
break;
|
||||||
case 20:
|
case 20:
|
||||||
sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
||||||
|
sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
||||||
ctx->state = 21;
|
ctx->state = 21;
|
||||||
break;
|
break;
|
||||||
case 21:
|
case 21:
|
||||||
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
|
||||||
sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod);
|
|
||||||
ctx->state = 22;
|
ctx->state = 22;
|
||||||
break;
|
break;
|
||||||
case 22:
|
case 22:
|
||||||
@ -9263,9 +9252,9 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r,
|
|||||||
x = r[j].x;
|
x = r[j].x;
|
||||||
/* X = A^2 - 2B */
|
/* X = A^2 - 2B */
|
||||||
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
|
sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
|
sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod);
|
||||||
/* B = 2.(B - X) */
|
/* B = 2.(B - X) */
|
||||||
sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
|
sp_256_mont_dbl_4(b, b, p256_mod);
|
||||||
/* Z = Z*Y */
|
/* Z = Z*Y */
|
||||||
sp_256_mont_mul_4(r[j].z, z, y, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_4(r[j].z, z, y, p256_mod, p256_mp_mod);
|
||||||
z = r[j].z;
|
z = r[j].z;
|
||||||
@ -9764,7 +9753,7 @@ extern void sp_256_div2_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* m
|
|||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#define sp_256_mont_sub_dbl_avx2_4 sp_256_mont_sub_dbl_4
|
#define sp_256_mont_rsb_sub_dbl_avx2_4 sp_256_mont_rsb_sub_dbl_4
|
||||||
/* Double the Montgomery form projective point p.
|
/* Double the Montgomery form projective point p.
|
||||||
*
|
*
|
||||||
* r Result of doubling point.
|
* r Result of doubling point.
|
||||||
@ -9815,9 +9804,8 @@ static void sp_256_proj_point_dbl_avx2_4(sp_point_256* r, const sp_point_256* p,
|
|||||||
/* X = T1 * T1 */
|
/* X = T1 * T1 */
|
||||||
sp_256_mont_sqr_avx2_4(x, t1, p256_mod, p256_mp_mod);
|
sp_256_mont_sqr_avx2_4(x, t1, p256_mod, p256_mp_mod);
|
||||||
/* X = X - 2*Y */
|
/* X = X - 2*Y */
|
||||||
sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod);
|
|
||||||
/* Y = Y - X */
|
/* Y = Y - X */
|
||||||
sp_256_mont_sub_avx2_4(y, y, x, p256_mod);
|
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, y, p256_mod);
|
||||||
/* Y = Y * T1 */
|
/* Y = Y * T1 */
|
||||||
sp_256_mont_mul_avx2_4(y, y, t1, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_avx2_4(y, y, t1, p256_mod, p256_mp_mod);
|
||||||
/* Y = Y - T2 */
|
/* Y = Y - T2 */
|
||||||
@ -9929,15 +9917,14 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
|
|||||||
break;
|
break;
|
||||||
case 14:
|
case 14:
|
||||||
/* X = X - 2*Y */
|
/* X = X - 2*Y */
|
||||||
sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
/* Y = Y - X */
|
||||||
|
sp_256_mont_rsb_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
||||||
ctx->state = 15;
|
ctx->state = 15;
|
||||||
break;
|
break;
|
||||||
case 15:
|
case 15:
|
||||||
ctx->state = 16;
|
ctx->state = 16;
|
||||||
break;
|
break;
|
||||||
case 16:
|
case 16:
|
||||||
/* Y = Y - X */
|
|
||||||
sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod);
|
|
||||||
ctx->state = 17;
|
ctx->state = 17;
|
||||||
break;
|
break;
|
||||||
case 17:
|
case 17:
|
||||||
@ -9962,7 +9949,6 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
#endif /* WOLFSSL_SP_NONBLOCK */
|
#endif /* WOLFSSL_SP_NONBLOCK */
|
||||||
#define sp_256_mont_dbl_sub_avx2_4 sp_256_mont_dbl_sub_4
|
|
||||||
/* Double the Montgomery form projective point p a number of times.
|
/* Double the Montgomery form projective point p a number of times.
|
||||||
*
|
*
|
||||||
* r Result of repeated doubling of point.
|
* r Result of repeated doubling of point.
|
||||||
@ -10006,9 +9992,9 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
|
|||||||
sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod);
|
||||||
/* X = A^2 - 2B */
|
/* X = A^2 - 2B */
|
||||||
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
|
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
|
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
|
||||||
/* B = 2.(B - X) */
|
/* B = 2.(B - X) */
|
||||||
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
|
sp_256_mont_dbl_avx2_4(b, b, p256_mod);
|
||||||
/* Z = Z*Y */
|
/* Z = Z*Y */
|
||||||
sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
|
||||||
/* t1 = Y^4 */
|
/* t1 = Y^4 */
|
||||||
@ -10034,9 +10020,9 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
|
|||||||
sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod);
|
||||||
/* X = A^2 - 2B */
|
/* X = A^2 - 2B */
|
||||||
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
|
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
|
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
|
||||||
/* B = 2.(B - X) */
|
/* B = 2.(B - X) */
|
||||||
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
|
sp_256_mont_dbl_avx2_4(b, b, p256_mod);
|
||||||
/* Z = Z*Y */
|
/* Z = Z*Y */
|
||||||
sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
|
||||||
/* t1 = Y^4 */
|
/* t1 = Y^4 */
|
||||||
@ -10105,9 +10091,8 @@ static void sp_256_proj_point_add_avx2_4(sp_point_256* r,
|
|||||||
sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod);
|
sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_avx2_4(x, x, t5, p256_mod);
|
sp_256_mont_sub_avx2_4(x, x, t5, p256_mod);
|
||||||
sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod);
|
|
||||||
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
||||||
sp_256_mont_sub_avx2_4(y, y, x, p256_mod);
|
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, y, p256_mod);
|
||||||
sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_avx2_4(y, y, t5, p256_mod);
|
sp_256_mont_sub_avx2_4(y, y, t5, p256_mod);
|
||||||
{
|
{
|
||||||
@ -10283,12 +10268,11 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
|
|||||||
ctx->state = 20;
|
ctx->state = 20;
|
||||||
break;
|
break;
|
||||||
case 20:
|
case 20:
|
||||||
sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
||||||
|
sp_256_mont_rsb_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
|
||||||
ctx->state = 21;
|
ctx->state = 21;
|
||||||
break;
|
break;
|
||||||
case 21:
|
case 21:
|
||||||
/* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
|
|
||||||
sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod);
|
|
||||||
ctx->state = 22;
|
ctx->state = 22;
|
||||||
break;
|
break;
|
||||||
case 22:
|
case 22:
|
||||||
@ -10387,9 +10371,9 @@ static void sp_256_proj_point_dbl_n_store_avx2_4(sp_point_256* r,
|
|||||||
x = r[j].x;
|
x = r[j].x;
|
||||||
/* X = A^2 - 2B */
|
/* X = A^2 - 2B */
|
||||||
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
|
sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
|
sp_256_mont_rsb_sub_dbl_avx2_4(x, x, b, p256_mod);
|
||||||
/* B = 2.(B - X) */
|
/* B = 2.(B - X) */
|
||||||
sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
|
sp_256_mont_dbl_avx2_4(b, b, p256_mod);
|
||||||
/* Z = Z*Y */
|
/* Z = Z*Y */
|
||||||
sp_256_mont_mul_avx2_4(r[j].z, z, y, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_avx2_4(r[j].z, z, y, p256_mod, p256_mp_mod);
|
||||||
z = r[j].z;
|
z = r[j].z;
|
||||||
@ -10689,9 +10673,8 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r,
|
|||||||
sp_256_mont_mul_4(t1, t1, t2, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_4(t1, t1, t2, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sqr_4(t2, t4, p256_mod, p256_mp_mod);
|
sp_256_mont_sqr_4(t2, t4, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_4(t2, t2, t1, p256_mod);
|
sp_256_mont_sub_4(t2, t2, t1, p256_mod);
|
||||||
sp_256_mont_sub_dbl_4(x, t2, t3, p256_mod);
|
sp_256_mont_rsb_sub_dbl_4(x, t2, t3, p256_mod);
|
||||||
/* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
|
/* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
|
||||||
sp_256_mont_sub_4(t3, t3, x, p256_mod);
|
|
||||||
sp_256_mont_mul_4(t3, t3, t4, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_4(t3, t3, t4, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_mul_4(t1, t1, p->y, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_4(t1, t1, p->y, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_4(y, t3, t1, p256_mod);
|
sp_256_mont_sub_4(y, t3, t1, p256_mod);
|
||||||
@ -11178,9 +11161,8 @@ static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r,
|
|||||||
sp_256_mont_mul_avx2_4(t1, t1, t2, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_avx2_4(t1, t1, t2, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sqr_avx2_4(t2, t4, p256_mod, p256_mp_mod);
|
sp_256_mont_sqr_avx2_4(t2, t4, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_avx2_4(t2, t2, t1, p256_mod);
|
sp_256_mont_sub_avx2_4(t2, t2, t1, p256_mod);
|
||||||
sp_256_mont_sub_dbl_avx2_4(x, t2, t3, p256_mod);
|
sp_256_mont_rsb_sub_dbl_avx2_4(x, t2, t3, p256_mod);
|
||||||
/* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
|
/* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
|
||||||
sp_256_mont_sub_avx2_4(t3, t3, x, p256_mod);
|
|
||||||
sp_256_mont_mul_avx2_4(t3, t3, t4, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_avx2_4(t3, t3, t4, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_mul_avx2_4(t1, t1, p->y, p256_mod, p256_mp_mod);
|
sp_256_mont_mul_avx2_4(t1, t1, p->y, p256_mod, p256_mp_mod);
|
||||||
sp_256_mont_sub_avx2_4(y, t3, t1, p256_mod);
|
sp_256_mont_sub_avx2_4(y, t3, t1, p256_mod);
|
||||||
|
@ -55378,11 +55378,12 @@ _sp_256_mul_avx2_4:
|
|||||||
pushq %rbx
|
pushq %rbx
|
||||||
movq %rdx, %rbp
|
movq %rdx, %rbp
|
||||||
movq (%rsi), %rdx
|
movq (%rsi), %rdx
|
||||||
|
movq 8(%rbp), %r14
|
||||||
# A[0] * B[0]
|
# A[0] * B[0]
|
||||||
mulxq (%rbp), %r8, %r9
|
mulxq (%rbp), %r8, %r9
|
||||||
xorq %rbx, %rbx
|
xorq %rbx, %rbx
|
||||||
# A[0] * B[1]
|
# A[0] * B[1]
|
||||||
mulxq 8(%rbp), %rax, %r10
|
mulxq %r14, %rax, %r10
|
||||||
adcxq %rax, %r9
|
adcxq %rax, %r9
|
||||||
# A[0] * B[2]
|
# A[0] * B[2]
|
||||||
mulxq 16(%rbp), %rax, %r11
|
mulxq 16(%rbp), %rax, %r11
|
||||||
@ -55397,7 +55398,7 @@ _sp_256_mul_avx2_4:
|
|||||||
xorq %rbx, %rbx
|
xorq %rbx, %rbx
|
||||||
adcxq %rax, %r9
|
adcxq %rax, %r9
|
||||||
# A[1] * B[1]
|
# A[1] * B[1]
|
||||||
mulxq 8(%rbp), %rax, %r15
|
mulxq %r14, %rax, %r15
|
||||||
adoxq %rcx, %r10
|
adoxq %rcx, %r10
|
||||||
adcxq %rax, %r10
|
adcxq %rax, %r10
|
||||||
# A[1] * B[2]
|
# A[1] * B[2]
|
||||||
@ -55416,7 +55417,7 @@ _sp_256_mul_avx2_4:
|
|||||||
xorq %rbx, %rbx
|
xorq %rbx, %rbx
|
||||||
adcxq %rax, %r10
|
adcxq %rax, %r10
|
||||||
# A[2] * B[1]
|
# A[2] * B[1]
|
||||||
mulxq 8(%rbp), %rax, %r15
|
mulxq %r14, %rax, %r15
|
||||||
adoxq %rcx, %r11
|
adoxq %rcx, %r11
|
||||||
adcxq %rax, %r11
|
adcxq %rax, %r11
|
||||||
# A[2] * B[2]
|
# A[2] * B[2]
|
||||||
@ -55981,11 +55982,10 @@ _sp_256_mont_mul_4:
|
|||||||
adcq $0x00, %rbx
|
adcq $0x00, %rbx
|
||||||
sbbq $0x00, %r9
|
sbbq $0x00, %r9
|
||||||
movq $0xffffffff00000001, %rsi
|
movq $0xffffffff00000001, %rsi
|
||||||
movq %r9, %rax
|
|
||||||
# mask m and sub from result if overflow
|
# mask m and sub from result if overflow
|
||||||
# m[0] = -1 & mask = mask
|
# m[0] = -1 & mask = mask
|
||||||
shrq $32, %rax
|
|
||||||
# m[2] = 0 & mask = 0
|
# m[2] = 0 & mask = 0
|
||||||
|
movl %r9d, %eax
|
||||||
andq %r9, %rsi
|
andq %r9, %rsi
|
||||||
subq %r9, %r13
|
subq %r9, %r13
|
||||||
sbbq %rax, %r14
|
sbbq %rax, %r14
|
||||||
@ -56163,11 +56163,10 @@ _sp_256_mont_sqr_4:
|
|||||||
adcq $0x00, %r15
|
adcq $0x00, %r15
|
||||||
sbbq $0x00, %r8
|
sbbq $0x00, %r8
|
||||||
movq $0xffffffff00000001, %rsi
|
movq $0xffffffff00000001, %rsi
|
||||||
movq %r8, %rax
|
|
||||||
# mask m and sub from result if overflow
|
# mask m and sub from result if overflow
|
||||||
# m[0] = -1 & mask = mask
|
# m[0] = -1 & mask = mask
|
||||||
shrq $32, %rax
|
|
||||||
# m[2] = 0 & mask = 0
|
# m[2] = 0 & mask = 0
|
||||||
|
movl %r8d, %eax
|
||||||
andq %r8, %rsi
|
andq %r8, %rsi
|
||||||
subq %r8, %r12
|
subq %r8, %r12
|
||||||
sbbq %rax, %r13
|
sbbq %rax, %r13
|
||||||
@ -56388,11 +56387,10 @@ _sp_256_mont_reduce_4:
|
|||||||
adcq $0x00, %r15
|
adcq $0x00, %r15
|
||||||
sbbq $0x00, %r8
|
sbbq $0x00, %r8
|
||||||
movq $0xffffffff00000001, %rbx
|
movq $0xffffffff00000001, %rbx
|
||||||
movq %r8, %rax
|
|
||||||
# mask m and sub from result if overflow
|
# mask m and sub from result if overflow
|
||||||
# m[0] = -1 & mask = mask
|
# m[0] = -1 & mask = mask
|
||||||
shrq $32, %rax
|
|
||||||
# m[2] = 0 & mask = 0
|
# m[2] = 0 & mask = 0
|
||||||
|
movl %r8d, %eax
|
||||||
andq %r8, %rbx
|
andq %r8, %rbx
|
||||||
subq %r8, %r12
|
subq %r8, %r12
|
||||||
sbbq %rax, %r13
|
sbbq %rax, %r13
|
||||||
@ -56543,13 +56541,12 @@ _sp_256_mont_add_4:
|
|||||||
movq 16(%rsi), %r8
|
movq 16(%rsi), %r8
|
||||||
movq 24(%rsi), %r9
|
movq 24(%rsi), %r9
|
||||||
addq (%rdx), %rax
|
addq (%rdx), %rax
|
||||||
movq $0xffffffff, %r10
|
|
||||||
adcq 8(%rdx), %rcx
|
adcq 8(%rdx), %rcx
|
||||||
movq $0xffffffff00000001, %r11
|
movq $0xffffffff00000001, %r11
|
||||||
adcq 16(%rdx), %r8
|
adcq 16(%rdx), %r8
|
||||||
adcq 24(%rdx), %r9
|
adcq 24(%rdx), %r9
|
||||||
sbbq %rsi, %rsi
|
sbbq %rsi, %rsi
|
||||||
andq %rsi, %r10
|
movl %esi, %r10d
|
||||||
andq %rsi, %r11
|
andq %rsi, %r11
|
||||||
subq %rsi, %rax
|
subq %rsi, %rax
|
||||||
sbbq %r10, %rcx
|
sbbq %r10, %rcx
|
||||||
@ -56593,13 +56590,13 @@ _sp_256_mont_dbl_4:
|
|||||||
movq 16(%rsi), %rcx
|
movq 16(%rsi), %rcx
|
||||||
movq 24(%rsi), %r8
|
movq 24(%rsi), %r8
|
||||||
addq %rdx, %rdx
|
addq %rdx, %rdx
|
||||||
movq $0xffffffff, %r9
|
|
||||||
adcq %rax, %rax
|
adcq %rax, %rax
|
||||||
movq $0xffffffff00000001, %r10
|
movq $0xffffffff00000001, %r10
|
||||||
adcq %rcx, %rcx
|
adcq %rcx, %rcx
|
||||||
|
movq %r8, %r11
|
||||||
adcq %r8, %r8
|
adcq %r8, %r8
|
||||||
sbbq %r11, %r11
|
sarq $63, %r11
|
||||||
andq %r11, %r9
|
movl %r11d, %r9d
|
||||||
andq %r11, %r10
|
andq %r11, %r10
|
||||||
subq %r11, %rdx
|
subq %r11, %rdx
|
||||||
sbbq %r9, %rax
|
sbbq %r9, %rax
|
||||||
@ -56643,13 +56640,12 @@ _sp_256_mont_tpl_4:
|
|||||||
movq 16(%rsi), %rcx
|
movq 16(%rsi), %rcx
|
||||||
movq 24(%rsi), %r8
|
movq 24(%rsi), %r8
|
||||||
addq %rdx, %rdx
|
addq %rdx, %rdx
|
||||||
movq $0xffffffff, %r9
|
|
||||||
adcq %rax, %rax
|
adcq %rax, %rax
|
||||||
movq $0xffffffff00000001, %r10
|
movq $0xffffffff00000001, %r10
|
||||||
adcq %rcx, %rcx
|
adcq %rcx, %rcx
|
||||||
adcq %r8, %r8
|
adcq %r8, %r8
|
||||||
sbbq %r11, %r11
|
sbbq %r11, %r11
|
||||||
andq %r11, %r9
|
movl %r11d, %r9d
|
||||||
andq %r11, %r10
|
andq %r11, %r10
|
||||||
subq %r11, %rdx
|
subq %r11, %rdx
|
||||||
sbbq %r9, %rax
|
sbbq %r9, %rax
|
||||||
@ -56663,13 +56659,12 @@ _sp_256_mont_tpl_4:
|
|||||||
sbbq $0x00, %rcx
|
sbbq $0x00, %rcx
|
||||||
sbbq %r10, %r8
|
sbbq %r10, %r8
|
||||||
addq (%rsi), %rdx
|
addq (%rsi), %rdx
|
||||||
movq $0xffffffff, %r9
|
|
||||||
adcq 8(%rsi), %rax
|
adcq 8(%rsi), %rax
|
||||||
movq $0xffffffff00000001, %r10
|
movq $0xffffffff00000001, %r10
|
||||||
adcq 16(%rsi), %rcx
|
adcq 16(%rsi), %rcx
|
||||||
adcq 24(%rsi), %r8
|
adcq 24(%rsi), %r8
|
||||||
sbbq %r11, %r11
|
sbbq $0x00, %r11
|
||||||
andq %r11, %r9
|
movl %r11d, %r9d
|
||||||
andq %r11, %r10
|
andq %r11, %r10
|
||||||
subq %r11, %rdx
|
subq %r11, %rdx
|
||||||
sbbq %r9, %rax
|
sbbq %r9, %rax
|
||||||
@ -56714,13 +56709,12 @@ _sp_256_mont_sub_4:
|
|||||||
movq 16(%rsi), %r8
|
movq 16(%rsi), %r8
|
||||||
movq 24(%rsi), %r9
|
movq 24(%rsi), %r9
|
||||||
subq (%rdx), %rax
|
subq (%rdx), %rax
|
||||||
movq $0xffffffff, %r10
|
|
||||||
sbbq 8(%rdx), %rcx
|
sbbq 8(%rdx), %rcx
|
||||||
movq $0xffffffff00000001, %r11
|
movq $0xffffffff00000001, %r11
|
||||||
sbbq 16(%rdx), %r8
|
sbbq 16(%rdx), %r8
|
||||||
sbbq 24(%rdx), %r9
|
sbbq 24(%rdx), %r9
|
||||||
sbbq %rsi, %rsi
|
sbbq %rsi, %rsi
|
||||||
andq %rsi, %r10
|
movl %esi, %r10d
|
||||||
andq %rsi, %r11
|
andq %rsi, %r11
|
||||||
addq %rsi, %rax
|
addq %rsi, %rax
|
||||||
adcq %r10, %rcx
|
adcq %r10, %rcx
|
||||||
@ -56797,15 +56791,15 @@ _sp_256_div2_4:
|
|||||||
*/
|
*/
|
||||||
#ifndef __APPLE__
|
#ifndef __APPLE__
|
||||||
.text
|
.text
|
||||||
.globl sp_256_mont_sub_dbl_4
|
.globl sp_256_mont_rsb_sub_dbl_4
|
||||||
.type sp_256_mont_sub_dbl_4,@function
|
.type sp_256_mont_rsb_sub_dbl_4,@function
|
||||||
.align 16
|
.align 16
|
||||||
sp_256_mont_sub_dbl_4:
|
sp_256_mont_rsb_sub_dbl_4:
|
||||||
#else
|
#else
|
||||||
.section __TEXT,__text
|
.section __TEXT,__text
|
||||||
.globl _sp_256_mont_sub_dbl_4
|
.globl _sp_256_mont_rsb_sub_dbl_4
|
||||||
.p2align 4
|
.p2align 4
|
||||||
_sp_256_mont_sub_dbl_4:
|
_sp_256_mont_rsb_sub_dbl_4:
|
||||||
#endif /* __APPLE__ */
|
#endif /* __APPLE__ */
|
||||||
pushq %r12
|
pushq %r12
|
||||||
pushq %r13
|
pushq %r13
|
||||||
@ -56820,42 +56814,40 @@ _sp_256_mont_sub_dbl_4:
|
|||||||
movq 16(%rdx), %r12
|
movq 16(%rdx), %r12
|
||||||
movq 24(%rdx), %r13
|
movq 24(%rdx), %r13
|
||||||
addq %r10, %r10
|
addq %r10, %r10
|
||||||
movq $0xffffffff, %r14
|
|
||||||
adcq %r11, %r11
|
adcq %r11, %r11
|
||||||
movq $0xffffffff00000001, %r15
|
movq $0xffffffff00000001, %r15
|
||||||
adcq %r12, %r12
|
adcq %r12, %r12
|
||||||
adcq %r13, %r13
|
adcq %r13, %r13
|
||||||
sbbq %rdx, %rdx
|
sbbq %rsi, %rsi
|
||||||
andq %rdx, %r14
|
movl %esi, %r14d
|
||||||
andq %rdx, %r15
|
andq %rsi, %r15
|
||||||
subq %rdx, %r10
|
subq %rsi, %r10
|
||||||
sbbq %r14, %r11
|
sbbq %r14, %r11
|
||||||
sbbq $0x00, %r12
|
sbbq $0x00, %r12
|
||||||
sbbq %r15, %r13
|
sbbq %r15, %r13
|
||||||
adcq $0x00, %rdx
|
adcq $0x00, %rsi
|
||||||
andq %rdx, %r14
|
andq %rsi, %r14
|
||||||
andq %rdx, %r15
|
andq %rsi, %r15
|
||||||
subq %rdx, %r10
|
subq %rsi, %r10
|
||||||
sbbq %r14, %r11
|
sbbq %r14, %r11
|
||||||
sbbq $0x00, %r12
|
sbbq $0x00, %r12
|
||||||
sbbq %r15, %r13
|
sbbq %r15, %r13
|
||||||
subq %r10, %rax
|
subq %r10, %rax
|
||||||
movq $0xffffffff, %r14
|
|
||||||
sbbq %r11, %rcx
|
sbbq %r11, %rcx
|
||||||
movq $0xffffffff00000001, %r15
|
movq $0xffffffff00000001, %r15
|
||||||
sbbq %r12, %r8
|
sbbq %r12, %r8
|
||||||
sbbq %r13, %r9
|
sbbq %r13, %r9
|
||||||
sbbq %rdx, %rdx
|
sbbq $0x00, %rsi
|
||||||
andq %rdx, %r14
|
movl %esi, %r14d
|
||||||
andq %rdx, %r15
|
andq %rsi, %r15
|
||||||
addq %rdx, %rax
|
addq %rsi, %rax
|
||||||
adcq %r14, %rcx
|
adcq %r14, %rcx
|
||||||
adcq $0x00, %r8
|
adcq $0x00, %r8
|
||||||
adcq %r15, %r9
|
adcq %r15, %r9
|
||||||
adcq $0x00, %rdx
|
adcq $0x00, %rsi
|
||||||
andq %rdx, %r14
|
andq %rsi, %r14
|
||||||
andq %rdx, %r15
|
andq %rsi, %r15
|
||||||
addq %rdx, %rax
|
addq %rsi, %rax
|
||||||
adcq %r14, %rcx
|
adcq %r14, %rcx
|
||||||
movq %rax, (%rdi)
|
movq %rax, (%rdi)
|
||||||
adcq $0x00, %r8
|
adcq $0x00, %r8
|
||||||
@ -56863,73 +56855,40 @@ _sp_256_mont_sub_dbl_4:
|
|||||||
adcq %r15, %r9
|
adcq %r15, %r9
|
||||||
movq %r8, 16(%rdi)
|
movq %r8, 16(%rdi)
|
||||||
movq %r9, 24(%rdi)
|
movq %r9, 24(%rdi)
|
||||||
|
movq (%rdx), %r10
|
||||||
|
movq 8(%rdx), %r11
|
||||||
|
movq 16(%rdx), %r12
|
||||||
|
movq 24(%rdx), %r13
|
||||||
|
subq %rax, %r10
|
||||||
|
sbbq %rcx, %r11
|
||||||
|
movq $0xffffffff00000001, %r15
|
||||||
|
sbbq %r8, %r12
|
||||||
|
sbbq %r9, %r13
|
||||||
|
sbbq %rsi, %rsi
|
||||||
|
movl %esi, %r14d
|
||||||
|
andq %rsi, %r15
|
||||||
|
addq %rsi, %r10
|
||||||
|
adcq %r14, %r11
|
||||||
|
adcq $0x00, %r12
|
||||||
|
adcq %r15, %r13
|
||||||
|
adcq $0x00, %rsi
|
||||||
|
andq %rsi, %r14
|
||||||
|
andq %rsi, %r15
|
||||||
|
addq %rsi, %r10
|
||||||
|
adcq %r14, %r11
|
||||||
|
movq %r10, (%rdx)
|
||||||
|
adcq $0x00, %r12
|
||||||
|
movq %r11, 8(%rdx)
|
||||||
|
adcq %r15, %r13
|
||||||
|
movq %r12, 16(%rdx)
|
||||||
|
movq %r13, 24(%rdx)
|
||||||
popq %r15
|
popq %r15
|
||||||
popq %r14
|
popq %r14
|
||||||
popq %r13
|
popq %r13
|
||||||
popq %r12
|
popq %r12
|
||||||
repz retq
|
repz retq
|
||||||
#ifndef __APPLE__
|
#ifndef __APPLE__
|
||||||
.size sp_256_mont_sub_dbl_4,.-sp_256_mont_sub_dbl_4
|
.size sp_256_mont_rsb_sub_dbl_4,.-sp_256_mont_rsb_sub_dbl_4
|
||||||
#endif /* __APPLE__ */
|
|
||||||
/* Two Montgomery numbers, subtract second from first and double.
|
|
||||||
* (r = 2.(a - b) % m).
|
|
||||||
*
|
|
||||||
* b must have came from a mont_sub operation.
|
|
||||||
*
|
|
||||||
* r Result of subtration.
|
|
||||||
* a Number to subtract from in Montgomery form.
|
|
||||||
* b Number to subtract with in Montgomery form.
|
|
||||||
* m Modulus (prime).
|
|
||||||
*/
|
|
||||||
#ifndef __APPLE__
|
|
||||||
.text
|
|
||||||
.globl sp_256_mont_dbl_sub_4
|
|
||||||
.type sp_256_mont_dbl_sub_4,@function
|
|
||||||
.align 16
|
|
||||||
sp_256_mont_dbl_sub_4:
|
|
||||||
#else
|
|
||||||
.section __TEXT,__text
|
|
||||||
.globl _sp_256_mont_dbl_sub_4
|
|
||||||
.p2align 4
|
|
||||||
_sp_256_mont_dbl_sub_4:
|
|
||||||
#endif /* __APPLE__ */
|
|
||||||
movq (%rsi), %rax
|
|
||||||
movq 8(%rsi), %rcx
|
|
||||||
movq 16(%rsi), %r8
|
|
||||||
movq 24(%rsi), %r9
|
|
||||||
subq (%rdx), %rax
|
|
||||||
movq $0xffffffff, %r10
|
|
||||||
sbbq 8(%rdx), %rcx
|
|
||||||
movq $0xffffffff00000001, %r11
|
|
||||||
sbbq 16(%rdx), %r8
|
|
||||||
sbbq 24(%rdx), %r9
|
|
||||||
sbbq %rdx, %rdx
|
|
||||||
andq %rdx, %r10
|
|
||||||
andq %rdx, %r11
|
|
||||||
addq %rdx, %rax
|
|
||||||
adcq %r10, %rcx
|
|
||||||
adcq $0x00, %r8
|
|
||||||
adcq %r11, %r9
|
|
||||||
addq %rax, %rax
|
|
||||||
movq $0xffffffff, %r10
|
|
||||||
adcq %rcx, %rcx
|
|
||||||
movq $0xffffffff00000001, %r11
|
|
||||||
adcq %r8, %r8
|
|
||||||
adcq %r9, %r9
|
|
||||||
sbbq %rdx, %rdx
|
|
||||||
andq %rdx, %r10
|
|
||||||
andq %rdx, %r11
|
|
||||||
subq %rdx, %rax
|
|
||||||
sbbq %r10, %rcx
|
|
||||||
movq %rax, (%rdi)
|
|
||||||
sbbq $0x00, %r8
|
|
||||||
movq %rcx, 8(%rdi)
|
|
||||||
sbbq %r11, %r9
|
|
||||||
movq %r8, 16(%rdi)
|
|
||||||
movq %r9, 24(%rdi)
|
|
||||||
repz retq
|
|
||||||
#ifndef __APPLE__
|
|
||||||
.size sp_256_mont_dbl_sub_4,.-sp_256_mont_dbl_sub_4
|
|
||||||
#endif /* __APPLE__ */
|
#endif /* __APPLE__ */
|
||||||
#ifndef WC_NO_CACHE_RESISTANT
|
#ifndef WC_NO_CACHE_RESISTANT
|
||||||
/* Touch each possible point that could be being copied.
|
/* Touch each possible point that could be being copied.
|
||||||
@ -57085,11 +57044,12 @@ _sp_256_mont_mul_avx2_4:
|
|||||||
pushq %rbx
|
pushq %rbx
|
||||||
movq %rdx, %rbp
|
movq %rdx, %rbp
|
||||||
movq (%rsi), %rdx
|
movq (%rsi), %rdx
|
||||||
|
movq 8(%rbp), %r14
|
||||||
# A[0] * B[0]
|
# A[0] * B[0]
|
||||||
mulxq (%rbp), %r8, %r9
|
mulxq (%rbp), %r8, %r9
|
||||||
xorq %rbx, %rbx
|
xorq %rbx, %rbx
|
||||||
# A[0] * B[1]
|
# A[0] * B[1]
|
||||||
mulxq 8(%rbp), %rax, %r10
|
mulxq %r14, %rax, %r10
|
||||||
adcxq %rax, %r9
|
adcxq %rax, %r9
|
||||||
# A[0] * B[2]
|
# A[0] * B[2]
|
||||||
mulxq 16(%rbp), %rax, %r11
|
mulxq 16(%rbp), %rax, %r11
|
||||||
@ -57104,7 +57064,7 @@ _sp_256_mont_mul_avx2_4:
|
|||||||
xorq %rbx, %rbx
|
xorq %rbx, %rbx
|
||||||
adcxq %rax, %r9
|
adcxq %rax, %r9
|
||||||
# A[1] * B[1]
|
# A[1] * B[1]
|
||||||
mulxq 8(%rbp), %rax, %r15
|
mulxq %r14, %rax, %r15
|
||||||
adoxq %rcx, %r10
|
adoxq %rcx, %r10
|
||||||
adcxq %rax, %r10
|
adcxq %rax, %r10
|
||||||
# A[1] * B[2]
|
# A[1] * B[2]
|
||||||
@ -57123,7 +57083,7 @@ _sp_256_mont_mul_avx2_4:
|
|||||||
xorq %rbx, %rbx
|
xorq %rbx, %rbx
|
||||||
adcxq %rax, %r10
|
adcxq %rax, %r10
|
||||||
# A[2] * B[1]
|
# A[2] * B[1]
|
||||||
mulxq 8(%rbp), %rax, %r15
|
mulxq %r14, %rax, %r15
|
||||||
adoxq %rcx, %r11
|
adoxq %rcx, %r11
|
||||||
adcxq %rax, %r11
|
adcxq %rax, %r11
|
||||||
# A[2] * B[2]
|
# A[2] * B[2]
|
||||||
@ -57213,11 +57173,10 @@ _sp_256_mont_mul_avx2_4:
|
|||||||
adcq $0x00, %r15
|
adcq $0x00, %r15
|
||||||
sbbq $0x00, %r8
|
sbbq $0x00, %r8
|
||||||
movq $0xffffffff00000001, %rsi
|
movq $0xffffffff00000001, %rsi
|
||||||
movq %r8, %rax
|
|
||||||
# mask m and sub from result if overflow
|
# mask m and sub from result if overflow
|
||||||
# m[0] = -1 & mask = mask
|
# m[0] = -1 & mask = mask
|
||||||
shrq $32, %rax
|
|
||||||
# m[2] = 0 & mask = 0
|
# m[2] = 0 & mask = 0
|
||||||
|
movl %r8d, %eax
|
||||||
andq %r8, %rsi
|
andq %r8, %rsi
|
||||||
subq %r8, %r12
|
subq %r8, %r12
|
||||||
sbbq %rax, %r13
|
sbbq %rax, %r13
|
||||||
@ -57378,11 +57337,10 @@ _sp_256_mont_sqr_avx2_4:
|
|||||||
adcq $0x00, %r15
|
adcq $0x00, %r15
|
||||||
sbbq $0x00, %r8
|
sbbq $0x00, %r8
|
||||||
movq $0xffffffff00000001, %rsi
|
movq $0xffffffff00000001, %rsi
|
||||||
movq %r8, %rax
|
|
||||||
# mask m and sub from result if overflow
|
# mask m and sub from result if overflow
|
||||||
# m[0] = -1 & mask = mask
|
# m[0] = -1 & mask = mask
|
||||||
shrq $32, %rax
|
|
||||||
# m[2] = 0 & mask = 0
|
# m[2] = 0 & mask = 0
|
||||||
|
movl %r8d, %eax
|
||||||
andq %r8, %rsi
|
andq %r8, %rsi
|
||||||
subq %r8, %r12
|
subq %r8, %r12
|
||||||
sbbq %rax, %r13
|
sbbq %rax, %r13
|
||||||
@ -58352,11 +58310,12 @@ _sp_256_mont_mul_order_avx2_4:
|
|||||||
pushq %rbx
|
pushq %rbx
|
||||||
movq %rdx, %rbp
|
movq %rdx, %rbp
|
||||||
movq (%rsi), %rdx
|
movq (%rsi), %rdx
|
||||||
|
movq 8(%rbp), %r14
|
||||||
# A[0] * B[0]
|
# A[0] * B[0]
|
||||||
mulxq (%rbp), %r8, %r9
|
mulxq (%rbp), %r8, %r9
|
||||||
xorq %rbx, %rbx
|
xorq %rbx, %rbx
|
||||||
# A[0] * B[1]
|
# A[0] * B[1]
|
||||||
mulxq 8(%rbp), %rax, %r10
|
mulxq %r14, %rax, %r10
|
||||||
adcxq %rax, %r9
|
adcxq %rax, %r9
|
||||||
# A[0] * B[2]
|
# A[0] * B[2]
|
||||||
mulxq 16(%rbp), %rax, %r11
|
mulxq 16(%rbp), %rax, %r11
|
||||||
@ -58371,7 +58330,7 @@ _sp_256_mont_mul_order_avx2_4:
|
|||||||
xorq %rbx, %rbx
|
xorq %rbx, %rbx
|
||||||
adcxq %rax, %r9
|
adcxq %rax, %r9
|
||||||
# A[1] * B[1]
|
# A[1] * B[1]
|
||||||
mulxq 8(%rbp), %rax, %r15
|
mulxq %r14, %rax, %r15
|
||||||
adoxq %rcx, %r10
|
adoxq %rcx, %r10
|
||||||
adcxq %rax, %r10
|
adcxq %rax, %r10
|
||||||
# A[1] * B[2]
|
# A[1] * B[2]
|
||||||
@ -58390,7 +58349,7 @@ _sp_256_mont_mul_order_avx2_4:
|
|||||||
xorq %rbx, %rbx
|
xorq %rbx, %rbx
|
||||||
adcxq %rax, %r10
|
adcxq %rax, %r10
|
||||||
# A[2] * B[1]
|
# A[2] * B[1]
|
||||||
mulxq 8(%rbp), %rax, %r15
|
mulxq %r14, %rax, %r15
|
||||||
adoxq %rcx, %r11
|
adoxq %rcx, %r11
|
||||||
adcxq %rax, %r11
|
adcxq %rax, %r11
|
||||||
# A[2] * B[2]
|
# A[2] * B[2]
|
||||||
@ -60601,11 +60560,10 @@ _sp_384_mont_reduce_6:
|
|||||||
# Subtract mod if carry
|
# Subtract mod if carry
|
||||||
negq %r10
|
negq %r10
|
||||||
movq $0xfffffffffffffffe, %r9
|
movq $0xfffffffffffffffe, %r9
|
||||||
movq %r10, %rcx
|
movl %r10d, %ecx
|
||||||
movq %r10, %r8
|
movq %r10, %r8
|
||||||
shrq $32, %rcx
|
|
||||||
shlq $32, %r8
|
|
||||||
andq %r10, %r9
|
andq %r10, %r9
|
||||||
|
shlq $32, %r8
|
||||||
subq %rcx, %rbx
|
subq %rcx, %rbx
|
||||||
sbbq %r8, %rbp
|
sbbq %r8, %rbp
|
||||||
sbbq %r9, %r11
|
sbbq %r9, %r11
|
||||||
@ -60851,7 +60809,6 @@ _sp_384_mont_add_6:
|
|||||||
movq 32(%rsi), %r10
|
movq 32(%rsi), %r10
|
||||||
movq 40(%rsi), %r11
|
movq 40(%rsi), %r11
|
||||||
addq (%rdx), %rax
|
addq (%rdx), %rax
|
||||||
movq $0xffffffff, %r12
|
|
||||||
adcq 8(%rdx), %rcx
|
adcq 8(%rdx), %rcx
|
||||||
movq $0xffffffff00000000, %r13
|
movq $0xffffffff00000000, %r13
|
||||||
adcq 16(%rdx), %r8
|
adcq 16(%rdx), %r8
|
||||||
@ -60860,7 +60817,7 @@ _sp_384_mont_add_6:
|
|||||||
adcq 32(%rdx), %r10
|
adcq 32(%rdx), %r10
|
||||||
adcq 40(%rdx), %r11
|
adcq 40(%rdx), %r11
|
||||||
sbbq %rsi, %rsi
|
sbbq %rsi, %rsi
|
||||||
andq %rsi, %r12
|
movl %esi, %r12d
|
||||||
andq %rsi, %r13
|
andq %rsi, %r13
|
||||||
andq %rsi, %r14
|
andq %rsi, %r14
|
||||||
subq %r12, %rax
|
subq %r12, %rax
|
||||||
@ -60920,16 +60877,16 @@ _sp_384_mont_dbl_6:
|
|||||||
movq 32(%rsi), %r9
|
movq 32(%rsi), %r9
|
||||||
movq 40(%rsi), %r10
|
movq 40(%rsi), %r10
|
||||||
addq %rdx, %rdx
|
addq %rdx, %rdx
|
||||||
movq $0xffffffff, %r11
|
|
||||||
adcq %rax, %rax
|
adcq %rax, %rax
|
||||||
movq $0xffffffff00000000, %r12
|
movq $0xffffffff00000000, %r12
|
||||||
adcq %rcx, %rcx
|
adcq %rcx, %rcx
|
||||||
movq $0xfffffffffffffffe, %r13
|
movq $0xfffffffffffffffe, %r13
|
||||||
adcq %r8, %r8
|
adcq %r8, %r8
|
||||||
adcq %r9, %r9
|
adcq %r9, %r9
|
||||||
|
movq %r10, %r14
|
||||||
adcq %r10, %r10
|
adcq %r10, %r10
|
||||||
sbbq %r14, %r14
|
sarq $63, %r14
|
||||||
andq %r14, %r11
|
movl %r14d, %r11d
|
||||||
andq %r14, %r12
|
andq %r14, %r12
|
||||||
andq %r14, %r13
|
andq %r14, %r13
|
||||||
subq %r11, %rdx
|
subq %r11, %rdx
|
||||||
@ -60989,7 +60946,6 @@ _sp_384_mont_tpl_6:
|
|||||||
movq 32(%rsi), %r9
|
movq 32(%rsi), %r9
|
||||||
movq 40(%rsi), %r10
|
movq 40(%rsi), %r10
|
||||||
addq %rdx, %rdx
|
addq %rdx, %rdx
|
||||||
movq $0xffffffff, %r11
|
|
||||||
adcq %rax, %rax
|
adcq %rax, %rax
|
||||||
movq $0xffffffff00000000, %r12
|
movq $0xffffffff00000000, %r12
|
||||||
adcq %rcx, %rcx
|
adcq %rcx, %rcx
|
||||||
@ -60998,7 +60954,7 @@ _sp_384_mont_tpl_6:
|
|||||||
adcq %r9, %r9
|
adcq %r9, %r9
|
||||||
adcq %r10, %r10
|
adcq %r10, %r10
|
||||||
sbbq %r14, %r14
|
sbbq %r14, %r14
|
||||||
andq %r14, %r11
|
movl %r14d, %r11d
|
||||||
andq %r14, %r12
|
andq %r14, %r12
|
||||||
andq %r14, %r13
|
andq %r14, %r13
|
||||||
subq %r11, %rdx
|
subq %r11, %rdx
|
||||||
@ -61019,7 +60975,6 @@ _sp_384_mont_tpl_6:
|
|||||||
sbbq %r14, %r9
|
sbbq %r14, %r9
|
||||||
sbbq %r14, %r10
|
sbbq %r14, %r10
|
||||||
addq (%rsi), %rdx
|
addq (%rsi), %rdx
|
||||||
movq $0xffffffff, %r11
|
|
||||||
adcq 8(%rsi), %rax
|
adcq 8(%rsi), %rax
|
||||||
movq $0xffffffff00000000, %r12
|
movq $0xffffffff00000000, %r12
|
||||||
adcq 16(%rsi), %rcx
|
adcq 16(%rsi), %rcx
|
||||||
@ -61028,7 +60983,7 @@ _sp_384_mont_tpl_6:
|
|||||||
adcq 32(%rsi), %r9
|
adcq 32(%rsi), %r9
|
||||||
adcq 40(%rsi), %r10
|
adcq 40(%rsi), %r10
|
||||||
sbbq %r14, %r14
|
sbbq %r14, %r14
|
||||||
andq %r14, %r11
|
movl %r14d, %r11d
|
||||||
andq %r14, %r12
|
andq %r14, %r12
|
||||||
andq %r14, %r13
|
andq %r14, %r13
|
||||||
subq %r11, %rdx
|
subq %r11, %rdx
|
||||||
@ -61089,7 +61044,6 @@ _sp_384_mont_sub_6:
|
|||||||
movq 32(%rsi), %r10
|
movq 32(%rsi), %r10
|
||||||
movq 40(%rsi), %r11
|
movq 40(%rsi), %r11
|
||||||
subq (%rdx), %rax
|
subq (%rdx), %rax
|
||||||
movq $0xffffffff, %r12
|
|
||||||
sbbq 8(%rdx), %rcx
|
sbbq 8(%rdx), %rcx
|
||||||
movq $0xffffffff00000000, %r13
|
movq $0xffffffff00000000, %r13
|
||||||
sbbq 16(%rdx), %r8
|
sbbq 16(%rdx), %r8
|
||||||
@ -61098,7 +61052,7 @@ _sp_384_mont_sub_6:
|
|||||||
sbbq 32(%rdx), %r10
|
sbbq 32(%rdx), %r10
|
||||||
sbbq 40(%rdx), %r11
|
sbbq 40(%rdx), %r11
|
||||||
sbbq %rsi, %rsi
|
sbbq %rsi, %rsi
|
||||||
andq %rsi, %r12
|
movl %esi, %r12d
|
||||||
andq %rsi, %r13
|
andq %rsi, %r13
|
||||||
andq %rsi, %r14
|
andq %rsi, %r14
|
||||||
addq %r12, %rax
|
addq %r12, %rax
|
||||||
|
@ -54329,11 +54329,12 @@ sp_256_mul_avx2_4 PROC
|
|||||||
mov rbp, r8
|
mov rbp, r8
|
||||||
mov rax, rdx
|
mov rax, rdx
|
||||||
mov rdx, QWORD PTR [rax]
|
mov rdx, QWORD PTR [rax]
|
||||||
|
mov r14, QWORD PTR [rbp+8]
|
||||||
; A[0] * B[0]
|
; A[0] * B[0]
|
||||||
mulx r9, r8, QWORD PTR [rbp]
|
mulx r9, r8, QWORD PTR [rbp]
|
||||||
xor rbx, rbx
|
xor rbx, rbx
|
||||||
; A[0] * B[1]
|
; A[0] * B[1]
|
||||||
mulx r10, rdi, QWORD PTR [rbp+8]
|
mulx r10, rdi, r14
|
||||||
adcx r9, rdi
|
adcx r9, rdi
|
||||||
; A[0] * B[2]
|
; A[0] * B[2]
|
||||||
mulx r11, rdi, QWORD PTR [rbp+16]
|
mulx r11, rdi, QWORD PTR [rbp+16]
|
||||||
@ -54348,7 +54349,7 @@ sp_256_mul_avx2_4 PROC
|
|||||||
xor rbx, rbx
|
xor rbx, rbx
|
||||||
adcx r9, rdi
|
adcx r9, rdi
|
||||||
; A[1] * B[1]
|
; A[1] * B[1]
|
||||||
mulx r15, rdi, QWORD PTR [rbp+8]
|
mulx r15, rdi, r14
|
||||||
adox r10, rsi
|
adox r10, rsi
|
||||||
adcx r10, rdi
|
adcx r10, rdi
|
||||||
; A[1] * B[2]
|
; A[1] * B[2]
|
||||||
@ -54367,7 +54368,7 @@ sp_256_mul_avx2_4 PROC
|
|||||||
xor rbx, rbx
|
xor rbx, rbx
|
||||||
adcx r10, rdi
|
adcx r10, rdi
|
||||||
; A[2] * B[1]
|
; A[2] * B[1]
|
||||||
mulx r15, rdi, QWORD PTR [rbp+8]
|
mulx r15, rdi, r14
|
||||||
adox r11, rsi
|
adox r11, rsi
|
||||||
adcx r11, rdi
|
adcx r11, rdi
|
||||||
; A[2] * B[2]
|
; A[2] * B[2]
|
||||||
@ -54884,11 +54885,10 @@ sp_256_mont_mul_4 PROC
|
|||||||
adc rbx, 0
|
adc rbx, 0
|
||||||
sbb r11, 0
|
sbb r11, 0
|
||||||
mov r10, 18446744069414584321
|
mov r10, 18446744069414584321
|
||||||
mov rax, r11
|
|
||||||
; mask m and sub from result if overflow
|
; mask m and sub from result if overflow
|
||||||
; m[0] = -1 & mask = mask
|
; m[0] = -1 & mask = mask
|
||||||
shr rax, 32
|
|
||||||
; m[2] = 0 & mask = 0
|
; m[2] = 0 & mask = 0
|
||||||
|
mov eax, r11d
|
||||||
and r10, r11
|
and r10, r11
|
||||||
sub r15, r11
|
sub r15, r11
|
||||||
sbb rdi, rax
|
sbb rdi, rax
|
||||||
@ -55060,11 +55060,10 @@ sp_256_mont_sqr_4 PROC
|
|||||||
adc rsi, 0
|
adc rsi, 0
|
||||||
sbb r10, 0
|
sbb r10, 0
|
||||||
mov r8, 18446744069414584321
|
mov r8, 18446744069414584321
|
||||||
mov rax, r10
|
|
||||||
; mask m and sub from result if overflow
|
; mask m and sub from result if overflow
|
||||||
; m[0] = -1 & mask = mask
|
; m[0] = -1 & mask = mask
|
||||||
shr rax, 32
|
|
||||||
; m[2] = 0 & mask = 0
|
; m[2] = 0 & mask = 0
|
||||||
|
mov eax, r10d
|
||||||
and r8, r10
|
and r8, r10
|
||||||
sub r14, r10
|
sub r14, r10
|
||||||
sbb r15, rax
|
sbb r15, rax
|
||||||
@ -55263,11 +55262,10 @@ sp_256_mont_reduce_4 PROC
|
|||||||
adc rdi, 0
|
adc rdi, 0
|
||||||
sbb r9, 0
|
sbb r9, 0
|
||||||
mov rbx, 18446744069414584321
|
mov rbx, 18446744069414584321
|
||||||
mov rax, r9
|
|
||||||
; mask m and sub from result if overflow
|
; mask m and sub from result if overflow
|
||||||
; m[0] = -1 & mask = mask
|
; m[0] = -1 & mask = mask
|
||||||
shr rax, 32
|
|
||||||
; m[2] = 0 & mask = 0
|
; m[2] = 0 & mask = 0
|
||||||
|
mov eax, r9d
|
||||||
and rbx, r9
|
and rbx, r9
|
||||||
sub r13, r9
|
sub r13, r9
|
||||||
sbb r14, rax
|
sbb r14, rax
|
||||||
@ -55404,13 +55402,12 @@ sp_256_mont_add_4 PROC
|
|||||||
mov r10, QWORD PTR [rdx+16]
|
mov r10, QWORD PTR [rdx+16]
|
||||||
mov r11, QWORD PTR [rdx+24]
|
mov r11, QWORD PTR [rdx+24]
|
||||||
add rax, QWORD PTR [r8]
|
add rax, QWORD PTR [r8]
|
||||||
mov r12, 4294967295
|
|
||||||
adc r9, QWORD PTR [r8+8]
|
adc r9, QWORD PTR [r8+8]
|
||||||
mov r13, 18446744069414584321
|
mov r13, 18446744069414584321
|
||||||
adc r10, QWORD PTR [r8+16]
|
adc r10, QWORD PTR [r8+16]
|
||||||
adc r11, QWORD PTR [r8+24]
|
adc r11, QWORD PTR [r8+24]
|
||||||
sbb rdx, rdx
|
sbb rdx, rdx
|
||||||
and r12, rdx
|
mov r12d, edx
|
||||||
and r13, rdx
|
and r13, rdx
|
||||||
sub rax, rdx
|
sub rax, rdx
|
||||||
sbb r9, r12
|
sbb r9, r12
|
||||||
@ -55447,13 +55444,13 @@ sp_256_mont_dbl_4 PROC
|
|||||||
mov r9, QWORD PTR [rdx+16]
|
mov r9, QWORD PTR [rdx+16]
|
||||||
mov r10, QWORD PTR [rdx+24]
|
mov r10, QWORD PTR [rdx+24]
|
||||||
add rax, rax
|
add rax, rax
|
||||||
mov r11, 4294967295
|
|
||||||
adc r8, r8
|
adc r8, r8
|
||||||
mov r12, 18446744069414584321
|
mov r12, 18446744069414584321
|
||||||
adc r9, r9
|
adc r9, r9
|
||||||
|
mov r13, r10
|
||||||
adc r10, r10
|
adc r10, r10
|
||||||
sbb r13, r13
|
sar r13, 63
|
||||||
and r11, r13
|
mov r11d, r13d
|
||||||
and r12, r13
|
and r12, r13
|
||||||
sub rax, r13
|
sub rax, r13
|
||||||
sbb r8, r11
|
sbb r8, r11
|
||||||
@ -55490,13 +55487,12 @@ sp_256_mont_tpl_4 PROC
|
|||||||
mov r9, QWORD PTR [rdx+16]
|
mov r9, QWORD PTR [rdx+16]
|
||||||
mov r10, QWORD PTR [rdx+24]
|
mov r10, QWORD PTR [rdx+24]
|
||||||
add rax, rax
|
add rax, rax
|
||||||
mov r11, 4294967295
|
|
||||||
adc r8, r8
|
adc r8, r8
|
||||||
mov r12, 18446744069414584321
|
mov r12, 18446744069414584321
|
||||||
adc r9, r9
|
adc r9, r9
|
||||||
adc r10, r10
|
adc r10, r10
|
||||||
sbb r13, r13
|
sbb r13, r13
|
||||||
and r11, r13
|
mov r11d, r13d
|
||||||
and r12, r13
|
and r12, r13
|
||||||
sub rax, r13
|
sub rax, r13
|
||||||
sbb r8, r11
|
sbb r8, r11
|
||||||
@ -55510,13 +55506,12 @@ sp_256_mont_tpl_4 PROC
|
|||||||
sbb r9, 0
|
sbb r9, 0
|
||||||
sbb r10, r12
|
sbb r10, r12
|
||||||
add rax, QWORD PTR [rdx]
|
add rax, QWORD PTR [rdx]
|
||||||
mov r11, 4294967295
|
|
||||||
adc r8, QWORD PTR [rdx+8]
|
adc r8, QWORD PTR [rdx+8]
|
||||||
mov r12, 18446744069414584321
|
mov r12, 18446744069414584321
|
||||||
adc r9, QWORD PTR [rdx+16]
|
adc r9, QWORD PTR [rdx+16]
|
||||||
adc r10, QWORD PTR [rdx+24]
|
adc r10, QWORD PTR [rdx+24]
|
||||||
sbb r13, r13
|
sbb r13, 0
|
||||||
and r11, r13
|
mov r11d, r13d
|
||||||
and r12, r13
|
and r12, r13
|
||||||
sub rax, r13
|
sub rax, r13
|
||||||
sbb r8, r11
|
sbb r8, r11
|
||||||
@ -55554,13 +55549,12 @@ sp_256_mont_sub_4 PROC
|
|||||||
mov r10, QWORD PTR [rdx+16]
|
mov r10, QWORD PTR [rdx+16]
|
||||||
mov r11, QWORD PTR [rdx+24]
|
mov r11, QWORD PTR [rdx+24]
|
||||||
sub rax, QWORD PTR [r8]
|
sub rax, QWORD PTR [r8]
|
||||||
mov r12, 4294967295
|
|
||||||
sbb r9, QWORD PTR [r8+8]
|
sbb r9, QWORD PTR [r8+8]
|
||||||
mov r13, 18446744069414584321
|
mov r13, 18446744069414584321
|
||||||
sbb r10, QWORD PTR [r8+16]
|
sbb r10, QWORD PTR [r8+16]
|
||||||
sbb r11, QWORD PTR [r8+24]
|
sbb r11, QWORD PTR [r8+24]
|
||||||
sbb rdx, rdx
|
sbb rdx, rdx
|
||||||
and r12, rdx
|
mov r12d, edx
|
||||||
and r13, rdx
|
and r13, rdx
|
||||||
add rax, rdx
|
add rax, rdx
|
||||||
adc r9, r12
|
adc r9, r12
|
||||||
@ -55630,7 +55624,7 @@ _text ENDS
|
|||||||
; * m Modulus (prime).
|
; * m Modulus (prime).
|
||||||
; */
|
; */
|
||||||
_text SEGMENT READONLY PARA
|
_text SEGMENT READONLY PARA
|
||||||
sp_256_mont_sub_dbl_4 PROC
|
sp_256_mont_rsb_sub_dbl_4 PROC
|
||||||
push r12
|
push r12
|
||||||
push r13
|
push r13
|
||||||
push r14
|
push r14
|
||||||
@ -55646,42 +55640,40 @@ sp_256_mont_sub_dbl_4 PROC
|
|||||||
mov r14, QWORD PTR [r8+16]
|
mov r14, QWORD PTR [r8+16]
|
||||||
mov r15, QWORD PTR [r8+24]
|
mov r15, QWORD PTR [r8+24]
|
||||||
add r12, r12
|
add r12, r12
|
||||||
mov rdi, 4294967295
|
|
||||||
adc r13, r13
|
adc r13, r13
|
||||||
mov rsi, 18446744069414584321
|
mov rsi, 18446744069414584321
|
||||||
adc r14, r14
|
adc r14, r14
|
||||||
adc r15, r15
|
adc r15, r15
|
||||||
sbb r8, r8
|
sbb rdx, rdx
|
||||||
and rdi, r8
|
mov edi, edx
|
||||||
and rsi, r8
|
and rsi, rdx
|
||||||
sub r12, r8
|
sub r12, rdx
|
||||||
sbb r13, rdi
|
sbb r13, rdi
|
||||||
sbb r14, 0
|
sbb r14, 0
|
||||||
sbb r15, rsi
|
sbb r15, rsi
|
||||||
adc r8, 0
|
adc rdx, 0
|
||||||
and rdi, r8
|
and rdi, rdx
|
||||||
and rsi, r8
|
and rsi, rdx
|
||||||
sub r12, r8
|
sub r12, rdx
|
||||||
sbb r13, rdi
|
sbb r13, rdi
|
||||||
sbb r14, 0
|
sbb r14, 0
|
||||||
sbb r15, rsi
|
sbb r15, rsi
|
||||||
sub rax, r12
|
sub rax, r12
|
||||||
mov rdi, 4294967295
|
|
||||||
sbb r9, r13
|
sbb r9, r13
|
||||||
mov rsi, 18446744069414584321
|
mov rsi, 18446744069414584321
|
||||||
sbb r10, r14
|
sbb r10, r14
|
||||||
sbb r11, r15
|
sbb r11, r15
|
||||||
sbb r8, r8
|
sbb rdx, 0
|
||||||
and rdi, r8
|
mov edi, edx
|
||||||
and rsi, r8
|
and rsi, rdx
|
||||||
add rax, r8
|
add rax, rdx
|
||||||
adc r9, rdi
|
adc r9, rdi
|
||||||
adc r10, 0
|
adc r10, 0
|
||||||
adc r11, rsi
|
adc r11, rsi
|
||||||
adc r8, 0
|
adc rdx, 0
|
||||||
and rdi, r8
|
and rdi, rdx
|
||||||
and rsi, r8
|
and rsi, rdx
|
||||||
add rax, r8
|
add rax, rdx
|
||||||
adc r9, rdi
|
adc r9, rdi
|
||||||
mov QWORD PTR [rcx], rax
|
mov QWORD PTR [rcx], rax
|
||||||
adc r10, 0
|
adc r10, 0
|
||||||
@ -55689,6 +55681,33 @@ sp_256_mont_sub_dbl_4 PROC
|
|||||||
adc r11, rsi
|
adc r11, rsi
|
||||||
mov QWORD PTR [rcx+16], r10
|
mov QWORD PTR [rcx+16], r10
|
||||||
mov QWORD PTR [rcx+24], r11
|
mov QWORD PTR [rcx+24], r11
|
||||||
|
mov r12, QWORD PTR [r8]
|
||||||
|
mov r13, QWORD PTR [r8+8]
|
||||||
|
mov r14, QWORD PTR [r8+16]
|
||||||
|
mov r15, QWORD PTR [r8+24]
|
||||||
|
sub r12, rax
|
||||||
|
sbb r13, r9
|
||||||
|
mov rsi, 18446744069414584321
|
||||||
|
sbb r14, r10
|
||||||
|
sbb r15, r11
|
||||||
|
sbb rdx, rdx
|
||||||
|
mov edi, edx
|
||||||
|
and rsi, rdx
|
||||||
|
add r12, rdx
|
||||||
|
adc r13, rdi
|
||||||
|
adc r14, 0
|
||||||
|
adc r15, rsi
|
||||||
|
adc rdx, 0
|
||||||
|
and rdi, rdx
|
||||||
|
and rsi, rdx
|
||||||
|
add r12, rdx
|
||||||
|
adc r13, rdi
|
||||||
|
mov QWORD PTR [r8], r12
|
||||||
|
adc r14, 0
|
||||||
|
mov QWORD PTR [r8+8], r13
|
||||||
|
adc r15, rsi
|
||||||
|
mov QWORD PTR [r8+16], r14
|
||||||
|
mov QWORD PTR [r8+24], r15
|
||||||
pop rsi
|
pop rsi
|
||||||
pop rdi
|
pop rdi
|
||||||
pop r15
|
pop r15
|
||||||
@ -55696,60 +55715,7 @@ sp_256_mont_sub_dbl_4 PROC
|
|||||||
pop r13
|
pop r13
|
||||||
pop r12
|
pop r12
|
||||||
ret
|
ret
|
||||||
sp_256_mont_sub_dbl_4 ENDP
|
sp_256_mont_rsb_sub_dbl_4 ENDP
|
||||||
_text ENDS
|
|
||||||
; /* Two Montgomery numbers, subtract second from first and double.
|
|
||||||
; * (r = 2.(a - b) % m).
|
|
||||||
; *
|
|
||||||
; * b must have came from a mont_sub operation.
|
|
||||||
; *
|
|
||||||
; * r Result of subtration.
|
|
||||||
; * a Number to subtract from in Montgomery form.
|
|
||||||
; * b Number to subtract with in Montgomery form.
|
|
||||||
; * m Modulus (prime).
|
|
||||||
; */
|
|
||||||
_text SEGMENT READONLY PARA
|
|
||||||
sp_256_mont_dbl_sub_4 PROC
|
|
||||||
push r12
|
|
||||||
push r13
|
|
||||||
mov rax, QWORD PTR [rdx]
|
|
||||||
mov r9, QWORD PTR [rdx+8]
|
|
||||||
mov r10, QWORD PTR [rdx+16]
|
|
||||||
mov r11, QWORD PTR [rdx+24]
|
|
||||||
sub rax, QWORD PTR [r8]
|
|
||||||
mov r12, 4294967295
|
|
||||||
sbb r9, QWORD PTR [r8+8]
|
|
||||||
mov r13, 18446744069414584321
|
|
||||||
sbb r10, QWORD PTR [r8+16]
|
|
||||||
sbb r11, QWORD PTR [r8+24]
|
|
||||||
sbb r8, r8
|
|
||||||
and r12, r8
|
|
||||||
and r13, r8
|
|
||||||
add rax, r8
|
|
||||||
adc r9, r12
|
|
||||||
adc r10, 0
|
|
||||||
adc r11, r13
|
|
||||||
add rax, rax
|
|
||||||
mov r12, 4294967295
|
|
||||||
adc r9, r9
|
|
||||||
mov r13, 18446744069414584321
|
|
||||||
adc r10, r10
|
|
||||||
adc r11, r11
|
|
||||||
sbb r8, r8
|
|
||||||
and r12, r8
|
|
||||||
and r13, r8
|
|
||||||
sub rax, r8
|
|
||||||
sbb r9, r12
|
|
||||||
mov QWORD PTR [rcx], rax
|
|
||||||
sbb r10, 0
|
|
||||||
mov QWORD PTR [rcx+8], r9
|
|
||||||
sbb r11, r13
|
|
||||||
mov QWORD PTR [rcx+16], r10
|
|
||||||
mov QWORD PTR [rcx+24], r11
|
|
||||||
pop r13
|
|
||||||
pop r12
|
|
||||||
ret
|
|
||||||
sp_256_mont_dbl_sub_4 ENDP
|
|
||||||
_text ENDS
|
_text ENDS
|
||||||
IFNDEF WC_NO_CACHE_RESISTANT
|
IFNDEF WC_NO_CACHE_RESISTANT
|
||||||
; /* Touch each possible point that could be being copied.
|
; /* Touch each possible point that could be being copied.
|
||||||
@ -55908,11 +55874,12 @@ sp_256_mont_mul_avx2_4 PROC
|
|||||||
mov rbp, r8
|
mov rbp, r8
|
||||||
mov rax, rdx
|
mov rax, rdx
|
||||||
mov rdx, QWORD PTR [rax]
|
mov rdx, QWORD PTR [rax]
|
||||||
|
mov r14, QWORD PTR [rbp+8]
|
||||||
; A[0] * B[0]
|
; A[0] * B[0]
|
||||||
mulx r9, r8, QWORD PTR [rbp]
|
mulx r9, r8, QWORD PTR [rbp]
|
||||||
xor rbx, rbx
|
xor rbx, rbx
|
||||||
; A[0] * B[1]
|
; A[0] * B[1]
|
||||||
mulx r10, rdi, QWORD PTR [rbp+8]
|
mulx r10, rdi, r14
|
||||||
adcx r9, rdi
|
adcx r9, rdi
|
||||||
; A[0] * B[2]
|
; A[0] * B[2]
|
||||||
mulx r11, rdi, QWORD PTR [rbp+16]
|
mulx r11, rdi, QWORD PTR [rbp+16]
|
||||||
@ -55927,7 +55894,7 @@ sp_256_mont_mul_avx2_4 PROC
|
|||||||
xor rbx, rbx
|
xor rbx, rbx
|
||||||
adcx r9, rdi
|
adcx r9, rdi
|
||||||
; A[1] * B[1]
|
; A[1] * B[1]
|
||||||
mulx r15, rdi, QWORD PTR [rbp+8]
|
mulx r15, rdi, r14
|
||||||
adox r10, rsi
|
adox r10, rsi
|
||||||
adcx r10, rdi
|
adcx r10, rdi
|
||||||
; A[1] * B[2]
|
; A[1] * B[2]
|
||||||
@ -55946,7 +55913,7 @@ sp_256_mont_mul_avx2_4 PROC
|
|||||||
xor rbx, rbx
|
xor rbx, rbx
|
||||||
adcx r10, rdi
|
adcx r10, rdi
|
||||||
; A[2] * B[1]
|
; A[2] * B[1]
|
||||||
mulx r15, rdi, QWORD PTR [rbp+8]
|
mulx r15, rdi, r14
|
||||||
adox r11, rsi
|
adox r11, rsi
|
||||||
adcx r11, rdi
|
adcx r11, rdi
|
||||||
; A[2] * B[2]
|
; A[2] * B[2]
|
||||||
@ -56036,11 +56003,10 @@ sp_256_mont_mul_avx2_4 PROC
|
|||||||
adc r15, 0
|
adc r15, 0
|
||||||
sbb r8, 0
|
sbb r8, 0
|
||||||
mov rax, 18446744069414584321
|
mov rax, 18446744069414584321
|
||||||
mov rdi, r8
|
|
||||||
; mask m and sub from result if overflow
|
; mask m and sub from result if overflow
|
||||||
; m[0] = -1 & mask = mask
|
; m[0] = -1 & mask = mask
|
||||||
shr rdi, 32
|
|
||||||
; m[2] = 0 & mask = 0
|
; m[2] = 0 & mask = 0
|
||||||
|
mov edi, r8d
|
||||||
and rax, r8
|
and rax, r8
|
||||||
sub r12, r8
|
sub r12, r8
|
||||||
sbb r13, rdi
|
sbb r13, rdi
|
||||||
@ -56195,11 +56161,10 @@ sp_256_mont_sqr_avx2_4 PROC
|
|||||||
adc r15, 0
|
adc r15, 0
|
||||||
sbb r8, 0
|
sbb r8, 0
|
||||||
mov rax, 18446744069414584321
|
mov rax, 18446744069414584321
|
||||||
mov rdi, r8
|
|
||||||
; mask m and sub from result if overflow
|
; mask m and sub from result if overflow
|
||||||
; m[0] = -1 & mask = mask
|
; m[0] = -1 & mask = mask
|
||||||
shr rdi, 32
|
|
||||||
; m[2] = 0 & mask = 0
|
; m[2] = 0 & mask = 0
|
||||||
|
mov edi, r8d
|
||||||
and rax, r8
|
and rax, r8
|
||||||
sub r12, r8
|
sub r12, r8
|
||||||
sbb r13, rdi
|
sbb r13, rdi
|
||||||
@ -57053,11 +57018,12 @@ sp_256_mont_mul_order_avx2_4 PROC
|
|||||||
mov rbp, r8
|
mov rbp, r8
|
||||||
mov rax, rdx
|
mov rax, rdx
|
||||||
mov rdx, QWORD PTR [rax]
|
mov rdx, QWORD PTR [rax]
|
||||||
|
mov r14, QWORD PTR [rbp+8]
|
||||||
; A[0] * B[0]
|
; A[0] * B[0]
|
||||||
mulx r9, r8, QWORD PTR [rbp]
|
mulx r9, r8, QWORD PTR [rbp]
|
||||||
xor rbx, rbx
|
xor rbx, rbx
|
||||||
; A[0] * B[1]
|
; A[0] * B[1]
|
||||||
mulx r10, rdi, QWORD PTR [rbp+8]
|
mulx r10, rdi, r14
|
||||||
adcx r9, rdi
|
adcx r9, rdi
|
||||||
; A[0] * B[2]
|
; A[0] * B[2]
|
||||||
mulx r11, rdi, QWORD PTR [rbp+16]
|
mulx r11, rdi, QWORD PTR [rbp+16]
|
||||||
@ -57072,7 +57038,7 @@ sp_256_mont_mul_order_avx2_4 PROC
|
|||||||
xor rbx, rbx
|
xor rbx, rbx
|
||||||
adcx r9, rdi
|
adcx r9, rdi
|
||||||
; A[1] * B[1]
|
; A[1] * B[1]
|
||||||
mulx r15, rdi, QWORD PTR [rbp+8]
|
mulx r15, rdi, r14
|
||||||
adox r10, rsi
|
adox r10, rsi
|
||||||
adcx r10, rdi
|
adcx r10, rdi
|
||||||
; A[1] * B[2]
|
; A[1] * B[2]
|
||||||
@ -57091,7 +57057,7 @@ sp_256_mont_mul_order_avx2_4 PROC
|
|||||||
xor rbx, rbx
|
xor rbx, rbx
|
||||||
adcx r10, rdi
|
adcx r10, rdi
|
||||||
; A[2] * B[1]
|
; A[2] * B[1]
|
||||||
mulx r15, rdi, QWORD PTR [rbp+8]
|
mulx r15, rdi, r14
|
||||||
adox r11, rsi
|
adox r11, rsi
|
||||||
adcx r11, rdi
|
adcx r11, rdi
|
||||||
; A[2] * B[2]
|
; A[2] * B[2]
|
||||||
@ -59213,11 +59179,10 @@ sp_384_mont_reduce_6 PROC
|
|||||||
; Subtract mod if carry
|
; Subtract mod if carry
|
||||||
neg r11
|
neg r11
|
||||||
mov r10, 18446744073709551614
|
mov r10, 18446744073709551614
|
||||||
mov r8, r11
|
mov r8d, r11d
|
||||||
mov r9, r11
|
mov r9, r11
|
||||||
shr r8, 32
|
|
||||||
shl r9, 32
|
|
||||||
and r10, r11
|
and r10, r11
|
||||||
|
shl r9, 32
|
||||||
sub rbx, r8
|
sub rbx, r8
|
||||||
sbb rbp, r9
|
sbb rbp, r9
|
||||||
sbb r12, r10
|
sbb r12, r10
|
||||||
@ -59436,7 +59401,6 @@ sp_384_mont_add_6 PROC
|
|||||||
mov r12, QWORD PTR [rdx+32]
|
mov r12, QWORD PTR [rdx+32]
|
||||||
mov r13, QWORD PTR [rdx+40]
|
mov r13, QWORD PTR [rdx+40]
|
||||||
add rax, QWORD PTR [r8]
|
add rax, QWORD PTR [r8]
|
||||||
mov r14, 4294967295
|
|
||||||
adc r9, QWORD PTR [r8+8]
|
adc r9, QWORD PTR [r8+8]
|
||||||
mov r15, 18446744069414584320
|
mov r15, 18446744069414584320
|
||||||
adc r10, QWORD PTR [r8+16]
|
adc r10, QWORD PTR [r8+16]
|
||||||
@ -59445,7 +59409,7 @@ sp_384_mont_add_6 PROC
|
|||||||
adc r12, QWORD PTR [r8+32]
|
adc r12, QWORD PTR [r8+32]
|
||||||
adc r13, QWORD PTR [r8+40]
|
adc r13, QWORD PTR [r8+40]
|
||||||
sbb rdx, rdx
|
sbb rdx, rdx
|
||||||
and r14, rdx
|
mov r14d, edx
|
||||||
and r15, rdx
|
and r15, rdx
|
||||||
and rdi, rdx
|
and rdi, rdx
|
||||||
sub rax, r14
|
sub rax, r14
|
||||||
@ -59498,16 +59462,16 @@ sp_384_mont_dbl_6 PROC
|
|||||||
mov r11, QWORD PTR [rdx+32]
|
mov r11, QWORD PTR [rdx+32]
|
||||||
mov r12, QWORD PTR [rdx+40]
|
mov r12, QWORD PTR [rdx+40]
|
||||||
add rax, rax
|
add rax, rax
|
||||||
mov r13, 4294967295
|
|
||||||
adc r8, r8
|
adc r8, r8
|
||||||
mov r14, 18446744069414584320
|
mov r14, 18446744069414584320
|
||||||
adc r9, r9
|
adc r9, r9
|
||||||
mov r15, 18446744073709551614
|
mov r15, 18446744073709551614
|
||||||
adc r10, r10
|
adc r10, r10
|
||||||
adc r11, r11
|
adc r11, r11
|
||||||
|
mov rdi, r12
|
||||||
adc r12, r12
|
adc r12, r12
|
||||||
sbb rdi, rdi
|
sar rdi, 63
|
||||||
and r13, rdi
|
mov r13d, edi
|
||||||
and r14, rdi
|
and r14, rdi
|
||||||
and r15, rdi
|
and r15, rdi
|
||||||
sub rax, r13
|
sub rax, r13
|
||||||
@ -59560,7 +59524,6 @@ sp_384_mont_tpl_6 PROC
|
|||||||
mov r11, QWORD PTR [rdx+32]
|
mov r11, QWORD PTR [rdx+32]
|
||||||
mov r12, QWORD PTR [rdx+40]
|
mov r12, QWORD PTR [rdx+40]
|
||||||
add rax, rax
|
add rax, rax
|
||||||
mov r13, 4294967295
|
|
||||||
adc r8, r8
|
adc r8, r8
|
||||||
mov r14, 18446744069414584320
|
mov r14, 18446744069414584320
|
||||||
adc r9, r9
|
adc r9, r9
|
||||||
@ -59569,7 +59532,7 @@ sp_384_mont_tpl_6 PROC
|
|||||||
adc r11, r11
|
adc r11, r11
|
||||||
adc r12, r12
|
adc r12, r12
|
||||||
sbb rdi, rdi
|
sbb rdi, rdi
|
||||||
and r13, rdi
|
mov r13d, edi
|
||||||
and r14, rdi
|
and r14, rdi
|
||||||
and r15, rdi
|
and r15, rdi
|
||||||
sub rax, r13
|
sub rax, r13
|
||||||
@ -59590,7 +59553,6 @@ sp_384_mont_tpl_6 PROC
|
|||||||
sbb r11, rdi
|
sbb r11, rdi
|
||||||
sbb r12, rdi
|
sbb r12, rdi
|
||||||
add rax, QWORD PTR [rdx]
|
add rax, QWORD PTR [rdx]
|
||||||
mov r13, 4294967295
|
|
||||||
adc r8, QWORD PTR [rdx+8]
|
adc r8, QWORD PTR [rdx+8]
|
||||||
mov r14, 18446744069414584320
|
mov r14, 18446744069414584320
|
||||||
adc r9, QWORD PTR [rdx+16]
|
adc r9, QWORD PTR [rdx+16]
|
||||||
@ -59599,7 +59561,7 @@ sp_384_mont_tpl_6 PROC
|
|||||||
adc r11, QWORD PTR [rdx+32]
|
adc r11, QWORD PTR [rdx+32]
|
||||||
adc r12, QWORD PTR [rdx+40]
|
adc r12, QWORD PTR [rdx+40]
|
||||||
sbb rdi, rdi
|
sbb rdi, rdi
|
||||||
and r13, rdi
|
mov r13d, edi
|
||||||
and r14, rdi
|
and r14, rdi
|
||||||
and r15, rdi
|
and r15, rdi
|
||||||
sub rax, r13
|
sub rax, r13
|
||||||
@ -59653,7 +59615,6 @@ sp_384_mont_sub_6 PROC
|
|||||||
mov r12, QWORD PTR [rdx+32]
|
mov r12, QWORD PTR [rdx+32]
|
||||||
mov r13, QWORD PTR [rdx+40]
|
mov r13, QWORD PTR [rdx+40]
|
||||||
sub rax, QWORD PTR [r8]
|
sub rax, QWORD PTR [r8]
|
||||||
mov r14, 4294967295
|
|
||||||
sbb r9, QWORD PTR [r8+8]
|
sbb r9, QWORD PTR [r8+8]
|
||||||
mov r15, 18446744069414584320
|
mov r15, 18446744069414584320
|
||||||
sbb r10, QWORD PTR [r8+16]
|
sbb r10, QWORD PTR [r8+16]
|
||||||
@ -59662,7 +59623,7 @@ sp_384_mont_sub_6 PROC
|
|||||||
sbb r12, QWORD PTR [r8+32]
|
sbb r12, QWORD PTR [r8+32]
|
||||||
sbb r13, QWORD PTR [r8+40]
|
sbb r13, QWORD PTR [r8+40]
|
||||||
sbb rdx, rdx
|
sbb rdx, rdx
|
||||||
and r14, rdx
|
mov r14d, edx
|
||||||
and r15, rdx
|
and r15, rdx
|
||||||
and rdi, rdx
|
and rdi, rdx
|
||||||
add rax, r14
|
add rax, r14
|
||||||
|
Reference in New Issue
Block a user