diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index 8bb7a9652..17ed8e78f 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -5837,12 +5837,12 @@ void bench_eccMakeKey(int doAsync, int curveId) bench_stats_start(&count, &start); do { /* while free pending slots in queue, submit ops */ - for (times = 0; times < genTimes || pending > 0; ) { + for (times = 0; times < agreeTimes || pending > 0; ) { bench_async_poll(&pending); for (i = 0; i < BENCH_MAX_PENDING; i++) { if (bench_async_check(&ret, BENCH_ASYNC_GET_DEV(&genKey[i]), 0, - ×, genTimes, &pending)) { + ×, agreeTimes, &pending)) { wc_ecc_free(&genKey[i]); ret = wc_ecc_init_ex(&genKey[i], HEAP_HINT, deviceID); diff --git a/wolfcrypt/src/ecc.c b/wolfcrypt/src/ecc.c index fe44ca04c..c3be011fe 100644 --- a/wolfcrypt/src/ecc.c +++ b/wolfcrypt/src/ecc.c @@ -4258,16 +4258,9 @@ int wc_ecc_shared_secret(ecc_key* private_key, ecc_key* public_key, byte* out, !defined(WOLFSSL_CRYPTOCELL) && !defined(WOLFSSL_KCAPI_ECC) static int wc_ecc_shared_secret_gen_sync(ecc_key* private_key, ecc_point* point, - byte* out, word32* outlen, ecc_curve_spec* curve) + byte* out, word32* outlen) { int err = MP_OKAY; -#if !defined(WOLFSSL_SP_MATH) - ecc_point* result = NULL; - #ifdef WOLFSSL_NO_MALLOC - ecc_point lcl_result; - #endif - word32 x = 0; -#endif mp_int* k = &private_key->k; #ifdef HAVE_ECC_CDH #ifdef WOLFSSL_SMALL_STACK @@ -4333,20 +4326,41 @@ static int wc_ecc_shared_secret_gen_sync(ecc_key* private_key, ecc_point* point, #if defined(WOLFSSL_SP_MATH) { err = WC_KEY_SIZE_E; - (void)curve; goto errout; } #else { + ecc_point* result = NULL; + #ifdef WOLFSSL_NO_MALLOC + ecc_point lcl_result; + #endif + word32 x = 0; mp_digit mp = 0; + DECLARE_CURVE_SPECS(3); + + /* load curve info */ + ALLOC_CURVE_SPECS(3, err); + if (err == MP_OKAY) { + err = wc_ecc_curve_load(private_key->dp, &curve, + (ECC_CURVE_FIELD_PRIME | ECC_CURVE_FIELD_AF | + ECC_CURVE_FIELD_ORDER)); + } + + if (err != MP_OKAY) { + FREE_CURVE_SPECS(); + goto errout; + } /* make new point */ #ifdef WOLFSSL_NO_MALLOC result = &lcl_result; #endif err = wc_ecc_new_point_ex(&result, private_key->heap); - if (err != MP_OKAY) + if (err != MP_OKAY) { + wc_ecc_curve_free(curve); + FREE_CURVE_SPECS(); goto errout; + } #ifdef ECC_TIMING_RESISTANT if (private_key->rng == NULL) { @@ -4387,6 +4401,9 @@ static int wc_ecc_shared_secret_gen_sync(ecc_key* private_key, ecc_point* point, *outlen = x; wc_ecc_del_point_ex(result, private_key->heap); + + wc_ecc_curve_free(curve); + FREE_CURVE_SPECS(); } #endif @@ -4408,10 +4425,23 @@ static int wc_ecc_shared_secret_gen_sync(ecc_key* private_key, ecc_point* point, #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_ECC) static int wc_ecc_shared_secret_gen_async(ecc_key* private_key, - ecc_point* point, byte* out, word32 *outlen, - ecc_curve_spec* curve) + ecc_point* point, byte* out, word32 *outlen) { int err; + DECLARE_CURVE_SPECS(3); + + /* load curve info */ + ALLOC_CURVE_SPECS(3, err); + if (err == MP_OKAY) { + err = wc_ecc_curve_load(private_key->dp, &curve, + (ECC_CURVE_FIELD_PRIME | ECC_CURVE_FIELD_AF | + ECC_CURVE_FIELD_ORDER)); + } + + if (err != MP_OKAY) { + FREE_CURVE_SPECS(); + return err; + } #if defined(HAVE_CAVIUM_V) || defined(HAVE_INTEL_QA) if (private_key->dp @@ -4453,6 +4483,8 @@ static int wc_ecc_shared_secret_gen_async(ecc_key* private_key, &curve->Af->raw, &curve->Bf->raw, &curve->prime->raw, private_key->dp->cofactor); #endif + wc_ecc_curve_free(curve); + FREE_CURVE_SPECS(); return err; } #elif defined(WOLFSSL_ASYNC_CRYPT_TEST) @@ -4462,6 +4494,8 @@ static int wc_ecc_shared_secret_gen_async(ecc_key* private_key, testDev->eccSharedSec.public_point = point; testDev->eccSharedSec.out = out; testDev->eccSharedSec.outLen = outlen; + wc_ecc_curve_free(curve); + FREE_CURVE_SPECS(); return WC_PENDING_E; } #endif @@ -4469,6 +4503,9 @@ static int wc_ecc_shared_secret_gen_async(ecc_key* private_key, /* use sync in other cases */ err = wc_ecc_shared_secret_gen_sync(private_key, point, out, outlen, curve); + wc_ecc_curve_free(curve); + FREE_CURVE_SPECS(); + return err; } #endif /* WOLFSSL_ASYNC_CRYPT && WC_ASYNC_ENABLE_ECC */ @@ -4477,40 +4514,24 @@ int wc_ecc_shared_secret_gen(ecc_key* private_key, ecc_point* point, byte* out, word32 *outlen) { int err = MP_OKAY; - DECLARE_CURVE_SPECS(3); if (private_key == NULL || point == NULL || out == NULL || outlen == NULL) { return BAD_FUNC_ARG; } - /* load curve info */ - ALLOC_CURVE_SPECS(3, err); - if (err == MP_OKAY) { - err = wc_ecc_curve_load(private_key->dp, &curve, - (ECC_CURVE_FIELD_PRIME | ECC_CURVE_FIELD_AF | ECC_CURVE_FIELD_ORDER)); - } - - if (err != MP_OKAY) { - FREE_CURVE_SPECS(); - return err; - } - #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_ECC) if (private_key->asyncDev.marker == WOLFSSL_ASYNC_MARKER_ECC) { err = wc_ecc_shared_secret_gen_async(private_key, point, - out, outlen, curve); + out, outlen); } else #endif { err = wc_ecc_shared_secret_gen_sync(private_key, point, - out, outlen, curve); + out, outlen); } - wc_ecc_curve_free(curve); - FREE_CURVE_SPECS(); - return err; } diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c index 9118131e8..207a600a4 100644 --- a/wolfcrypt/src/sp_arm32.c +++ b/wolfcrypt/src/sp_arm32.c @@ -31215,7 +31215,7 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p, (sp_digit)1 : (sp_digit)0)); sp_256_norm_8(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -31511,6 +31511,7 @@ static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b, ); } +#define sp_256_mont_sub_lower_8 sp_256_mont_sub_8 /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) * * r Result of division by 2. @@ -31684,7 +31685,7 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con break; case 16: /* Y = Y - X */ - sp_256_mont_sub_8(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -31710,7 +31711,8 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p, sp_digit* t) +static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*8; @@ -31757,7 +31759,7 @@ static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p, sp_d /* X = X - Y */ sp_256_mont_sub_8(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_8(y, y, x, p256_mod); + sp_256_mont_sub_lower_8(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_8(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -31797,6 +31799,7 @@ typedef struct sp_256_proj_point_add_8_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -31825,6 +31828,10 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->t3 = t + 4*8; ctx->t4 = t + 6*8; ctx->t5 = t + 8*8; + ctx->t6 = t + 10*8; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -31849,29 +31856,6 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_256)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<8; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<8; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<8; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -31885,16 +31869,16 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 6; break; case 6: - sp_256_mont_mul_8(ctx->t1, ctx->t1, ctx->x, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_8(ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 8; break; case 8: - sp_256_mont_mul_8(ctx->t4, ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 9; break; case 9: @@ -31903,7 +31887,7 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_8(ctx->t3, ctx->t3, ctx->y, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod); ctx->state = 11; break; case 11: @@ -31922,29 +31906,29 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_8(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_256_mont_sqr_8(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 15; break; case 15: - sp_256_mont_mul_8(ctx->z, ctx->z, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_8(ctx->x, ctx->t4, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 17; break; case 17: - sp_256_mont_sqr_8(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_8(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 18; break; case 18: - sp_256_mont_mul_8(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); ctx->state = 19; break; case 19: - sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(ctx->x, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 20; break; case 20: @@ -31952,24 +31936,24 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 21; break; case 21: - sp_256_mont_dbl_8(ctx->t1, ctx->y, p256_mod); + sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); ctx->state = 22; break; case 22: - sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t1, p256_mod); + sp_256_mont_dbl_8(ctx->t3, ctx->y, p256_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_8(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t3, p256_mod); ctx->state = 24; break; case 24: - sp_256_mont_mul_8(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 25; break; case 25: - sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 26; break; case 26: @@ -31977,9 +31961,30 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -31991,24 +31996,13 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, static void sp_256_proj_point_add_8(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*8; sp_digit* t3 = t + 4*8; sp_digit* t4 = t + 6*8; sp_digit* t5 = t + 8*8; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*8; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_256* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_256_sub_8(t1, p256_mod, q->y); @@ -32018,60 +32012,61 @@ static void sp_256_proj_point_add_8(sp_point_256* r, sp_256_proj_point_dbl_8(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<8; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<8; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<8; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_8(t1, q->z, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t3, t1, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t1, t1, x, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t1, t1, p->x, p256_mod, p256_mp_mod); /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_8(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod); /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_8(t3, t3, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t3, t3, p->y, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ sp_256_mont_sub_8(t2, t2, t1, p256_mod); /* R = S2 - S1 */ sp_256_mont_sub_8(t4, t4, t3, p256_mod); - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_8(z, z, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(z, z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_8(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod); sp_256_mont_mul_8(y, t1, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(z, z, q->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_8(x, x, t5, p256_mod); - sp_256_mont_dbl_8(t1, y, p256_mod); - sp_256_mont_sub_8(x, x, t1, p256_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_8(y, y, x, p256_mod); - sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t5, t5, t3, p256_mod, p256_mp_mod); + sp_256_mont_dbl_8(t3, y, p256_mod); + sp_256_mont_sub_8(x, x, t3, p256_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_8(y, y, x, p256_mod); + sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, y, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -32166,7 +32161,7 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons sp_digit* tmp = NULL; #else sp_point_256 t[16 + 1]; - sp_digit tmp[2 * 8 * 5]; + sp_digit tmp[2 * 8 * 6]; #endif sp_point_256* rt = NULL; #ifndef WC_NO_CACHE_RESISTANT @@ -32200,7 +32195,7 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons } #endif if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -32301,7 +32296,7 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 8 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 8 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -32331,6 +32326,8 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons } #ifdef FP_ECC +#define sp_256_mont_dbl_lower_8 sp_256_mont_dbl_8 +#define sp_256_mont_tpl_lower_8 sp_256_mont_tpl_8 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -32369,7 +32366,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int n, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_8(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_8(t1, t1, w, p256_mod); - sp_256_mont_tpl_8(a, t1, p256_mod); + sp_256_mont_tpl_lower_8(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_8(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_8(b, t1, x, p256_mod, p256_mp_mod); @@ -32377,9 +32374,12 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int n, sp_256_mont_sqr_8(x, a, p256_mod, p256_mp_mod); sp_256_mont_dbl_8(t2, b, p256_mod); sp_256_mont_sub_8(x, x, t2, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_sub_lower_8(t2, b, x, p256_mod); + sp_256_mont_dbl_lower_8(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_8(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_8(t1, t1, p256_mod, p256_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -32389,16 +32389,14 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int n, sp_256_mont_mul_8(w, w, t1, p256_mod, p256_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_8(y, b, x, p256_mod); - sp_256_mont_mul_8(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_8(y, y, p256_mod); + sp_256_mont_mul_8(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, y, t1, p256_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_256_mont_sqr_8(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_8(t1, t1, w, p256_mod); - sp_256_mont_tpl_8(a, t1, p256_mod); + sp_256_mont_tpl_lower_8(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_8(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_8(b, t1, x, p256_mod, p256_mp_mod); @@ -32406,14 +32404,15 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int n, sp_256_mont_sqr_8(x, a, p256_mod, p256_mp_mod); sp_256_mont_dbl_8(t2, b, p256_mod); sp_256_mont_sub_8(x, x, t2, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_sub_lower_8(t2, b, x, p256_mod); + sp_256_mont_dbl_lower_8(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_8(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_8(t1, t1, p256_mod, p256_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_8(y, b, x, p256_mod); - sp_256_mont_mul_8(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_8(y, y, p256_mod); + sp_256_mont_mul_8(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, y, t1, p256_mod); #endif /* Y = Y/2 */ @@ -32463,17 +32462,12 @@ typedef struct sp_table_entry_256 { static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*8; sp_digit* t3 = t + 4*8; sp_digit* t4 = t + 6*8; sp_digit* t5 = t + 8*8; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*8; /* Check double */ (void)sp_256_sub_8(t1, p256_mod, q->y); @@ -32483,53 +32477,54 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p, sp_256_proj_point_dbl_8(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<8; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<8; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<8; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_8(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - X1 */ - sp_256_mont_sub_8(t2, t2, x, p256_mod); + sp_256_mont_sub_8(t2, t2, p->x, p256_mod); /* R = S2 - Y1 */ - sp_256_mont_sub_8(t4, t4, y, p256_mod); + sp_256_mont_sub_8(t4, t4, p->y, p256_mod); /* Z3 = H*Z1 */ - sp_256_mont_mul_8(z, z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_256_mont_sqr_8(t1, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t3, x, t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t3, p->x, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod); sp_256_mont_sub_8(x, t1, t5, p256_mod); sp_256_mont_dbl_8(t1, t3, p256_mod); sp_256_mont_sub_8(x, x, t1, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_8(t3, t3, x, p256_mod); + sp_256_mont_sub_lower_8(t3, t3, x, p256_mod); sp_256_mont_mul_8(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t5, t5, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t5, t5, p->y, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, t3, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -32701,7 +32696,7 @@ static int sp_256_ecc_mulmod_stripe_8(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 8 * 5]; + sp_digit t[2 * 8 * 6]; #endif sp_point_256* p = NULL; int i; @@ -32722,7 +32717,7 @@ static int sp_256_ecc_mulmod_stripe_8(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -32902,7 +32897,7 @@ static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g, const sp_ #ifndef FP_ECC return sp_256_ecc_mulmod_fast_8(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 8 * 5]; + sp_digit tmp[2 * 8 * 6]; sp_cache_256_t* cache; int err = MP_OKAY; @@ -33105,7 +33100,7 @@ static int sp_256_ecc_mulmod_stripe_8(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 8 * 5]; + sp_digit t[2 * 8 * 6]; #endif sp_point_256* p = NULL; int i; @@ -33126,7 +33121,7 @@ static int sp_256_ecc_mulmod_stripe_8(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -33306,7 +33301,7 @@ static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g, const sp_ #ifndef FP_ECC return sp_256_ecc_mulmod_fast_8(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 8 * 5]; + sp_digit tmp[2 * 8 * 6]; sp_cache_256_t* cache; int err = MP_OKAY; @@ -33417,7 +33412,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[8 + 8 * 2 * 5]; + sp_digit k[8 + 8 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -33430,7 +33425,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (8 + 8 * 2 * 5), heap, + sizeof(sp_digit) * (8 + 8 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -34966,7 +34961,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[8 + 8 * 2 * 5]; + sp_digit k[8 + 8 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -34979,7 +34974,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (8 + 8 * 2 * 5), + sizeof(sp_digit) * (8 + 8 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -36584,7 +36579,7 @@ typedef struct sp_ecc_verify_256_ctx { sp_digit u1[2*8]; sp_digit u2[2*8]; sp_digit s[2*8]; - sp_digit tmp[2*8 * 5]; + sp_digit tmp[2*8 * 6]; sp_point_256 p1; sp_point_256 p2; } sp_ecc_verify_256_ctx; @@ -36730,7 +36725,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_256* p1 = NULL; #else - sp_digit u1[16 * 8]; + sp_digit u1[18 * 8]; sp_point_256 p1[2]; #endif sp_digit* u2 = NULL; @@ -36749,7 +36744,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 8, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 8, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -37052,7 +37047,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_256* p = NULL; #else - sp_digit tmp[2 * 8 * 5]; + sp_digit tmp[2 * 8 * 6]; sp_point_256 p[2]; #endif sp_point_256* q = NULL; @@ -37066,7 +37061,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -40356,7 +40351,7 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p, (sp_digit)1 : (sp_digit)0)); sp_384_norm_12(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -40529,6 +40524,7 @@ static void sp_384_mont_sub_12(sp_digit* r, const sp_digit* a, const sp_digit* b sp_384_cond_add_12(r, r, m, o); } +#define sp_384_mont_sub_lower_12 sp_384_mont_sub_12 static void sp_384_rshift1_12(sp_digit* r, const sp_digit* a) { __asm__ __volatile__ ( @@ -40713,7 +40709,7 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co break; case 16: /* Y = Y - X */ - sp_384_mont_sub_12(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -40739,7 +40735,8 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p, sp_digit* t) +static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*12; @@ -40786,7 +40783,7 @@ static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p, sp_ /* X = X - Y */ sp_384_mont_sub_12(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_12(y, y, x, p384_mod); + sp_384_mont_sub_lower_12(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_12(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ @@ -40827,6 +40824,7 @@ typedef struct sp_384_proj_point_add_12_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -40855,6 +40853,10 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->t3 = t + 4*12; ctx->t4 = t + 6*12; ctx->t5 = t + 8*12; + ctx->t6 = t + 10*12; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -40879,29 +40881,6 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_384)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<12; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<12; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<12; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -40915,16 +40894,16 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 6; break; case 6: - sp_384_mont_mul_12(ctx->t1, ctx->t1, ctx->x, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_12(ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 8; break; case 8: - sp_384_mont_mul_12(ctx->t4, ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 9; break; case 9: @@ -40933,7 +40912,7 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_12(ctx->t3, ctx->t3, ctx->y, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod); ctx->state = 11; break; case 11: @@ -40952,29 +40931,29 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_12(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_384_mont_sqr_12(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 15; break; case 15: - sp_384_mont_mul_12(ctx->z, ctx->z, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_12(ctx->x, ctx->t4, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 17; break; case 17: - sp_384_mont_sqr_12(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_12(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 18; break; case 18: - sp_384_mont_mul_12(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); ctx->state = 19; break; case 19: - sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(ctx->x, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 20; break; case 20: @@ -40982,24 +40961,24 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 21; break; case 21: - sp_384_mont_dbl_12(ctx->t1, ctx->y, p384_mod); + sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); ctx->state = 22; break; case 22: - sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t1, p384_mod); + sp_384_mont_dbl_12(ctx->t3, ctx->y, p384_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_12(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t3, p384_mod); ctx->state = 24; break; case 24: - sp_384_mont_mul_12(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 25; break; case 25: - sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 26; break; case 26: @@ -41007,9 +40986,30 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -41021,24 +41021,13 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, static void sp_384_proj_point_add_12(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*12; sp_digit* t3 = t + 4*12; sp_digit* t4 = t + 6*12; sp_digit* t5 = t + 8*12; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*12; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_384* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_384_sub_12(t1, p384_mod, q->y); @@ -41048,60 +41037,61 @@ static void sp_384_proj_point_add_12(sp_point_384* r, sp_384_proj_point_dbl_12(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<12; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<12; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<12; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_12(t1, q->z, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t3, t1, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t1, t1, x, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t1, t1, p->x, p384_mod, p384_mp_mod); /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_12(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod); /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_12(t3, t3, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t3, t3, p->y, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - U1 */ sp_384_mont_sub_12(t2, t2, t1, p384_mod); /* R = S2 - S1 */ sp_384_mont_sub_12(t4, t4, t3, p384_mod); - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_12(z, z, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(z, z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_12(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod); sp_384_mont_mul_12(y, t1, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(z, z, q->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_12(x, x, t5, p384_mod); - sp_384_mont_dbl_12(t1, y, p384_mod); - sp_384_mont_sub_12(x, x, t1, p384_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_12(y, y, x, p384_mod); - sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t5, t5, t3, p384_mod, p384_mp_mod); + sp_384_mont_dbl_12(t3, y, p384_mod); + sp_384_mont_sub_12(x, x, t3, p384_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_12(y, y, x, p384_mod); + sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, y, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -41385,6 +41375,8 @@ static int sp_384_ecc_mulmod_fast_12(sp_point_384* r, const sp_point_384* g, con } #ifdef FP_ECC +#define sp_384_mont_dbl_lower_12 sp_384_mont_dbl_12 +#define sp_384_mont_tpl_lower_12 sp_384_mont_tpl_12 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -41423,7 +41415,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int n, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_12(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_12(t1, t1, w, p384_mod); - sp_384_mont_tpl_12(a, t1, p384_mod); + sp_384_mont_tpl_lower_12(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_12(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_12(b, t1, x, p384_mod, p384_mp_mod); @@ -41431,9 +41423,12 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int n, sp_384_mont_sqr_12(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_12(t2, b, p384_mod); sp_384_mont_sub_12(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_12(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_12(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_12(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_12(t1, t1, p384_mod, p384_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -41443,16 +41438,14 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int n, sp_384_mont_mul_12(w, w, t1, p384_mod, p384_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_12(y, b, x, p384_mod); - sp_384_mont_mul_12(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_12(y, y, p384_mod); + sp_384_mont_mul_12(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, y, t1, p384_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_384_mont_sqr_12(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_12(t1, t1, w, p384_mod); - sp_384_mont_tpl_12(a, t1, p384_mod); + sp_384_mont_tpl_lower_12(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_12(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_12(b, t1, x, p384_mod, p384_mp_mod); @@ -41460,14 +41453,15 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int n, sp_384_mont_sqr_12(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_12(t2, b, p384_mod); sp_384_mont_sub_12(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_12(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_12(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_12(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_12(t1, t1, p384_mod, p384_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_12(y, b, x, p384_mod); - sp_384_mont_mul_12(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_12(y, y, p384_mod); + sp_384_mont_mul_12(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, y, t1, p384_mod); #endif /* Y = Y/2 */ @@ -41517,17 +41511,12 @@ typedef struct sp_table_entry_384 { static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*12; sp_digit* t3 = t + 4*12; sp_digit* t4 = t + 6*12; sp_digit* t5 = t + 8*12; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*12; /* Check double */ (void)sp_384_sub_12(t1, p384_mod, q->y); @@ -41537,53 +41526,54 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p, sp_384_proj_point_dbl_12(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<12; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<12; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<12; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_12(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - X1 */ - sp_384_mont_sub_12(t2, t2, x, p384_mod); + sp_384_mont_sub_12(t2, t2, p->x, p384_mod); /* R = S2 - Y1 */ - sp_384_mont_sub_12(t4, t4, y, p384_mod); + sp_384_mont_sub_12(t4, t4, p->y, p384_mod); /* Z3 = H*Z1 */ - sp_384_mont_mul_12(z, z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_384_mont_sqr_12(t1, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t3, x, t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t3, p->x, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod); sp_384_mont_sub_12(x, t1, t5, p384_mod); sp_384_mont_dbl_12(t1, t3, p384_mod); sp_384_mont_sub_12(x, x, t1, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_12(t3, t3, x, p384_mod); + sp_384_mont_sub_lower_12(t3, t3, x, p384_mod); sp_384_mont_mul_12(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t5, t5, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t5, t5, p->y, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, t3, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -45748,7 +45738,7 @@ typedef struct sp_ecc_verify_384_ctx { sp_digit u1[2*12]; sp_digit u2[2*12]; sp_digit s[2*12]; - sp_digit tmp[2*12 * 5]; + sp_digit tmp[2*12 * 6]; sp_point_384 p1; sp_point_384 p2; } sp_ecc_verify_384_ctx; @@ -45894,7 +45884,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_384* p1 = NULL; #else - sp_digit u1[16 * 12]; + sp_digit u1[18 * 12]; sp_point_384 p1[2]; #endif sp_digit* u2 = NULL; @@ -45913,7 +45903,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 12, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 12, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -46216,7 +46206,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_384* p = NULL; #else - sp_digit tmp[2 * 12 * 5]; + sp_digit tmp[2 * 12 * 6]; sp_point_384 p[2]; #endif sp_point_384* q = NULL; @@ -46230,7 +46220,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 12 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 12 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -51428,7 +51418,7 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p, (sp_digit)1 : (sp_digit)0)); sp_521_norm_17(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -51922,6 +51912,7 @@ static void sp_521_mont_sub_17(sp_digit* r, const sp_digit* a, const sp_digit* b ); } +#define sp_521_mont_sub_lower_17 sp_521_mont_sub_17 static void sp_521_rshift1_17(sp_digit* r, const sp_digit* a) { __asm__ __volatile__ ( @@ -52126,7 +52117,7 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co break; case 16: /* Y = Y - X */ - sp_521_mont_sub_17(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -52152,7 +52143,8 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p, sp_digit* t) +static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*17; @@ -52199,7 +52191,7 @@ static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p, sp_ /* X = X - Y */ sp_521_mont_sub_17(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_17(y, y, x, p521_mod); + sp_521_mont_sub_lower_17(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_17(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ @@ -52242,6 +52234,7 @@ typedef struct sp_521_proj_point_add_17_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -52270,6 +52263,10 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->t3 = t + 4*17; ctx->t4 = t + 6*17; ctx->t5 = t + 8*17; + ctx->t6 = t + 10*17; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -52294,29 +52291,6 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_521)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<17; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<17; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<17; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -52330,16 +52304,16 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 6; break; case 6: - sp_521_mont_mul_17(ctx->t1, ctx->t1, ctx->x, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_17(ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 8; break; case 8: - sp_521_mont_mul_17(ctx->t4, ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 9; break; case 9: @@ -52348,7 +52322,7 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_17(ctx->t3, ctx->t3, ctx->y, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod); ctx->state = 11; break; case 11: @@ -52367,29 +52341,29 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_17(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_521_mont_sqr_17(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 15; break; case 15: - sp_521_mont_mul_17(ctx->z, ctx->z, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_17(ctx->x, ctx->t4, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 17; break; case 17: - sp_521_mont_sqr_17(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_17(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 18; break; case 18: - sp_521_mont_mul_17(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); ctx->state = 19; break; case 19: - sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(ctx->x, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 20; break; case 20: @@ -52397,24 +52371,24 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 21; break; case 21: - sp_521_mont_dbl_17(ctx->t1, ctx->y, p521_mod); + sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); ctx->state = 22; break; case 22: - sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t1, p521_mod); + sp_521_mont_dbl_17(ctx->t3, ctx->y, p521_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_17(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t3, p521_mod); ctx->state = 24; break; case 24: - sp_521_mont_mul_17(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 25; break; case 25: - sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 26; break; case 26: @@ -52422,9 +52396,30 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -52436,24 +52431,13 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, static void sp_521_proj_point_add_17(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*17; sp_digit* t3 = t + 4*17; sp_digit* t4 = t + 6*17; sp_digit* t5 = t + 8*17; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*17; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_521* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_521_sub_17(t1, p521_mod, q->y); @@ -52463,60 +52447,61 @@ static void sp_521_proj_point_add_17(sp_point_521* r, sp_521_proj_point_dbl_17(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<17; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<17; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<17; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_17(t1, q->z, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t3, t1, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t1, t1, x, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t1, t1, p->x, p521_mod, p521_mp_mod); /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_17(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod); /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_17(t3, t3, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t3, t3, p->y, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - U1 */ sp_521_mont_sub_17(t2, t2, t1, p521_mod); /* R = S2 - S1 */ sp_521_mont_sub_17(t4, t4, t3, p521_mod); - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_17(z, z, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(z, z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_17(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod); sp_521_mont_mul_17(y, t1, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(z, z, q->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_17(x, x, t5, p521_mod); - sp_521_mont_dbl_17(t1, y, p521_mod); - sp_521_mont_sub_17(x, x, t1, p521_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_17(y, y, x, p521_mod); - sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t5, t5, t3, p521_mod, p521_mp_mod); + sp_521_mont_dbl_17(t3, y, p521_mod); + sp_521_mont_sub_17(x, x, t3, p521_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_17(y, y, x, p521_mod); + sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, y, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -52665,7 +52650,7 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con sp_digit* tmp = NULL; #else sp_point_521 t[16 + 1]; - sp_digit tmp[2 * 17 * 5]; + sp_digit tmp[2 * 17 * 6]; #endif sp_point_521* rt = NULL; #ifndef WC_NO_CACHE_RESISTANT @@ -52699,7 +52684,7 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con } #endif if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -52804,7 +52789,7 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 17 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 17 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -52834,6 +52819,8 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con } #ifdef FP_ECC +#define sp_521_mont_dbl_lower_17 sp_521_mont_dbl_17 +#define sp_521_mont_tpl_lower_17 sp_521_mont_tpl_17 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -52872,7 +52859,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int n, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_17(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_17(t1, t1, w, p521_mod); - sp_521_mont_tpl_17(a, t1, p521_mod); + sp_521_mont_tpl_lower_17(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_17(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_17(b, t1, x, p521_mod, p521_mp_mod); @@ -52880,9 +52867,12 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int n, sp_521_mont_sqr_17(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_17(t2, b, p521_mod); sp_521_mont_sub_17(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_17(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_17(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_17(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_17(t1, t1, p521_mod, p521_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -52892,16 +52882,14 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int n, sp_521_mont_mul_17(w, w, t1, p521_mod, p521_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_17(y, b, x, p521_mod); - sp_521_mont_mul_17(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_17(y, y, p521_mod); + sp_521_mont_mul_17(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, y, t1, p521_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_521_mont_sqr_17(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_17(t1, t1, w, p521_mod); - sp_521_mont_tpl_17(a, t1, p521_mod); + sp_521_mont_tpl_lower_17(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_17(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_17(b, t1, x, p521_mod, p521_mp_mod); @@ -52909,14 +52897,15 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int n, sp_521_mont_sqr_17(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_17(t2, b, p521_mod); sp_521_mont_sub_17(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_17(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_17(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_17(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_17(t1, t1, p521_mod, p521_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_17(y, b, x, p521_mod); - sp_521_mont_mul_17(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_17(y, y, p521_mod); + sp_521_mont_mul_17(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, y, t1, p521_mod); #endif /* Y = Y/2 */ @@ -52966,17 +52955,12 @@ typedef struct sp_table_entry_521 { static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*17; sp_digit* t3 = t + 4*17; sp_digit* t4 = t + 6*17; sp_digit* t5 = t + 8*17; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*17; /* Check double */ (void)sp_521_sub_17(t1, p521_mod, q->y); @@ -52986,53 +52970,54 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p, sp_521_proj_point_dbl_17(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<17; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<17; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<17; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_17(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - X1 */ - sp_521_mont_sub_17(t2, t2, x, p521_mod); + sp_521_mont_sub_17(t2, t2, p->x, p521_mod); /* R = S2 - Y1 */ - sp_521_mont_sub_17(t4, t4, y, p521_mod); + sp_521_mont_sub_17(t4, t4, p->y, p521_mod); /* Z3 = H*Z1 */ - sp_521_mont_mul_17(z, z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_521_mont_sqr_17(t1, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t3, x, t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t3, p->x, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod); sp_521_mont_sub_17(x, t1, t5, p521_mod); sp_521_mont_dbl_17(t1, t3, p521_mod); sp_521_mont_sub_17(x, x, t1, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_17(t3, t3, x, p521_mod); + sp_521_mont_sub_lower_17(t3, t3, x, p521_mod); sp_521_mont_mul_17(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t5, t5, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t5, t5, p->y, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, t3, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -53240,7 +53225,7 @@ static int sp_521_ecc_mulmod_stripe_17(sp_point_521* r, const sp_point_521* g, sp_digit* t = NULL; #else sp_point_521 rt[2]; - sp_digit t[2 * 17 * 5]; + sp_digit t[2 * 17 * 6]; #endif sp_point_521* p = NULL; int i; @@ -53261,7 +53246,7 @@ static int sp_521_ecc_mulmod_stripe_17(sp_point_521* r, const sp_point_521* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -53680,7 +53665,7 @@ static int sp_521_ecc_mulmod_stripe_17(sp_point_521* r, const sp_point_521* g, sp_digit* t = NULL; #else sp_point_521 rt[2]; - sp_digit t[2 * 17 * 5]; + sp_digit t[2 * 17 * 6]; #endif sp_point_521* p = NULL; int i; @@ -53701,7 +53686,7 @@ static int sp_521_ecc_mulmod_stripe_17(sp_point_521* r, const sp_point_521* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -53992,7 +53977,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[17 + 17 * 2 * 5]; + sp_digit k[17 + 17 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -54005,7 +53990,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (17 + 17 * 2 * 5), heap, + sizeof(sp_digit) * (17 + 17 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -56085,7 +56070,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[17 + 17 * 2 * 5]; + sp_digit k[17 + 17 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -56098,7 +56083,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (17 + 17 * 2 * 5), + sizeof(sp_digit) * (17 + 17 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -58392,7 +58377,7 @@ typedef struct sp_ecc_verify_521_ctx { sp_digit u1[2*17]; sp_digit u2[2*17]; sp_digit s[2*17]; - sp_digit tmp[2*17 * 5]; + sp_digit tmp[2*17 * 6]; sp_point_521 p1; sp_point_521 p2; } sp_ecc_verify_521_ctx; @@ -58541,7 +58526,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_521* p1 = NULL; #else - sp_digit u1[16 * 17]; + sp_digit u1[18 * 17]; sp_point_521 p1[2]; #endif sp_digit* u2 = NULL; @@ -58560,7 +58545,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 17, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 17, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -58867,7 +58852,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_521* p = NULL; #else - sp_digit tmp[2 * 17 * 5]; + sp_digit tmp[2 * 17 * 6]; sp_point_521 p[2]; #endif sp_point_521* q = NULL; @@ -58881,7 +58866,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -64518,7 +64503,7 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p, (sp_digit)1 : (sp_digit)0)); sp_1024_norm_32(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -65317,6 +65302,7 @@ static void sp_1024_mont_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* ); } +#define sp_1024_mont_sub_lower_32 sp_1024_mont_sub_32 #ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. @@ -65757,7 +65743,7 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -65783,7 +65769,8 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p, sp_digit* t) +static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*32; @@ -65830,7 +65817,7 @@ static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_32(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_32(y, y, x, p1024_mod); + sp_1024_mont_sub_lower_32(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_32(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ @@ -66018,6 +66005,7 @@ typedef struct sp_1024_proj_point_add_32_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -66046,6 +66034,10 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->t3 = t + 4*32; ctx->t4 = t + 6*32; ctx->t5 = t + 8*32; + ctx->t6 = t + 10*32; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -66070,29 +66062,6 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_1024)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<32; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<32; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<32; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -66106,16 +66075,16 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 6; break; case 6: - sp_1024_mont_mul_32(ctx->t1, ctx->t1, ctx->x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_32(ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 8; break; case 8: - sp_1024_mont_mul_32(ctx->t4, ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 9; break; case 9: @@ -66124,7 +66093,7 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_32(ctx->t3, ctx->t3, ctx->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod); ctx->state = 11; break; case 11: @@ -66143,29 +66112,29 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_32(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_1024_mont_sqr_32(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 15; break; case 15: - sp_1024_mont_mul_32(ctx->z, ctx->z, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_32(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 17; break; case 17: - sp_1024_mont_sqr_32(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_32(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 18; break; case 18: - sp_1024_mont_mul_32(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); ctx->state = 19; break; case 19: - sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 20; break; case 20: @@ -66173,24 +66142,24 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 21; break; case 21: - sp_1024_mont_dbl_32(ctx->t1, ctx->y, p1024_mod); + sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); ctx->state = 22; break; case 22: - sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t1, p1024_mod); + sp_1024_mont_dbl_32(ctx->t3, ctx->y, p1024_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t3, p1024_mod); ctx->state = 24; break; case 24: - sp_1024_mont_mul_32(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 25; break; case 25: - sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 26; break; case 26: @@ -66198,9 +66167,30 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -66212,24 +66202,13 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, static void sp_1024_proj_point_add_32(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*32; sp_digit* t3 = t + 4*32; sp_digit* t4 = t + 6*32; sp_digit* t5 = t + 8*32; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*32; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_1024* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_1024_mont_sub_32(t1, p1024_mod, q->y, p1024_mod); @@ -66239,60 +66218,61 @@ static void sp_1024_proj_point_add_32(sp_point_1024* r, sp_1024_proj_point_dbl_32(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<32; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<32; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<32; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_32(t1, q->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t3, t1, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t1, t1, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t1, t1, p->x, p1024_mod, p1024_mp_mod); /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_32(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_32(t3, t3, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t3, t3, p->y, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - U1 */ sp_1024_mont_sub_32(t2, t2, t1, p1024_mod); /* R = S2 - S1 */ sp_1024_mont_sub_32(t4, t4, t3, p1024_mod); - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_32(z, z, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(z, z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_32(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(y, t1, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(z, z, q->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(x, x, t5, p1024_mod); - sp_1024_mont_dbl_32(t1, y, p1024_mod); - sp_1024_mont_sub_32(x, x, t1, p1024_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_32(y, y, x, p1024_mod); - sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t5, t5, t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_dbl_32(t3, y, p1024_mod); + sp_1024_mont_sub_32(x, x, t3, p1024_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_32(y, y, x, p1024_mod); + sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, y, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -66321,7 +66301,7 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, sp_digit* tmp = NULL; #else sp_point_1024 t[16 + 1]; - sp_digit tmp[2 * 32 * 5]; + sp_digit tmp[2 * 32 * 6]; #endif sp_point_1024* rt = NULL; sp_digit n; @@ -66340,7 +66320,7 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, if (t == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -66421,7 +66401,7 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 32 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 32 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -66440,6 +66420,8 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, } #if defined(FP_ECC) || !defined(WOLFSSL_SP_SMALL) +#define sp_1024_mont_dbl_lower_32 sp_1024_mont_dbl_32 +#define sp_1024_mont_tpl_lower_32 sp_1024_mont_tpl_32 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -66478,7 +66460,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int n, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_32(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_32(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_32(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_32(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(b, t1, x, p1024_mod, p1024_mp_mod); @@ -66486,9 +66468,12 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int n, sp_1024_mont_sqr_32(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_32(t2, b, p1024_mod); sp_1024_mont_sub_32(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_32(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_32(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_32(t1, t1, p1024_mod, p1024_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -66498,16 +66483,14 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int n, sp_1024_mont_mul_32(w, w, t1, p1024_mod, p1024_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_32(y, b, x, p1024_mod); - sp_1024_mont_mul_32(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_32(y, y, p1024_mod); + sp_1024_mont_mul_32(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, y, t1, p1024_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_32(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_32(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_32(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_32(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(b, t1, x, p1024_mod, p1024_mp_mod); @@ -66515,14 +66498,15 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int n, sp_1024_mont_sqr_32(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_32(t2, b, p1024_mod); sp_1024_mont_sub_32(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_32(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_32(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_32(t1, t1, p1024_mod, p1024_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_32(y, b, x, p1024_mod); - sp_1024_mont_mul_32(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_32(y, y, p1024_mod); + sp_1024_mont_mul_32(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, y, t1, p1024_mod); #endif /* Y = Y/2 */ @@ -66572,17 +66556,12 @@ typedef struct sp_table_entry_1024 { static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*32; sp_digit* t3 = t + 4*32; sp_digit* t4 = t + 6*32; sp_digit* t5 = t + 8*32; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*32; /* Check double */ (void)sp_1024_mont_sub_32(t1, p1024_mod, q->y, p1024_mod); @@ -66592,53 +66571,54 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024* sp_1024_proj_point_dbl_32(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<32; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<32; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<32; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_32(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - X1 */ - sp_1024_mont_sub_32(t2, t2, x, p1024_mod); + sp_1024_mont_sub_32(t2, t2, p->x, p1024_mod); /* R = S2 - Y1 */ - sp_1024_mont_sub_32(t4, t4, y, p1024_mod); + sp_1024_mont_sub_32(t4, t4, p->y, p1024_mod); /* Z3 = H*Z1 */ - sp_1024_mont_mul_32(z, z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_1024_mont_sqr_32(t1, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t3, x, t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t3, p->x, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(x, t1, t5, p1024_mod); sp_1024_mont_dbl_32(t1, t3, p1024_mod); sp_1024_mont_sub_32(x, x, t1, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_32(t3, t3, x, p1024_mod); + sp_1024_mont_sub_lower_32(t3, t3, x, p1024_mod); sp_1024_mont_mul_32(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t5, t5, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t5, t5, p->y, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, t3, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -66760,7 +66740,7 @@ static int sp_1024_ecc_mulmod_stripe_32(sp_point_1024* r, const sp_point_1024* g sp_digit* t = NULL; #else sp_point_1024 rt[2]; - sp_digit t[2 * 32 * 5]; + sp_digit t[2 * 32 * 6]; #endif sp_point_1024* p = NULL; int i; @@ -66781,7 +66761,7 @@ static int sp_1024_ecc_mulmod_stripe_32(sp_point_1024* r, const sp_point_1024* g if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -66946,7 +66926,7 @@ static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g, const #ifndef FP_ECC return sp_1024_ecc_mulmod_fast_32(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 32 * 5]; + sp_digit tmp[2 * 32 * 6]; sp_cache_1024_t* cache; int err = MP_OKAY; @@ -67099,7 +67079,7 @@ static int sp_1024_ecc_mulmod_stripe_32(sp_point_1024* r, const sp_point_1024* g sp_digit* t = NULL; #else sp_point_1024 rt[2]; - sp_digit t[2 * 32 * 5]; + sp_digit t[2 * 32 * 6]; #endif sp_point_1024* p = NULL; int i; @@ -67120,7 +67100,7 @@ static int sp_1024_ecc_mulmod_stripe_32(sp_point_1024* r, const sp_point_1024* g if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -67285,7 +67265,7 @@ static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g, const #ifndef FP_ECC return sp_1024_ecc_mulmod_fast_32(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 32 * 5]; + sp_digit tmp[2 * 32 * 6]; sp_cache_1024_t* cache; int err = MP_OKAY; @@ -71033,7 +71013,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_1024 point[2]; - sp_digit k[32 + 32 * 2 * 5]; + sp_digit k[32 + 32 * 2 * 6]; #endif sp_point_1024* addP = NULL; sp_digit* tmp = NULL; @@ -71046,7 +71026,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (32 + 32 * 2 * 5), + sizeof(sp_digit) * (32 + 32 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c index 89d1bab2c..9b63937a0 100644 --- a/wolfcrypt/src/sp_arm64.c +++ b/wolfcrypt/src/sp_arm64.c @@ -23081,7 +23081,7 @@ static void sp_256_map_4(sp_point_256* r, const sp_point_256* p, (sp_digit)1 : (sp_digit)0)); sp_256_norm_4(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -23235,6 +23235,7 @@ static void sp_256_mont_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, ); } +#define sp_256_mont_sub_lower_4 sp_256_mont_sub_4 /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) * * r Result of division by 2. @@ -23383,7 +23384,7 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con break; case 16: /* Y = Y - X */ - sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_lower_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -23409,7 +23410,8 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p, sp_digit* t) +static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*4; @@ -23456,13 +23458,14 @@ static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p, sp_d /* X = X - Y */ sp_256_mont_sub_4(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_4(y, y, x, p256_mod); + sp_256_mont_sub_lower_4(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ sp_256_mont_sub_4(y, y, t2, p256_mod); } +#define sp_256_mont_tpl_lower_4 sp_256_mont_tpl_4 /* Subtract two Montgomery form numbers (r = a - b % m). * * r Result of subtration. @@ -23521,24 +23524,12 @@ static void sp_256_mont_dbl_sub_4(sp_digit* r, const sp_digit* a, const sp_digit const sp_digit* m) { __asm__ __volatile__ ( - "ldp x4, x5, [%[a]]\n\t" - "ldp x6, x7, [%[a],16]\n\t" - "adds x4, x4, x4\n\t" - "ldp x8, x9, [%[b]]\n\t" - "adcs x5, x5, x5\n\t" - "ldp x10, x11, [%[b],16]\n\t" - "adcs x6, x6, x6\n\t" - "adcs x7, x7, x7\n\t" - "mov x13, 0xffffffff00000001\n\t" - "csetm x14, cs\n\t" - "subs x4, x4, x14\n\t" - "lsr x12, x14, 32\n\t" - "sbcs x5, x5, x12\n\t" - "and x13, x13, x14\n\t" - "sbcs x6, x6, xzr\n\t" - "sbc x7, x7, x13\n\t" + "ldp x4, x5, [%[a], 0]\n\t" + "ldp x8, x9, [%[b], 0]\n\t" "subs x4, x4, x8\n\t" + "ldp x6, x7, [%[a], 16]\n\t" "sbcs x5, x5, x9\n\t" + "ldp x10, x11, [%[b], 16]\n\t" "sbcs x6, x6, x10\n\t" "sbcs x7, x7, x11\n\t" "mov x13, 0xffffffff00000001\n\t" @@ -23548,8 +23539,20 @@ static void sp_256_mont_dbl_sub_4(sp_digit* r, const sp_digit* a, const sp_digit "adcs x5, x5, x12\n\t" "and x13, x13, x14\n\t" "adcs x6, x6, xzr\n\t" - "stp x4, x5, [%[r],0]\n\t" "adc x7, x7, x13\n\t" + "adds x4, x4, x4\n\t" + "adcs x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "mov x13, 0xffffffff00000001\n\t" + "csetm x14, cs\n\t" + "subs x4, x4, x14\n\t" + "lsr x12, x14, 32\n\t" + "sbcs x5, x5, x12\n\t" + "and x13, x13, x14\n\t" + "sbcs x6, x6, xzr\n\t" + "stp x4, x5, [%[r],0]\n\t" + "sbc x7, x7, x13\n\t" "stp x6, x7, [%[r],16]\n\t" : : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) @@ -23594,16 +23597,18 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int n, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_4(a, t1, p256_mod); + sp_256_mont_tpl_lower_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); /* X = A^2 - 2B */ sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod); sp_256_mont_sub_dbl_4(x, x, b, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_dbl_sub_4(b, b, x, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_4(t1, t1, p256_mod, p256_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -23613,29 +23618,29 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int n, sp_256_mont_mul_4(w, w, t1, p256_mod, p256_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_4(y, b, x, p256_mod); - sp_256_mont_mul_4(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_sub_4(y, y, t1, p256_mod); + sp_256_mont_mul_4(y, b, a, p256_mod, p256_mp_mod); + sp_256_mont_sub_4(y, y, t1, p256_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_256_mont_sqr_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_4(a, t1, p256_mod); + sp_256_mont_tpl_lower_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); /* X = A^2 - 2B */ sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod); sp_256_mont_sub_dbl_4(x, x, b, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_dbl_sub_4(b, b, x, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_4(t1, t1, p256_mod, p256_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_4(y, b, x, p256_mod); - sp_256_mont_mul_4(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_sub_4(y, y, t1, p256_mod); + sp_256_mont_mul_4(y, b, a, p256_mod, p256_mp_mod); + sp_256_mont_sub_4(y, y, t1, p256_mod); #endif /* Y = Y/2 */ sp_256_div2_4(y, y, p256_mod); @@ -23673,6 +23678,7 @@ typedef struct sp_256_proj_point_add_4_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -23701,6 +23707,10 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->t3 = t + 4*4; ctx->t4 = t + 6*4; ctx->t5 = t + 8*4; + ctx->t6 = t + 10*4; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -23725,29 +23735,6 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_256)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<4; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<4; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<4; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -23761,16 +23748,16 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 6; break; case 6: - sp_256_mont_mul_4(ctx->t1, ctx->t1, ctx->x, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_4(ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 8; break; case 8: - sp_256_mont_mul_4(ctx->t4, ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 9; break; case 9: @@ -23779,7 +23766,7 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_4(ctx->t3, ctx->t3, ctx->y, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod); ctx->state = 11; break; case 11: @@ -23798,29 +23785,29 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_4(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_256_mont_sqr_4(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 15; break; case 15: - sp_256_mont_mul_4(ctx->z, ctx->z, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_4(ctx->x, ctx->t4, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 17; break; case 17: - sp_256_mont_sqr_4(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_4(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 18; break; case 18: - sp_256_mont_mul_4(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); ctx->state = 19; break; case 19: - sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(ctx->x, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 20; break; case 20: @@ -23828,24 +23815,24 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 21; break; case 21: - sp_256_mont_dbl_4(ctx->t1, ctx->y, p256_mod); + sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); ctx->state = 22; break; case 22: - sp_256_mont_sub_4(ctx->x, ctx->x, ctx->t1, p256_mod); + sp_256_mont_dbl_4(ctx->t3, ctx->y, p256_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_4(ctx->x, ctx->x, ctx->t3, p256_mod); ctx->state = 24; break; case 24: - sp_256_mont_mul_4(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 25; break; case 25: - sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 26; break; case 26: @@ -23853,9 +23840,30 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -23867,24 +23875,13 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, static void sp_256_proj_point_add_4(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*4; sp_digit* t3 = t + 4*4; sp_digit* t4 = t + 6*4; sp_digit* t5 = t + 8*4; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*4; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_256* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_256_sub_4(t1, p256_mod, q->y); @@ -23894,60 +23891,60 @@ static void sp_256_proj_point_add_4(sp_point_256* r, sp_256_proj_point_dbl_4(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<4; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<4; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<4; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_4(t1, q->z, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t3, t1, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t1, t1, x, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t1, t1, p->x, p256_mod, p256_mp_mod); /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_4(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod); /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_4(t3, t3, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t3, t3, p->y, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ sp_256_mont_sub_4(t2, t2, t1, p256_mod); /* R = S2 - S1 */ sp_256_mont_sub_4(t4, t4, t3, p256_mod); - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_4(z, z, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(z, z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod); sp_256_mont_mul_4(y, t1, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_4(z, p->z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(z, z, q->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_4(x, x, t5, p256_mod); - sp_256_mont_dbl_4(t1, y, p256_mod); - sp_256_mont_sub_4(x, x, t1, p256_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_4(y, y, x, p256_mod); - sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod); + sp_256_mont_sub_dbl_4(x, x, y, p256_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_4(y, y, x, p256_mod); + sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_4(y, y, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -23965,7 +23962,6 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r, sp_digit* a = t + 2*4; sp_digit* b = t + 4*4; sp_digit* t1 = t + 6*4; - sp_digit* t2 = t + 8*4; sp_digit* x = r[2*m].x; sp_digit* y = r[(1<x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_4(t1, q->z, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t3, t1, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t1, t1, x, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t1, t1, xa, p256_mod, p256_mp_mod); /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_4(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(t2, za, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t4, t2, za, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod); /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_4(t3, t3, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t3, t3, ya, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ @@ -24076,30 +24071,30 @@ static void sp_256_proj_point_add_sub_4(sp_point_256* ra, sp_256_mont_sub_4(t4, t4, t3, p256_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_256_mont_mul_4(z, z, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(z, z, t2, p256_mod, p256_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_256_mont_mul_4(za, za, q->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(za, za, t2, p256_mod, p256_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(xa, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(xs, t6, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(y, t1, t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ya, t1, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_sub_4(x, x, t5, p256_mod); + sp_256_mont_sub_4(xa, xa, t5, p256_mod); sp_256_mont_sub_4(xs, xs, t5, p256_mod); - sp_256_mont_dbl_4(t1, y, p256_mod); - sp_256_mont_sub_4(x, x, t1, p256_mod); + sp_256_mont_dbl_4(t1, ya, p256_mod); + sp_256_mont_sub_4(xa, xa, t1, p256_mod); sp_256_mont_sub_4(xs, xs, t1, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_256_mont_sub_4(ys, y, xs, p256_mod); - sp_256_mont_sub_4(y, y, x, p256_mod); - sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod); + sp_256_mont_sub_lower_4(ys, ya, xs, p256_mod); + sp_256_mont_sub_lower_4(ya, ya, xa, p256_mod); + sp_256_mont_mul_4(ya, ya, t4, p256_mod, p256_mp_mod); sp_256_sub_4(t6, p256_mod, t6); sp_256_mont_mul_4(ys, ys, t6, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod); - sp_256_mont_sub_4(y, y, t5, p256_mod); + sp_256_mont_sub_4(ya, ya, t5, p256_mod); sp_256_mont_sub_4(ys, ys, t5, p256_mod); } @@ -24382,17 +24377,12 @@ typedef struct sp_table_entry_256 { static void sp_256_proj_point_add_qz1_4(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*4; sp_digit* t3 = t + 4*4; sp_digit* t4 = t + 6*4; sp_digit* t5 = t + 8*4; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*4; /* Check double */ (void)sp_256_sub_4(t1, p256_mod, q->y); @@ -24402,53 +24392,53 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r, const sp_point_256* p, sp_256_proj_point_dbl_4(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<4; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<4; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<4; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_4(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - X1 */ - sp_256_mont_sub_4(t2, t2, x, p256_mod); + sp_256_mont_sub_4(t2, t2, p->x, p256_mod); /* R = S2 - Y1 */ - sp_256_mont_sub_4(t4, t4, y, p256_mod); + sp_256_mont_sub_4(t4, t4, p->y, p256_mod); /* Z3 = H*Z1 */ - sp_256_mont_mul_4(z, z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_256_mont_sqr_4(t1, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t3, x, t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t3, p->x, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod); sp_256_mont_sub_4(x, t1, t5, p256_mod); - sp_256_mont_dbl_4(t1, t3, p256_mod); - sp_256_mont_sub_4(x, x, t1, p256_mod); + sp_256_mont_sub_dbl_4(x, x, t3, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_4(t3, t3, x, p256_mod); + sp_256_mont_sub_lower_4(t3, t3, x, p256_mod); sp_256_mont_mul_4(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t5, t5, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t5, t5, p->y, p256_mod, p256_mp_mod); sp_256_mont_sub_4(y, t3, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -24626,7 +24616,7 @@ static int sp_256_ecc_mulmod_stripe_4(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 4 * 5]; + sp_digit t[2 * 4 * 6]; #endif sp_point_256* p = NULL; int i; @@ -24647,7 +24637,7 @@ static int sp_256_ecc_mulmod_stripe_4(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -24828,7 +24818,7 @@ static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g, const sp_ #ifndef FP_ECC return sp_256_ecc_mulmod_win_add_sub_4(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 4 * 5]; + sp_digit tmp[2 * 4 * 6]; sp_cache_256_t* cache; int err = MP_OKAY; @@ -25018,7 +25008,7 @@ static int sp_256_ecc_mulmod_stripe_4(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 4 * 5]; + sp_digit t[2 * 4 * 6]; #endif sp_point_256* p = NULL; int i; @@ -25039,7 +25029,7 @@ static int sp_256_ecc_mulmod_stripe_4(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -25220,7 +25210,7 @@ static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g, const sp_ #ifndef FP_ECC return sp_256_ecc_mulmod_win_add_sub_4(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 4 * 5]; + sp_digit tmp[2 * 4 * 6]; sp_cache_256_t* cache; int err = MP_OKAY; @@ -25331,7 +25321,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[4 + 4 * 2 * 5]; + sp_digit k[4 + 4 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -25344,7 +25334,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (4 + 4 * 2 * 5), heap, + sizeof(sp_digit) * (4 + 4 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -39134,7 +39124,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, sp_digit* tmp = NULL; #else sp_point_256 rt[2]; - sp_digit tmp[2 * 4 * 5]; + sp_digit tmp[2 * 4 * 6]; #endif sp_point_256* p = NULL; sp_digit* negy = NULL; @@ -39153,7 +39143,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -39212,7 +39202,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 4 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 4 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -39314,7 +39304,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[4 + 4 * 2 * 5]; + sp_digit k[4 + 4 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -39327,7 +39317,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (4 + 4 * 2 * 5), + sizeof(sp_digit) * (4 + 4 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -40835,7 +40825,7 @@ typedef struct sp_ecc_verify_256_ctx { sp_digit u1[2*4]; sp_digit u2[2*4]; sp_digit s[2*4]; - sp_digit tmp[2*4 * 5]; + sp_digit tmp[2*4 * 6]; sp_point_256 p1; sp_point_256 p2; } sp_ecc_verify_256_ctx; @@ -40981,7 +40971,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_256* p1 = NULL; #else - sp_digit u1[16 * 4]; + sp_digit u1[18 * 4]; sp_point_256 p1[2]; #endif sp_digit* u2 = NULL; @@ -41000,7 +40990,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 4, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 4, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -41303,7 +41293,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_256* p = NULL; #else - sp_digit tmp[2 * 4 * 5]; + sp_digit tmp[2 * 4 * 6]; sp_point_256 p[2]; #endif sp_point_256* q = NULL; @@ -41317,7 +41307,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -43194,7 +43184,7 @@ static void sp_384_map_6(sp_point_384* r, const sp_point_384* p, (sp_digit)1 : (sp_digit)0)); sp_384_norm_6(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -43342,6 +43332,7 @@ static void sp_384_mont_sub_6(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_384_cond_add_6(r, r, m, o); } +#define sp_384_mont_sub_lower_6 sp_384_mont_sub_6 static void sp_384_rshift1_6(sp_digit* r, const sp_digit* a) { __asm__ __volatile__ ( @@ -43493,7 +43484,7 @@ static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con break; case 16: /* Y = Y - X */ - sp_384_mont_sub_6(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_lower_6(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -43519,7 +43510,8 @@ static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p, sp_digit* t) +static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*6; @@ -43566,13 +43558,15 @@ static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p, sp_d /* X = X - Y */ sp_384_mont_sub_6(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_6(y, y, x, p384_mod); + sp_384_mont_sub_lower_6(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_6(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ sp_384_mont_sub_6(y, y, t2, p384_mod); } +#define sp_384_mont_dbl_lower_6 sp_384_mont_dbl_6 +#define sp_384_mont_tpl_lower_6 sp_384_mont_tpl_6 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -43611,7 +43605,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int n, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_6(a, t1, p384_mod); + sp_384_mont_tpl_lower_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_6(b, t1, x, p384_mod, p384_mp_mod); @@ -43619,9 +43613,12 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int n, sp_384_mont_sqr_6(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_6(t2, b, p384_mod); sp_384_mont_sub_6(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_6(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_6(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_6(t1, t1, p384_mod, p384_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -43631,16 +43628,14 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int n, sp_384_mont_mul_6(w, w, t1, p384_mod, p384_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_6(y, b, x, p384_mod); - sp_384_mont_mul_6(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_6(y, y, p384_mod); + sp_384_mont_mul_6(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_6(y, y, t1, p384_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_384_mont_sqr_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_6(a, t1, p384_mod); + sp_384_mont_tpl_lower_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_6(b, t1, x, p384_mod, p384_mp_mod); @@ -43648,14 +43643,15 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int n, sp_384_mont_sqr_6(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_6(t2, b, p384_mod); sp_384_mont_sub_6(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_6(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_6(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_6(t1, t1, p384_mod, p384_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_6(y, b, x, p384_mod); - sp_384_mont_mul_6(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_6(y, y, p384_mod); + sp_384_mont_mul_6(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_6(y, y, t1, p384_mod); #endif /* Y = Y/2 */ @@ -43694,6 +43690,7 @@ typedef struct sp_384_proj_point_add_6_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -43722,6 +43719,10 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->t3 = t + 4*6; ctx->t4 = t + 6*6; ctx->t5 = t + 8*6; + ctx->t6 = t + 10*6; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -43746,29 +43747,6 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_384)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<6; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<6; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<6; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -43782,16 +43760,16 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 6; break; case 6: - sp_384_mont_mul_6(ctx->t1, ctx->t1, ctx->x, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_6(ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 8; break; case 8: - sp_384_mont_mul_6(ctx->t4, ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 9; break; case 9: @@ -43800,7 +43778,7 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_6(ctx->t3, ctx->t3, ctx->y, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod); ctx->state = 11; break; case 11: @@ -43819,29 +43797,29 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_6(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_384_mont_sqr_6(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 15; break; case 15: - sp_384_mont_mul_6(ctx->z, ctx->z, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_6(ctx->x, ctx->t4, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 17; break; case 17: - sp_384_mont_sqr_6(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_6(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 18; break; case 18: - sp_384_mont_mul_6(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); ctx->state = 19; break; case 19: - sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(ctx->x, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 20; break; case 20: @@ -43849,24 +43827,24 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 21; break; case 21: - sp_384_mont_dbl_6(ctx->t1, ctx->y, p384_mod); + sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); ctx->state = 22; break; case 22: - sp_384_mont_sub_6(ctx->x, ctx->x, ctx->t1, p384_mod); + sp_384_mont_dbl_6(ctx->t3, ctx->y, p384_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_6(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_6(ctx->x, ctx->x, ctx->t3, p384_mod); ctx->state = 24; break; case 24: - sp_384_mont_mul_6(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_6(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 25; break; case 25: - sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 26; break; case 26: @@ -43874,9 +43852,30 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -43888,24 +43887,13 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, static void sp_384_proj_point_add_6(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*6; sp_digit* t3 = t + 4*6; sp_digit* t4 = t + 6*6; sp_digit* t5 = t + 8*6; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*6; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_384* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_384_sub_6(t1, p384_mod, q->y); @@ -43915,60 +43903,61 @@ static void sp_384_proj_point_add_6(sp_point_384* r, sp_384_proj_point_dbl_6(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<6; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<6; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<6; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_6(t1, q->z, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t3, t1, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t1, t1, x, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t1, t1, p->x, p384_mod, p384_mp_mod); /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_6(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod); /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_6(t3, t3, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t3, t3, p->y, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - U1 */ sp_384_mont_sub_6(t2, t2, t1, p384_mod); /* R = S2 - S1 */ sp_384_mont_sub_6(t4, t4, t3, p384_mod); - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_6(z, z, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(z, z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_6(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_6(t5, t2, p384_mod, p384_mp_mod); sp_384_mont_mul_6(y, t1, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t5, t5, t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_6(z, p->z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(z, z, q->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_6(x, x, t5, p384_mod); - sp_384_mont_dbl_6(t1, y, p384_mod); - sp_384_mont_sub_6(x, x, t1, p384_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_6(y, y, x, p384_mod); - sp_384_mont_mul_6(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t5, t5, t3, p384_mod, p384_mp_mod); + sp_384_mont_dbl_6(t3, y, p384_mod); + sp_384_mont_sub_6(x, x, t3, p384_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_6(y, y, x, p384_mod); + sp_384_mont_mul_6(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_6(y, y, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -44015,29 +44004,30 @@ static void sp_384_proj_point_dbl_n_store_6(sp_point_384* r, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_6(a, t1, p384_mod); + sp_384_mont_tpl_lower_6(a, t1, p384_mod); /* B = X*Y^2 */ - sp_384_mont_sqr_6(t2, y, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(b, t2, x, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(t1, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(b, t1, x, p384_mod, p384_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_384_mont_sqr_6(x, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_6(t1, b, p384_mod); - sp_384_mont_sub_6(x, x, t1, p384_mod); + sp_384_mont_dbl_6(t2, b, p384_mod); + sp_384_mont_sub_6(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_6(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_6(r[j].z, z, y, p384_mod, p384_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_384_mont_sqr_6(t2, t2, p384_mod, p384_mp_mod); + /* t1 = Y^4 */ + sp_384_mont_sqr_6(t1, t1, p384_mod, p384_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_384_mont_mul_6(w, w, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(w, w, t1, p384_mod, p384_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_6(y, b, x, p384_mod); - sp_384_mont_mul_6(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_6(y, y, p384_mod); - sp_384_mont_sub_6(y, y, t2, p384_mod); + sp_384_mont_mul_6(y, b, a, p384_mod, p384_mp_mod); + sp_384_mont_sub_6(y, y, t1, p384_mod); /* Y = Y/2 */ sp_384_div2_6(r[j].y, y, p384_mod); @@ -44063,30 +44053,30 @@ static void sp_384_proj_point_add_sub_6(sp_point_384* ra, sp_digit* t4 = t + 6*6; sp_digit* t5 = t + 8*6; sp_digit* t6 = t + 10*6; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_6(t1, q->z, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t3, t1, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t1, t1, x, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t1, t1, xa, p384_mod, p384_mp_mod); /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_6(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(t2, za, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t4, t2, za, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod); /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_6(t3, t3, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t3, t3, ya, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - U1 */ @@ -44097,30 +44087,30 @@ static void sp_384_proj_point_add_sub_6(sp_point_384* ra, sp_384_mont_sub_6(t4, t4, t3, p384_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_384_mont_mul_6(z, z, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(z, z, t2, p384_mod, p384_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_384_mont_mul_6(za, za, q->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(za, za, t2, p384_mod, p384_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_6(x, t4, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(xa, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_6(xs, t6, p384_mod, p384_mp_mod); sp_384_mont_sqr_6(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(y, t1, t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ya, t1, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t5, t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_sub_6(x, x, t5, p384_mod); + sp_384_mont_sub_6(xa, xa, t5, p384_mod); sp_384_mont_sub_6(xs, xs, t5, p384_mod); - sp_384_mont_dbl_6(t1, y, p384_mod); - sp_384_mont_sub_6(x, x, t1, p384_mod); + sp_384_mont_dbl_6(t1, ya, p384_mod); + sp_384_mont_sub_6(xa, xa, t1, p384_mod); sp_384_mont_sub_6(xs, xs, t1, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_384_mont_sub_6(ys, y, xs, p384_mod); - sp_384_mont_sub_6(y, y, x, p384_mod); - sp_384_mont_mul_6(y, y, t4, p384_mod, p384_mp_mod); + sp_384_mont_sub_lower_6(ys, ya, xs, p384_mod); + sp_384_mont_sub_lower_6(ya, ya, xa, p384_mod); + sp_384_mont_mul_6(ya, ya, t4, p384_mod, p384_mp_mod); sp_384_sub_6(t6, p384_mod, t6); sp_384_mont_mul_6(ys, ys, t6, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t5, t5, t3, p384_mod, p384_mp_mod); - sp_384_mont_sub_6(y, y, t5, p384_mod); + sp_384_mont_sub_6(ya, ya, t5, p384_mod); sp_384_mont_sub_6(ys, ys, t5, p384_mod); } @@ -44415,17 +44405,12 @@ typedef struct sp_table_entry_384 { static void sp_384_proj_point_add_qz1_6(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*6; sp_digit* t3 = t + 4*6; sp_digit* t4 = t + 6*6; sp_digit* t5 = t + 8*6; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*6; /* Check double */ (void)sp_384_sub_6(t1, p384_mod, q->y); @@ -44435,53 +44420,54 @@ static void sp_384_proj_point_add_qz1_6(sp_point_384* r, const sp_point_384* p, sp_384_proj_point_dbl_6(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<6; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<6; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<6; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_6(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - X1 */ - sp_384_mont_sub_6(t2, t2, x, p384_mod); + sp_384_mont_sub_6(t2, t2, p->x, p384_mod); /* R = S2 - Y1 */ - sp_384_mont_sub_6(t4, t4, y, p384_mod); + sp_384_mont_sub_6(t4, t4, p->y, p384_mod); /* Z3 = H*Z1 */ - sp_384_mont_mul_6(z, z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_384_mont_sqr_6(t1, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_6(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t3, x, t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t3, p->x, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t5, t5, t2, p384_mod, p384_mp_mod); sp_384_mont_sub_6(x, t1, t5, p384_mod); sp_384_mont_dbl_6(t1, t3, p384_mod); sp_384_mont_sub_6(x, x, t1, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_6(t3, t3, x, p384_mod); + sp_384_mont_sub_lower_6(t3, t3, x, p384_mod); sp_384_mont_mul_6(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t5, t5, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t5, t5, p->y, p384_mod, p384_mp_mod); sp_384_mont_sub_6(y, t3, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -66648,7 +66634,7 @@ typedef struct sp_ecc_verify_384_ctx { sp_digit u1[2*6]; sp_digit u2[2*6]; sp_digit s[2*6]; - sp_digit tmp[2*6 * 5]; + sp_digit tmp[2*6 * 6]; sp_point_384 p1; sp_point_384 p2; } sp_ecc_verify_384_ctx; @@ -66794,7 +66780,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_384* p1 = NULL; #else - sp_digit u1[16 * 6]; + sp_digit u1[18 * 6]; sp_point_384 p1[2]; #endif sp_digit* u2 = NULL; @@ -66813,7 +66799,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 6, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 6, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -67116,7 +67102,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_384* p = NULL; #else - sp_digit tmp[2 * 6 * 5]; + sp_digit tmp[2 * 6 * 6]; sp_point_384 p[2]; #endif sp_point_384* q = NULL; @@ -67130,7 +67116,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 6 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 6 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -71128,7 +71114,7 @@ static void sp_521_map_9(sp_point_521* r, const sp_point_521* p, (sp_digit)1 : (sp_digit)0)); sp_521_norm_9(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -71337,6 +71323,7 @@ static void sp_521_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b, ); } +#define sp_521_mont_sub_lower_9 sp_521_mont_sub_9 #ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. @@ -71595,7 +71582,7 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con break; case 16: /* Y = Y - X */ - sp_521_mont_sub_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -71621,7 +71608,8 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p, sp_digit* t) +static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*9; @@ -71668,13 +71656,15 @@ static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p, sp_d /* X = X - Y */ sp_521_mont_sub_9(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_9(y, y, x, p521_mod); + sp_521_mont_sub_lower_9(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_9(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ sp_521_mont_sub_9(y, y, t2, p521_mod); } +#define sp_521_mont_dbl_lower_9 sp_521_mont_dbl_9 +#define sp_521_mont_tpl_lower_9 sp_521_mont_tpl_9 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -71713,7 +71703,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int n, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_9(a, t1, p521_mod); + sp_521_mont_tpl_lower_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -71721,9 +71711,12 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int n, sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_9(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_9(t1, t1, p521_mod, p521_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -71733,16 +71726,14 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int n, sp_521_mont_mul_9(w, w, t1, p521_mod, p521_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_9(y, b, x, p521_mod); - sp_521_mont_mul_9(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_9(y, y, p521_mod); + sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, y, t1, p521_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_9(a, t1, p521_mod); + sp_521_mont_tpl_lower_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -71750,14 +71741,15 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int n, sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_9(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_9(t1, t1, p521_mod, p521_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_9(y, b, x, p521_mod); - sp_521_mont_mul_9(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_9(y, y, p521_mod); + sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, y, t1, p521_mod); #endif /* Y = Y/2 */ @@ -71797,6 +71789,7 @@ typedef struct sp_521_proj_point_add_9_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -71825,6 +71818,10 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->t3 = t + 4*9; ctx->t4 = t + 6*9; ctx->t5 = t + 8*9; + ctx->t6 = t + 10*9; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -71849,29 +71846,6 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_521)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -71885,16 +71859,16 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 6; break; case 6: - sp_521_mont_mul_9(ctx->t1, ctx->t1, ctx->x, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_9(ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 8; break; case 8: - sp_521_mont_mul_9(ctx->t4, ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 9; break; case 9: @@ -71903,7 +71877,7 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_9(ctx->t3, ctx->t3, ctx->y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod); ctx->state = 11; break; case 11: @@ -71922,29 +71896,29 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_521_mont_sqr_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 15; break; case 15: - sp_521_mont_mul_9(ctx->z, ctx->z, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 17; break; case 17: - sp_521_mont_sqr_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_9(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 18; break; case 18: - sp_521_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); ctx->state = 19; break; case 19: - sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 20; break; case 20: @@ -71952,24 +71926,24 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 21; break; case 21: - sp_521_mont_dbl_9(ctx->t1, ctx->y, p521_mod); + sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); ctx->state = 22; break; case 22: - sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t1, p521_mod); + sp_521_mont_dbl_9(ctx->t3, ctx->y, p521_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t3, p521_mod); ctx->state = 24; break; case 24: - sp_521_mont_mul_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 25; break; case 25: - sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 26; break; case 26: @@ -71977,9 +71951,30 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -71991,24 +71986,13 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, static void sp_521_proj_point_add_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*9; sp_digit* t3 = t + 4*9; sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*9; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_521* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_521_sub_9(t1, p521_mod, q->y); @@ -72018,60 +72002,61 @@ static void sp_521_proj_point_add_9(sp_point_521* r, sp_521_proj_point_dbl_9(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t3, t1, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t1, t1, x, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t1, t1, p->x, p521_mod, p521_mp_mod); /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_9(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod); /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_9(t3, t3, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t3, t3, p->y, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - U1 */ sp_521_mont_sub_9(t2, t2, t1, p521_mod); /* R = S2 - S1 */ sp_521_mont_sub_9(t4, t4, t3, p521_mod); - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(z, z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod); sp_521_mont_mul_9(y, t1, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_9(x, x, t5, p521_mod); - sp_521_mont_dbl_9(t1, y, p521_mod); - sp_521_mont_sub_9(x, x, t1, p521_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_9(y, y, x, p521_mod); - sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t3, p521_mod, p521_mp_mod); + sp_521_mont_dbl_9(t3, y, p521_mod); + sp_521_mont_sub_9(x, x, t3, p521_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_9(y, y, x, p521_mod); + sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, y, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -72118,29 +72103,30 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_9(a, t1, p521_mod); + sp_521_mont_tpl_lower_9(a, t1, p521_mod); /* B = X*Y^2 */ - sp_521_mont_sqr_9(t2, y, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(b, t2, x, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_9(t1, b, p521_mod); - sp_521_mont_sub_9(x, x, t1, p521_mod); + sp_521_mont_dbl_9(t2, b, p521_mod); + sp_521_mont_sub_9(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_9(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(r[j].z, z, y, p521_mod, p521_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_521_mont_sqr_9(t2, t2, p521_mod, p521_mp_mod); + /* t1 = Y^4 */ + sp_521_mont_sqr_9(t1, t1, p521_mod, p521_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_521_mont_mul_9(w, w, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(w, w, t1, p521_mod, p521_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_9(y, b, x, p521_mod); - sp_521_mont_mul_9(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_9(y, y, p521_mod); - sp_521_mont_sub_9(y, y, t2, p521_mod); + sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod); + sp_521_mont_sub_9(y, y, t1, p521_mod); /* Y = Y/2 */ sp_521_div2_9(r[j].y, y, p521_mod); @@ -72166,30 +72152,30 @@ static void sp_521_proj_point_add_sub_9(sp_point_521* ra, sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; sp_digit* t6 = t + 10*9; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t3, t1, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t1, t1, x, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t1, t1, xa, p521_mod, p521_mp_mod); /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_9(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t2, za, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t4, t2, za, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod); /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_9(t3, t3, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t3, t3, ya, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - U1 */ @@ -72200,30 +72186,30 @@ static void sp_521_proj_point_add_sub_9(sp_point_521* ra, sp_521_mont_sub_9(t4, t4, t3, p521_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(z, z, t2, p521_mod, p521_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_521_mont_mul_9(za, za, q->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(za, za, t2, p521_mod, p521_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(xa, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_9(xs, t6, p521_mod, p521_mp_mod); sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(y, t1, t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ya, t1, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_sub_9(x, x, t5, p521_mod); + sp_521_mont_sub_9(xa, xa, t5, p521_mod); sp_521_mont_sub_9(xs, xs, t5, p521_mod); - sp_521_mont_dbl_9(t1, y, p521_mod); - sp_521_mont_sub_9(x, x, t1, p521_mod); + sp_521_mont_dbl_9(t1, ya, p521_mod); + sp_521_mont_sub_9(xa, xa, t1, p521_mod); sp_521_mont_sub_9(xs, xs, t1, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_521_mont_sub_9(ys, y, xs, p521_mod); - sp_521_mont_sub_9(y, y, x, p521_mod); - sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod); + sp_521_mont_sub_lower_9(ys, ya, xs, p521_mod); + sp_521_mont_sub_lower_9(ya, ya, xa, p521_mod); + sp_521_mont_mul_9(ya, ya, t4, p521_mod, p521_mp_mod); sp_521_sub_9(t6, p521_mod, t6); sp_521_mont_mul_9(ys, ys, t6, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t3, p521_mod, p521_mp_mod); - sp_521_mont_sub_9(y, y, t5, p521_mod); + sp_521_mont_sub_9(ya, ya, t5, p521_mod); sp_521_mont_sub_9(ys, ys, t5, p521_mod); } @@ -72536,17 +72522,12 @@ typedef struct sp_table_entry_521 { static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*9; sp_digit* t3 = t + 4*9; sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*9; /* Check double */ (void)sp_521_sub_9(t1, p521_mod, q->y); @@ -72556,53 +72537,54 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p, sp_521_proj_point_dbl_9(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_9(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - X1 */ - sp_521_mont_sub_9(t2, t2, x, p521_mod); + sp_521_mont_sub_9(t2, t2, p->x, p521_mod); /* R = S2 - Y1 */ - sp_521_mont_sub_9(t4, t4, y, p521_mod); + sp_521_mont_sub_9(t4, t4, p->y, p521_mod); /* Z3 = H*Z1 */ - sp_521_mont_mul_9(z, z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_521_mont_sqr_9(t1, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t3, x, t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t3, p->x, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod); sp_521_mont_sub_9(x, t1, t5, p521_mod); sp_521_mont_dbl_9(t1, t3, p521_mod); sp_521_mont_sub_9(x, x, t1, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_9(t3, t3, x, p521_mod); + sp_521_mont_sub_lower_9(t3, t3, x, p521_mod); sp_521_mont_mul_9(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t5, t5, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t5, t5, p->y, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, t3, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -72800,7 +72782,7 @@ static int sp_521_ecc_mulmod_stripe_9(sp_point_521* r, const sp_point_521* g, sp_digit* t = NULL; #else sp_point_521 rt[2]; - sp_digit t[2 * 9 * 5]; + sp_digit t[2 * 9 * 6]; #endif sp_point_521* p = NULL; int i; @@ -72821,7 +72803,7 @@ static int sp_521_ecc_mulmod_stripe_9(sp_point_521* r, const sp_point_521* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -73212,7 +73194,7 @@ static int sp_521_ecc_mulmod_stripe_9(sp_point_521* r, const sp_point_521* g, sp_digit* t = NULL; #else sp_point_521 rt[2]; - sp_digit t[2 * 9 * 5]; + sp_digit t[2 * 9 * 6]; #endif sp_point_521* p = NULL; int i; @@ -73233,7 +73215,7 @@ static int sp_521_ecc_mulmod_stripe_9(sp_point_521* r, const sp_point_521* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -73525,7 +73507,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[9 + 9 * 2 * 5]; + sp_digit k[9 + 9 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -73538,7 +73520,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (9 + 9 * 2 * 5), heap, + sizeof(sp_digit) * (9 + 9 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -109858,7 +109840,7 @@ static int sp_521_ecc_mulmod_add_only_9(sp_point_521* r, const sp_point_521* g, sp_digit* tmp = NULL; #else sp_point_521 rt[2]; - sp_digit tmp[2 * 9 * 5]; + sp_digit tmp[2 * 9 * 6]; #endif sp_point_521* p = NULL; sp_digit* negy = NULL; @@ -109877,7 +109859,7 @@ static int sp_521_ecc_mulmod_add_only_9(sp_point_521* r, const sp_point_521* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -109936,7 +109918,7 @@ static int sp_521_ecc_mulmod_add_only_9(sp_point_521* r, const sp_point_521* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 9 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 9 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -110038,7 +110020,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[9 + 9 * 2 * 5]; + sp_digit k[9 + 9 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -110051,7 +110033,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (9 + 9 * 2 * 5), + sizeof(sp_digit) * (9 + 9 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -111286,7 +111268,7 @@ typedef struct sp_ecc_verify_521_ctx { sp_digit u1[2*9]; sp_digit u2[2*9]; sp_digit s[2*9]; - sp_digit tmp[2*9 * 5]; + sp_digit tmp[2*9 * 6]; sp_point_521 p1; sp_point_521 p2; } sp_ecc_verify_521_ctx; @@ -111435,7 +111417,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_521* p1 = NULL; #else - sp_digit u1[16 * 9]; + sp_digit u1[18 * 9]; sp_point_521 p1[2]; #endif sp_digit* u2 = NULL; @@ -111454,7 +111436,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 9, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 9, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -111761,7 +111743,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_521* p = NULL; #else - sp_digit tmp[2 * 9 * 5]; + sp_digit tmp[2 * 9 * 6]; sp_point_521 p[2]; #endif sp_point_521* q = NULL; @@ -111775,7 +111757,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -114631,7 +114613,7 @@ static void sp_1024_map_16(sp_point_1024* r, const sp_point_1024* p, (sp_digit)1 : (sp_digit)0)); sp_1024_norm_16(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -115089,6 +115071,7 @@ static void sp_1024_mont_sub_16(sp_digit* r, const sp_digit* a, const sp_digit* ); } +#define sp_1024_mont_sub_lower_16 sp_1024_mont_sub_16 #ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. @@ -115391,7 +115374,7 @@ static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_16(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_lower_16(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -115417,7 +115400,8 @@ static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p, sp_digit* t) +static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*16; @@ -115464,13 +115448,15 @@ static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_16(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_16(y, y, x, p1024_mod); + sp_1024_mont_sub_lower_16(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_16(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ sp_1024_mont_sub_16(y, y, t2, p1024_mod); } +#define sp_1024_mont_dbl_lower_16 sp_1024_mont_dbl_16 +#define sp_1024_mont_tpl_lower_16 sp_1024_mont_tpl_16 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -115509,7 +115495,7 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int n, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_16(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -115517,9 +115503,12 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int n, sp_1024_mont_sqr_16(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_16(t2, b, p1024_mod); sp_1024_mont_sub_16(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_16(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_16(t1, t1, p1024_mod, p1024_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -115529,16 +115518,14 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int n, sp_1024_mont_mul_16(w, w, t1, p1024_mod, p1024_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_16(y, b, x, p1024_mod); - sp_1024_mont_mul_16(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_16(y, y, p1024_mod); + sp_1024_mont_mul_16(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(y, y, t1, p1024_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_16(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -115546,14 +115533,15 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int n, sp_1024_mont_sqr_16(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_16(t2, b, p1024_mod); sp_1024_mont_sub_16(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_16(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_16(t1, t1, p1024_mod, p1024_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_16(y, b, x, p1024_mod); - sp_1024_mont_mul_16(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_16(y, y, p1024_mod); + sp_1024_mont_mul_16(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(y, y, t1, p1024_mod); #endif /* Y = Y/2 */ @@ -115694,6 +115682,7 @@ typedef struct sp_1024_proj_point_add_16_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -115722,6 +115711,10 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->t3 = t + 4*16; ctx->t4 = t + 6*16; ctx->t5 = t + 8*16; + ctx->t6 = t + 10*16; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -115746,29 +115739,6 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_1024)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<16; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<16; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<16; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -115782,16 +115752,16 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 6; break; case 6: - sp_1024_mont_mul_16(ctx->t1, ctx->t1, ctx->x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_16(ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 8; break; case 8: - sp_1024_mont_mul_16(ctx->t4, ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 9; break; case 9: @@ -115800,7 +115770,7 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_16(ctx->t3, ctx->t3, ctx->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod); ctx->state = 11; break; case 11: @@ -115819,29 +115789,29 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_16(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_1024_mont_sqr_16(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 15; break; case 15: - sp_1024_mont_mul_16(ctx->z, ctx->z, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_16(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 17; break; case 17: - sp_1024_mont_sqr_16(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_16(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 18; break; case 18: - sp_1024_mont_mul_16(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); ctx->state = 19; break; case 19: - sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 20; break; case 20: @@ -115849,24 +115819,24 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 21; break; case 21: - sp_1024_mont_dbl_16(ctx->t1, ctx->y, p1024_mod); + sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); ctx->state = 22; break; case 22: - sp_1024_mont_sub_16(ctx->x, ctx->x, ctx->t1, p1024_mod); + sp_1024_mont_dbl_16(ctx->t3, ctx->y, p1024_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_16(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_16(ctx->x, ctx->x, ctx->t3, p1024_mod); ctx->state = 24; break; case 24: - sp_1024_mont_mul_16(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_16(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 25; break; case 25: - sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 26; break; case 26: @@ -115874,9 +115844,30 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -115888,24 +115879,13 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, static void sp_1024_proj_point_add_16(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*16; sp_digit* t3 = t + 4*16; sp_digit* t4 = t + 6*16; sp_digit* t5 = t + 8*16; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*16; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_1024* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_1024_mont_sub_16(t1, p1024_mod, q->y, p1024_mod); @@ -115915,60 +115895,61 @@ static void sp_1024_proj_point_add_16(sp_point_1024* r, sp_1024_proj_point_dbl_16(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<16; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<16; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<16; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_16(t1, q->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t3, t1, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t1, t1, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t1, t1, p->x, p1024_mod, p1024_mp_mod); /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_16(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_16(t3, t3, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t3, t3, p->y, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - U1 */ sp_1024_mont_sub_16(t2, t2, t1, p1024_mod); /* R = S2 - S1 */ sp_1024_mont_sub_16(t4, t4, t3, p1024_mod); - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_16(z, z, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(z, z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_16(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_16(t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(y, t1, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t5, t5, t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_16(z, p->z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(z, z, q->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(x, x, t5, p1024_mod); - sp_1024_mont_dbl_16(t1, y, p1024_mod); - sp_1024_mont_sub_16(x, x, t1, p1024_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_16(y, y, x, p1024_mod); - sp_1024_mont_mul_16(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t5, t5, t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_dbl_16(t3, y, p1024_mod); + sp_1024_mont_sub_16(x, x, t3, p1024_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_16(y, y, x, p1024_mod); + sp_1024_mont_mul_16(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(y, y, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -116015,29 +115996,30 @@ static void sp_1024_proj_point_dbl_n_store_16(sp_point_1024* r, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_16(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_16(a, t1, p1024_mod); /* B = X*Y^2 */ - sp_1024_mont_sqr_16(t2, y, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(b, t2, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(t1, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(b, t1, x, p1024_mod, p1024_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_1024_mont_sqr_16(x, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_16(t1, b, p1024_mod); - sp_1024_mont_sub_16(x, x, t1, p1024_mod); + sp_1024_mont_dbl_16(t2, b, p1024_mod); + sp_1024_mont_sub_16(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_16(r[j].z, z, y, p1024_mod, p1024_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_1024_mont_sqr_16(t2, t2, p1024_mod, p1024_mp_mod); + /* t1 = Y^4 */ + sp_1024_mont_sqr_16(t1, t1, p1024_mod, p1024_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_1024_mont_mul_16(w, w, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(w, w, t1, p1024_mod, p1024_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_16(y, b, x, p1024_mod); - sp_1024_mont_mul_16(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_16(y, y, p1024_mod); - sp_1024_mont_sub_16(y, y, t2, p1024_mod); + sp_1024_mont_mul_16(y, b, a, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_16(y, y, t1, p1024_mod); /* Y = Y/2 */ sp_1024_div2_16(r[j].y, y, p1024_mod); @@ -116063,30 +116045,30 @@ static void sp_1024_proj_point_add_sub_16(sp_point_1024* ra, sp_digit* t4 = t + 6*16; sp_digit* t5 = t + 8*16; sp_digit* t6 = t + 10*16; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_16(t1, q->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t3, t1, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t1, t1, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t1, t1, xa, p1024_mod, p1024_mp_mod); /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_16(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(t2, za, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t4, t2, za, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_16(t3, t3, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t3, t3, ya, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - U1 */ @@ -116097,30 +116079,30 @@ static void sp_1024_proj_point_add_sub_16(sp_point_1024* ra, sp_1024_mont_sub_16(t4, t4, t3, p1024_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_1024_mont_mul_16(z, z, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(z, z, t2, p1024_mod, p1024_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_1024_mont_mul_16(za, za, q->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(za, za, t2, p1024_mod, p1024_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_16(x, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(xa, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_16(xs, t6, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_16(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(y, t1, t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ya, t1, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t5, t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_16(x, x, t5, p1024_mod); + sp_1024_mont_sub_16(xa, xa, t5, p1024_mod); sp_1024_mont_sub_16(xs, xs, t5, p1024_mod); - sp_1024_mont_dbl_16(t1, y, p1024_mod); - sp_1024_mont_sub_16(x, x, t1, p1024_mod); + sp_1024_mont_dbl_16(t1, ya, p1024_mod); + sp_1024_mont_sub_16(xa, xa, t1, p1024_mod); sp_1024_mont_sub_16(xs, xs, t1, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_1024_mont_sub_16(ys, y, xs, p1024_mod); - sp_1024_mont_sub_16(y, y, x, p1024_mod); - sp_1024_mont_mul_16(y, y, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_lower_16(ys, ya, xs, p1024_mod); + sp_1024_mont_sub_lower_16(ya, ya, xa, p1024_mod); + sp_1024_mont_mul_16(ya, ya, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t6, p1024_mod, t6, p1024_mod); sp_1024_mont_mul_16(ys, ys, t6, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t5, t5, t3, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_16(y, y, t5, p1024_mod); + sp_1024_mont_sub_16(ya, ya, t5, p1024_mod); sp_1024_mont_sub_16(ys, ys, t5, p1024_mod); } @@ -116372,17 +116354,12 @@ typedef struct sp_table_entry_1024 { static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*16; sp_digit* t3 = t + 4*16; sp_digit* t4 = t + 6*16; sp_digit* t5 = t + 8*16; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*16; /* Check double */ (void)sp_1024_mont_sub_16(t1, p1024_mod, q->y, p1024_mod); @@ -116392,53 +116369,54 @@ static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, const sp_point_1024* sp_1024_proj_point_dbl_16(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<16; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<16; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<16; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_16(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - X1 */ - sp_1024_mont_sub_16(t2, t2, x, p1024_mod); + sp_1024_mont_sub_16(t2, t2, p->x, p1024_mod); /* R = S2 - Y1 */ - sp_1024_mont_sub_16(t4, t4, y, p1024_mod); + sp_1024_mont_sub_16(t4, t4, p->y, p1024_mod); /* Z3 = H*Z1 */ - sp_1024_mont_mul_16(z, z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_1024_mont_sqr_16(t1, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_16(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t3, x, t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t3, p->x, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t5, t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(x, t1, t5, p1024_mod); sp_1024_mont_dbl_16(t1, t3, p1024_mod); sp_1024_mont_sub_16(x, x, t1, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_16(t3, t3, x, p1024_mod); + sp_1024_mont_sub_lower_16(t3, t3, x, p1024_mod); sp_1024_mont_mul_16(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t5, t5, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t5, t5, p->y, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(y, t3, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -116581,7 +116559,7 @@ static int sp_1024_ecc_mulmod_stripe_16(sp_point_1024* r, const sp_point_1024* g sp_digit* t = NULL; #else sp_point_1024 rt[2]; - sp_digit t[2 * 16 * 5]; + sp_digit t[2 * 16 * 6]; #endif sp_point_1024* p = NULL; int i; @@ -116602,7 +116580,7 @@ static int sp_1024_ecc_mulmod_stripe_16(sp_point_1024* r, const sp_point_1024* g if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 16 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 16 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -116767,7 +116745,7 @@ static int sp_1024_ecc_mulmod_16(sp_point_1024* r, const sp_point_1024* g, const #ifndef FP_ECC return sp_1024_ecc_mulmod_win_add_sub_16(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 16 * 5]; + sp_digit tmp[2 * 16 * 6]; sp_cache_1024_t* cache; int err = MP_OKAY; @@ -120277,7 +120255,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_1024 point[2]; - sp_digit k[16 + 16 * 2 * 5]; + sp_digit k[16 + 16 * 2 * 6]; #endif sp_point_1024* addP = NULL; sp_digit* tmp = NULL; @@ -120290,7 +120268,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (16 + 16 * 2 * 5), + sizeof(sp_digit) * (16 + 16 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c index cfa7dd2a8..a490dd471 100644 --- a/wolfcrypt/src/sp_armthumb.c +++ b/wolfcrypt/src/sp_armthumb.c @@ -98656,7 +98656,7 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p, (sp_digit)1 : (sp_digit)0)); sp_256_norm_8(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -99536,6 +99536,7 @@ SP_NOINLINE static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a, ); } +#define sp_256_mont_sub_lower_8 sp_256_mont_sub_8 /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) * * r Result of division by 2. @@ -99924,7 +99925,7 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con break; case 16: /* Y = Y - X */ - sp_256_mont_sub_8(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -99950,7 +99951,8 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p, sp_digit* t) +static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*8; @@ -99997,7 +99999,7 @@ static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p, sp_d /* X = X - Y */ sp_256_mont_sub_8(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_8(y, y, x, p256_mod); + sp_256_mont_sub_lower_8(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_8(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -100037,6 +100039,7 @@ typedef struct sp_256_proj_point_add_8_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -100065,6 +100068,10 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->t3 = t + 4*8; ctx->t4 = t + 6*8; ctx->t5 = t + 8*8; + ctx->t6 = t + 10*8; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -100089,29 +100096,6 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_256)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<8; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<8; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<8; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -100125,16 +100109,16 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 6; break; case 6: - sp_256_mont_mul_8(ctx->t1, ctx->t1, ctx->x, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_8(ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 8; break; case 8: - sp_256_mont_mul_8(ctx->t4, ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 9; break; case 9: @@ -100143,7 +100127,7 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_8(ctx->t3, ctx->t3, ctx->y, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod); ctx->state = 11; break; case 11: @@ -100162,29 +100146,29 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_8(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_256_mont_sqr_8(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 15; break; case 15: - sp_256_mont_mul_8(ctx->z, ctx->z, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_8(ctx->x, ctx->t4, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 17; break; case 17: - sp_256_mont_sqr_8(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_8(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 18; break; case 18: - sp_256_mont_mul_8(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); ctx->state = 19; break; case 19: - sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(ctx->x, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 20; break; case 20: @@ -100192,24 +100176,24 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 21; break; case 21: - sp_256_mont_dbl_8(ctx->t1, ctx->y, p256_mod); + sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); ctx->state = 22; break; case 22: - sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t1, p256_mod); + sp_256_mont_dbl_8(ctx->t3, ctx->y, p256_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_8(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t3, p256_mod); ctx->state = 24; break; case 24: - sp_256_mont_mul_8(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 25; break; case 25: - sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 26; break; case 26: @@ -100217,9 +100201,30 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -100231,24 +100236,13 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, static void sp_256_proj_point_add_8(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*8; sp_digit* t3 = t + 4*8; sp_digit* t4 = t + 6*8; sp_digit* t5 = t + 8*8; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*8; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_256* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_256_sub_8(t1, p256_mod, q->y); @@ -100258,60 +100252,61 @@ static void sp_256_proj_point_add_8(sp_point_256* r, sp_256_proj_point_dbl_8(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<8; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<8; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<8; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_8(t1, q->z, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t3, t1, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t1, t1, x, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t1, t1, p->x, p256_mod, p256_mp_mod); /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_8(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod); /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_8(t3, t3, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t3, t3, p->y, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ sp_256_mont_sub_8(t2, t2, t1, p256_mod); /* R = S2 - S1 */ sp_256_mont_sub_8(t4, t4, t3, p256_mod); - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_8(z, z, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(z, z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_8(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod); sp_256_mont_mul_8(y, t1, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(z, z, q->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_8(x, x, t5, p256_mod); - sp_256_mont_dbl_8(t1, y, p256_mod); - sp_256_mont_sub_8(x, x, t1, p256_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_8(y, y, x, p256_mod); - sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t5, t5, t3, p256_mod, p256_mp_mod); + sp_256_mont_dbl_8(t3, y, p256_mod); + sp_256_mont_sub_8(x, x, t3, p256_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_8(y, y, x, p256_mod); + sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, y, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -100406,7 +100401,7 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons sp_digit* tmp = NULL; #else sp_point_256 t[16 + 1]; - sp_digit tmp[2 * 8 * 5]; + sp_digit tmp[2 * 8 * 6]; #endif sp_point_256* rt = NULL; #ifndef WC_NO_CACHE_RESISTANT @@ -100440,7 +100435,7 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons } #endif if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -100541,7 +100536,7 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 8 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 8 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -100571,6 +100566,8 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons } #ifdef FP_ECC +#define sp_256_mont_dbl_lower_8 sp_256_mont_dbl_8 +#define sp_256_mont_tpl_lower_8 sp_256_mont_tpl_8 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -100609,7 +100606,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int n, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_8(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_8(t1, t1, w, p256_mod); - sp_256_mont_tpl_8(a, t1, p256_mod); + sp_256_mont_tpl_lower_8(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_8(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_8(b, t1, x, p256_mod, p256_mp_mod); @@ -100617,9 +100614,12 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int n, sp_256_mont_sqr_8(x, a, p256_mod, p256_mp_mod); sp_256_mont_dbl_8(t2, b, p256_mod); sp_256_mont_sub_8(x, x, t2, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_sub_lower_8(t2, b, x, p256_mod); + sp_256_mont_dbl_lower_8(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_8(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_8(t1, t1, p256_mod, p256_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -100629,16 +100629,14 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int n, sp_256_mont_mul_8(w, w, t1, p256_mod, p256_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_8(y, b, x, p256_mod); - sp_256_mont_mul_8(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_8(y, y, p256_mod); + sp_256_mont_mul_8(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, y, t1, p256_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_256_mont_sqr_8(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_8(t1, t1, w, p256_mod); - sp_256_mont_tpl_8(a, t1, p256_mod); + sp_256_mont_tpl_lower_8(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_8(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_8(b, t1, x, p256_mod, p256_mp_mod); @@ -100646,14 +100644,15 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int n, sp_256_mont_sqr_8(x, a, p256_mod, p256_mp_mod); sp_256_mont_dbl_8(t2, b, p256_mod); sp_256_mont_sub_8(x, x, t2, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_sub_lower_8(t2, b, x, p256_mod); + sp_256_mont_dbl_lower_8(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_8(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_8(t1, t1, p256_mod, p256_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_8(y, b, x, p256_mod); - sp_256_mont_mul_8(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_8(y, y, p256_mod); + sp_256_mont_mul_8(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, y, t1, p256_mod); #endif /* Y = Y/2 */ @@ -100703,17 +100702,12 @@ typedef struct sp_table_entry_256 { static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*8; sp_digit* t3 = t + 4*8; sp_digit* t4 = t + 6*8; sp_digit* t5 = t + 8*8; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*8; /* Check double */ (void)sp_256_sub_8(t1, p256_mod, q->y); @@ -100723,53 +100717,54 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p, sp_256_proj_point_dbl_8(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<8; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<8; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<8; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_8(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - X1 */ - sp_256_mont_sub_8(t2, t2, x, p256_mod); + sp_256_mont_sub_8(t2, t2, p->x, p256_mod); /* R = S2 - Y1 */ - sp_256_mont_sub_8(t4, t4, y, p256_mod); + sp_256_mont_sub_8(t4, t4, p->y, p256_mod); /* Z3 = H*Z1 */ - sp_256_mont_mul_8(z, z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_256_mont_sqr_8(t1, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t3, x, t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t3, p->x, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod); sp_256_mont_sub_8(x, t1, t5, p256_mod); sp_256_mont_dbl_8(t1, t3, p256_mod); sp_256_mont_sub_8(x, x, t1, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_8(t3, t3, x, p256_mod); + sp_256_mont_sub_lower_8(t3, t3, x, p256_mod); sp_256_mont_mul_8(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t5, t5, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t5, t5, p->y, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, t3, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -100941,7 +100936,7 @@ static int sp_256_ecc_mulmod_stripe_8(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 8 * 5]; + sp_digit t[2 * 8 * 6]; #endif sp_point_256* p = NULL; int i; @@ -100962,7 +100957,7 @@ static int sp_256_ecc_mulmod_stripe_8(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -101142,7 +101137,7 @@ static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g, const sp_ #ifndef FP_ECC return sp_256_ecc_mulmod_fast_8(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 8 * 5]; + sp_digit tmp[2 * 8 * 6]; sp_cache_256_t* cache; int err = MP_OKAY; @@ -101345,7 +101340,7 @@ static int sp_256_ecc_mulmod_stripe_8(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 8 * 5]; + sp_digit t[2 * 8 * 6]; #endif sp_point_256* p = NULL; int i; @@ -101366,7 +101361,7 @@ static int sp_256_ecc_mulmod_stripe_8(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -101546,7 +101541,7 @@ static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g, const sp_ #ifndef FP_ECC return sp_256_ecc_mulmod_fast_8(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 8 * 5]; + sp_digit tmp[2 * 8 * 6]; sp_cache_256_t* cache; int err = MP_OKAY; @@ -101657,7 +101652,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[8 + 8 * 2 * 5]; + sp_digit k[8 + 8 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -101670,7 +101665,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (8 + 8 * 2 * 5), heap, + sizeof(sp_digit) * (8 + 8 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -103206,7 +103201,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[8 + 8 * 2 * 5]; + sp_digit k[8 + 8 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -103219,7 +103214,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (8 + 8 * 2 * 5), + sizeof(sp_digit) * (8 + 8 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -106478,7 +106473,7 @@ typedef struct sp_ecc_verify_256_ctx { sp_digit u1[2*8]; sp_digit u2[2*8]; sp_digit s[2*8]; - sp_digit tmp[2*8 * 5]; + sp_digit tmp[2*8 * 6]; sp_point_256 p1; sp_point_256 p2; } sp_ecc_verify_256_ctx; @@ -106624,7 +106619,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_256* p1 = NULL; #else - sp_digit u1[16 * 8]; + sp_digit u1[18 * 8]; sp_point_256 p1[2]; #endif sp_digit* u2 = NULL; @@ -106643,7 +106638,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 8, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 8, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -106946,7 +106941,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_256* p = NULL; #else - sp_digit tmp[2 * 8 * 5]; + sp_digit tmp[2 * 8 * 6]; sp_point_256 p[2]; #endif sp_point_256* q = NULL; @@ -106960,7 +106955,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -109799,7 +109794,7 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p, (sp_digit)1 : (sp_digit)0)); sp_384_norm_12(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -109937,6 +109932,7 @@ SP_NOINLINE static void sp_384_mont_sub_12(sp_digit* r, const sp_digit* a, sp_384_cond_add_12(r, r, m, o); } +#define sp_384_mont_sub_lower_12 sp_384_mont_sub_12 /* Right shift a by 1 bit into r. (r = a >> 1) * * r A single precision integer. @@ -110298,7 +110294,7 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co break; case 16: /* Y = Y - X */ - sp_384_mont_sub_12(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -110324,7 +110320,8 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p, sp_digit* t) +static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*12; @@ -110371,7 +110368,7 @@ static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p, sp_ /* X = X - Y */ sp_384_mont_sub_12(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_12(y, y, x, p384_mod); + sp_384_mont_sub_lower_12(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_12(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ @@ -110412,6 +110409,7 @@ typedef struct sp_384_proj_point_add_12_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -110440,6 +110438,10 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->t3 = t + 4*12; ctx->t4 = t + 6*12; ctx->t5 = t + 8*12; + ctx->t6 = t + 10*12; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -110464,29 +110466,6 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_384)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<12; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<12; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<12; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -110500,16 +110479,16 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 6; break; case 6: - sp_384_mont_mul_12(ctx->t1, ctx->t1, ctx->x, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_12(ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 8; break; case 8: - sp_384_mont_mul_12(ctx->t4, ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 9; break; case 9: @@ -110518,7 +110497,7 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_12(ctx->t3, ctx->t3, ctx->y, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod); ctx->state = 11; break; case 11: @@ -110537,29 +110516,29 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_12(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_384_mont_sqr_12(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 15; break; case 15: - sp_384_mont_mul_12(ctx->z, ctx->z, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_12(ctx->x, ctx->t4, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 17; break; case 17: - sp_384_mont_sqr_12(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_12(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 18; break; case 18: - sp_384_mont_mul_12(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); ctx->state = 19; break; case 19: - sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(ctx->x, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 20; break; case 20: @@ -110567,24 +110546,24 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 21; break; case 21: - sp_384_mont_dbl_12(ctx->t1, ctx->y, p384_mod); + sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); ctx->state = 22; break; case 22: - sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t1, p384_mod); + sp_384_mont_dbl_12(ctx->t3, ctx->y, p384_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_12(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t3, p384_mod); ctx->state = 24; break; case 24: - sp_384_mont_mul_12(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 25; break; case 25: - sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 26; break; case 26: @@ -110592,9 +110571,30 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -110606,24 +110606,13 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, static void sp_384_proj_point_add_12(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*12; sp_digit* t3 = t + 4*12; sp_digit* t4 = t + 6*12; sp_digit* t5 = t + 8*12; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*12; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_384* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_384_sub_12(t1, p384_mod, q->y); @@ -110633,60 +110622,61 @@ static void sp_384_proj_point_add_12(sp_point_384* r, sp_384_proj_point_dbl_12(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<12; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<12; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<12; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_12(t1, q->z, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t3, t1, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t1, t1, x, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t1, t1, p->x, p384_mod, p384_mp_mod); /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_12(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod); /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_12(t3, t3, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t3, t3, p->y, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - U1 */ sp_384_mont_sub_12(t2, t2, t1, p384_mod); /* R = S2 - S1 */ sp_384_mont_sub_12(t4, t4, t3, p384_mod); - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_12(z, z, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(z, z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_12(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod); sp_384_mont_mul_12(y, t1, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(z, z, q->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_12(x, x, t5, p384_mod); - sp_384_mont_dbl_12(t1, y, p384_mod); - sp_384_mont_sub_12(x, x, t1, p384_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_12(y, y, x, p384_mod); - sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t5, t5, t3, p384_mod, p384_mp_mod); + sp_384_mont_dbl_12(t3, y, p384_mod); + sp_384_mont_sub_12(x, x, t3, p384_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_12(y, y, x, p384_mod); + sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, y, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -110970,6 +110960,8 @@ static int sp_384_ecc_mulmod_fast_12(sp_point_384* r, const sp_point_384* g, con } #ifdef FP_ECC +#define sp_384_mont_dbl_lower_12 sp_384_mont_dbl_12 +#define sp_384_mont_tpl_lower_12 sp_384_mont_tpl_12 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -111008,7 +111000,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int n, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_12(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_12(t1, t1, w, p384_mod); - sp_384_mont_tpl_12(a, t1, p384_mod); + sp_384_mont_tpl_lower_12(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_12(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_12(b, t1, x, p384_mod, p384_mp_mod); @@ -111016,9 +111008,12 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int n, sp_384_mont_sqr_12(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_12(t2, b, p384_mod); sp_384_mont_sub_12(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_12(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_12(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_12(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_12(t1, t1, p384_mod, p384_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -111028,16 +111023,14 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int n, sp_384_mont_mul_12(w, w, t1, p384_mod, p384_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_12(y, b, x, p384_mod); - sp_384_mont_mul_12(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_12(y, y, p384_mod); + sp_384_mont_mul_12(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, y, t1, p384_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_384_mont_sqr_12(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_12(t1, t1, w, p384_mod); - sp_384_mont_tpl_12(a, t1, p384_mod); + sp_384_mont_tpl_lower_12(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_12(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_12(b, t1, x, p384_mod, p384_mp_mod); @@ -111045,14 +111038,15 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int n, sp_384_mont_sqr_12(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_12(t2, b, p384_mod); sp_384_mont_sub_12(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_12(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_12(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_12(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_12(t1, t1, p384_mod, p384_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_12(y, b, x, p384_mod); - sp_384_mont_mul_12(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_12(y, y, p384_mod); + sp_384_mont_mul_12(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, y, t1, p384_mod); #endif /* Y = Y/2 */ @@ -111102,17 +111096,12 @@ typedef struct sp_table_entry_384 { static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*12; sp_digit* t3 = t + 4*12; sp_digit* t4 = t + 6*12; sp_digit* t5 = t + 8*12; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*12; /* Check double */ (void)sp_384_sub_12(t1, p384_mod, q->y); @@ -111122,53 +111111,54 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p, sp_384_proj_point_dbl_12(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<12; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<12; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<12; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_12(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - X1 */ - sp_384_mont_sub_12(t2, t2, x, p384_mod); + sp_384_mont_sub_12(t2, t2, p->x, p384_mod); /* R = S2 - Y1 */ - sp_384_mont_sub_12(t4, t4, y, p384_mod); + sp_384_mont_sub_12(t4, t4, p->y, p384_mod); /* Z3 = H*Z1 */ - sp_384_mont_mul_12(z, z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_384_mont_sqr_12(t1, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t3, x, t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t3, p->x, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod); sp_384_mont_sub_12(x, t1, t5, p384_mod); sp_384_mont_dbl_12(t1, t3, p384_mod); sp_384_mont_sub_12(x, x, t1, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_12(t3, t3, x, p384_mod); + sp_384_mont_sub_lower_12(t3, t3, x, p384_mod); sp_384_mont_mul_12(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t5, t5, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t5, t5, p->y, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, t3, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -117386,7 +117376,7 @@ typedef struct sp_ecc_verify_384_ctx { sp_digit u1[2*12]; sp_digit u2[2*12]; sp_digit s[2*12]; - sp_digit tmp[2*12 * 5]; + sp_digit tmp[2*12 * 6]; sp_point_384 p1; sp_point_384 p2; } sp_ecc_verify_384_ctx; @@ -117532,7 +117522,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_384* p1 = NULL; #else - sp_digit u1[16 * 12]; + sp_digit u1[18 * 12]; sp_point_384 p1[2]; #endif sp_digit* u2 = NULL; @@ -117551,7 +117541,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 12, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 12, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -117854,7 +117844,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_384* p = NULL; #else - sp_digit tmp[2 * 12 * 5]; + sp_digit tmp[2 * 12 * 6]; sp_point_384 p[2]; #endif sp_point_384* q = NULL; @@ -117868,7 +117858,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 12 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 12 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -121836,7 +121826,7 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p, (sp_digit)1 : (sp_digit)0)); sp_521_norm_17(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -123378,6 +123368,7 @@ SP_NOINLINE static void sp_521_mont_sub_17(sp_digit* r, const sp_digit* a, ); } +#define sp_521_mont_sub_lower_17 sp_521_mont_sub_17 /* Right shift a by 1 bit into r. (r = a >> 1) * * r A single precision integer. @@ -123834,7 +123825,7 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co break; case 16: /* Y = Y - X */ - sp_521_mont_sub_17(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -123860,7 +123851,8 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p, sp_digit* t) +static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*17; @@ -123907,7 +123899,7 @@ static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p, sp_ /* X = X - Y */ sp_521_mont_sub_17(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_17(y, y, x, p521_mod); + sp_521_mont_sub_lower_17(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_17(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ @@ -123950,6 +123942,7 @@ typedef struct sp_521_proj_point_add_17_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -123978,6 +123971,10 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->t3 = t + 4*17; ctx->t4 = t + 6*17; ctx->t5 = t + 8*17; + ctx->t6 = t + 10*17; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -124002,29 +123999,6 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_521)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<17; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<17; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<17; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -124038,16 +124012,16 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 6; break; case 6: - sp_521_mont_mul_17(ctx->t1, ctx->t1, ctx->x, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_17(ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 8; break; case 8: - sp_521_mont_mul_17(ctx->t4, ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 9; break; case 9: @@ -124056,7 +124030,7 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_17(ctx->t3, ctx->t3, ctx->y, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod); ctx->state = 11; break; case 11: @@ -124075,29 +124049,29 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_17(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_521_mont_sqr_17(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 15; break; case 15: - sp_521_mont_mul_17(ctx->z, ctx->z, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_17(ctx->x, ctx->t4, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 17; break; case 17: - sp_521_mont_sqr_17(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_17(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 18; break; case 18: - sp_521_mont_mul_17(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); ctx->state = 19; break; case 19: - sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(ctx->x, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 20; break; case 20: @@ -124105,24 +124079,24 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 21; break; case 21: - sp_521_mont_dbl_17(ctx->t1, ctx->y, p521_mod); + sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); ctx->state = 22; break; case 22: - sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t1, p521_mod); + sp_521_mont_dbl_17(ctx->t3, ctx->y, p521_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_17(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t3, p521_mod); ctx->state = 24; break; case 24: - sp_521_mont_mul_17(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 25; break; case 25: - sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 26; break; case 26: @@ -124130,9 +124104,30 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -124144,24 +124139,13 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, static void sp_521_proj_point_add_17(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*17; sp_digit* t3 = t + 4*17; sp_digit* t4 = t + 6*17; sp_digit* t5 = t + 8*17; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*17; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_521* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_521_sub_17(t1, p521_mod, q->y); @@ -124171,60 +124155,61 @@ static void sp_521_proj_point_add_17(sp_point_521* r, sp_521_proj_point_dbl_17(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<17; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<17; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<17; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_17(t1, q->z, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t3, t1, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t1, t1, x, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t1, t1, p->x, p521_mod, p521_mp_mod); /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_17(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod); /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_17(t3, t3, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t3, t3, p->y, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - U1 */ sp_521_mont_sub_17(t2, t2, t1, p521_mod); /* R = S2 - S1 */ sp_521_mont_sub_17(t4, t4, t3, p521_mod); - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_17(z, z, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(z, z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_17(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod); sp_521_mont_mul_17(y, t1, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(z, z, q->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_17(x, x, t5, p521_mod); - sp_521_mont_dbl_17(t1, y, p521_mod); - sp_521_mont_sub_17(x, x, t1, p521_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_17(y, y, x, p521_mod); - sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t5, t5, t3, p521_mod, p521_mp_mod); + sp_521_mont_dbl_17(t3, y, p521_mod); + sp_521_mont_sub_17(x, x, t3, p521_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_17(y, y, x, p521_mod); + sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, y, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -124373,7 +124358,7 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con sp_digit* tmp = NULL; #else sp_point_521 t[16 + 1]; - sp_digit tmp[2 * 17 * 5]; + sp_digit tmp[2 * 17 * 6]; #endif sp_point_521* rt = NULL; #ifndef WC_NO_CACHE_RESISTANT @@ -124407,7 +124392,7 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con } #endif if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -124512,7 +124497,7 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 17 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 17 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -124542,6 +124527,8 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con } #ifdef FP_ECC +#define sp_521_mont_dbl_lower_17 sp_521_mont_dbl_17 +#define sp_521_mont_tpl_lower_17 sp_521_mont_tpl_17 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -124580,7 +124567,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int n, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_17(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_17(t1, t1, w, p521_mod); - sp_521_mont_tpl_17(a, t1, p521_mod); + sp_521_mont_tpl_lower_17(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_17(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_17(b, t1, x, p521_mod, p521_mp_mod); @@ -124588,9 +124575,12 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int n, sp_521_mont_sqr_17(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_17(t2, b, p521_mod); sp_521_mont_sub_17(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_17(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_17(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_17(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_17(t1, t1, p521_mod, p521_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -124600,16 +124590,14 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int n, sp_521_mont_mul_17(w, w, t1, p521_mod, p521_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_17(y, b, x, p521_mod); - sp_521_mont_mul_17(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_17(y, y, p521_mod); + sp_521_mont_mul_17(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, y, t1, p521_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_521_mont_sqr_17(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_17(t1, t1, w, p521_mod); - sp_521_mont_tpl_17(a, t1, p521_mod); + sp_521_mont_tpl_lower_17(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_17(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_17(b, t1, x, p521_mod, p521_mp_mod); @@ -124617,14 +124605,15 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int n, sp_521_mont_sqr_17(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_17(t2, b, p521_mod); sp_521_mont_sub_17(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_17(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_17(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_17(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_17(t1, t1, p521_mod, p521_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_17(y, b, x, p521_mod); - sp_521_mont_mul_17(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_17(y, y, p521_mod); + sp_521_mont_mul_17(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, y, t1, p521_mod); #endif /* Y = Y/2 */ @@ -124674,17 +124663,12 @@ typedef struct sp_table_entry_521 { static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*17; sp_digit* t3 = t + 4*17; sp_digit* t4 = t + 6*17; sp_digit* t5 = t + 8*17; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*17; /* Check double */ (void)sp_521_sub_17(t1, p521_mod, q->y); @@ -124694,53 +124678,54 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p, sp_521_proj_point_dbl_17(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<17; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<17; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<17; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_17(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - X1 */ - sp_521_mont_sub_17(t2, t2, x, p521_mod); + sp_521_mont_sub_17(t2, t2, p->x, p521_mod); /* R = S2 - Y1 */ - sp_521_mont_sub_17(t4, t4, y, p521_mod); + sp_521_mont_sub_17(t4, t4, p->y, p521_mod); /* Z3 = H*Z1 */ - sp_521_mont_mul_17(z, z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_521_mont_sqr_17(t1, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t3, x, t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t3, p->x, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod); sp_521_mont_sub_17(x, t1, t5, p521_mod); sp_521_mont_dbl_17(t1, t3, p521_mod); sp_521_mont_sub_17(x, x, t1, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_17(t3, t3, x, p521_mod); + sp_521_mont_sub_lower_17(t3, t3, x, p521_mod); sp_521_mont_mul_17(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t5, t5, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t5, t5, p->y, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, t3, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -124948,7 +124933,7 @@ static int sp_521_ecc_mulmod_stripe_17(sp_point_521* r, const sp_point_521* g, sp_digit* t = NULL; #else sp_point_521 rt[2]; - sp_digit t[2 * 17 * 5]; + sp_digit t[2 * 17 * 6]; #endif sp_point_521* p = NULL; int i; @@ -124969,7 +124954,7 @@ static int sp_521_ecc_mulmod_stripe_17(sp_point_521* r, const sp_point_521* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -125388,7 +125373,7 @@ static int sp_521_ecc_mulmod_stripe_17(sp_point_521* r, const sp_point_521* g, sp_digit* t = NULL; #else sp_point_521 rt[2]; - sp_digit t[2 * 17 * 5]; + sp_digit t[2 * 17 * 6]; #endif sp_point_521* p = NULL; int i; @@ -125409,7 +125394,7 @@ static int sp_521_ecc_mulmod_stripe_17(sp_point_521* r, const sp_point_521* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -125700,7 +125685,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[17 + 17 * 2 * 5]; + sp_digit k[17 + 17 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -125713,7 +125698,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (17 + 17 * 2 * 5), heap, + sizeof(sp_digit) * (17 + 17 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -127793,7 +127778,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[17 + 17 * 2 * 5]; + sp_digit k[17 + 17 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -127806,7 +127791,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (17 + 17 * 2 * 5), + sizeof(sp_digit) * (17 + 17 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -134036,7 +134021,7 @@ typedef struct sp_ecc_verify_521_ctx { sp_digit u1[2*17]; sp_digit u2[2*17]; sp_digit s[2*17]; - sp_digit tmp[2*17 * 5]; + sp_digit tmp[2*17 * 6]; sp_point_521 p1; sp_point_521 p2; } sp_ecc_verify_521_ctx; @@ -134185,7 +134170,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_521* p1 = NULL; #else - sp_digit u1[16 * 17]; + sp_digit u1[18 * 17]; sp_point_521 p1[2]; #endif sp_digit* u2 = NULL; @@ -134204,7 +134189,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 17, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 17, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -134511,7 +134496,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_521* p = NULL; #else - sp_digit tmp[2 * 17 * 5]; + sp_digit tmp[2 * 17 * 6]; sp_point_521 p[2]; #endif sp_point_521* q = NULL; @@ -134525,7 +134510,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -202258,7 +202243,7 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p, (sp_digit)1 : (sp_digit)0)); sp_1024_norm_32(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -207254,6 +207239,7 @@ SP_NOINLINE static void sp_1024_mont_sub_32(sp_digit* r, const sp_digit* a, ); } +#define sp_1024_mont_sub_lower_32 sp_1024_mont_sub_32 /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -208064,7 +208050,7 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -208090,7 +208076,8 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p, sp_digit* t) +static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*32; @@ -208137,7 +208124,7 @@ static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_32(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_32(y, y, x, p1024_mod); + sp_1024_mont_sub_lower_32(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_32(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ @@ -208549,6 +208536,7 @@ typedef struct sp_1024_proj_point_add_32_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -208577,6 +208565,10 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->t3 = t + 4*32; ctx->t4 = t + 6*32; ctx->t5 = t + 8*32; + ctx->t6 = t + 10*32; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -208601,29 +208593,6 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_1024)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<32; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<32; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<32; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -208637,16 +208606,16 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 6; break; case 6: - sp_1024_mont_mul_32(ctx->t1, ctx->t1, ctx->x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_32(ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 8; break; case 8: - sp_1024_mont_mul_32(ctx->t4, ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 9; break; case 9: @@ -208655,7 +208624,7 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_32(ctx->t3, ctx->t3, ctx->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod); ctx->state = 11; break; case 11: @@ -208674,29 +208643,29 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_32(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_1024_mont_sqr_32(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 15; break; case 15: - sp_1024_mont_mul_32(ctx->z, ctx->z, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_32(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 17; break; case 17: - sp_1024_mont_sqr_32(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_32(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 18; break; case 18: - sp_1024_mont_mul_32(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); ctx->state = 19; break; case 19: - sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 20; break; case 20: @@ -208704,24 +208673,24 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 21; break; case 21: - sp_1024_mont_dbl_32(ctx->t1, ctx->y, p1024_mod); + sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); ctx->state = 22; break; case 22: - sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t1, p1024_mod); + sp_1024_mont_dbl_32(ctx->t3, ctx->y, p1024_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t3, p1024_mod); ctx->state = 24; break; case 24: - sp_1024_mont_mul_32(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 25; break; case 25: - sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 26; break; case 26: @@ -208729,9 +208698,30 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -208743,24 +208733,13 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, static void sp_1024_proj_point_add_32(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*32; sp_digit* t3 = t + 4*32; sp_digit* t4 = t + 6*32; sp_digit* t5 = t + 8*32; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*32; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_1024* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_1024_mont_sub_32(t1, p1024_mod, q->y, p1024_mod); @@ -208770,60 +208749,61 @@ static void sp_1024_proj_point_add_32(sp_point_1024* r, sp_1024_proj_point_dbl_32(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<32; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<32; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<32; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_32(t1, q->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t3, t1, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t1, t1, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t1, t1, p->x, p1024_mod, p1024_mp_mod); /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_32(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_32(t3, t3, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t3, t3, p->y, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - U1 */ sp_1024_mont_sub_32(t2, t2, t1, p1024_mod); /* R = S2 - S1 */ sp_1024_mont_sub_32(t4, t4, t3, p1024_mod); - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_32(z, z, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(z, z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_32(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(y, t1, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(z, z, q->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(x, x, t5, p1024_mod); - sp_1024_mont_dbl_32(t1, y, p1024_mod); - sp_1024_mont_sub_32(x, x, t1, p1024_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_32(y, y, x, p1024_mod); - sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t5, t5, t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_dbl_32(t3, y, p1024_mod); + sp_1024_mont_sub_32(x, x, t3, p1024_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_32(y, y, x, p1024_mod); + sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, y, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -208852,7 +208832,7 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, sp_digit* tmp = NULL; #else sp_point_1024 t[16 + 1]; - sp_digit tmp[2 * 32 * 5]; + sp_digit tmp[2 * 32 * 6]; #endif sp_point_1024* rt = NULL; sp_digit n; @@ -208871,7 +208851,7 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, if (t == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -208952,7 +208932,7 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 32 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 32 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -208971,6 +208951,8 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, } #if defined(FP_ECC) || !defined(WOLFSSL_SP_SMALL) +#define sp_1024_mont_dbl_lower_32 sp_1024_mont_dbl_32 +#define sp_1024_mont_tpl_lower_32 sp_1024_mont_tpl_32 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -209009,7 +208991,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int n, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_32(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_32(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_32(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_32(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(b, t1, x, p1024_mod, p1024_mp_mod); @@ -209017,9 +208999,12 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int n, sp_1024_mont_sqr_32(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_32(t2, b, p1024_mod); sp_1024_mont_sub_32(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_32(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_32(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_32(t1, t1, p1024_mod, p1024_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -209029,16 +209014,14 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int n, sp_1024_mont_mul_32(w, w, t1, p1024_mod, p1024_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_32(y, b, x, p1024_mod); - sp_1024_mont_mul_32(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_32(y, y, p1024_mod); + sp_1024_mont_mul_32(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, y, t1, p1024_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_32(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_32(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_32(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_32(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(b, t1, x, p1024_mod, p1024_mp_mod); @@ -209046,14 +209029,15 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int n, sp_1024_mont_sqr_32(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_32(t2, b, p1024_mod); sp_1024_mont_sub_32(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_32(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_32(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_32(t1, t1, p1024_mod, p1024_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_32(y, b, x, p1024_mod); - sp_1024_mont_mul_32(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_32(y, y, p1024_mod); + sp_1024_mont_mul_32(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, y, t1, p1024_mod); #endif /* Y = Y/2 */ @@ -209103,17 +209087,12 @@ typedef struct sp_table_entry_1024 { static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*32; sp_digit* t3 = t + 4*32; sp_digit* t4 = t + 6*32; sp_digit* t5 = t + 8*32; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*32; /* Check double */ (void)sp_1024_mont_sub_32(t1, p1024_mod, q->y, p1024_mod); @@ -209123,53 +209102,54 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024* sp_1024_proj_point_dbl_32(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<32; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<32; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<32; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_32(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - X1 */ - sp_1024_mont_sub_32(t2, t2, x, p1024_mod); + sp_1024_mont_sub_32(t2, t2, p->x, p1024_mod); /* R = S2 - Y1 */ - sp_1024_mont_sub_32(t4, t4, y, p1024_mod); + sp_1024_mont_sub_32(t4, t4, p->y, p1024_mod); /* Z3 = H*Z1 */ - sp_1024_mont_mul_32(z, z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_1024_mont_sqr_32(t1, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t3, x, t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t3, p->x, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(x, t1, t5, p1024_mod); sp_1024_mont_dbl_32(t1, t3, p1024_mod); sp_1024_mont_sub_32(x, x, t1, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_32(t3, t3, x, p1024_mod); + sp_1024_mont_sub_lower_32(t3, t3, x, p1024_mod); sp_1024_mont_mul_32(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t5, t5, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t5, t5, p->y, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, t3, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -209291,7 +209271,7 @@ static int sp_1024_ecc_mulmod_stripe_32(sp_point_1024* r, const sp_point_1024* g sp_digit* t = NULL; #else sp_point_1024 rt[2]; - sp_digit t[2 * 32 * 5]; + sp_digit t[2 * 32 * 6]; #endif sp_point_1024* p = NULL; int i; @@ -209312,7 +209292,7 @@ static int sp_1024_ecc_mulmod_stripe_32(sp_point_1024* r, const sp_point_1024* g if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -209477,7 +209457,7 @@ static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g, const #ifndef FP_ECC return sp_1024_ecc_mulmod_fast_32(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 32 * 5]; + sp_digit tmp[2 * 32 * 6]; sp_cache_1024_t* cache; int err = MP_OKAY; @@ -209630,7 +209610,7 @@ static int sp_1024_ecc_mulmod_stripe_32(sp_point_1024* r, const sp_point_1024* g sp_digit* t = NULL; #else sp_point_1024 rt[2]; - sp_digit t[2 * 32 * 5]; + sp_digit t[2 * 32 * 6]; #endif sp_point_1024* p = NULL; int i; @@ -209651,7 +209631,7 @@ static int sp_1024_ecc_mulmod_stripe_32(sp_point_1024* r, const sp_point_1024* g if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -209816,7 +209796,7 @@ static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g, const #ifndef FP_ECC return sp_1024_ecc_mulmod_fast_32(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 32 * 5]; + sp_digit tmp[2 * 32 * 6]; sp_cache_1024_t* cache; int err = MP_OKAY; @@ -213564,7 +213544,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_1024 point[2]; - sp_digit k[32 + 32 * 2 * 5]; + sp_digit k[32 + 32 * 2 * 6]; #endif sp_point_1024* addP = NULL; sp_digit* tmp = NULL; @@ -213577,7 +213557,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (32 + 32 * 2 * 5), + sizeof(sp_digit) * (32 + 32 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c index 736424df2..8929072fb 100644 --- a/wolfcrypt/src/sp_c32.c +++ b/wolfcrypt/src/sp_c32.c @@ -21439,7 +21439,7 @@ static void sp_256_map_9(sp_point_256* r, const sp_point_256* p, (sp_digit)1 : (sp_digit)0)); sp_256_norm_9(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -21556,6 +21556,7 @@ static void sp_256_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_256_norm_9(r); } +#define sp_256_mont_sub_lower_9 sp_256_mont_sub_9 /* Shift number left one bit. * Bottom bit is lost. * @@ -21711,7 +21712,7 @@ static int sp_256_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con break; case 16: /* Y = Y - X */ - sp_256_mont_sub_9(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -21737,7 +21738,8 @@ static int sp_256_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_256_proj_point_dbl_9(sp_point_256* r, const sp_point_256* p, sp_digit* t) +static void sp_256_proj_point_dbl_9(sp_point_256* r, const sp_point_256* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*9; @@ -21784,7 +21786,7 @@ static void sp_256_proj_point_dbl_9(sp_point_256* r, const sp_point_256* p, sp_d /* X = X - Y */ sp_256_mont_sub_9(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_9(y, y, x, p256_mod); + sp_256_mont_sub_lower_9(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_9(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -21824,6 +21826,7 @@ typedef struct sp_256_proj_point_add_9_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -21852,6 +21855,10 @@ static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->t3 = t + 4*9; ctx->t4 = t + 6*9; ctx->t5 = t + 8*9; + ctx->t6 = t + 10*9; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -21876,29 +21883,6 @@ static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_256)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -21912,16 +21896,16 @@ static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 6; break; case 6: - sp_256_mont_mul_9(ctx->t1, ctx->t1, ctx->x, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_9(ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_9(ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 8; break; case 8: - sp_256_mont_mul_9(ctx->t4, ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 9; break; case 9: @@ -21930,7 +21914,7 @@ static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_9(ctx->t3, ctx->t3, ctx->y, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod); ctx->state = 11; break; case 11: @@ -21949,29 +21933,29 @@ static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_9(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_256_mont_sqr_9(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 15; break; case 15: - sp_256_mont_mul_9(ctx->z, ctx->z, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_9(ctx->x, ctx->t4, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 17; break; case 17: - sp_256_mont_sqr_9(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_9(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 18; break; case 18: - sp_256_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); ctx->state = 19; break; case 19: - sp_256_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_9(ctx->x, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 20; break; case 20: @@ -21979,24 +21963,24 @@ static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 21; break; case 21: - sp_256_mont_dbl_9(ctx->t1, ctx->y, p256_mod); + sp_256_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); ctx->state = 22; break; case 22: - sp_256_mont_sub_9(ctx->x, ctx->x, ctx->t1, p256_mod); + sp_256_mont_dbl_9(ctx->t3, ctx->y, p256_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_9(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_9(ctx->x, ctx->x, ctx->t3, p256_mod); ctx->state = 24; break; case 24: - sp_256_mont_mul_9(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 25; break; case 25: - sp_256_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 26; break; case 26: @@ -22004,9 +21988,30 @@ static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -22018,24 +22023,13 @@ static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, static void sp_256_proj_point_add_9(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*9; sp_digit* t3 = t + 4*9; sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*9; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_256* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_256_sub_9(t1, p256_mod, q->y); @@ -22045,60 +22039,61 @@ static void sp_256_proj_point_add_9(sp_point_256* r, sp_256_proj_point_dbl_9(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_9(t1, q->z, p256_mod, p256_mp_mod); sp_256_mont_mul_9(t3, t1, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(t1, t1, x, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(t1, t1, p->x, p256_mod, p256_mp_mod); /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_9(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_9(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_9(t2, t2, q->x, p256_mod, p256_mp_mod); /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_9(t3, t3, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(t3, t3, p->y, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_9(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ sp_256_mont_sub_9(t2, t2, t1, p256_mod); /* R = S2 - S1 */ sp_256_mont_sub_9(t4, t4, t3, p256_mod); - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_9(z, z, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(z, z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_9(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_9(t5, t2, p256_mod, p256_mp_mod); sp_256_mont_mul_9(y, t1, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_9(t5, t5, t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_9(z, p->z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(z, z, q->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_9(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_9(x, x, t5, p256_mod); - sp_256_mont_dbl_9(t1, y, p256_mod); - sp_256_mont_sub_9(x, x, t1, p256_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_9(y, y, x, p256_mod); - sp_256_mont_mul_9(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_9(t5, t5, t3, p256_mod, p256_mp_mod); + sp_256_mont_dbl_9(t3, y, p256_mod); + sp_256_mont_sub_9(x, x, t3, p256_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_9(y, y, x, p256_mod); + sp_256_mont_mul_9(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_9(y, y, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -22251,7 +22246,7 @@ typedef struct sp_256_ecc_mulmod_9_ctx { sp_256_proj_point_add_9_ctx add_ctx; }; sp_point_256 t[3]; - sp_digit tmp[2 * 9 * 5]; + sp_digit tmp[2 * 9 * 6]; sp_digit n; int i; int c; @@ -22365,7 +22360,7 @@ static int sp_256_ecc_mulmod_9(sp_point_256* r, const sp_point_256* g, sp_digit* tmp = NULL; #else sp_point_256 t[3]; - sp_digit tmp[2 * 9 * 5]; + sp_digit tmp[2 * 9 * 6]; #endif sp_digit n; int i; @@ -22383,7 +22378,7 @@ static int sp_256_ecc_mulmod_9(sp_point_256* r, const sp_point_256* g, if (t == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -22442,7 +22437,7 @@ static int sp_256_ecc_mulmod_9(sp_point_256* r, const sp_point_256* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 9 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 9 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -22508,6 +22503,8 @@ static void sp_256_cond_copy_9(sp_digit* r, const sp_digit* a, const sp_digit m) #endif /* WOLFSSL_SP_SMALL */ } +#define sp_256_mont_dbl_lower_9 sp_256_mont_dbl_9 +#define sp_256_mont_tpl_lower_9 sp_256_mont_tpl_9 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -22546,7 +22543,7 @@ static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int n, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_9(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_9(t1, t1, w, p256_mod); - sp_256_mont_tpl_9(a, t1, p256_mod); + sp_256_mont_tpl_lower_9(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_9(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_9(b, t1, x, p256_mod, p256_mp_mod); @@ -22554,9 +22551,12 @@ static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int n, sp_256_mont_sqr_9(x, a, p256_mod, p256_mp_mod); sp_256_mont_dbl_9(t2, b, p256_mod); sp_256_mont_sub_9(x, x, t2, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_sub_lower_9(t2, b, x, p256_mod); + sp_256_mont_dbl_lower_9(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_9(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_9(t1, t1, p256_mod, p256_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -22566,16 +22566,14 @@ static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int n, sp_256_mont_mul_9(w, w, t1, p256_mod, p256_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_9(y, b, x, p256_mod); - sp_256_mont_mul_9(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_9(y, y, p256_mod); + sp_256_mont_mul_9(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_9(y, y, t1, p256_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_256_mont_sqr_9(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_9(t1, t1, w, p256_mod); - sp_256_mont_tpl_9(a, t1, p256_mod); + sp_256_mont_tpl_lower_9(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_9(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_9(b, t1, x, p256_mod, p256_mp_mod); @@ -22583,14 +22581,15 @@ static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int n, sp_256_mont_sqr_9(x, a, p256_mod, p256_mp_mod); sp_256_mont_dbl_9(t2, b, p256_mod); sp_256_mont_sub_9(x, x, t2, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_sub_lower_9(t2, b, x, p256_mod); + sp_256_mont_dbl_lower_9(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_9(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_9(t1, t1, p256_mod, p256_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_9(y, b, x, p256_mod); - sp_256_mont_mul_9(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_9(y, y, p256_mod); + sp_256_mont_mul_9(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_9(y, y, t1, p256_mod); #endif /* Y = Y/2 */ @@ -22640,29 +22639,30 @@ static void sp_256_proj_point_dbl_n_store_9(sp_point_256* r, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_9(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_9(t1, t1, w, p256_mod); - sp_256_mont_tpl_9(a, t1, p256_mod); + sp_256_mont_tpl_lower_9(a, t1, p256_mod); /* B = X*Y^2 */ - sp_256_mont_sqr_9(t2, y, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(b, t2, x, p256_mod, p256_mp_mod); + sp_256_mont_sqr_9(t1, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(b, t1, x, p256_mod, p256_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_256_mont_sqr_9(x, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_9(t1, b, p256_mod); - sp_256_mont_sub_9(x, x, t1, p256_mod); + sp_256_mont_dbl_9(t2, b, p256_mod); + sp_256_mont_sub_9(x, x, t2, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_sub_lower_9(t2, b, x, p256_mod); + sp_256_mont_dbl_lower_9(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_9(r[j].z, z, y, p256_mod, p256_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_256_mont_sqr_9(t2, t2, p256_mod, p256_mp_mod); + /* t1 = Y^4 */ + sp_256_mont_sqr_9(t1, t1, p256_mod, p256_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_256_mont_mul_9(w, w, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(w, w, t1, p256_mod, p256_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_9(y, b, x, p256_mod); - sp_256_mont_mul_9(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_9(y, y, p256_mod); - sp_256_mont_sub_9(y, y, t2, p256_mod); + sp_256_mont_mul_9(y, b, a, p256_mod, p256_mp_mod); + sp_256_mont_sub_9(y, y, t1, p256_mod); /* Y = Y/2 */ sp_256_div2_9(r[j].y, y, p256_mod); @@ -22688,30 +22688,30 @@ static void sp_256_proj_point_add_sub_9(sp_point_256* ra, sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; sp_digit* t6 = t + 10*9; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_9(t1, q->z, p256_mod, p256_mp_mod); sp_256_mont_mul_9(t3, t1, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(t1, t1, x, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(t1, t1, xa, p256_mod, p256_mp_mod); /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_9(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_9(t2, za, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(t4, t2, za, p256_mod, p256_mp_mod); sp_256_mont_mul_9(t2, t2, q->x, p256_mod, p256_mp_mod); /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_9(t3, t3, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(t3, t3, ya, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_9(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ @@ -22722,30 +22722,30 @@ static void sp_256_proj_point_add_sub_9(sp_point_256* ra, sp_256_mont_sub_9(t4, t4, t3, p256_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_256_mont_mul_9(z, z, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(z, z, t2, p256_mod, p256_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_256_mont_mul_9(za, za, q->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(za, za, t2, p256_mod, p256_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_9(x, t4, p256_mod, p256_mp_mod); + sp_256_mont_sqr_9(xa, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_9(xs, t6, p256_mod, p256_mp_mod); sp_256_mont_sqr_9(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(y, t1, t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(ya, t1, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_9(t5, t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_sub_9(x, x, t5, p256_mod); + sp_256_mont_sub_9(xa, xa, t5, p256_mod); sp_256_mont_sub_9(xs, xs, t5, p256_mod); - sp_256_mont_dbl_9(t1, y, p256_mod); - sp_256_mont_sub_9(x, x, t1, p256_mod); + sp_256_mont_dbl_9(t1, ya, p256_mod); + sp_256_mont_sub_9(xa, xa, t1, p256_mod); sp_256_mont_sub_9(xs, xs, t1, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_256_mont_sub_9(ys, y, xs, p256_mod); - sp_256_mont_sub_9(y, y, x, p256_mod); - sp_256_mont_mul_9(y, y, t4, p256_mod, p256_mp_mod); + sp_256_mont_sub_lower_9(ys, ya, xs, p256_mod); + sp_256_mont_sub_lower_9(ya, ya, xa, p256_mod); + sp_256_mont_mul_9(ya, ya, t4, p256_mod, p256_mp_mod); sp_256_sub_9(t6, p256_mod, t6); sp_256_mont_mul_9(ys, ys, t6, p256_mod, p256_mp_mod); sp_256_mont_mul_9(t5, t5, t3, p256_mod, p256_mp_mod); - sp_256_mont_sub_9(y, y, t5, p256_mod); + sp_256_mont_sub_9(ya, ya, t5, p256_mod); sp_256_mont_sub_9(ys, ys, t5, p256_mod); } @@ -23051,17 +23051,12 @@ static int sp_256_ecc_mulmod_win_add_sub_9(sp_point_256* r, const sp_point_256* static void sp_256_proj_point_add_qz1_9(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*9; sp_digit* t3 = t + 4*9; sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*9; /* Check double */ (void)sp_256_sub_9(t1, p256_mod, q->y); @@ -23071,53 +23066,54 @@ static void sp_256_proj_point_add_qz1_9(sp_point_256* r, const sp_point_256* p, sp_256_proj_point_dbl_9(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_9(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_9(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_9(t2, t2, q->x, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_9(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - X1 */ - sp_256_mont_sub_9(t2, t2, x, p256_mod); + sp_256_mont_sub_9(t2, t2, p->x, p256_mod); /* R = S2 - Y1 */ - sp_256_mont_sub_9(t4, t4, y, p256_mod); + sp_256_mont_sub_9(t4, t4, p->y, p256_mod); /* Z3 = H*Z1 */ - sp_256_mont_mul_9(z, z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_256_mont_sqr_9(t1, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_9(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(t3, x, t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(t3, p->x, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_9(t5, t5, t2, p256_mod, p256_mp_mod); sp_256_mont_sub_9(x, t1, t5, p256_mod); sp_256_mont_dbl_9(t1, t3, p256_mod); sp_256_mont_sub_9(x, x, t1, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_9(t3, t3, x, p256_mod); + sp_256_mont_sub_lower_9(t3, t3, x, p256_mod); sp_256_mont_mul_9(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(t5, t5, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(t5, t5, p->y, p256_mod, p256_mp_mod); sp_256_mont_sub_9(y, t3, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -23314,7 +23310,7 @@ static int sp_256_ecc_mulmod_stripe_9(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 9 * 5]; + sp_digit t[2 * 9 * 6]; #endif sp_point_256* p = NULL; int i; @@ -23335,7 +23331,7 @@ static int sp_256_ecc_mulmod_stripe_9(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -23515,7 +23511,7 @@ static int sp_256_ecc_mulmod_9(sp_point_256* r, const sp_point_256* g, const sp_ #ifndef FP_ECC return sp_256_ecc_mulmod_win_add_sub_9(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 9 * 5]; + sp_digit tmp[2 * 9 * 6]; sp_cache_256_t* cache; int err = MP_OKAY; @@ -23626,7 +23622,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[9 + 9 * 2 * 5]; + sp_digit k[9 + 9 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -23639,7 +23635,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (9 + 9 * 2 * 5), heap, + sizeof(sp_digit) * (9 + 9 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -25084,7 +25080,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[9 + 9 * 2 * 5]; + sp_digit k[9 + 9 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -25097,7 +25093,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (9 + 9 * 2 * 5), + sizeof(sp_digit) * (9 + 9 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -26466,7 +26462,7 @@ typedef struct sp_ecc_verify_256_ctx { sp_digit u1[2*9]; sp_digit u2[2*9]; sp_digit s[2*9]; - sp_digit tmp[2*9 * 5]; + sp_digit tmp[2*9 * 6]; sp_point_256 p1; sp_point_256 p2; } sp_ecc_verify_256_ctx; @@ -26612,7 +26608,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_256* p1 = NULL; #else - sp_digit u1[16 * 9]; + sp_digit u1[18 * 9]; sp_point_256 p1[2]; #endif sp_digit* u2 = NULL; @@ -26631,7 +26627,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 9, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 9, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -26934,7 +26930,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_256* p = NULL; #else - sp_digit tmp[2 * 9 * 5]; + sp_digit tmp[2 * 9 * 6]; sp_point_256 p[2]; #endif sp_point_256* q = NULL; @@ -26948,7 +26944,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -28688,7 +28684,7 @@ static void sp_384_map_15(sp_point_384* r, const sp_point_384* p, (sp_digit)1 : (sp_digit)0)); sp_384_norm_15(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -28811,6 +28807,7 @@ static void sp_384_mont_sub_15(sp_digit* r, const sp_digit* a, const sp_digit* b sp_384_norm_15(r); } +#define sp_384_mont_sub_lower_15 sp_384_mont_sub_15 /* Shift number left one bit. * Bottom bit is lost. * @@ -28972,7 +28969,7 @@ static int sp_384_proj_point_dbl_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co break; case 16: /* Y = Y - X */ - sp_384_mont_sub_15(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_lower_15(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -28998,7 +28995,8 @@ static int sp_384_proj_point_dbl_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_384_proj_point_dbl_15(sp_point_384* r, const sp_point_384* p, sp_digit* t) +static void sp_384_proj_point_dbl_15(sp_point_384* r, const sp_point_384* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*15; @@ -29045,7 +29043,7 @@ static void sp_384_proj_point_dbl_15(sp_point_384* r, const sp_point_384* p, sp_ /* X = X - Y */ sp_384_mont_sub_15(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_15(y, y, x, p384_mod); + sp_384_mont_sub_lower_15(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_15(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ @@ -29087,6 +29085,7 @@ typedef struct sp_384_proj_point_add_15_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -29115,6 +29114,10 @@ static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->t3 = t + 4*15; ctx->t4 = t + 6*15; ctx->t5 = t + 8*15; + ctx->t6 = t + 10*15; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -29139,29 +29142,6 @@ static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_384)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<15; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<15; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<15; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -29175,16 +29155,16 @@ static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 6; break; case 6: - sp_384_mont_mul_15(ctx->t1, ctx->t1, ctx->x, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_15(ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_15(ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 8; break; case 8: - sp_384_mont_mul_15(ctx->t4, ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 9; break; case 9: @@ -29193,7 +29173,7 @@ static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_15(ctx->t3, ctx->t3, ctx->y, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod); ctx->state = 11; break; case 11: @@ -29212,29 +29192,29 @@ static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_15(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_384_mont_sqr_15(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 15; break; case 15: - sp_384_mont_mul_15(ctx->z, ctx->z, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_15(ctx->x, ctx->t4, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 17; break; case 17: - sp_384_mont_sqr_15(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_15(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 18; break; case 18: - sp_384_mont_mul_15(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); ctx->state = 19; break; case 19: - sp_384_mont_mul_15(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_15(ctx->x, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 20; break; case 20: @@ -29242,24 +29222,24 @@ static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 21; break; case 21: - sp_384_mont_dbl_15(ctx->t1, ctx->y, p384_mod); + sp_384_mont_mul_15(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); ctx->state = 22; break; case 22: - sp_384_mont_sub_15(ctx->x, ctx->x, ctx->t1, p384_mod); + sp_384_mont_dbl_15(ctx->t3, ctx->y, p384_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_15(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_15(ctx->x, ctx->x, ctx->t3, p384_mod); ctx->state = 24; break; case 24: - sp_384_mont_mul_15(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_15(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 25; break; case 25: - sp_384_mont_mul_15(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 26; break; case 26: @@ -29267,9 +29247,30 @@ static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 15; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 15; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 15; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -29281,24 +29282,13 @@ static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, static void sp_384_proj_point_add_15(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*15; sp_digit* t3 = t + 4*15; sp_digit* t4 = t + 6*15; sp_digit* t5 = t + 8*15; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*15; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_384* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_384_sub_15(t1, p384_mod, q->y); @@ -29308,60 +29298,61 @@ static void sp_384_proj_point_add_15(sp_point_384* r, sp_384_proj_point_dbl_15(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<15; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<15; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<15; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_15(t1, q->z, p384_mod, p384_mp_mod); sp_384_mont_mul_15(t3, t1, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(t1, t1, x, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(t1, t1, p->x, p384_mod, p384_mp_mod); /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_15(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_15(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_15(t2, t2, q->x, p384_mod, p384_mp_mod); /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_15(t3, t3, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(t3, t3, p->y, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_15(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - U1 */ sp_384_mont_sub_15(t2, t2, t1, p384_mod); /* R = S2 - S1 */ sp_384_mont_sub_15(t4, t4, t3, p384_mod); - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_15(z, z, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(z, z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_15(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_15(t5, t2, p384_mod, p384_mp_mod); sp_384_mont_mul_15(y, t1, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_15(t5, t5, t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_15(z, p->z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(z, z, q->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_15(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_15(x, x, t5, p384_mod); - sp_384_mont_dbl_15(t1, y, p384_mod); - sp_384_mont_sub_15(x, x, t1, p384_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_15(y, y, x, p384_mod); - sp_384_mont_mul_15(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_mul_15(t5, t5, t3, p384_mod, p384_mp_mod); + sp_384_mont_dbl_15(t3, y, p384_mod); + sp_384_mont_sub_15(x, x, t3, p384_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_15(y, y, x, p384_mod); + sp_384_mont_mul_15(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_15(y, y, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 15; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 15; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 15; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -29827,6 +29818,8 @@ static void sp_384_cond_copy_15(sp_digit* r, const sp_digit* a, const sp_digit m #endif /* WOLFSSL_SP_SMALL */ } +#define sp_384_mont_dbl_lower_15 sp_384_mont_dbl_15 +#define sp_384_mont_tpl_lower_15 sp_384_mont_tpl_15 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -29865,7 +29858,7 @@ static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int n, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_15(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_15(t1, t1, w, p384_mod); - sp_384_mont_tpl_15(a, t1, p384_mod); + sp_384_mont_tpl_lower_15(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_15(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_15(b, t1, x, p384_mod, p384_mp_mod); @@ -29873,9 +29866,12 @@ static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int n, sp_384_mont_sqr_15(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_15(t2, b, p384_mod); sp_384_mont_sub_15(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_15(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_15(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_15(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_15(t1, t1, p384_mod, p384_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -29885,16 +29881,14 @@ static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int n, sp_384_mont_mul_15(w, w, t1, p384_mod, p384_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_15(y, b, x, p384_mod); - sp_384_mont_mul_15(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_15(y, y, p384_mod); + sp_384_mont_mul_15(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_15(y, y, t1, p384_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_384_mont_sqr_15(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_15(t1, t1, w, p384_mod); - sp_384_mont_tpl_15(a, t1, p384_mod); + sp_384_mont_tpl_lower_15(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_15(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_15(b, t1, x, p384_mod, p384_mp_mod); @@ -29902,14 +29896,15 @@ static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int n, sp_384_mont_sqr_15(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_15(t2, b, p384_mod); sp_384_mont_sub_15(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_15(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_15(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_15(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_15(t1, t1, p384_mod, p384_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_15(y, b, x, p384_mod); - sp_384_mont_mul_15(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_15(y, y, p384_mod); + sp_384_mont_mul_15(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_15(y, y, t1, p384_mod); #endif /* Y = Y/2 */ @@ -29959,29 +29954,30 @@ static void sp_384_proj_point_dbl_n_store_15(sp_point_384* r, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_15(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_15(t1, t1, w, p384_mod); - sp_384_mont_tpl_15(a, t1, p384_mod); + sp_384_mont_tpl_lower_15(a, t1, p384_mod); /* B = X*Y^2 */ - sp_384_mont_sqr_15(t2, y, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(b, t2, x, p384_mod, p384_mp_mod); + sp_384_mont_sqr_15(t1, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(b, t1, x, p384_mod, p384_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_384_mont_sqr_15(x, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_15(t1, b, p384_mod); - sp_384_mont_sub_15(x, x, t1, p384_mod); + sp_384_mont_dbl_15(t2, b, p384_mod); + sp_384_mont_sub_15(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_15(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_15(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_15(r[j].z, z, y, p384_mod, p384_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_384_mont_sqr_15(t2, t2, p384_mod, p384_mp_mod); + /* t1 = Y^4 */ + sp_384_mont_sqr_15(t1, t1, p384_mod, p384_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_384_mont_mul_15(w, w, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(w, w, t1, p384_mod, p384_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_15(y, b, x, p384_mod); - sp_384_mont_mul_15(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_15(y, y, p384_mod); - sp_384_mont_sub_15(y, y, t2, p384_mod); + sp_384_mont_mul_15(y, b, a, p384_mod, p384_mp_mod); + sp_384_mont_sub_15(y, y, t1, p384_mod); /* Y = Y/2 */ sp_384_div2_15(r[j].y, y, p384_mod); @@ -30007,30 +30003,30 @@ static void sp_384_proj_point_add_sub_15(sp_point_384* ra, sp_digit* t4 = t + 6*15; sp_digit* t5 = t + 8*15; sp_digit* t6 = t + 10*15; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_15(t1, q->z, p384_mod, p384_mp_mod); sp_384_mont_mul_15(t3, t1, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(t1, t1, x, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(t1, t1, xa, p384_mod, p384_mp_mod); /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_15(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_15(t2, za, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(t4, t2, za, p384_mod, p384_mp_mod); sp_384_mont_mul_15(t2, t2, q->x, p384_mod, p384_mp_mod); /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_15(t3, t3, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(t3, t3, ya, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_15(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - U1 */ @@ -30041,30 +30037,30 @@ static void sp_384_proj_point_add_sub_15(sp_point_384* ra, sp_384_mont_sub_15(t4, t4, t3, p384_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_384_mont_mul_15(z, z, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(z, z, t2, p384_mod, p384_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_384_mont_mul_15(za, za, q->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(za, za, t2, p384_mod, p384_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_15(x, t4, p384_mod, p384_mp_mod); + sp_384_mont_sqr_15(xa, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_15(xs, t6, p384_mod, p384_mp_mod); sp_384_mont_sqr_15(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(y, t1, t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(ya, t1, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_15(t5, t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_sub_15(x, x, t5, p384_mod); + sp_384_mont_sub_15(xa, xa, t5, p384_mod); sp_384_mont_sub_15(xs, xs, t5, p384_mod); - sp_384_mont_dbl_15(t1, y, p384_mod); - sp_384_mont_sub_15(x, x, t1, p384_mod); + sp_384_mont_dbl_15(t1, ya, p384_mod); + sp_384_mont_sub_15(xa, xa, t1, p384_mod); sp_384_mont_sub_15(xs, xs, t1, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_384_mont_sub_15(ys, y, xs, p384_mod); - sp_384_mont_sub_15(y, y, x, p384_mod); - sp_384_mont_mul_15(y, y, t4, p384_mod, p384_mp_mod); + sp_384_mont_sub_lower_15(ys, ya, xs, p384_mod); + sp_384_mont_sub_lower_15(ya, ya, xa, p384_mod); + sp_384_mont_mul_15(ya, ya, t4, p384_mod, p384_mp_mod); sp_384_sub_15(t6, p384_mod, t6); sp_384_mont_mul_15(ys, ys, t6, p384_mod, p384_mp_mod); sp_384_mont_mul_15(t5, t5, t3, p384_mod, p384_mp_mod); - sp_384_mont_sub_15(y, y, t5, p384_mod); + sp_384_mont_sub_15(ya, ya, t5, p384_mod); sp_384_mont_sub_15(ys, ys, t5, p384_mod); } @@ -30406,17 +30402,12 @@ static int sp_384_ecc_mulmod_win_add_sub_15(sp_point_384* r, const sp_point_384* static void sp_384_proj_point_add_qz1_15(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*15; sp_digit* t3 = t + 4*15; sp_digit* t4 = t + 6*15; sp_digit* t5 = t + 8*15; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*15; /* Check double */ (void)sp_384_sub_15(t1, p384_mod, q->y); @@ -30426,53 +30417,54 @@ static void sp_384_proj_point_add_qz1_15(sp_point_384* r, const sp_point_384* p, sp_384_proj_point_dbl_15(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<15; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<15; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<15; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_15(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_15(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_15(t2, t2, q->x, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_15(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - X1 */ - sp_384_mont_sub_15(t2, t2, x, p384_mod); + sp_384_mont_sub_15(t2, t2, p->x, p384_mod); /* R = S2 - Y1 */ - sp_384_mont_sub_15(t4, t4, y, p384_mod); + sp_384_mont_sub_15(t4, t4, p->y, p384_mod); /* Z3 = H*Z1 */ - sp_384_mont_mul_15(z, z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_384_mont_sqr_15(t1, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_15(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(t3, x, t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(t3, p->x, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_15(t5, t5, t2, p384_mod, p384_mp_mod); sp_384_mont_sub_15(x, t1, t5, p384_mod); sp_384_mont_dbl_15(t1, t3, p384_mod); sp_384_mont_sub_15(x, x, t1, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_15(t3, t3, x, p384_mod); + sp_384_mont_sub_lower_15(t3, t3, x, p384_mod); sp_384_mont_mul_15(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(t5, t5, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(t5, t5, p->y, p384_mod, p384_mp_mod); sp_384_mont_sub_15(y, t3, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 15; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 15; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 15; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -34376,7 +34368,7 @@ typedef struct sp_ecc_verify_384_ctx { sp_digit u1[2*15]; sp_digit u2[2*15]; sp_digit s[2*15]; - sp_digit tmp[2*15 * 5]; + sp_digit tmp[2*15 * 6]; sp_point_384 p1; sp_point_384 p2; } sp_ecc_verify_384_ctx; @@ -34522,7 +34514,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_384* p1 = NULL; #else - sp_digit u1[16 * 15]; + sp_digit u1[18 * 15]; sp_point_384 p1[2]; #endif sp_digit* u2 = NULL; @@ -34541,7 +34533,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 15, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 15, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -34844,7 +34836,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_384* p = NULL; #else - sp_digit tmp[2 * 15 * 5]; + sp_digit tmp[2 * 15 * 6]; sp_point_384 p[2]; #endif sp_point_384* q = NULL; @@ -34858,7 +34850,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 15 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 15 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -36195,7 +36187,7 @@ static void sp_521_map_21(sp_point_521* r, const sp_point_521* p, (sp_digit)1 : (sp_digit)0)); sp_521_norm_21(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -36320,6 +36312,7 @@ static void sp_521_mont_sub_21(sp_digit* r, const sp_digit* a, const sp_digit* b sp_521_norm_21(r); } +#define sp_521_mont_sub_lower_21 sp_521_mont_sub_21 /* Shift number left one bit. * Bottom bit is lost. * @@ -36487,7 +36480,7 @@ static int sp_521_proj_point_dbl_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co break; case 16: /* Y = Y - X */ - sp_521_mont_sub_21(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_lower_21(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -36513,7 +36506,8 @@ static int sp_521_proj_point_dbl_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_521_proj_point_dbl_21(sp_point_521* r, const sp_point_521* p, sp_digit* t) +static void sp_521_proj_point_dbl_21(sp_point_521* r, const sp_point_521* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*21; @@ -36560,7 +36554,7 @@ static void sp_521_proj_point_dbl_21(sp_point_521* r, const sp_point_521* p, sp_ /* X = X - Y */ sp_521_mont_sub_21(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_21(y, y, x, p521_mod); + sp_521_mont_sub_lower_21(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_21(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ @@ -36604,6 +36598,7 @@ typedef struct sp_521_proj_point_add_21_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -36632,6 +36627,10 @@ static int sp_521_proj_point_add_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->t3 = t + 4*21; ctx->t4 = t + 6*21; ctx->t5 = t + 8*21; + ctx->t6 = t + 10*21; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -36656,29 +36655,6 @@ static int sp_521_proj_point_add_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_521)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<21; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<21; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<21; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -36692,16 +36668,16 @@ static int sp_521_proj_point_add_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 6; break; case 6: - sp_521_mont_mul_21(ctx->t1, ctx->t1, ctx->x, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_21(ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_21(ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 8; break; case 8: - sp_521_mont_mul_21(ctx->t4, ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 9; break; case 9: @@ -36710,7 +36686,7 @@ static int sp_521_proj_point_add_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_21(ctx->t3, ctx->t3, ctx->y, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod); ctx->state = 11; break; case 11: @@ -36729,29 +36705,29 @@ static int sp_521_proj_point_add_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_21(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_521_mont_sqr_21(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 15; break; case 15: - sp_521_mont_mul_21(ctx->z, ctx->z, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_21(ctx->x, ctx->t4, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 17; break; case 17: - sp_521_mont_sqr_21(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_21(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 18; break; case 18: - sp_521_mont_mul_21(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); ctx->state = 19; break; case 19: - sp_521_mont_mul_21(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_21(ctx->x, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 20; break; case 20: @@ -36759,24 +36735,24 @@ static int sp_521_proj_point_add_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 21; break; case 21: - sp_521_mont_dbl_21(ctx->t1, ctx->y, p521_mod); + sp_521_mont_mul_21(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); ctx->state = 22; break; case 22: - sp_521_mont_sub_21(ctx->x, ctx->x, ctx->t1, p521_mod); + sp_521_mont_dbl_21(ctx->t3, ctx->y, p521_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_21(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_21(ctx->x, ctx->x, ctx->t3, p521_mod); ctx->state = 24; break; case 24: - sp_521_mont_mul_21(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_21(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 25; break; case 25: - sp_521_mont_mul_21(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 26; break; case 26: @@ -36784,9 +36760,30 @@ static int sp_521_proj_point_add_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 21; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 21; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 21; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -36798,24 +36795,13 @@ static int sp_521_proj_point_add_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, static void sp_521_proj_point_add_21(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*21; sp_digit* t3 = t + 4*21; sp_digit* t4 = t + 6*21; sp_digit* t5 = t + 8*21; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*21; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_521* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_521_sub_21(t1, p521_mod, q->y); @@ -36825,60 +36811,61 @@ static void sp_521_proj_point_add_21(sp_point_521* r, sp_521_proj_point_dbl_21(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<21; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<21; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<21; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_21(t1, q->z, p521_mod, p521_mp_mod); sp_521_mont_mul_21(t3, t1, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(t1, t1, x, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(t1, t1, p->x, p521_mod, p521_mp_mod); /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_21(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_21(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_21(t2, t2, q->x, p521_mod, p521_mp_mod); /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_21(t3, t3, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(t3, t3, p->y, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_21(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - U1 */ sp_521_mont_sub_21(t2, t2, t1, p521_mod); /* R = S2 - S1 */ sp_521_mont_sub_21(t4, t4, t3, p521_mod); - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_21(z, z, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(z, z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_21(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_21(t5, t2, p521_mod, p521_mp_mod); sp_521_mont_mul_21(y, t1, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_21(t5, t5, t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_21(z, p->z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(z, z, q->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_21(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_21(x, x, t5, p521_mod); - sp_521_mont_dbl_21(t1, y, p521_mod); - sp_521_mont_sub_21(x, x, t1, p521_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_21(y, y, x, p521_mod); - sp_521_mont_mul_21(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_mul_21(t5, t5, t3, p521_mod, p521_mp_mod); + sp_521_mont_dbl_21(t3, y, p521_mod); + sp_521_mont_sub_21(x, x, t3, p521_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_21(y, y, x, p521_mod); + sp_521_mont_mul_21(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_21(y, y, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 21; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 21; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 21; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -36925,7 +36912,7 @@ typedef struct sp_521_ecc_mulmod_21_ctx { sp_521_proj_point_add_21_ctx add_ctx; }; sp_point_521 t[3]; - sp_digit tmp[2 * 21 * 5]; + sp_digit tmp[2 * 21 * 6]; sp_digit n; int i; int c; @@ -37039,7 +37026,7 @@ static int sp_521_ecc_mulmod_21(sp_point_521* r, const sp_point_521* g, sp_digit* tmp = NULL; #else sp_point_521 t[3]; - sp_digit tmp[2 * 21 * 5]; + sp_digit tmp[2 * 21 * 6]; #endif sp_digit n; int i; @@ -37057,7 +37044,7 @@ static int sp_521_ecc_mulmod_21(sp_point_521* r, const sp_point_521* g, if (t == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 21 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 21 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -37116,7 +37103,7 @@ static int sp_521_ecc_mulmod_21(sp_point_521* r, const sp_point_521* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 21 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 21 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -37206,6 +37193,8 @@ static void sp_521_cond_copy_21(sp_digit* r, const sp_digit* a, const sp_digit m #endif /* WOLFSSL_SP_SMALL */ } +#define sp_521_mont_dbl_lower_21 sp_521_mont_dbl_21 +#define sp_521_mont_tpl_lower_21 sp_521_mont_tpl_21 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -37244,7 +37233,7 @@ static void sp_521_proj_point_dbl_n_21(sp_point_521* p, int n, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_21(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_21(t1, t1, w, p521_mod); - sp_521_mont_tpl_21(a, t1, p521_mod); + sp_521_mont_tpl_lower_21(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_21(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_21(b, t1, x, p521_mod, p521_mp_mod); @@ -37252,9 +37241,12 @@ static void sp_521_proj_point_dbl_n_21(sp_point_521* p, int n, sp_521_mont_sqr_21(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_21(t2, b, p521_mod); sp_521_mont_sub_21(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_21(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_21(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_21(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_21(t1, t1, p521_mod, p521_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -37264,16 +37256,14 @@ static void sp_521_proj_point_dbl_n_21(sp_point_521* p, int n, sp_521_mont_mul_21(w, w, t1, p521_mod, p521_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_21(y, b, x, p521_mod); - sp_521_mont_mul_21(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_21(y, y, p521_mod); + sp_521_mont_mul_21(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_21(y, y, t1, p521_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_521_mont_sqr_21(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_21(t1, t1, w, p521_mod); - sp_521_mont_tpl_21(a, t1, p521_mod); + sp_521_mont_tpl_lower_21(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_21(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_21(b, t1, x, p521_mod, p521_mp_mod); @@ -37281,14 +37271,15 @@ static void sp_521_proj_point_dbl_n_21(sp_point_521* p, int n, sp_521_mont_sqr_21(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_21(t2, b, p521_mod); sp_521_mont_sub_21(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_21(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_21(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_21(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_21(t1, t1, p521_mod, p521_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_21(y, b, x, p521_mod); - sp_521_mont_mul_21(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_21(y, y, p521_mod); + sp_521_mont_mul_21(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_21(y, y, t1, p521_mod); #endif /* Y = Y/2 */ @@ -37338,29 +37329,30 @@ static void sp_521_proj_point_dbl_n_store_21(sp_point_521* r, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_21(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_21(t1, t1, w, p521_mod); - sp_521_mont_tpl_21(a, t1, p521_mod); + sp_521_mont_tpl_lower_21(a, t1, p521_mod); /* B = X*Y^2 */ - sp_521_mont_sqr_21(t2, y, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(b, t2, x, p521_mod, p521_mp_mod); + sp_521_mont_sqr_21(t1, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(b, t1, x, p521_mod, p521_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_521_mont_sqr_21(x, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_21(t1, b, p521_mod); - sp_521_mont_sub_21(x, x, t1, p521_mod); + sp_521_mont_dbl_21(t2, b, p521_mod); + sp_521_mont_sub_21(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_21(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_21(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_21(r[j].z, z, y, p521_mod, p521_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_521_mont_sqr_21(t2, t2, p521_mod, p521_mp_mod); + /* t1 = Y^4 */ + sp_521_mont_sqr_21(t1, t1, p521_mod, p521_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_521_mont_mul_21(w, w, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(w, w, t1, p521_mod, p521_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_21(y, b, x, p521_mod); - sp_521_mont_mul_21(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_21(y, y, p521_mod); - sp_521_mont_sub_21(y, y, t2, p521_mod); + sp_521_mont_mul_21(y, b, a, p521_mod, p521_mp_mod); + sp_521_mont_sub_21(y, y, t1, p521_mod); /* Y = Y/2 */ sp_521_div2_21(r[j].y, y, p521_mod); @@ -37386,30 +37378,30 @@ static void sp_521_proj_point_add_sub_21(sp_point_521* ra, sp_digit* t4 = t + 6*21; sp_digit* t5 = t + 8*21; sp_digit* t6 = t + 10*21; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_21(t1, q->z, p521_mod, p521_mp_mod); sp_521_mont_mul_21(t3, t1, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(t1, t1, x, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(t1, t1, xa, p521_mod, p521_mp_mod); /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_21(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_21(t2, za, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(t4, t2, za, p521_mod, p521_mp_mod); sp_521_mont_mul_21(t2, t2, q->x, p521_mod, p521_mp_mod); /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_21(t3, t3, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(t3, t3, ya, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_21(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - U1 */ @@ -37420,30 +37412,30 @@ static void sp_521_proj_point_add_sub_21(sp_point_521* ra, sp_521_mont_sub_21(t4, t4, t3, p521_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_521_mont_mul_21(z, z, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(z, z, t2, p521_mod, p521_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_521_mont_mul_21(za, za, q->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(za, za, t2, p521_mod, p521_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_21(x, t4, p521_mod, p521_mp_mod); + sp_521_mont_sqr_21(xa, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_21(xs, t6, p521_mod, p521_mp_mod); sp_521_mont_sqr_21(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(y, t1, t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(ya, t1, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_21(t5, t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_sub_21(x, x, t5, p521_mod); + sp_521_mont_sub_21(xa, xa, t5, p521_mod); sp_521_mont_sub_21(xs, xs, t5, p521_mod); - sp_521_mont_dbl_21(t1, y, p521_mod); - sp_521_mont_sub_21(x, x, t1, p521_mod); + sp_521_mont_dbl_21(t1, ya, p521_mod); + sp_521_mont_sub_21(xa, xa, t1, p521_mod); sp_521_mont_sub_21(xs, xs, t1, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_521_mont_sub_21(ys, y, xs, p521_mod); - sp_521_mont_sub_21(y, y, x, p521_mod); - sp_521_mont_mul_21(y, y, t4, p521_mod, p521_mp_mod); + sp_521_mont_sub_lower_21(ys, ya, xs, p521_mod); + sp_521_mont_sub_lower_21(ya, ya, xa, p521_mod); + sp_521_mont_mul_21(ya, ya, t4, p521_mod, p521_mp_mod); sp_521_sub_21(t6, p521_mod, t6); sp_521_mont_mul_21(ys, ys, t6, p521_mod, p521_mp_mod); sp_521_mont_mul_21(t5, t5, t3, p521_mod, p521_mp_mod); - sp_521_mont_sub_21(y, y, t5, p521_mod); + sp_521_mont_sub_21(ya, ya, t5, p521_mod); sp_521_mont_sub_21(ys, ys, t5, p521_mod); } @@ -37821,17 +37813,12 @@ static int sp_521_ecc_mulmod_win_add_sub_21(sp_point_521* r, const sp_point_521* static void sp_521_proj_point_add_qz1_21(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*21; sp_digit* t3 = t + 4*21; sp_digit* t4 = t + 6*21; sp_digit* t5 = t + 8*21; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*21; /* Check double */ (void)sp_521_sub_21(t1, p521_mod, q->y); @@ -37841,53 +37828,54 @@ static void sp_521_proj_point_add_qz1_21(sp_point_521* r, const sp_point_521* p, sp_521_proj_point_dbl_21(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<21; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<21; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<21; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_21(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_21(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_21(t2, t2, q->x, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_21(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - X1 */ - sp_521_mont_sub_21(t2, t2, x, p521_mod); + sp_521_mont_sub_21(t2, t2, p->x, p521_mod); /* R = S2 - Y1 */ - sp_521_mont_sub_21(t4, t4, y, p521_mod); + sp_521_mont_sub_21(t4, t4, p->y, p521_mod); /* Z3 = H*Z1 */ - sp_521_mont_mul_21(z, z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_521_mont_sqr_21(t1, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_21(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(t3, x, t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(t3, p->x, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_21(t5, t5, t2, p521_mod, p521_mp_mod); sp_521_mont_sub_21(x, t1, t5, p521_mod); sp_521_mont_dbl_21(t1, t3, p521_mod); sp_521_mont_sub_21(x, x, t1, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_21(t3, t3, x, p521_mod); + sp_521_mont_sub_lower_21(t3, t3, x, p521_mod); sp_521_mont_mul_21(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(t5, t5, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(t5, t5, p->y, p521_mod, p521_mp_mod); sp_521_mont_sub_21(y, t3, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 21; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 21; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 21; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -38132,7 +38120,7 @@ static int sp_521_ecc_mulmod_stripe_21(sp_point_521* r, const sp_point_521* g, sp_digit* t = NULL; #else sp_point_521 rt[2]; - sp_digit t[2 * 21 * 5]; + sp_digit t[2 * 21 * 6]; #endif sp_point_521* p = NULL; int i; @@ -38153,7 +38141,7 @@ static int sp_521_ecc_mulmod_stripe_21(sp_point_521* r, const sp_point_521* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 21 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 21 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -38444,7 +38432,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[21 + 21 * 2 * 5]; + sp_digit k[21 + 21 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -38457,7 +38445,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (21 + 21 * 2 * 5), heap, + sizeof(sp_digit) * (21 + 21 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -40924,7 +40912,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[21 + 21 * 2 * 5]; + sp_digit k[21 + 21 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -40937,7 +40925,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (21 + 21 * 2 * 5), + sizeof(sp_digit) * (21 + 21 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -42352,7 +42340,7 @@ typedef struct sp_ecc_verify_521_ctx { sp_digit u1[2*21]; sp_digit u2[2*21]; sp_digit s[2*21]; - sp_digit tmp[2*21 * 5]; + sp_digit tmp[2*21 * 6]; sp_point_521 p1; sp_point_521 p2; } sp_ecc_verify_521_ctx; @@ -42502,7 +42490,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_521* p1 = NULL; #else - sp_digit u1[16 * 21]; + sp_digit u1[18 * 21]; sp_point_521 p1[2]; #endif sp_digit* u2 = NULL; @@ -42521,7 +42509,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 21, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 21, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -42829,7 +42817,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_521* p = NULL; #else - sp_digit tmp[2 * 21 * 5]; + sp_digit tmp[2 * 21 * 6]; sp_point_521 p[2]; #endif sp_point_521* q = NULL; @@ -42843,7 +42831,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 21 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 21 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -44805,7 +44793,7 @@ static void sp_1024_map_42(sp_point_1024* r, const sp_point_1024* p, (sp_digit)1 : (sp_digit)0)); sp_1024_norm_42(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -44878,6 +44866,7 @@ static void sp_1024_mont_sub_42(sp_digit* r, const sp_digit* a, const sp_digit* sp_1024_norm_42(r); } +#define sp_1024_mont_sub_lower_42 sp_1024_mont_sub_42 /* Shift number left one bit. * Bottom bit is lost. * @@ -45066,7 +45055,7 @@ static int sp_1024_proj_point_dbl_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_42(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_lower_42(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -45092,7 +45081,8 @@ static int sp_1024_proj_point_dbl_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_1024_proj_point_dbl_42(sp_point_1024* r, const sp_point_1024* p, sp_digit* t) +static void sp_1024_proj_point_dbl_42(sp_point_1024* r, const sp_point_1024* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*42; @@ -45139,7 +45129,7 @@ static void sp_1024_proj_point_dbl_42(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_42(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_42(y, y, x, p1024_mod); + sp_1024_mont_sub_lower_42(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_42(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ @@ -45190,6 +45180,7 @@ typedef struct sp_1024_proj_point_add_42_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -45218,6 +45209,10 @@ static int sp_1024_proj_point_add_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->t3 = t + 4*42; ctx->t4 = t + 6*42; ctx->t5 = t + 8*42; + ctx->t6 = t + 10*42; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -45242,29 +45237,6 @@ static int sp_1024_proj_point_add_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_1024)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<42; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<42; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<42; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -45278,16 +45250,16 @@ static int sp_1024_proj_point_add_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 6; break; case 6: - sp_1024_mont_mul_42(ctx->t1, ctx->t1, ctx->x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_42(ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_42(ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 8; break; case 8: - sp_1024_mont_mul_42(ctx->t4, ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 9; break; case 9: @@ -45296,7 +45268,7 @@ static int sp_1024_proj_point_add_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_42(ctx->t3, ctx->t3, ctx->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod); ctx->state = 11; break; case 11: @@ -45315,29 +45287,29 @@ static int sp_1024_proj_point_add_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_42(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_1024_mont_sqr_42(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 15; break; case 15: - sp_1024_mont_mul_42(ctx->z, ctx->z, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_42(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 17; break; case 17: - sp_1024_mont_sqr_42(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_42(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 18; break; case 18: - sp_1024_mont_mul_42(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); ctx->state = 19; break; case 19: - sp_1024_mont_mul_42(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_42(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 20; break; case 20: @@ -45345,24 +45317,24 @@ static int sp_1024_proj_point_add_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 21; break; case 21: - sp_1024_mont_dbl_42(ctx->t1, ctx->y, p1024_mod); + sp_1024_mont_mul_42(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); ctx->state = 22; break; case 22: - sp_1024_mont_sub_42(ctx->x, ctx->x, ctx->t1, p1024_mod); + sp_1024_mont_dbl_42(ctx->t3, ctx->y, p1024_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_42(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_42(ctx->x, ctx->x, ctx->t3, p1024_mod); ctx->state = 24; break; case 24: - sp_1024_mont_mul_42(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_42(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 25; break; case 25: - sp_1024_mont_mul_42(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 26; break; case 26: @@ -45370,9 +45342,30 @@ static int sp_1024_proj_point_add_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 42; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 42; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 42; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -45384,24 +45377,13 @@ static int sp_1024_proj_point_add_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, static void sp_1024_proj_point_add_42(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*42; sp_digit* t3 = t + 4*42; sp_digit* t4 = t + 6*42; sp_digit* t5 = t + 8*42; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*42; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_1024* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_1024_mont_sub_42(t1, p1024_mod, q->y, p1024_mod); @@ -45411,60 +45393,61 @@ static void sp_1024_proj_point_add_42(sp_point_1024* r, sp_1024_proj_point_dbl_42(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<42; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<42; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<42; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_42(t1, q->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(t3, t1, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(t1, t1, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(t1, t1, p->x, p1024_mod, p1024_mp_mod); /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_42(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_42(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_42(t3, t3, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(t3, t3, p->y, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_42(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - U1 */ sp_1024_mont_sub_42(t2, t2, t1, p1024_mod); /* R = S2 - S1 */ sp_1024_mont_sub_42(t4, t4, t3, p1024_mod); - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_42(z, z, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(z, z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_42(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_42(t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(y, t1, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(t5, t5, t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_42(z, p->z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(z, z, q->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_42(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(x, x, t5, p1024_mod); - sp_1024_mont_dbl_42(t1, y, p1024_mod); - sp_1024_mont_sub_42(x, x, t1, p1024_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_42(y, y, x, p1024_mod); - sp_1024_mont_mul_42(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(t5, t5, t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_dbl_42(t3, y, p1024_mod); + sp_1024_mont_sub_42(x, x, t3, p1024_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_42(y, y, x, p1024_mod); + sp_1024_mont_mul_42(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(y, y, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 42; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 42; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 42; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -45493,7 +45476,7 @@ typedef struct sp_1024_ecc_mulmod_42_ctx { sp_1024_proj_point_add_42_ctx add_ctx; }; sp_point_1024 t[3]; - sp_digit tmp[2 * 42 * 5]; + sp_digit tmp[2 * 42 * 6]; sp_digit n; int i; int c; @@ -45607,7 +45590,7 @@ static int sp_1024_ecc_mulmod_42(sp_point_1024* r, const sp_point_1024* g, sp_digit* tmp = NULL; #else sp_point_1024 t[3]; - sp_digit tmp[2 * 42 * 5]; + sp_digit tmp[2 * 42 * 6]; #endif sp_digit n; int i; @@ -45625,7 +45608,7 @@ static int sp_1024_ecc_mulmod_42(sp_point_1024* r, const sp_point_1024* g, if (t == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 42 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 42 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -45684,7 +45667,7 @@ static int sp_1024_ecc_mulmod_42(sp_point_1024* r, const sp_point_1024* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 42 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 42 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -45816,6 +45799,8 @@ static void sp_1024_cond_copy_42(sp_digit* r, const sp_digit* a, const sp_digit #endif /* WOLFSSL_SP_SMALL */ } +#define sp_1024_mont_dbl_lower_42 sp_1024_mont_dbl_42 +#define sp_1024_mont_tpl_lower_42 sp_1024_mont_tpl_42 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -45854,7 +45839,7 @@ static void sp_1024_proj_point_dbl_n_42(sp_point_1024* p, int n, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_42(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_42(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_42(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_42(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(b, t1, x, p1024_mod, p1024_mp_mod); @@ -45862,9 +45847,12 @@ static void sp_1024_proj_point_dbl_n_42(sp_point_1024* p, int n, sp_1024_mont_sqr_42(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_42(t2, b, p1024_mod); sp_1024_mont_sub_42(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_42(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_42(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_42(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_42(t1, t1, p1024_mod, p1024_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -45874,16 +45862,14 @@ static void sp_1024_proj_point_dbl_n_42(sp_point_1024* p, int n, sp_1024_mont_mul_42(w, w, t1, p1024_mod, p1024_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_42(y, b, x, p1024_mod); - sp_1024_mont_mul_42(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_42(y, y, p1024_mod); + sp_1024_mont_mul_42(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(y, y, t1, p1024_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_42(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_42(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_42(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_42(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(b, t1, x, p1024_mod, p1024_mp_mod); @@ -45891,14 +45877,15 @@ static void sp_1024_proj_point_dbl_n_42(sp_point_1024* p, int n, sp_1024_mont_sqr_42(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_42(t2, b, p1024_mod); sp_1024_mont_sub_42(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_42(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_42(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_42(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_42(t1, t1, p1024_mod, p1024_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_42(y, b, x, p1024_mod); - sp_1024_mont_mul_42(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_42(y, y, p1024_mod); + sp_1024_mont_mul_42(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(y, y, t1, p1024_mod); #endif /* Y = Y/2 */ @@ -45948,29 +45935,30 @@ static void sp_1024_proj_point_dbl_n_store_42(sp_point_1024* r, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_42(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_42(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_42(a, t1, p1024_mod); /* B = X*Y^2 */ - sp_1024_mont_sqr_42(t2, y, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(b, t2, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_42(t1, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(b, t1, x, p1024_mod, p1024_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_1024_mont_sqr_42(x, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_42(t1, b, p1024_mod); - sp_1024_mont_sub_42(x, x, t1, p1024_mod); + sp_1024_mont_dbl_42(t2, b, p1024_mod); + sp_1024_mont_sub_42(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_42(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_42(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_42(r[j].z, z, y, p1024_mod, p1024_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_1024_mont_sqr_42(t2, t2, p1024_mod, p1024_mp_mod); + /* t1 = Y^4 */ + sp_1024_mont_sqr_42(t1, t1, p1024_mod, p1024_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_1024_mont_mul_42(w, w, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(w, w, t1, p1024_mod, p1024_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_42(y, b, x, p1024_mod); - sp_1024_mont_mul_42(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_42(y, y, p1024_mod); - sp_1024_mont_sub_42(y, y, t2, p1024_mod); + sp_1024_mont_mul_42(y, b, a, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_42(y, y, t1, p1024_mod); /* Y = Y/2 */ sp_1024_div2_42(r[j].y, y, p1024_mod); @@ -45996,30 +45984,30 @@ static void sp_1024_proj_point_add_sub_42(sp_point_1024* ra, sp_digit* t4 = t + 6*42; sp_digit* t5 = t + 8*42; sp_digit* t6 = t + 10*42; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_42(t1, q->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(t3, t1, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(t1, t1, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(t1, t1, xa, p1024_mod, p1024_mp_mod); /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_42(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_42(t2, za, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(t4, t2, za, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_42(t3, t3, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(t3, t3, ya, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_42(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - U1 */ @@ -46030,30 +46018,30 @@ static void sp_1024_proj_point_add_sub_42(sp_point_1024* ra, sp_1024_mont_sub_42(t4, t4, t3, p1024_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_1024_mont_mul_42(z, z, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(z, z, t2, p1024_mod, p1024_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_1024_mont_mul_42(za, za, q->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(za, za, t2, p1024_mod, p1024_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_42(x, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_42(xa, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_42(xs, t6, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_42(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(y, t1, t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(ya, t1, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(t5, t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_42(x, x, t5, p1024_mod); + sp_1024_mont_sub_42(xa, xa, t5, p1024_mod); sp_1024_mont_sub_42(xs, xs, t5, p1024_mod); - sp_1024_mont_dbl_42(t1, y, p1024_mod); - sp_1024_mont_sub_42(x, x, t1, p1024_mod); + sp_1024_mont_dbl_42(t1, ya, p1024_mod); + sp_1024_mont_sub_42(xa, xa, t1, p1024_mod); sp_1024_mont_sub_42(xs, xs, t1, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_1024_mont_sub_42(ys, y, xs, p1024_mod); - sp_1024_mont_sub_42(y, y, x, p1024_mod); - sp_1024_mont_mul_42(y, y, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_lower_42(ys, ya, xs, p1024_mod); + sp_1024_mont_sub_lower_42(ya, ya, xa, p1024_mod); + sp_1024_mont_mul_42(ya, ya, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(t6, p1024_mod, t6, p1024_mod); sp_1024_mont_mul_42(ys, ys, t6, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(t5, t5, t3, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_42(y, y, t5, p1024_mod); + sp_1024_mont_sub_42(ya, ya, t5, p1024_mod); sp_1024_mont_sub_42(ys, ys, t5, p1024_mod); } @@ -46299,17 +46287,12 @@ static int sp_1024_ecc_mulmod_win_add_sub_42(sp_point_1024* r, const sp_point_10 static void sp_1024_proj_point_add_qz1_42(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*42; sp_digit* t3 = t + 4*42; sp_digit* t4 = t + 6*42; sp_digit* t5 = t + 8*42; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*42; /* Check double */ (void)sp_1024_mont_sub_42(t1, p1024_mod, q->y, p1024_mod); @@ -46319,53 +46302,54 @@ static void sp_1024_proj_point_add_qz1_42(sp_point_1024* r, const sp_point_1024* sp_1024_proj_point_dbl_42(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<42; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<42; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<42; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_42(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_42(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_42(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - X1 */ - sp_1024_mont_sub_42(t2, t2, x, p1024_mod); + sp_1024_mont_sub_42(t2, t2, p->x, p1024_mod); /* R = S2 - Y1 */ - sp_1024_mont_sub_42(t4, t4, y, p1024_mod); + sp_1024_mont_sub_42(t4, t4, p->y, p1024_mod); /* Z3 = H*Z1 */ - sp_1024_mont_mul_42(z, z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_1024_mont_sqr_42(t1, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_42(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(t3, x, t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(t3, p->x, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(t5, t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(x, t1, t5, p1024_mod); sp_1024_mont_dbl_42(t1, t3, p1024_mod); sp_1024_mont_sub_42(x, x, t1, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_42(t3, t3, x, p1024_mod); + sp_1024_mont_sub_lower_42(t3, t3, x, p1024_mod); sp_1024_mont_mul_42(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(t5, t5, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(t5, t5, p->y, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(y, t3, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 42; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 42; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 42; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -46508,7 +46492,7 @@ static int sp_1024_ecc_mulmod_stripe_42(sp_point_1024* r, const sp_point_1024* g sp_digit* t = NULL; #else sp_point_1024 rt[2]; - sp_digit t[2 * 42 * 5]; + sp_digit t[2 * 42 * 6]; #endif sp_point_1024* p = NULL; int i; @@ -46529,7 +46513,7 @@ static int sp_1024_ecc_mulmod_stripe_42(sp_point_1024* r, const sp_point_1024* g if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 42 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 42 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -46694,7 +46678,7 @@ static int sp_1024_ecc_mulmod_42(sp_point_1024* r, const sp_point_1024* g, const #ifndef FP_ECC return sp_1024_ecc_mulmod_win_add_sub_42(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 42 * 5]; + sp_digit tmp[2 * 42 * 6]; sp_cache_1024_t* cache; int err = MP_OKAY; @@ -50739,7 +50723,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_1024 point[2]; - sp_digit k[42 + 42 * 2 * 5]; + sp_digit k[42 + 42 * 2 * 6]; #endif sp_point_1024* addP = NULL; sp_digit* tmp = NULL; @@ -50752,7 +50736,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (42 + 42 * 2 * 5), + sizeof(sp_digit) * (42 + 42 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c index 4512640fe..e7c618c31 100644 --- a/wolfcrypt/src/sp_c64.c +++ b/wolfcrypt/src/sp_c64.c @@ -22804,7 +22804,7 @@ static void sp_256_map_5(sp_point_256* r, const sp_point_256* p, (sp_digit)1 : (sp_digit)0)); sp_256_norm_5(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -22917,6 +22917,7 @@ static void sp_256_mont_sub_5(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_256_norm_5(r); } +#define sp_256_mont_sub_lower_5 sp_256_mont_sub_5 /* Shift number left one bit. * Bottom bit is lost. * @@ -23068,7 +23069,7 @@ static int sp_256_proj_point_dbl_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con break; case 16: /* Y = Y - X */ - sp_256_mont_sub_5(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_lower_5(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -23094,7 +23095,8 @@ static int sp_256_proj_point_dbl_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_256_proj_point_dbl_5(sp_point_256* r, const sp_point_256* p, sp_digit* t) +static void sp_256_proj_point_dbl_5(sp_point_256* r, const sp_point_256* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*5; @@ -23141,7 +23143,7 @@ static void sp_256_proj_point_dbl_5(sp_point_256* r, const sp_point_256* p, sp_d /* X = X - Y */ sp_256_mont_sub_5(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_5(y, y, x, p256_mod); + sp_256_mont_sub_lower_5(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_5(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -23180,6 +23182,7 @@ typedef struct sp_256_proj_point_add_5_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -23208,6 +23211,10 @@ static int sp_256_proj_point_add_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->t3 = t + 4*5; ctx->t4 = t + 6*5; ctx->t5 = t + 8*5; + ctx->t6 = t + 10*5; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -23232,29 +23239,6 @@ static int sp_256_proj_point_add_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_256)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<5; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<5; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<5; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -23268,16 +23252,16 @@ static int sp_256_proj_point_add_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 6; break; case 6: - sp_256_mont_mul_5(ctx->t1, ctx->t1, ctx->x, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_5(ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_5(ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 8; break; case 8: - sp_256_mont_mul_5(ctx->t4, ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 9; break; case 9: @@ -23286,7 +23270,7 @@ static int sp_256_proj_point_add_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_5(ctx->t3, ctx->t3, ctx->y, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod); ctx->state = 11; break; case 11: @@ -23305,29 +23289,29 @@ static int sp_256_proj_point_add_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_5(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_256_mont_sqr_5(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 15; break; case 15: - sp_256_mont_mul_5(ctx->z, ctx->z, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_5(ctx->x, ctx->t4, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 17; break; case 17: - sp_256_mont_sqr_5(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_5(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 18; break; case 18: - sp_256_mont_mul_5(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); ctx->state = 19; break; case 19: - sp_256_mont_mul_5(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_5(ctx->x, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 20; break; case 20: @@ -23335,24 +23319,24 @@ static int sp_256_proj_point_add_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 21; break; case 21: - sp_256_mont_dbl_5(ctx->t1, ctx->y, p256_mod); + sp_256_mont_mul_5(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); ctx->state = 22; break; case 22: - sp_256_mont_sub_5(ctx->x, ctx->x, ctx->t1, p256_mod); + sp_256_mont_dbl_5(ctx->t3, ctx->y, p256_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_5(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_5(ctx->x, ctx->x, ctx->t3, p256_mod); ctx->state = 24; break; case 24: - sp_256_mont_mul_5(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_5(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 25; break; case 25: - sp_256_mont_mul_5(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 26; break; case 26: @@ -23360,9 +23344,30 @@ static int sp_256_proj_point_add_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 5; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 5; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 5; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -23374,24 +23379,13 @@ static int sp_256_proj_point_add_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, static void sp_256_proj_point_add_5(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*5; sp_digit* t3 = t + 4*5; sp_digit* t4 = t + 6*5; sp_digit* t5 = t + 8*5; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*5; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_256* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_256_sub_5(t1, p256_mod, q->y); @@ -23401,60 +23395,61 @@ static void sp_256_proj_point_add_5(sp_point_256* r, sp_256_proj_point_dbl_5(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<5; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<5; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<5; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_5(t1, q->z, p256_mod, p256_mp_mod); sp_256_mont_mul_5(t3, t1, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(t1, t1, x, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(t1, t1, p->x, p256_mod, p256_mp_mod); /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_5(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_5(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_5(t2, t2, q->x, p256_mod, p256_mp_mod); /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_5(t3, t3, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(t3, t3, p->y, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_5(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ sp_256_mont_sub_5(t2, t2, t1, p256_mod); /* R = S2 - S1 */ sp_256_mont_sub_5(t4, t4, t3, p256_mod); - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_5(z, z, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(z, z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_5(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_5(t5, t2, p256_mod, p256_mp_mod); sp_256_mont_mul_5(y, t1, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_5(t5, t5, t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_5(z, p->z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(z, z, q->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_5(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_5(x, x, t5, p256_mod); - sp_256_mont_dbl_5(t1, y, p256_mod); - sp_256_mont_sub_5(x, x, t1, p256_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_5(y, y, x, p256_mod); - sp_256_mont_mul_5(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_5(t5, t5, t3, p256_mod, p256_mp_mod); + sp_256_mont_dbl_5(t3, y, p256_mod); + sp_256_mont_sub_5(x, x, t3, p256_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_5(y, y, x, p256_mod); + sp_256_mont_mul_5(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_5(y, y, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 5; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 5; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 5; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -23592,7 +23587,7 @@ typedef struct sp_256_ecc_mulmod_5_ctx { sp_256_proj_point_add_5_ctx add_ctx; }; sp_point_256 t[3]; - sp_digit tmp[2 * 5 * 5]; + sp_digit tmp[2 * 5 * 6]; sp_digit n; int i; int c; @@ -23706,7 +23701,7 @@ static int sp_256_ecc_mulmod_5(sp_point_256* r, const sp_point_256* g, sp_digit* tmp = NULL; #else sp_point_256 t[3]; - sp_digit tmp[2 * 5 * 5]; + sp_digit tmp[2 * 5 * 6]; #endif sp_digit n; int i; @@ -23724,7 +23719,7 @@ static int sp_256_ecc_mulmod_5(sp_point_256* r, const sp_point_256* g, if (t == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 5 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 5 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -23783,7 +23778,7 @@ static int sp_256_ecc_mulmod_5(sp_point_256* r, const sp_point_256* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 5 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 5 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -23841,6 +23836,8 @@ static void sp_256_cond_copy_5(sp_digit* r, const sp_digit* a, const sp_digit m) #endif /* WOLFSSL_SP_SMALL */ } +#define sp_256_mont_dbl_lower_5 sp_256_mont_dbl_5 +#define sp_256_mont_tpl_lower_5 sp_256_mont_tpl_5 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -23879,7 +23876,7 @@ static void sp_256_proj_point_dbl_n_5(sp_point_256* p, int n, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_5(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_5(t1, t1, w, p256_mod); - sp_256_mont_tpl_5(a, t1, p256_mod); + sp_256_mont_tpl_lower_5(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_5(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_5(b, t1, x, p256_mod, p256_mp_mod); @@ -23887,9 +23884,12 @@ static void sp_256_proj_point_dbl_n_5(sp_point_256* p, int n, sp_256_mont_sqr_5(x, a, p256_mod, p256_mp_mod); sp_256_mont_dbl_5(t2, b, p256_mod); sp_256_mont_sub_5(x, x, t2, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_sub_lower_5(t2, b, x, p256_mod); + sp_256_mont_dbl_lower_5(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_5(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_5(t1, t1, p256_mod, p256_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -23899,16 +23899,14 @@ static void sp_256_proj_point_dbl_n_5(sp_point_256* p, int n, sp_256_mont_mul_5(w, w, t1, p256_mod, p256_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_5(y, b, x, p256_mod); - sp_256_mont_mul_5(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_5(y, y, p256_mod); + sp_256_mont_mul_5(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_5(y, y, t1, p256_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_256_mont_sqr_5(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_5(t1, t1, w, p256_mod); - sp_256_mont_tpl_5(a, t1, p256_mod); + sp_256_mont_tpl_lower_5(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_5(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_5(b, t1, x, p256_mod, p256_mp_mod); @@ -23916,14 +23914,15 @@ static void sp_256_proj_point_dbl_n_5(sp_point_256* p, int n, sp_256_mont_sqr_5(x, a, p256_mod, p256_mp_mod); sp_256_mont_dbl_5(t2, b, p256_mod); sp_256_mont_sub_5(x, x, t2, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_sub_lower_5(t2, b, x, p256_mod); + sp_256_mont_dbl_lower_5(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_5(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_5(t1, t1, p256_mod, p256_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_5(y, b, x, p256_mod); - sp_256_mont_mul_5(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_5(y, y, p256_mod); + sp_256_mont_mul_5(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_5(y, y, t1, p256_mod); #endif /* Y = Y/2 */ @@ -23973,29 +23972,30 @@ static void sp_256_proj_point_dbl_n_store_5(sp_point_256* r, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_5(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_5(t1, t1, w, p256_mod); - sp_256_mont_tpl_5(a, t1, p256_mod); + sp_256_mont_tpl_lower_5(a, t1, p256_mod); /* B = X*Y^2 */ - sp_256_mont_sqr_5(t2, y, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(b, t2, x, p256_mod, p256_mp_mod); + sp_256_mont_sqr_5(t1, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(b, t1, x, p256_mod, p256_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_256_mont_sqr_5(x, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_5(t1, b, p256_mod); - sp_256_mont_sub_5(x, x, t1, p256_mod); + sp_256_mont_dbl_5(t2, b, p256_mod); + sp_256_mont_sub_5(x, x, t2, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_sub_lower_5(t2, b, x, p256_mod); + sp_256_mont_dbl_lower_5(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_5(r[j].z, z, y, p256_mod, p256_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_256_mont_sqr_5(t2, t2, p256_mod, p256_mp_mod); + /* t1 = Y^4 */ + sp_256_mont_sqr_5(t1, t1, p256_mod, p256_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_256_mont_mul_5(w, w, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(w, w, t1, p256_mod, p256_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_5(y, b, x, p256_mod); - sp_256_mont_mul_5(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_5(y, y, p256_mod); - sp_256_mont_sub_5(y, y, t2, p256_mod); + sp_256_mont_mul_5(y, b, a, p256_mod, p256_mp_mod); + sp_256_mont_sub_5(y, y, t1, p256_mod); /* Y = Y/2 */ sp_256_div2_5(r[j].y, y, p256_mod); @@ -24021,30 +24021,30 @@ static void sp_256_proj_point_add_sub_5(sp_point_256* ra, sp_digit* t4 = t + 6*5; sp_digit* t5 = t + 8*5; sp_digit* t6 = t + 10*5; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_5(t1, q->z, p256_mod, p256_mp_mod); sp_256_mont_mul_5(t3, t1, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(t1, t1, x, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(t1, t1, xa, p256_mod, p256_mp_mod); /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_5(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_5(t2, za, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(t4, t2, za, p256_mod, p256_mp_mod); sp_256_mont_mul_5(t2, t2, q->x, p256_mod, p256_mp_mod); /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_5(t3, t3, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(t3, t3, ya, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_5(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ @@ -24055,30 +24055,30 @@ static void sp_256_proj_point_add_sub_5(sp_point_256* ra, sp_256_mont_sub_5(t4, t4, t3, p256_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_256_mont_mul_5(z, z, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(z, z, t2, p256_mod, p256_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_256_mont_mul_5(za, za, q->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(za, za, t2, p256_mod, p256_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_5(x, t4, p256_mod, p256_mp_mod); + sp_256_mont_sqr_5(xa, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_5(xs, t6, p256_mod, p256_mp_mod); sp_256_mont_sqr_5(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(y, t1, t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(ya, t1, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_5(t5, t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_sub_5(x, x, t5, p256_mod); + sp_256_mont_sub_5(xa, xa, t5, p256_mod); sp_256_mont_sub_5(xs, xs, t5, p256_mod); - sp_256_mont_dbl_5(t1, y, p256_mod); - sp_256_mont_sub_5(x, x, t1, p256_mod); + sp_256_mont_dbl_5(t1, ya, p256_mod); + sp_256_mont_sub_5(xa, xa, t1, p256_mod); sp_256_mont_sub_5(xs, xs, t1, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_256_mont_sub_5(ys, y, xs, p256_mod); - sp_256_mont_sub_5(y, y, x, p256_mod); - sp_256_mont_mul_5(y, y, t4, p256_mod, p256_mp_mod); + sp_256_mont_sub_lower_5(ys, ya, xs, p256_mod); + sp_256_mont_sub_lower_5(ya, ya, xa, p256_mod); + sp_256_mont_mul_5(ya, ya, t4, p256_mod, p256_mp_mod); sp_256_sub_5(t6, p256_mod, t6); sp_256_mont_mul_5(ys, ys, t6, p256_mod, p256_mp_mod); sp_256_mont_mul_5(t5, t5, t3, p256_mod, p256_mp_mod); - sp_256_mont_sub_5(y, y, t5, p256_mod); + sp_256_mont_sub_5(ya, ya, t5, p256_mod); sp_256_mont_sub_5(ys, ys, t5, p256_mod); } @@ -24360,17 +24360,12 @@ static int sp_256_ecc_mulmod_win_add_sub_5(sp_point_256* r, const sp_point_256* static void sp_256_proj_point_add_qz1_5(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*5; sp_digit* t3 = t + 4*5; sp_digit* t4 = t + 6*5; sp_digit* t5 = t + 8*5; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*5; /* Check double */ (void)sp_256_sub_5(t1, p256_mod, q->y); @@ -24380,53 +24375,54 @@ static void sp_256_proj_point_add_qz1_5(sp_point_256* r, const sp_point_256* p, sp_256_proj_point_dbl_5(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<5; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<5; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<5; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_5(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_5(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_5(t2, t2, q->x, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_5(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - X1 */ - sp_256_mont_sub_5(t2, t2, x, p256_mod); + sp_256_mont_sub_5(t2, t2, p->x, p256_mod); /* R = S2 - Y1 */ - sp_256_mont_sub_5(t4, t4, y, p256_mod); + sp_256_mont_sub_5(t4, t4, p->y, p256_mod); /* Z3 = H*Z1 */ - sp_256_mont_mul_5(z, z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_256_mont_sqr_5(t1, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_5(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(t3, x, t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(t3, p->x, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_5(t5, t5, t2, p256_mod, p256_mp_mod); sp_256_mont_sub_5(x, t1, t5, p256_mod); sp_256_mont_dbl_5(t1, t3, p256_mod); sp_256_mont_sub_5(x, x, t1, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_5(t3, t3, x, p256_mod); + sp_256_mont_sub_lower_5(t3, t3, x, p256_mod); sp_256_mont_mul_5(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(t5, t5, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(t5, t5, p->y, p256_mod, p256_mp_mod); sp_256_mont_sub_5(y, t3, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 5; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 5; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 5; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -24607,7 +24603,7 @@ static int sp_256_ecc_mulmod_stripe_5(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 5 * 5]; + sp_digit t[2 * 5 * 6]; #endif sp_point_256* p = NULL; int i; @@ -24628,7 +24624,7 @@ static int sp_256_ecc_mulmod_stripe_5(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 5 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 5 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -24808,7 +24804,7 @@ static int sp_256_ecc_mulmod_5(sp_point_256* r, const sp_point_256* g, const sp_ #ifndef FP_ECC return sp_256_ecc_mulmod_win_add_sub_5(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 5 * 5]; + sp_digit tmp[2 * 5 * 6]; sp_cache_256_t* cache; int err = MP_OKAY; @@ -24919,7 +24915,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[5 + 5 * 2 * 5]; + sp_digit k[5 + 5 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -24932,7 +24928,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (5 + 5 * 2 * 5), heap, + sizeof(sp_digit) * (5 + 5 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -26377,7 +26373,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[5 + 5 * 2 * 5]; + sp_digit k[5 + 5 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -26390,7 +26386,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (5 + 5 * 2 * 5), + sizeof(sp_digit) * (5 + 5 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -27745,7 +27741,7 @@ typedef struct sp_ecc_verify_256_ctx { sp_digit u1[2*5]; sp_digit u2[2*5]; sp_digit s[2*5]; - sp_digit tmp[2*5 * 5]; + sp_digit tmp[2*5 * 6]; sp_point_256 p1; sp_point_256 p2; } sp_ecc_verify_256_ctx; @@ -27891,7 +27887,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_256* p1 = NULL; #else - sp_digit u1[16 * 5]; + sp_digit u1[18 * 5]; sp_point_256 p1[2]; #endif sp_digit* u2 = NULL; @@ -27910,7 +27906,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 5, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 5, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -28213,7 +28209,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_256* p = NULL; #else - sp_digit tmp[2 * 5 * 5]; + sp_digit tmp[2 * 5 * 6]; sp_point_256 p[2]; #endif sp_point_256* q = NULL; @@ -28227,7 +28223,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 5 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 5 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -29557,7 +29553,7 @@ static void sp_384_map_7(sp_point_384* r, const sp_point_384* p, (sp_digit)1 : (sp_digit)0)); sp_384_norm_7(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -29672,6 +29668,7 @@ static void sp_384_mont_sub_7(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_384_norm_7(r); } +#define sp_384_mont_sub_lower_7 sp_384_mont_sub_7 /* Shift number left one bit. * Bottom bit is lost. * @@ -29825,7 +29822,7 @@ static int sp_384_proj_point_dbl_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con break; case 16: /* Y = Y - X */ - sp_384_mont_sub_7(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_lower_7(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -29851,7 +29848,8 @@ static int sp_384_proj_point_dbl_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_384_proj_point_dbl_7(sp_point_384* r, const sp_point_384* p, sp_digit* t) +static void sp_384_proj_point_dbl_7(sp_point_384* r, const sp_point_384* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*7; @@ -29898,7 +29896,7 @@ static void sp_384_proj_point_dbl_7(sp_point_384* r, const sp_point_384* p, sp_d /* X = X - Y */ sp_384_mont_sub_7(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_7(y, y, x, p384_mod); + sp_384_mont_sub_lower_7(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_7(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ @@ -29938,6 +29936,7 @@ typedef struct sp_384_proj_point_add_7_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -29966,6 +29965,10 @@ static int sp_384_proj_point_add_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->t3 = t + 4*7; ctx->t4 = t + 6*7; ctx->t5 = t + 8*7; + ctx->t6 = t + 10*7; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -29990,29 +29993,6 @@ static int sp_384_proj_point_add_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_384)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<7; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<7; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<7; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -30026,16 +30006,16 @@ static int sp_384_proj_point_add_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 6; break; case 6: - sp_384_mont_mul_7(ctx->t1, ctx->t1, ctx->x, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_7(ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_7(ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 8; break; case 8: - sp_384_mont_mul_7(ctx->t4, ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 9; break; case 9: @@ -30044,7 +30024,7 @@ static int sp_384_proj_point_add_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_7(ctx->t3, ctx->t3, ctx->y, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod); ctx->state = 11; break; case 11: @@ -30063,29 +30043,29 @@ static int sp_384_proj_point_add_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_7(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_384_mont_sqr_7(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 15; break; case 15: - sp_384_mont_mul_7(ctx->z, ctx->z, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_7(ctx->x, ctx->t4, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 17; break; case 17: - sp_384_mont_sqr_7(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_7(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 18; break; case 18: - sp_384_mont_mul_7(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); ctx->state = 19; break; case 19: - sp_384_mont_mul_7(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_7(ctx->x, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 20; break; case 20: @@ -30093,24 +30073,24 @@ static int sp_384_proj_point_add_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 21; break; case 21: - sp_384_mont_dbl_7(ctx->t1, ctx->y, p384_mod); + sp_384_mont_mul_7(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); ctx->state = 22; break; case 22: - sp_384_mont_sub_7(ctx->x, ctx->x, ctx->t1, p384_mod); + sp_384_mont_dbl_7(ctx->t3, ctx->y, p384_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_7(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_7(ctx->x, ctx->x, ctx->t3, p384_mod); ctx->state = 24; break; case 24: - sp_384_mont_mul_7(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_7(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 25; break; case 25: - sp_384_mont_mul_7(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 26; break; case 26: @@ -30118,9 +30098,30 @@ static int sp_384_proj_point_add_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 7; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 7; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 7; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -30132,24 +30133,13 @@ static int sp_384_proj_point_add_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, static void sp_384_proj_point_add_7(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*7; sp_digit* t3 = t + 4*7; sp_digit* t4 = t + 6*7; sp_digit* t5 = t + 8*7; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*7; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_384* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_384_sub_7(t1, p384_mod, q->y); @@ -30159,60 +30149,61 @@ static void sp_384_proj_point_add_7(sp_point_384* r, sp_384_proj_point_dbl_7(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<7; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<7; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<7; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_7(t1, q->z, p384_mod, p384_mp_mod); sp_384_mont_mul_7(t3, t1, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(t1, t1, x, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(t1, t1, p->x, p384_mod, p384_mp_mod); /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_7(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_7(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_7(t2, t2, q->x, p384_mod, p384_mp_mod); /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_7(t3, t3, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(t3, t3, p->y, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_7(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - U1 */ sp_384_mont_sub_7(t2, t2, t1, p384_mod); /* R = S2 - S1 */ sp_384_mont_sub_7(t4, t4, t3, p384_mod); - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_7(z, z, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(z, z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_7(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_7(t5, t2, p384_mod, p384_mp_mod); sp_384_mont_mul_7(y, t1, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_7(t5, t5, t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_7(z, p->z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(z, z, q->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_7(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_7(x, x, t5, p384_mod); - sp_384_mont_dbl_7(t1, y, p384_mod); - sp_384_mont_sub_7(x, x, t1, p384_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_7(y, y, x, p384_mod); - sp_384_mont_mul_7(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_mul_7(t5, t5, t3, p384_mod, p384_mp_mod); + sp_384_mont_dbl_7(t3, y, p384_mod); + sp_384_mont_sub_7(x, x, t3, p384_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_7(y, y, x, p384_mod); + sp_384_mont_mul_7(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_7(y, y, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 7; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 7; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 7; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -30635,6 +30626,8 @@ static void sp_384_cond_copy_7(sp_digit* r, const sp_digit* a, const sp_digit m) #endif /* WOLFSSL_SP_SMALL */ } +#define sp_384_mont_dbl_lower_7 sp_384_mont_dbl_7 +#define sp_384_mont_tpl_lower_7 sp_384_mont_tpl_7 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -30673,7 +30666,7 @@ static void sp_384_proj_point_dbl_n_7(sp_point_384* p, int n, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_7(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_7(t1, t1, w, p384_mod); - sp_384_mont_tpl_7(a, t1, p384_mod); + sp_384_mont_tpl_lower_7(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_7(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_7(b, t1, x, p384_mod, p384_mp_mod); @@ -30681,9 +30674,12 @@ static void sp_384_proj_point_dbl_n_7(sp_point_384* p, int n, sp_384_mont_sqr_7(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_7(t2, b, p384_mod); sp_384_mont_sub_7(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_7(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_7(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_7(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_7(t1, t1, p384_mod, p384_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -30693,16 +30689,14 @@ static void sp_384_proj_point_dbl_n_7(sp_point_384* p, int n, sp_384_mont_mul_7(w, w, t1, p384_mod, p384_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_7(y, b, x, p384_mod); - sp_384_mont_mul_7(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_7(y, y, p384_mod); + sp_384_mont_mul_7(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_7(y, y, t1, p384_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_384_mont_sqr_7(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_7(t1, t1, w, p384_mod); - sp_384_mont_tpl_7(a, t1, p384_mod); + sp_384_mont_tpl_lower_7(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_7(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_7(b, t1, x, p384_mod, p384_mp_mod); @@ -30710,14 +30704,15 @@ static void sp_384_proj_point_dbl_n_7(sp_point_384* p, int n, sp_384_mont_sqr_7(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_7(t2, b, p384_mod); sp_384_mont_sub_7(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_7(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_7(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_7(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_7(t1, t1, p384_mod, p384_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_7(y, b, x, p384_mod); - sp_384_mont_mul_7(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_7(y, y, p384_mod); + sp_384_mont_mul_7(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_7(y, y, t1, p384_mod); #endif /* Y = Y/2 */ @@ -30767,29 +30762,30 @@ static void sp_384_proj_point_dbl_n_store_7(sp_point_384* r, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_7(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_7(t1, t1, w, p384_mod); - sp_384_mont_tpl_7(a, t1, p384_mod); + sp_384_mont_tpl_lower_7(a, t1, p384_mod); /* B = X*Y^2 */ - sp_384_mont_sqr_7(t2, y, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(b, t2, x, p384_mod, p384_mp_mod); + sp_384_mont_sqr_7(t1, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(b, t1, x, p384_mod, p384_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_384_mont_sqr_7(x, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_7(t1, b, p384_mod); - sp_384_mont_sub_7(x, x, t1, p384_mod); + sp_384_mont_dbl_7(t2, b, p384_mod); + sp_384_mont_sub_7(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_7(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_7(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_7(r[j].z, z, y, p384_mod, p384_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_384_mont_sqr_7(t2, t2, p384_mod, p384_mp_mod); + /* t1 = Y^4 */ + sp_384_mont_sqr_7(t1, t1, p384_mod, p384_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_384_mont_mul_7(w, w, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(w, w, t1, p384_mod, p384_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_7(y, b, x, p384_mod); - sp_384_mont_mul_7(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_7(y, y, p384_mod); - sp_384_mont_sub_7(y, y, t2, p384_mod); + sp_384_mont_mul_7(y, b, a, p384_mod, p384_mp_mod); + sp_384_mont_sub_7(y, y, t1, p384_mod); /* Y = Y/2 */ sp_384_div2_7(r[j].y, y, p384_mod); @@ -30815,30 +30811,30 @@ static void sp_384_proj_point_add_sub_7(sp_point_384* ra, sp_digit* t4 = t + 6*7; sp_digit* t5 = t + 8*7; sp_digit* t6 = t + 10*7; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_7(t1, q->z, p384_mod, p384_mp_mod); sp_384_mont_mul_7(t3, t1, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(t1, t1, x, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(t1, t1, xa, p384_mod, p384_mp_mod); /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_7(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_7(t2, za, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(t4, t2, za, p384_mod, p384_mp_mod); sp_384_mont_mul_7(t2, t2, q->x, p384_mod, p384_mp_mod); /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_7(t3, t3, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(t3, t3, ya, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_7(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - U1 */ @@ -30849,30 +30845,30 @@ static void sp_384_proj_point_add_sub_7(sp_point_384* ra, sp_384_mont_sub_7(t4, t4, t3, p384_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_384_mont_mul_7(z, z, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(z, z, t2, p384_mod, p384_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_384_mont_mul_7(za, za, q->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(za, za, t2, p384_mod, p384_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_7(x, t4, p384_mod, p384_mp_mod); + sp_384_mont_sqr_7(xa, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_7(xs, t6, p384_mod, p384_mp_mod); sp_384_mont_sqr_7(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(y, t1, t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(ya, t1, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_7(t5, t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_sub_7(x, x, t5, p384_mod); + sp_384_mont_sub_7(xa, xa, t5, p384_mod); sp_384_mont_sub_7(xs, xs, t5, p384_mod); - sp_384_mont_dbl_7(t1, y, p384_mod); - sp_384_mont_sub_7(x, x, t1, p384_mod); + sp_384_mont_dbl_7(t1, ya, p384_mod); + sp_384_mont_sub_7(xa, xa, t1, p384_mod); sp_384_mont_sub_7(xs, xs, t1, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_384_mont_sub_7(ys, y, xs, p384_mod); - sp_384_mont_sub_7(y, y, x, p384_mod); - sp_384_mont_mul_7(y, y, t4, p384_mod, p384_mp_mod); + sp_384_mont_sub_lower_7(ys, ya, xs, p384_mod); + sp_384_mont_sub_lower_7(ya, ya, xa, p384_mod); + sp_384_mont_mul_7(ya, ya, t4, p384_mod, p384_mp_mod); sp_384_sub_7(t6, p384_mod, t6); sp_384_mont_mul_7(ys, ys, t6, p384_mod, p384_mp_mod); sp_384_mont_mul_7(t5, t5, t3, p384_mod, p384_mp_mod); - sp_384_mont_sub_7(y, y, t5, p384_mod); + sp_384_mont_sub_7(ya, ya, t5, p384_mod); sp_384_mont_sub_7(ys, ys, t5, p384_mod); } @@ -31166,17 +31162,12 @@ static int sp_384_ecc_mulmod_win_add_sub_7(sp_point_384* r, const sp_point_384* static void sp_384_proj_point_add_qz1_7(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*7; sp_digit* t3 = t + 4*7; sp_digit* t4 = t + 6*7; sp_digit* t5 = t + 8*7; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*7; /* Check double */ (void)sp_384_sub_7(t1, p384_mod, q->y); @@ -31186,53 +31177,54 @@ static void sp_384_proj_point_add_qz1_7(sp_point_384* r, const sp_point_384* p, sp_384_proj_point_dbl_7(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<7; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<7; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<7; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_7(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_7(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_7(t2, t2, q->x, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_7(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - X1 */ - sp_384_mont_sub_7(t2, t2, x, p384_mod); + sp_384_mont_sub_7(t2, t2, p->x, p384_mod); /* R = S2 - Y1 */ - sp_384_mont_sub_7(t4, t4, y, p384_mod); + sp_384_mont_sub_7(t4, t4, p->y, p384_mod); /* Z3 = H*Z1 */ - sp_384_mont_mul_7(z, z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_384_mont_sqr_7(t1, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_7(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(t3, x, t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(t3, p->x, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_7(t5, t5, t2, p384_mod, p384_mp_mod); sp_384_mont_sub_7(x, t1, t5, p384_mod); sp_384_mont_dbl_7(t1, t3, p384_mod); sp_384_mont_sub_7(x, x, t1, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_7(t3, t3, x, p384_mod); + sp_384_mont_sub_lower_7(t3, t3, x, p384_mod); sp_384_mont_mul_7(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(t5, t5, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(t5, t5, p->y, p384_mod, p384_mp_mod); sp_384_mont_sub_7(y, t3, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 7; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 7; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 7; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -35050,7 +35042,7 @@ typedef struct sp_ecc_verify_384_ctx { sp_digit u1[2*7]; sp_digit u2[2*7]; sp_digit s[2*7]; - sp_digit tmp[2*7 * 5]; + sp_digit tmp[2*7 * 6]; sp_point_384 p1; sp_point_384 p2; } sp_ecc_verify_384_ctx; @@ -35196,7 +35188,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_384* p1 = NULL; #else - sp_digit u1[16 * 7]; + sp_digit u1[18 * 7]; sp_point_384 p1[2]; #endif sp_digit* u2 = NULL; @@ -35215,7 +35207,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 7, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 7, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -35518,7 +35510,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_384* p = NULL; #else - sp_digit tmp[2 * 7 * 5]; + sp_digit tmp[2 * 7 * 6]; sp_point_384 p[2]; #endif sp_point_384* q = NULL; @@ -35532,7 +35524,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 7 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 7 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -36921,7 +36913,7 @@ static void sp_521_map_9(sp_point_521* r, const sp_point_521* p, (sp_digit)1 : (sp_digit)0)); sp_521_norm_9(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -37038,6 +37030,7 @@ static void sp_521_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_521_norm_9(r); } +#define sp_521_mont_sub_lower_9 sp_521_mont_sub_9 /* Shift number left one bit. * Bottom bit is lost. * @@ -37193,7 +37186,7 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con break; case 16: /* Y = Y - X */ - sp_521_mont_sub_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -37219,7 +37212,8 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p, sp_digit* t) +static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*9; @@ -37266,7 +37260,7 @@ static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p, sp_d /* X = X - Y */ sp_521_mont_sub_9(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_9(y, y, x, p521_mod); + sp_521_mont_sub_lower_9(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_9(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ @@ -37306,6 +37300,7 @@ typedef struct sp_521_proj_point_add_9_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -37334,6 +37329,10 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->t3 = t + 4*9; ctx->t4 = t + 6*9; ctx->t5 = t + 8*9; + ctx->t6 = t + 10*9; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -37358,29 +37357,6 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_521)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -37394,16 +37370,16 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 6; break; case 6: - sp_521_mont_mul_9(ctx->t1, ctx->t1, ctx->x, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_9(ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 8; break; case 8: - sp_521_mont_mul_9(ctx->t4, ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 9; break; case 9: @@ -37412,7 +37388,7 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_9(ctx->t3, ctx->t3, ctx->y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod); ctx->state = 11; break; case 11: @@ -37431,29 +37407,29 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_521_mont_sqr_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 15; break; case 15: - sp_521_mont_mul_9(ctx->z, ctx->z, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 17; break; case 17: - sp_521_mont_sqr_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_9(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 18; break; case 18: - sp_521_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); ctx->state = 19; break; case 19: - sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 20; break; case 20: @@ -37461,24 +37437,24 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 21; break; case 21: - sp_521_mont_dbl_9(ctx->t1, ctx->y, p521_mod); + sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); ctx->state = 22; break; case 22: - sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t1, p521_mod); + sp_521_mont_dbl_9(ctx->t3, ctx->y, p521_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t3, p521_mod); ctx->state = 24; break; case 24: - sp_521_mont_mul_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 25; break; case 25: - sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 26; break; case 26: @@ -37486,9 +37462,30 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -37500,24 +37497,13 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, static void sp_521_proj_point_add_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*9; sp_digit* t3 = t + 4*9; sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*9; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_521* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_521_sub_9(t1, p521_mod, q->y); @@ -37527,60 +37513,61 @@ static void sp_521_proj_point_add_9(sp_point_521* r, sp_521_proj_point_dbl_9(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t3, t1, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t1, t1, x, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t1, t1, p->x, p521_mod, p521_mp_mod); /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_9(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod); /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_9(t3, t3, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t3, t3, p->y, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - U1 */ sp_521_mont_sub_9(t2, t2, t1, p521_mod); /* R = S2 - S1 */ sp_521_mont_sub_9(t4, t4, t3, p521_mod); - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(z, z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod); sp_521_mont_mul_9(y, t1, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_9(x, x, t5, p521_mod); - sp_521_mont_dbl_9(t1, y, p521_mod); - sp_521_mont_sub_9(x, x, t1, p521_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_9(y, y, x, p521_mod); - sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t3, p521_mod, p521_mp_mod); + sp_521_mont_dbl_9(t3, y, p521_mod); + sp_521_mont_sub_9(x, x, t3, p521_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_9(y, y, x, p521_mod); + sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, y, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -37627,7 +37614,7 @@ typedef struct sp_521_ecc_mulmod_9_ctx { sp_521_proj_point_add_9_ctx add_ctx; }; sp_point_521 t[3]; - sp_digit tmp[2 * 9 * 5]; + sp_digit tmp[2 * 9 * 6]; sp_digit n; int i; int c; @@ -37741,7 +37728,7 @@ static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g, sp_digit* tmp = NULL; #else sp_point_521 t[3]; - sp_digit tmp[2 * 9 * 5]; + sp_digit tmp[2 * 9 * 6]; #endif sp_digit n; int i; @@ -37759,7 +37746,7 @@ static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g, if (t == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -37818,7 +37805,7 @@ static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 9 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 9 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -37884,6 +37871,8 @@ static void sp_521_cond_copy_9(sp_digit* r, const sp_digit* a, const sp_digit m) #endif /* WOLFSSL_SP_SMALL */ } +#define sp_521_mont_dbl_lower_9 sp_521_mont_dbl_9 +#define sp_521_mont_tpl_lower_9 sp_521_mont_tpl_9 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -37922,7 +37911,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int n, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_9(a, t1, p521_mod); + sp_521_mont_tpl_lower_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -37930,9 +37919,12 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int n, sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_9(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_9(t1, t1, p521_mod, p521_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -37942,16 +37934,14 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int n, sp_521_mont_mul_9(w, w, t1, p521_mod, p521_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_9(y, b, x, p521_mod); - sp_521_mont_mul_9(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_9(y, y, p521_mod); + sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, y, t1, p521_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_9(a, t1, p521_mod); + sp_521_mont_tpl_lower_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -37959,14 +37949,15 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int n, sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_9(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_9(t1, t1, p521_mod, p521_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_9(y, b, x, p521_mod); - sp_521_mont_mul_9(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_9(y, y, p521_mod); + sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, y, t1, p521_mod); #endif /* Y = Y/2 */ @@ -38016,29 +38007,30 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_9(a, t1, p521_mod); + sp_521_mont_tpl_lower_9(a, t1, p521_mod); /* B = X*Y^2 */ - sp_521_mont_sqr_9(t2, y, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(b, t2, x, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_9(t1, b, p521_mod); - sp_521_mont_sub_9(x, x, t1, p521_mod); + sp_521_mont_dbl_9(t2, b, p521_mod); + sp_521_mont_sub_9(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_9(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(r[j].z, z, y, p521_mod, p521_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_521_mont_sqr_9(t2, t2, p521_mod, p521_mp_mod); + /* t1 = Y^4 */ + sp_521_mont_sqr_9(t1, t1, p521_mod, p521_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_521_mont_mul_9(w, w, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(w, w, t1, p521_mod, p521_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_9(y, b, x, p521_mod); - sp_521_mont_mul_9(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_9(y, y, p521_mod); - sp_521_mont_sub_9(y, y, t2, p521_mod); + sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod); + sp_521_mont_sub_9(y, y, t1, p521_mod); /* Y = Y/2 */ sp_521_div2_9(r[j].y, y, p521_mod); @@ -38064,30 +38056,30 @@ static void sp_521_proj_point_add_sub_9(sp_point_521* ra, sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; sp_digit* t6 = t + 10*9; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t3, t1, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t1, t1, x, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t1, t1, xa, p521_mod, p521_mp_mod); /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_9(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t2, za, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t4, t2, za, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod); /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_9(t3, t3, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t3, t3, ya, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - U1 */ @@ -38098,30 +38090,30 @@ static void sp_521_proj_point_add_sub_9(sp_point_521* ra, sp_521_mont_sub_9(t4, t4, t3, p521_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(z, z, t2, p521_mod, p521_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_521_mont_mul_9(za, za, q->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(za, za, t2, p521_mod, p521_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(xa, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_9(xs, t6, p521_mod, p521_mp_mod); sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(y, t1, t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ya, t1, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_sub_9(x, x, t5, p521_mod); + sp_521_mont_sub_9(xa, xa, t5, p521_mod); sp_521_mont_sub_9(xs, xs, t5, p521_mod); - sp_521_mont_dbl_9(t1, y, p521_mod); - sp_521_mont_sub_9(x, x, t1, p521_mod); + sp_521_mont_dbl_9(t1, ya, p521_mod); + sp_521_mont_sub_9(xa, xa, t1, p521_mod); sp_521_mont_sub_9(xs, xs, t1, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_521_mont_sub_9(ys, y, xs, p521_mod); - sp_521_mont_sub_9(y, y, x, p521_mod); - sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod); + sp_521_mont_sub_lower_9(ys, ya, xs, p521_mod); + sp_521_mont_sub_lower_9(ya, ya, xa, p521_mod); + sp_521_mont_mul_9(ya, ya, t4, p521_mod, p521_mp_mod); sp_521_sub_9(t6, p521_mod, t6); sp_521_mont_mul_9(ys, ys, t6, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t3, p521_mod, p521_mp_mod); - sp_521_mont_sub_9(y, y, t5, p521_mod); + sp_521_mont_sub_9(ya, ya, t5, p521_mod); sp_521_mont_sub_9(ys, ys, t5, p521_mod); } @@ -38427,17 +38419,12 @@ static int sp_521_ecc_mulmod_win_add_sub_9(sp_point_521* r, const sp_point_521* static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*9; sp_digit* t3 = t + 4*9; sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*9; /* Check double */ (void)sp_521_sub_9(t1, p521_mod, q->y); @@ -38447,53 +38434,54 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p, sp_521_proj_point_dbl_9(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_9(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - X1 */ - sp_521_mont_sub_9(t2, t2, x, p521_mod); + sp_521_mont_sub_9(t2, t2, p->x, p521_mod); /* R = S2 - Y1 */ - sp_521_mont_sub_9(t4, t4, y, p521_mod); + sp_521_mont_sub_9(t4, t4, p->y, p521_mod); /* Z3 = H*Z1 */ - sp_521_mont_mul_9(z, z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_521_mont_sqr_9(t1, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t3, x, t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t3, p->x, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod); sp_521_mont_sub_9(x, t1, t5, p521_mod); sp_521_mont_dbl_9(t1, t3, p521_mod); sp_521_mont_sub_9(x, x, t1, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_9(t3, t3, x, p521_mod); + sp_521_mont_sub_lower_9(t3, t3, x, p521_mod); sp_521_mont_mul_9(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t5, t5, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t5, t5, p->y, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, t3, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -38690,7 +38678,7 @@ static int sp_521_ecc_mulmod_stripe_9(sp_point_521* r, const sp_point_521* g, sp_digit* t = NULL; #else sp_point_521 rt[2]; - sp_digit t[2 * 9 * 5]; + sp_digit t[2 * 9 * 6]; #endif sp_point_521* p = NULL; int i; @@ -38711,7 +38699,7 @@ static int sp_521_ecc_mulmod_stripe_9(sp_point_521* r, const sp_point_521* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -39002,7 +38990,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[9 + 9 * 2 * 5]; + sp_digit k[9 + 9 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -39015,7 +39003,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (9 + 9 * 2 * 5), heap, + sizeof(sp_digit) * (9 + 9 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -40970,7 +40958,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[9 + 9 * 2 * 5]; + sp_digit k[9 + 9 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -40983,7 +40971,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (9 + 9 * 2 * 5), + sizeof(sp_digit) * (9 + 9 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -42432,7 +42420,7 @@ typedef struct sp_ecc_verify_521_ctx { sp_digit u1[2*9]; sp_digit u2[2*9]; sp_digit s[2*9]; - sp_digit tmp[2*9 * 5]; + sp_digit tmp[2*9 * 6]; sp_point_521 p1; sp_point_521 p2; } sp_ecc_verify_521_ctx; @@ -42582,7 +42570,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_521* p1 = NULL; #else - sp_digit u1[16 * 9]; + sp_digit u1[18 * 9]; sp_point_521 p1[2]; #endif sp_digit* u2 = NULL; @@ -42601,7 +42589,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 9, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 9, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -42909,7 +42897,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_521* p = NULL; #else - sp_digit tmp[2 * 9 * 5]; + sp_digit tmp[2 * 9 * 6]; sp_point_521 p[2]; #endif sp_point_521* q = NULL; @@ -42923,7 +42911,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -44763,7 +44751,7 @@ static void sp_1024_map_18(sp_point_1024* r, const sp_point_1024* p, (sp_digit)1 : (sp_digit)0)); sp_1024_norm_18(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -44836,6 +44824,7 @@ static void sp_1024_mont_sub_18(sp_digit* r, const sp_digit* a, const sp_digit* sp_1024_norm_18(r); } +#define sp_1024_mont_sub_lower_18 sp_1024_mont_sub_18 /* Shift number left one bit. * Bottom bit is lost. * @@ -45000,7 +44989,7 @@ static int sp_1024_proj_point_dbl_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_18(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_lower_18(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -45026,7 +45015,8 @@ static int sp_1024_proj_point_dbl_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_1024_proj_point_dbl_18(sp_point_1024* r, const sp_point_1024* p, sp_digit* t) +static void sp_1024_proj_point_dbl_18(sp_point_1024* r, const sp_point_1024* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*18; @@ -45073,7 +45063,7 @@ static void sp_1024_proj_point_dbl_18(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_18(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_18(y, y, x, p1024_mod); + sp_1024_mont_sub_lower_18(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_18(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ @@ -45116,6 +45106,7 @@ typedef struct sp_1024_proj_point_add_18_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -45144,6 +45135,10 @@ static int sp_1024_proj_point_add_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->t3 = t + 4*18; ctx->t4 = t + 6*18; ctx->t5 = t + 8*18; + ctx->t6 = t + 10*18; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -45168,29 +45163,6 @@ static int sp_1024_proj_point_add_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_1024)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<18; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<18; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<18; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -45204,16 +45176,16 @@ static int sp_1024_proj_point_add_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 6; break; case 6: - sp_1024_mont_mul_18(ctx->t1, ctx->t1, ctx->x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_18(ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_18(ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 8; break; case 8: - sp_1024_mont_mul_18(ctx->t4, ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 9; break; case 9: @@ -45222,7 +45194,7 @@ static int sp_1024_proj_point_add_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_18(ctx->t3, ctx->t3, ctx->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod); ctx->state = 11; break; case 11: @@ -45241,29 +45213,29 @@ static int sp_1024_proj_point_add_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_18(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_1024_mont_sqr_18(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 15; break; case 15: - sp_1024_mont_mul_18(ctx->z, ctx->z, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_18(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 17; break; case 17: - sp_1024_mont_sqr_18(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_18(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 18; break; case 18: - sp_1024_mont_mul_18(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); ctx->state = 19; break; case 19: - sp_1024_mont_mul_18(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_18(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 20; break; case 20: @@ -45271,24 +45243,24 @@ static int sp_1024_proj_point_add_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 21; break; case 21: - sp_1024_mont_dbl_18(ctx->t1, ctx->y, p1024_mod); + sp_1024_mont_mul_18(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); ctx->state = 22; break; case 22: - sp_1024_mont_sub_18(ctx->x, ctx->x, ctx->t1, p1024_mod); + sp_1024_mont_dbl_18(ctx->t3, ctx->y, p1024_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_18(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_18(ctx->x, ctx->x, ctx->t3, p1024_mod); ctx->state = 24; break; case 24: - sp_1024_mont_mul_18(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_18(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 25; break; case 25: - sp_1024_mont_mul_18(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 26; break; case 26: @@ -45296,9 +45268,30 @@ static int sp_1024_proj_point_add_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 18; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 18; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 18; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -45310,24 +45303,13 @@ static int sp_1024_proj_point_add_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, static void sp_1024_proj_point_add_18(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*18; sp_digit* t3 = t + 4*18; sp_digit* t4 = t + 6*18; sp_digit* t5 = t + 8*18; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*18; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_1024* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_1024_mont_sub_18(t1, p1024_mod, q->y, p1024_mod); @@ -45337,60 +45319,61 @@ static void sp_1024_proj_point_add_18(sp_point_1024* r, sp_1024_proj_point_dbl_18(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<18; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<18; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<18; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_18(t1, q->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(t3, t1, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(t1, t1, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(t1, t1, p->x, p1024_mod, p1024_mp_mod); /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_18(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_18(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_18(t3, t3, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(t3, t3, p->y, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_18(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - U1 */ sp_1024_mont_sub_18(t2, t2, t1, p1024_mod); /* R = S2 - S1 */ sp_1024_mont_sub_18(t4, t4, t3, p1024_mod); - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_18(z, z, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(z, z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_18(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_18(t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(y, t1, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(t5, t5, t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_18(z, p->z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(z, z, q->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_18(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(x, x, t5, p1024_mod); - sp_1024_mont_dbl_18(t1, y, p1024_mod); - sp_1024_mont_sub_18(x, x, t1, p1024_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_18(y, y, x, p1024_mod); - sp_1024_mont_mul_18(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(t5, t5, t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_dbl_18(t3, y, p1024_mod); + sp_1024_mont_sub_18(x, x, t3, p1024_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_18(y, y, x, p1024_mod); + sp_1024_mont_mul_18(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(y, y, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 18; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 18; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 18; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -45419,7 +45402,7 @@ typedef struct sp_1024_ecc_mulmod_18_ctx { sp_1024_proj_point_add_18_ctx add_ctx; }; sp_point_1024 t[3]; - sp_digit tmp[2 * 18 * 5]; + sp_digit tmp[2 * 18 * 6]; sp_digit n; int i; int c; @@ -45533,7 +45516,7 @@ static int sp_1024_ecc_mulmod_18(sp_point_1024* r, const sp_point_1024* g, sp_digit* tmp = NULL; #else sp_point_1024 t[3]; - sp_digit tmp[2 * 18 * 5]; + sp_digit tmp[2 * 18 * 6]; #endif sp_digit n; int i; @@ -45551,7 +45534,7 @@ static int sp_1024_ecc_mulmod_18(sp_point_1024* r, const sp_point_1024* g, if (t == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 18 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 18 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -45610,7 +45593,7 @@ static int sp_1024_ecc_mulmod_18(sp_point_1024* r, const sp_point_1024* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 18 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 18 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -45694,6 +45677,8 @@ static void sp_1024_cond_copy_18(sp_digit* r, const sp_digit* a, const sp_digit #endif /* WOLFSSL_SP_SMALL */ } +#define sp_1024_mont_dbl_lower_18 sp_1024_mont_dbl_18 +#define sp_1024_mont_tpl_lower_18 sp_1024_mont_tpl_18 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -45732,7 +45717,7 @@ static void sp_1024_proj_point_dbl_n_18(sp_point_1024* p, int n, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_18(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_18(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_18(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_18(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(b, t1, x, p1024_mod, p1024_mp_mod); @@ -45740,9 +45725,12 @@ static void sp_1024_proj_point_dbl_n_18(sp_point_1024* p, int n, sp_1024_mont_sqr_18(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_18(t2, b, p1024_mod); sp_1024_mont_sub_18(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_18(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_18(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_18(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_18(t1, t1, p1024_mod, p1024_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -45752,16 +45740,14 @@ static void sp_1024_proj_point_dbl_n_18(sp_point_1024* p, int n, sp_1024_mont_mul_18(w, w, t1, p1024_mod, p1024_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_18(y, b, x, p1024_mod); - sp_1024_mont_mul_18(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_18(y, y, p1024_mod); + sp_1024_mont_mul_18(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(y, y, t1, p1024_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_18(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_18(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_18(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_18(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(b, t1, x, p1024_mod, p1024_mp_mod); @@ -45769,14 +45755,15 @@ static void sp_1024_proj_point_dbl_n_18(sp_point_1024* p, int n, sp_1024_mont_sqr_18(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_18(t2, b, p1024_mod); sp_1024_mont_sub_18(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_18(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_18(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_18(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_18(t1, t1, p1024_mod, p1024_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_18(y, b, x, p1024_mod); - sp_1024_mont_mul_18(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_18(y, y, p1024_mod); + sp_1024_mont_mul_18(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(y, y, t1, p1024_mod); #endif /* Y = Y/2 */ @@ -45826,29 +45813,30 @@ static void sp_1024_proj_point_dbl_n_store_18(sp_point_1024* r, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_18(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_18(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_18(a, t1, p1024_mod); /* B = X*Y^2 */ - sp_1024_mont_sqr_18(t2, y, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(b, t2, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_18(t1, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(b, t1, x, p1024_mod, p1024_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_1024_mont_sqr_18(x, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_18(t1, b, p1024_mod); - sp_1024_mont_sub_18(x, x, t1, p1024_mod); + sp_1024_mont_dbl_18(t2, b, p1024_mod); + sp_1024_mont_sub_18(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_18(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_18(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_18(r[j].z, z, y, p1024_mod, p1024_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_1024_mont_sqr_18(t2, t2, p1024_mod, p1024_mp_mod); + /* t1 = Y^4 */ + sp_1024_mont_sqr_18(t1, t1, p1024_mod, p1024_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_1024_mont_mul_18(w, w, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(w, w, t1, p1024_mod, p1024_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_18(y, b, x, p1024_mod); - sp_1024_mont_mul_18(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_18(y, y, p1024_mod); - sp_1024_mont_sub_18(y, y, t2, p1024_mod); + sp_1024_mont_mul_18(y, b, a, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_18(y, y, t1, p1024_mod); /* Y = Y/2 */ sp_1024_div2_18(r[j].y, y, p1024_mod); @@ -45874,30 +45862,30 @@ static void sp_1024_proj_point_add_sub_18(sp_point_1024* ra, sp_digit* t4 = t + 6*18; sp_digit* t5 = t + 8*18; sp_digit* t6 = t + 10*18; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_18(t1, q->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(t3, t1, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(t1, t1, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(t1, t1, xa, p1024_mod, p1024_mp_mod); /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_18(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_18(t2, za, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(t4, t2, za, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_18(t3, t3, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(t3, t3, ya, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_18(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - U1 */ @@ -45908,30 +45896,30 @@ static void sp_1024_proj_point_add_sub_18(sp_point_1024* ra, sp_1024_mont_sub_18(t4, t4, t3, p1024_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_1024_mont_mul_18(z, z, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(z, z, t2, p1024_mod, p1024_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_1024_mont_mul_18(za, za, q->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(za, za, t2, p1024_mod, p1024_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_18(x, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_18(xa, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_18(xs, t6, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_18(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(y, t1, t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(ya, t1, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(t5, t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_18(x, x, t5, p1024_mod); + sp_1024_mont_sub_18(xa, xa, t5, p1024_mod); sp_1024_mont_sub_18(xs, xs, t5, p1024_mod); - sp_1024_mont_dbl_18(t1, y, p1024_mod); - sp_1024_mont_sub_18(x, x, t1, p1024_mod); + sp_1024_mont_dbl_18(t1, ya, p1024_mod); + sp_1024_mont_sub_18(xa, xa, t1, p1024_mod); sp_1024_mont_sub_18(xs, xs, t1, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_1024_mont_sub_18(ys, y, xs, p1024_mod); - sp_1024_mont_sub_18(y, y, x, p1024_mod); - sp_1024_mont_mul_18(y, y, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_lower_18(ys, ya, xs, p1024_mod); + sp_1024_mont_sub_lower_18(ya, ya, xa, p1024_mod); + sp_1024_mont_mul_18(ya, ya, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(t6, p1024_mod, t6, p1024_mod); sp_1024_mont_mul_18(ys, ys, t6, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(t5, t5, t3, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_18(y, y, t5, p1024_mod); + sp_1024_mont_sub_18(ya, ya, t5, p1024_mod); sp_1024_mont_sub_18(ys, ys, t5, p1024_mod); } @@ -46177,17 +46165,12 @@ static int sp_1024_ecc_mulmod_win_add_sub_18(sp_point_1024* r, const sp_point_10 static void sp_1024_proj_point_add_qz1_18(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*18; sp_digit* t3 = t + 4*18; sp_digit* t4 = t + 6*18; sp_digit* t5 = t + 8*18; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*18; /* Check double */ (void)sp_1024_mont_sub_18(t1, p1024_mod, q->y, p1024_mod); @@ -46197,53 +46180,54 @@ static void sp_1024_proj_point_add_qz1_18(sp_point_1024* r, const sp_point_1024* sp_1024_proj_point_dbl_18(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<18; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<18; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<18; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_18(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_18(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_18(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - X1 */ - sp_1024_mont_sub_18(t2, t2, x, p1024_mod); + sp_1024_mont_sub_18(t2, t2, p->x, p1024_mod); /* R = S2 - Y1 */ - sp_1024_mont_sub_18(t4, t4, y, p1024_mod); + sp_1024_mont_sub_18(t4, t4, p->y, p1024_mod); /* Z3 = H*Z1 */ - sp_1024_mont_mul_18(z, z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_1024_mont_sqr_18(t1, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_18(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(t3, x, t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(t3, p->x, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(t5, t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(x, t1, t5, p1024_mod); sp_1024_mont_dbl_18(t1, t3, p1024_mod); sp_1024_mont_sub_18(x, x, t1, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_18(t3, t3, x, p1024_mod); + sp_1024_mont_sub_lower_18(t3, t3, x, p1024_mod); sp_1024_mont_mul_18(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(t5, t5, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(t5, t5, p->y, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(y, t3, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 18; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 18; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 18; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -46386,7 +46370,7 @@ static int sp_1024_ecc_mulmod_stripe_18(sp_point_1024* r, const sp_point_1024* g sp_digit* t = NULL; #else sp_point_1024 rt[2]; - sp_digit t[2 * 18 * 5]; + sp_digit t[2 * 18 * 6]; #endif sp_point_1024* p = NULL; int i; @@ -46407,7 +46391,7 @@ static int sp_1024_ecc_mulmod_stripe_18(sp_point_1024* r, const sp_point_1024* g if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 18 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 18 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -46572,7 +46556,7 @@ static int sp_1024_ecc_mulmod_18(sp_point_1024* r, const sp_point_1024* g, const #ifndef FP_ECC return sp_1024_ecc_mulmod_win_add_sub_18(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 18 * 5]; + sp_digit tmp[2 * 18 * 6]; sp_cache_1024_t* cache; int err = MP_OKAY; @@ -50103,7 +50087,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_1024 point[2]; - sp_digit k[18 + 18 * 2 * 5]; + sp_digit k[18 + 18 * 2 * 6]; #endif sp_point_1024* addP = NULL; sp_digit* tmp = NULL; @@ -50116,7 +50100,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (18 + 18 * 2 * 5), + sizeof(sp_digit) * (18 + 18 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c index cbff69c33..aefbde83d 100644 --- a/wolfcrypt/src/sp_cortexm.c +++ b/wolfcrypt/src/sp_cortexm.c @@ -18754,7 +18754,7 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p, (sp_digit)1 : (sp_digit)0)); sp_256_norm_8(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -19038,6 +19038,7 @@ SP_NOINLINE static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a, const ); } +#define sp_256_mont_sub_lower_8 sp_256_mont_sub_8 /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) * * r Result of division by 2. @@ -19240,7 +19241,7 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con break; case 16: /* Y = Y - X */ - sp_256_mont_sub_8(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -19266,7 +19267,8 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p, sp_digit* t) +static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*8; @@ -19313,7 +19315,7 @@ static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p, sp_d /* X = X - Y */ sp_256_mont_sub_8(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_8(y, y, x, p256_mod); + sp_256_mont_sub_lower_8(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_8(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -19353,6 +19355,7 @@ typedef struct sp_256_proj_point_add_8_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -19381,6 +19384,10 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->t3 = t + 4*8; ctx->t4 = t + 6*8; ctx->t5 = t + 8*8; + ctx->t6 = t + 10*8; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -19405,29 +19412,6 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_256)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<8; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<8; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<8; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -19441,16 +19425,16 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 6; break; case 6: - sp_256_mont_mul_8(ctx->t1, ctx->t1, ctx->x, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_8(ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 8; break; case 8: - sp_256_mont_mul_8(ctx->t4, ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 9; break; case 9: @@ -19459,7 +19443,7 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_8(ctx->t3, ctx->t3, ctx->y, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod); ctx->state = 11; break; case 11: @@ -19478,29 +19462,29 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_8(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_256_mont_sqr_8(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 15; break; case 15: - sp_256_mont_mul_8(ctx->z, ctx->z, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_8(ctx->x, ctx->t4, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 17; break; case 17: - sp_256_mont_sqr_8(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_8(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 18; break; case 18: - sp_256_mont_mul_8(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); ctx->state = 19; break; case 19: - sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(ctx->x, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 20; break; case 20: @@ -19508,24 +19492,24 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 21; break; case 21: - sp_256_mont_dbl_8(ctx->t1, ctx->y, p256_mod); + sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); ctx->state = 22; break; case 22: - sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t1, p256_mod); + sp_256_mont_dbl_8(ctx->t3, ctx->y, p256_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_8(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t3, p256_mod); ctx->state = 24; break; case 24: - sp_256_mont_mul_8(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 25; break; case 25: - sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 26; break; case 26: @@ -19533,9 +19517,30 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -19547,24 +19552,13 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, static void sp_256_proj_point_add_8(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*8; sp_digit* t3 = t + 4*8; sp_digit* t4 = t + 6*8; sp_digit* t5 = t + 8*8; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*8; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_256* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_256_sub_8(t1, p256_mod, q->y); @@ -19574,60 +19568,61 @@ static void sp_256_proj_point_add_8(sp_point_256* r, sp_256_proj_point_dbl_8(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<8; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<8; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<8; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_8(t1, q->z, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t3, t1, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t1, t1, x, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t1, t1, p->x, p256_mod, p256_mp_mod); /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_8(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod); /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_8(t3, t3, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t3, t3, p->y, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ sp_256_mont_sub_8(t2, t2, t1, p256_mod); /* R = S2 - S1 */ sp_256_mont_sub_8(t4, t4, t3, p256_mod); - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_8(z, z, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(z, z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_8(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod); sp_256_mont_mul_8(y, t1, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(z, z, q->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_8(x, x, t5, p256_mod); - sp_256_mont_dbl_8(t1, y, p256_mod); - sp_256_mont_sub_8(x, x, t1, p256_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_8(y, y, x, p256_mod); - sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t5, t5, t3, p256_mod, p256_mp_mod); + sp_256_mont_dbl_8(t3, y, p256_mod); + sp_256_mont_sub_8(x, x, t3, p256_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_8(y, y, x, p256_mod); + sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, y, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -19722,7 +19717,7 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons sp_digit* tmp = NULL; #else sp_point_256 t[16 + 1]; - sp_digit tmp[2 * 8 * 5]; + sp_digit tmp[2 * 8 * 6]; #endif sp_point_256* rt = NULL; #ifndef WC_NO_CACHE_RESISTANT @@ -19756,7 +19751,7 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons } #endif if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -19857,7 +19852,7 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 8 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 8 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -19887,6 +19882,8 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons } #ifdef FP_ECC +#define sp_256_mont_dbl_lower_8 sp_256_mont_dbl_8 +#define sp_256_mont_tpl_lower_8 sp_256_mont_tpl_8 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -19925,7 +19922,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int n, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_8(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_8(t1, t1, w, p256_mod); - sp_256_mont_tpl_8(a, t1, p256_mod); + sp_256_mont_tpl_lower_8(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_8(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_8(b, t1, x, p256_mod, p256_mp_mod); @@ -19933,9 +19930,12 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int n, sp_256_mont_sqr_8(x, a, p256_mod, p256_mp_mod); sp_256_mont_dbl_8(t2, b, p256_mod); sp_256_mont_sub_8(x, x, t2, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_sub_lower_8(t2, b, x, p256_mod); + sp_256_mont_dbl_lower_8(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_8(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_8(t1, t1, p256_mod, p256_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -19945,16 +19945,14 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int n, sp_256_mont_mul_8(w, w, t1, p256_mod, p256_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_8(y, b, x, p256_mod); - sp_256_mont_mul_8(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_8(y, y, p256_mod); + sp_256_mont_mul_8(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, y, t1, p256_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_256_mont_sqr_8(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_8(t1, t1, w, p256_mod); - sp_256_mont_tpl_8(a, t1, p256_mod); + sp_256_mont_tpl_lower_8(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_8(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_8(b, t1, x, p256_mod, p256_mp_mod); @@ -19962,14 +19960,15 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int n, sp_256_mont_sqr_8(x, a, p256_mod, p256_mp_mod); sp_256_mont_dbl_8(t2, b, p256_mod); sp_256_mont_sub_8(x, x, t2, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_sub_lower_8(t2, b, x, p256_mod); + sp_256_mont_dbl_lower_8(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_8(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_8(t1, t1, p256_mod, p256_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_8(y, b, x, p256_mod); - sp_256_mont_mul_8(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_8(y, y, p256_mod); + sp_256_mont_mul_8(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, y, t1, p256_mod); #endif /* Y = Y/2 */ @@ -20019,17 +20018,12 @@ typedef struct sp_table_entry_256 { static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*8; sp_digit* t3 = t + 4*8; sp_digit* t4 = t + 6*8; sp_digit* t5 = t + 8*8; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*8; /* Check double */ (void)sp_256_sub_8(t1, p256_mod, q->y); @@ -20039,53 +20033,54 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p, sp_256_proj_point_dbl_8(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<8; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<8; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<8; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_8(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - X1 */ - sp_256_mont_sub_8(t2, t2, x, p256_mod); + sp_256_mont_sub_8(t2, t2, p->x, p256_mod); /* R = S2 - Y1 */ - sp_256_mont_sub_8(t4, t4, y, p256_mod); + sp_256_mont_sub_8(t4, t4, p->y, p256_mod); /* Z3 = H*Z1 */ - sp_256_mont_mul_8(z, z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_256_mont_sqr_8(t1, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t3, x, t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t3, p->x, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod); sp_256_mont_sub_8(x, t1, t5, p256_mod); sp_256_mont_dbl_8(t1, t3, p256_mod); sp_256_mont_sub_8(x, x, t1, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_8(t3, t3, x, p256_mod); + sp_256_mont_sub_lower_8(t3, t3, x, p256_mod); sp_256_mont_mul_8(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t5, t5, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t5, t5, p->y, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, t3, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -20257,7 +20252,7 @@ static int sp_256_ecc_mulmod_stripe_8(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 8 * 5]; + sp_digit t[2 * 8 * 6]; #endif sp_point_256* p = NULL; int i; @@ -20278,7 +20273,7 @@ static int sp_256_ecc_mulmod_stripe_8(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -20458,7 +20453,7 @@ static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g, const sp_ #ifndef FP_ECC return sp_256_ecc_mulmod_fast_8(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 8 * 5]; + sp_digit tmp[2 * 8 * 6]; sp_cache_256_t* cache; int err = MP_OKAY; @@ -20661,7 +20656,7 @@ static int sp_256_ecc_mulmod_stripe_8(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 8 * 5]; + sp_digit t[2 * 8 * 6]; #endif sp_point_256* p = NULL; int i; @@ -20682,7 +20677,7 @@ static int sp_256_ecc_mulmod_stripe_8(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -20862,7 +20857,7 @@ static int sp_256_ecc_mulmod_8(sp_point_256* r, const sp_point_256* g, const sp_ #ifndef FP_ECC return sp_256_ecc_mulmod_fast_8(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 8 * 5]; + sp_digit tmp[2 * 8 * 6]; sp_cache_256_t* cache; int err = MP_OKAY; @@ -20973,7 +20968,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[8 + 8 * 2 * 5]; + sp_digit k[8 + 8 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -20986,7 +20981,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (8 + 8 * 2 * 5), heap, + sizeof(sp_digit) * (8 + 8 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -22522,7 +22517,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[8 + 8 * 2 * 5]; + sp_digit k[8 + 8 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -22535,7 +22530,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (8 + 8 * 2 * 5), + sizeof(sp_digit) * (8 + 8 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -24095,7 +24090,7 @@ typedef struct sp_ecc_verify_256_ctx { sp_digit u1[2*8]; sp_digit u2[2*8]; sp_digit s[2*8]; - sp_digit tmp[2*8 * 5]; + sp_digit tmp[2*8 * 6]; sp_point_256 p1; sp_point_256 p2; } sp_ecc_verify_256_ctx; @@ -24241,7 +24236,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_256* p1 = NULL; #else - sp_digit u1[16 * 8]; + sp_digit u1[18 * 8]; sp_point_256 p1[2]; #endif sp_digit* u2 = NULL; @@ -24260,7 +24255,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 8, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 8, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -24563,7 +24558,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_256* p = NULL; #else - sp_digit tmp[2 * 8 * 5]; + sp_digit tmp[2 * 8 * 6]; sp_point_256 p[2]; #endif sp_point_256* q = NULL; @@ -24577,7 +24572,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 8 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -26044,7 +26039,7 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p, (sp_digit)1 : (sp_digit)0)); sp_384_norm_12(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -26152,6 +26147,7 @@ SP_NOINLINE static void sp_384_mont_sub_12(sp_digit* r, const sp_digit* a, const sp_384_cond_add_12(r, r, m, o); } +#define sp_384_mont_sub_lower_12 sp_384_mont_sub_12 static void sp_384_rshift1_12(sp_digit* r, const sp_digit* a) { __asm__ __volatile__ ( @@ -26338,7 +26334,7 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co break; case 16: /* Y = Y - X */ - sp_384_mont_sub_12(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -26364,7 +26360,8 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p, sp_digit* t) +static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*12; @@ -26411,7 +26408,7 @@ static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p, sp_ /* X = X - Y */ sp_384_mont_sub_12(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_12(y, y, x, p384_mod); + sp_384_mont_sub_lower_12(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_12(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ @@ -26452,6 +26449,7 @@ typedef struct sp_384_proj_point_add_12_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -26480,6 +26478,10 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->t3 = t + 4*12; ctx->t4 = t + 6*12; ctx->t5 = t + 8*12; + ctx->t6 = t + 10*12; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -26504,29 +26506,6 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_384)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<12; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<12; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<12; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -26540,16 +26519,16 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 6; break; case 6: - sp_384_mont_mul_12(ctx->t1, ctx->t1, ctx->x, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_12(ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 8; break; case 8: - sp_384_mont_mul_12(ctx->t4, ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 9; break; case 9: @@ -26558,7 +26537,7 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_12(ctx->t3, ctx->t3, ctx->y, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod); ctx->state = 11; break; case 11: @@ -26577,29 +26556,29 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_12(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_384_mont_sqr_12(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 15; break; case 15: - sp_384_mont_mul_12(ctx->z, ctx->z, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_12(ctx->x, ctx->t4, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 17; break; case 17: - sp_384_mont_sqr_12(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_12(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 18; break; case 18: - sp_384_mont_mul_12(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); ctx->state = 19; break; case 19: - sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(ctx->x, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 20; break; case 20: @@ -26607,24 +26586,24 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 21; break; case 21: - sp_384_mont_dbl_12(ctx->t1, ctx->y, p384_mod); + sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); ctx->state = 22; break; case 22: - sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t1, p384_mod); + sp_384_mont_dbl_12(ctx->t3, ctx->y, p384_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_12(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t3, p384_mod); ctx->state = 24; break; case 24: - sp_384_mont_mul_12(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 25; break; case 25: - sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 26; break; case 26: @@ -26632,9 +26611,30 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -26646,24 +26646,13 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, static void sp_384_proj_point_add_12(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*12; sp_digit* t3 = t + 4*12; sp_digit* t4 = t + 6*12; sp_digit* t5 = t + 8*12; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*12; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_384* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_384_sub_12(t1, p384_mod, q->y); @@ -26673,60 +26662,61 @@ static void sp_384_proj_point_add_12(sp_point_384* r, sp_384_proj_point_dbl_12(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<12; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<12; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<12; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_12(t1, q->z, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t3, t1, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t1, t1, x, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t1, t1, p->x, p384_mod, p384_mp_mod); /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_12(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod); /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_12(t3, t3, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t3, t3, p->y, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - U1 */ sp_384_mont_sub_12(t2, t2, t1, p384_mod); /* R = S2 - S1 */ sp_384_mont_sub_12(t4, t4, t3, p384_mod); - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_12(z, z, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(z, z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_12(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod); sp_384_mont_mul_12(y, t1, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(z, z, q->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_12(x, x, t5, p384_mod); - sp_384_mont_dbl_12(t1, y, p384_mod); - sp_384_mont_sub_12(x, x, t1, p384_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_12(y, y, x, p384_mod); - sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t5, t5, t3, p384_mod, p384_mp_mod); + sp_384_mont_dbl_12(t3, y, p384_mod); + sp_384_mont_sub_12(x, x, t3, p384_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_12(y, y, x, p384_mod); + sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, y, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -27010,6 +27000,8 @@ static int sp_384_ecc_mulmod_fast_12(sp_point_384* r, const sp_point_384* g, con } #ifdef FP_ECC +#define sp_384_mont_dbl_lower_12 sp_384_mont_dbl_12 +#define sp_384_mont_tpl_lower_12 sp_384_mont_tpl_12 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -27048,7 +27040,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int n, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_12(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_12(t1, t1, w, p384_mod); - sp_384_mont_tpl_12(a, t1, p384_mod); + sp_384_mont_tpl_lower_12(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_12(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_12(b, t1, x, p384_mod, p384_mp_mod); @@ -27056,9 +27048,12 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int n, sp_384_mont_sqr_12(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_12(t2, b, p384_mod); sp_384_mont_sub_12(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_12(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_12(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_12(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_12(t1, t1, p384_mod, p384_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -27068,16 +27063,14 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int n, sp_384_mont_mul_12(w, w, t1, p384_mod, p384_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_12(y, b, x, p384_mod); - sp_384_mont_mul_12(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_12(y, y, p384_mod); + sp_384_mont_mul_12(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, y, t1, p384_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_384_mont_sqr_12(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_12(t1, t1, w, p384_mod); - sp_384_mont_tpl_12(a, t1, p384_mod); + sp_384_mont_tpl_lower_12(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_12(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_12(b, t1, x, p384_mod, p384_mp_mod); @@ -27085,14 +27078,15 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int n, sp_384_mont_sqr_12(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_12(t2, b, p384_mod); sp_384_mont_sub_12(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_12(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_12(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_12(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_12(t1, t1, p384_mod, p384_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_12(y, b, x, p384_mod); - sp_384_mont_mul_12(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_12(y, y, p384_mod); + sp_384_mont_mul_12(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, y, t1, p384_mod); #endif /* Y = Y/2 */ @@ -27142,17 +27136,12 @@ typedef struct sp_table_entry_384 { static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*12; sp_digit* t3 = t + 4*12; sp_digit* t4 = t + 6*12; sp_digit* t5 = t + 8*12; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*12; /* Check double */ (void)sp_384_sub_12(t1, p384_mod, q->y); @@ -27162,53 +27151,54 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p, sp_384_proj_point_dbl_12(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<12; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<12; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<12; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_12(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - X1 */ - sp_384_mont_sub_12(t2, t2, x, p384_mod); + sp_384_mont_sub_12(t2, t2, p->x, p384_mod); /* R = S2 - Y1 */ - sp_384_mont_sub_12(t4, t4, y, p384_mod); + sp_384_mont_sub_12(t4, t4, p->y, p384_mod); /* Z3 = H*Z1 */ - sp_384_mont_mul_12(z, z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_384_mont_sqr_12(t1, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t3, x, t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t3, p->x, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod); sp_384_mont_sub_12(x, t1, t5, p384_mod); sp_384_mont_dbl_12(t1, t3, p384_mod); sp_384_mont_sub_12(x, x, t1, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_12(t3, t3, x, p384_mod); + sp_384_mont_sub_lower_12(t3, t3, x, p384_mod); sp_384_mont_mul_12(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t5, t5, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t5, t5, p->y, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, t3, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -31296,7 +31286,7 @@ typedef struct sp_ecc_verify_384_ctx { sp_digit u1[2*12]; sp_digit u2[2*12]; sp_digit s[2*12]; - sp_digit tmp[2*12 * 5]; + sp_digit tmp[2*12 * 6]; sp_point_384 p1; sp_point_384 p2; } sp_ecc_verify_384_ctx; @@ -31442,7 +31432,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_384* p1 = NULL; #else - sp_digit u1[16 * 12]; + sp_digit u1[18 * 12]; sp_point_384 p1[2]; #endif sp_digit* u2 = NULL; @@ -31461,7 +31451,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 12, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 12, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -31764,7 +31754,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_384* p = NULL; #else - sp_digit tmp[2 * 12 * 5]; + sp_digit tmp[2 * 12 * 6]; sp_point_384 p[2]; #endif sp_point_384* q = NULL; @@ -31778,7 +31768,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 12 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 12 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -33407,7 +33397,7 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p, (sp_digit)1 : (sp_digit)0)); sp_521_norm_17(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -33776,6 +33766,7 @@ SP_NOINLINE static void sp_521_mont_sub_17(sp_digit* r, const sp_digit* a, const ); } +#define sp_521_mont_sub_lower_17 sp_521_mont_sub_17 /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -34023,7 +34014,7 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co break; case 16: /* Y = Y - X */ - sp_521_mont_sub_17(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -34049,7 +34040,8 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p, sp_digit* t) +static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*17; @@ -34096,7 +34088,7 @@ static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p, sp_ /* X = X - Y */ sp_521_mont_sub_17(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_17(y, y, x, p521_mod); + sp_521_mont_sub_lower_17(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_17(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ @@ -34139,6 +34131,7 @@ typedef struct sp_521_proj_point_add_17_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -34167,6 +34160,10 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->t3 = t + 4*17; ctx->t4 = t + 6*17; ctx->t5 = t + 8*17; + ctx->t6 = t + 10*17; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -34191,29 +34188,6 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_521)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<17; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<17; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<17; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -34227,16 +34201,16 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 6; break; case 6: - sp_521_mont_mul_17(ctx->t1, ctx->t1, ctx->x, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_17(ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 8; break; case 8: - sp_521_mont_mul_17(ctx->t4, ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 9; break; case 9: @@ -34245,7 +34219,7 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_17(ctx->t3, ctx->t3, ctx->y, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod); ctx->state = 11; break; case 11: @@ -34264,29 +34238,29 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_17(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_521_mont_sqr_17(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 15; break; case 15: - sp_521_mont_mul_17(ctx->z, ctx->z, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_17(ctx->x, ctx->t4, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 17; break; case 17: - sp_521_mont_sqr_17(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_17(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 18; break; case 18: - sp_521_mont_mul_17(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); ctx->state = 19; break; case 19: - sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(ctx->x, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 20; break; case 20: @@ -34294,24 +34268,24 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 21; break; case 21: - sp_521_mont_dbl_17(ctx->t1, ctx->y, p521_mod); + sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); ctx->state = 22; break; case 22: - sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t1, p521_mod); + sp_521_mont_dbl_17(ctx->t3, ctx->y, p521_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_17(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t3, p521_mod); ctx->state = 24; break; case 24: - sp_521_mont_mul_17(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 25; break; case 25: - sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 26; break; case 26: @@ -34319,9 +34293,30 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -34333,24 +34328,13 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, static void sp_521_proj_point_add_17(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*17; sp_digit* t3 = t + 4*17; sp_digit* t4 = t + 6*17; sp_digit* t5 = t + 8*17; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*17; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_521* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_521_sub_17(t1, p521_mod, q->y); @@ -34360,60 +34344,61 @@ static void sp_521_proj_point_add_17(sp_point_521* r, sp_521_proj_point_dbl_17(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<17; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<17; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<17; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_17(t1, q->z, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t3, t1, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t1, t1, x, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t1, t1, p->x, p521_mod, p521_mp_mod); /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_17(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod); /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_17(t3, t3, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t3, t3, p->y, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - U1 */ sp_521_mont_sub_17(t2, t2, t1, p521_mod); /* R = S2 - S1 */ sp_521_mont_sub_17(t4, t4, t3, p521_mod); - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_17(z, z, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(z, z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_17(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod); sp_521_mont_mul_17(y, t1, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(z, z, q->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_17(x, x, t5, p521_mod); - sp_521_mont_dbl_17(t1, y, p521_mod); - sp_521_mont_sub_17(x, x, t1, p521_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_17(y, y, x, p521_mod); - sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t5, t5, t3, p521_mod, p521_mp_mod); + sp_521_mont_dbl_17(t3, y, p521_mod); + sp_521_mont_sub_17(x, x, t3, p521_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_17(y, y, x, p521_mod); + sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, y, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -34562,7 +34547,7 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con sp_digit* tmp = NULL; #else sp_point_521 t[16 + 1]; - sp_digit tmp[2 * 17 * 5]; + sp_digit tmp[2 * 17 * 6]; #endif sp_point_521* rt = NULL; #ifndef WC_NO_CACHE_RESISTANT @@ -34596,7 +34581,7 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con } #endif if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -34701,7 +34686,7 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 17 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 17 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -34731,6 +34716,8 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con } #ifdef FP_ECC +#define sp_521_mont_dbl_lower_17 sp_521_mont_dbl_17 +#define sp_521_mont_tpl_lower_17 sp_521_mont_tpl_17 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -34769,7 +34756,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int n, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_17(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_17(t1, t1, w, p521_mod); - sp_521_mont_tpl_17(a, t1, p521_mod); + sp_521_mont_tpl_lower_17(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_17(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_17(b, t1, x, p521_mod, p521_mp_mod); @@ -34777,9 +34764,12 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int n, sp_521_mont_sqr_17(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_17(t2, b, p521_mod); sp_521_mont_sub_17(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_17(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_17(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_17(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_17(t1, t1, p521_mod, p521_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -34789,16 +34779,14 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int n, sp_521_mont_mul_17(w, w, t1, p521_mod, p521_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_17(y, b, x, p521_mod); - sp_521_mont_mul_17(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_17(y, y, p521_mod); + sp_521_mont_mul_17(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, y, t1, p521_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_521_mont_sqr_17(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_17(t1, t1, w, p521_mod); - sp_521_mont_tpl_17(a, t1, p521_mod); + sp_521_mont_tpl_lower_17(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_17(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_17(b, t1, x, p521_mod, p521_mp_mod); @@ -34806,14 +34794,15 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int n, sp_521_mont_sqr_17(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_17(t2, b, p521_mod); sp_521_mont_sub_17(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_17(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_17(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_17(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_17(t1, t1, p521_mod, p521_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_17(y, b, x, p521_mod); - sp_521_mont_mul_17(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_17(y, y, p521_mod); + sp_521_mont_mul_17(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, y, t1, p521_mod); #endif /* Y = Y/2 */ @@ -34863,17 +34852,12 @@ typedef struct sp_table_entry_521 { static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*17; sp_digit* t3 = t + 4*17; sp_digit* t4 = t + 6*17; sp_digit* t5 = t + 8*17; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*17; /* Check double */ (void)sp_521_sub_17(t1, p521_mod, q->y); @@ -34883,53 +34867,54 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p, sp_521_proj_point_dbl_17(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<17; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<17; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<17; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_17(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - X1 */ - sp_521_mont_sub_17(t2, t2, x, p521_mod); + sp_521_mont_sub_17(t2, t2, p->x, p521_mod); /* R = S2 - Y1 */ - sp_521_mont_sub_17(t4, t4, y, p521_mod); + sp_521_mont_sub_17(t4, t4, p->y, p521_mod); /* Z3 = H*Z1 */ - sp_521_mont_mul_17(z, z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_521_mont_sqr_17(t1, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t3, x, t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t3, p->x, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod); sp_521_mont_sub_17(x, t1, t5, p521_mod); sp_521_mont_dbl_17(t1, t3, p521_mod); sp_521_mont_sub_17(x, x, t1, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_17(t3, t3, x, p521_mod); + sp_521_mont_sub_lower_17(t3, t3, x, p521_mod); sp_521_mont_mul_17(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t5, t5, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t5, t5, p->y, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, t3, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -35137,7 +35122,7 @@ static int sp_521_ecc_mulmod_stripe_17(sp_point_521* r, const sp_point_521* g, sp_digit* t = NULL; #else sp_point_521 rt[2]; - sp_digit t[2 * 17 * 5]; + sp_digit t[2 * 17 * 6]; #endif sp_point_521* p = NULL; int i; @@ -35158,7 +35143,7 @@ static int sp_521_ecc_mulmod_stripe_17(sp_point_521* r, const sp_point_521* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -35577,7 +35562,7 @@ static int sp_521_ecc_mulmod_stripe_17(sp_point_521* r, const sp_point_521* g, sp_digit* t = NULL; #else sp_point_521 rt[2]; - sp_digit t[2 * 17 * 5]; + sp_digit t[2 * 17 * 6]; #endif sp_point_521* p = NULL; int i; @@ -35598,7 +35583,7 @@ static int sp_521_ecc_mulmod_stripe_17(sp_point_521* r, const sp_point_521* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -35889,7 +35874,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[17 + 17 * 2 * 5]; + sp_digit k[17 + 17 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -35902,7 +35887,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (17 + 17 * 2 * 5), heap, + sizeof(sp_digit) * (17 + 17 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -37982,7 +37967,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[17 + 17 * 2 * 5]; + sp_digit k[17 + 17 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -37995,7 +37980,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (17 + 17 * 2 * 5), + sizeof(sp_digit) * (17 + 17 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -40180,7 +40165,7 @@ typedef struct sp_ecc_verify_521_ctx { sp_digit u1[2*17]; sp_digit u2[2*17]; sp_digit s[2*17]; - sp_digit tmp[2*17 * 5]; + sp_digit tmp[2*17 * 6]; sp_point_521 p1; sp_point_521 p2; } sp_ecc_verify_521_ctx; @@ -40329,7 +40314,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_521* p1 = NULL; #else - sp_digit u1[16 * 17]; + sp_digit u1[18 * 17]; sp_point_521 p1[2]; #endif sp_digit* u2 = NULL; @@ -40348,7 +40333,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 17, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 17, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -40655,7 +40640,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_521* p = NULL; #else - sp_digit tmp[2 * 17 * 5]; + sp_digit tmp[2 * 17 * 6]; sp_point_521 p[2]; #endif sp_point_521* q = NULL; @@ -40669,7 +40654,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 17 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -42771,7 +42756,7 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p, (sp_digit)1 : (sp_digit)0)); sp_1024_norm_32(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -43570,6 +43555,7 @@ SP_NOINLINE static void sp_1024_mont_sub_32(sp_digit* r, const sp_digit* a, cons ); } +#define sp_1024_mont_sub_lower_32 sp_1024_mont_sub_32 /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -43877,7 +43863,7 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -43903,7 +43889,8 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p, sp_digit* t) +static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*32; @@ -43950,7 +43937,7 @@ static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_32(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_32(y, y, x, p1024_mod); + sp_1024_mont_sub_lower_32(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_32(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ @@ -44141,6 +44128,7 @@ typedef struct sp_1024_proj_point_add_32_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -44169,6 +44157,10 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->t3 = t + 4*32; ctx->t4 = t + 6*32; ctx->t5 = t + 8*32; + ctx->t6 = t + 10*32; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -44193,29 +44185,6 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_1024)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<32; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<32; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<32; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -44229,16 +44198,16 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 6; break; case 6: - sp_1024_mont_mul_32(ctx->t1, ctx->t1, ctx->x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_32(ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 8; break; case 8: - sp_1024_mont_mul_32(ctx->t4, ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 9; break; case 9: @@ -44247,7 +44216,7 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_32(ctx->t3, ctx->t3, ctx->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod); ctx->state = 11; break; case 11: @@ -44266,29 +44235,29 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_32(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_1024_mont_sqr_32(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 15; break; case 15: - sp_1024_mont_mul_32(ctx->z, ctx->z, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_32(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 17; break; case 17: - sp_1024_mont_sqr_32(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_32(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 18; break; case 18: - sp_1024_mont_mul_32(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); ctx->state = 19; break; case 19: - sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 20; break; case 20: @@ -44296,24 +44265,24 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 21; break; case 21: - sp_1024_mont_dbl_32(ctx->t1, ctx->y, p1024_mod); + sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); ctx->state = 22; break; case 22: - sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t1, p1024_mod); + sp_1024_mont_dbl_32(ctx->t3, ctx->y, p1024_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t3, p1024_mod); ctx->state = 24; break; case 24: - sp_1024_mont_mul_32(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 25; break; case 25: - sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 26; break; case 26: @@ -44321,9 +44290,30 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -44335,24 +44325,13 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, static void sp_1024_proj_point_add_32(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*32; sp_digit* t3 = t + 4*32; sp_digit* t4 = t + 6*32; sp_digit* t5 = t + 8*32; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*32; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_1024* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_1024_mont_sub_32(t1, p1024_mod, q->y, p1024_mod); @@ -44362,60 +44341,61 @@ static void sp_1024_proj_point_add_32(sp_point_1024* r, sp_1024_proj_point_dbl_32(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<32; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<32; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<32; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_32(t1, q->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t3, t1, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t1, t1, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t1, t1, p->x, p1024_mod, p1024_mp_mod); /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_32(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_32(t3, t3, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t3, t3, p->y, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - U1 */ sp_1024_mont_sub_32(t2, t2, t1, p1024_mod); /* R = S2 - S1 */ sp_1024_mont_sub_32(t4, t4, t3, p1024_mod); - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_32(z, z, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(z, z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_32(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(y, t1, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(z, z, q->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(x, x, t5, p1024_mod); - sp_1024_mont_dbl_32(t1, y, p1024_mod); - sp_1024_mont_sub_32(x, x, t1, p1024_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_32(y, y, x, p1024_mod); - sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t5, t5, t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_dbl_32(t3, y, p1024_mod); + sp_1024_mont_sub_32(x, x, t3, p1024_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_32(y, y, x, p1024_mod); + sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, y, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -44444,7 +44424,7 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, sp_digit* tmp = NULL; #else sp_point_1024 t[16 + 1]; - sp_digit tmp[2 * 32 * 5]; + sp_digit tmp[2 * 32 * 6]; #endif sp_point_1024* rt = NULL; sp_digit n; @@ -44463,7 +44443,7 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, if (t == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -44544,7 +44524,7 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 32 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 32 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -44563,6 +44543,8 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, } #if defined(FP_ECC) || !defined(WOLFSSL_SP_SMALL) +#define sp_1024_mont_dbl_lower_32 sp_1024_mont_dbl_32 +#define sp_1024_mont_tpl_lower_32 sp_1024_mont_tpl_32 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -44601,7 +44583,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int n, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_32(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_32(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_32(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_32(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(b, t1, x, p1024_mod, p1024_mp_mod); @@ -44609,9 +44591,12 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int n, sp_1024_mont_sqr_32(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_32(t2, b, p1024_mod); sp_1024_mont_sub_32(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_32(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_32(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_32(t1, t1, p1024_mod, p1024_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -44621,16 +44606,14 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int n, sp_1024_mont_mul_32(w, w, t1, p1024_mod, p1024_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_32(y, b, x, p1024_mod); - sp_1024_mont_mul_32(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_32(y, y, p1024_mod); + sp_1024_mont_mul_32(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, y, t1, p1024_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_32(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_32(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_32(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_32(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(b, t1, x, p1024_mod, p1024_mp_mod); @@ -44638,14 +44621,15 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int n, sp_1024_mont_sqr_32(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_32(t2, b, p1024_mod); sp_1024_mont_sub_32(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_32(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_32(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_32(t1, t1, p1024_mod, p1024_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_32(y, b, x, p1024_mod); - sp_1024_mont_mul_32(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_32(y, y, p1024_mod); + sp_1024_mont_mul_32(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, y, t1, p1024_mod); #endif /* Y = Y/2 */ @@ -44695,17 +44679,12 @@ typedef struct sp_table_entry_1024 { static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*32; sp_digit* t3 = t + 4*32; sp_digit* t4 = t + 6*32; sp_digit* t5 = t + 8*32; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*32; /* Check double */ (void)sp_1024_mont_sub_32(t1, p1024_mod, q->y, p1024_mod); @@ -44715,53 +44694,54 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024* sp_1024_proj_point_dbl_32(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<32; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<32; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<32; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_32(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - X1 */ - sp_1024_mont_sub_32(t2, t2, x, p1024_mod); + sp_1024_mont_sub_32(t2, t2, p->x, p1024_mod); /* R = S2 - Y1 */ - sp_1024_mont_sub_32(t4, t4, y, p1024_mod); + sp_1024_mont_sub_32(t4, t4, p->y, p1024_mod); /* Z3 = H*Z1 */ - sp_1024_mont_mul_32(z, z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_1024_mont_sqr_32(t1, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t3, x, t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t3, p->x, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(x, t1, t5, p1024_mod); sp_1024_mont_dbl_32(t1, t3, p1024_mod); sp_1024_mont_sub_32(x, x, t1, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_32(t3, t3, x, p1024_mod); + sp_1024_mont_sub_lower_32(t3, t3, x, p1024_mod); sp_1024_mont_mul_32(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t5, t5, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t5, t5, p->y, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, t3, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -44883,7 +44863,7 @@ static int sp_1024_ecc_mulmod_stripe_32(sp_point_1024* r, const sp_point_1024* g sp_digit* t = NULL; #else sp_point_1024 rt[2]; - sp_digit t[2 * 32 * 5]; + sp_digit t[2 * 32 * 6]; #endif sp_point_1024* p = NULL; int i; @@ -44904,7 +44884,7 @@ static int sp_1024_ecc_mulmod_stripe_32(sp_point_1024* r, const sp_point_1024* g if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -45069,7 +45049,7 @@ static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g, const #ifndef FP_ECC return sp_1024_ecc_mulmod_fast_32(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 32 * 5]; + sp_digit tmp[2 * 32 * 6]; sp_cache_1024_t* cache; int err = MP_OKAY; @@ -45222,7 +45202,7 @@ static int sp_1024_ecc_mulmod_stripe_32(sp_point_1024* r, const sp_point_1024* g sp_digit* t = NULL; #else sp_point_1024 rt[2]; - sp_digit t[2 * 32 * 5]; + sp_digit t[2 * 32 * 6]; #endif sp_point_1024* p = NULL; int i; @@ -45243,7 +45223,7 @@ static int sp_1024_ecc_mulmod_stripe_32(sp_point_1024* r, const sp_point_1024* g if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 32 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -45408,7 +45388,7 @@ static int sp_1024_ecc_mulmod_32(sp_point_1024* r, const sp_point_1024* g, const #ifndef FP_ECC return sp_1024_ecc_mulmod_fast_32(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 32 * 5]; + sp_digit tmp[2 * 32 * 6]; sp_cache_1024_t* cache; int err = MP_OKAY; @@ -49156,7 +49136,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_1024 point[2]; - sp_digit k[32 + 32 * 2 * 5]; + sp_digit k[32 + 32 * 2 * 6]; #endif sp_point_1024* addP = NULL; sp_digit* tmp = NULL; @@ -49169,7 +49149,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (32 + 32 * 2 * 5), + sizeof(sp_digit) * (32 + 32 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c index 57c4f9fff..8eb40ef6e 100644 --- a/wolfcrypt/src/sp_x86_64.c +++ b/wolfcrypt/src/sp_x86_64.c @@ -222,7 +222,6 @@ extern void sp_2048_mul_avx2_32(sp_digit* r, const sp_digit* a, const sp_digit* extern void sp_2048_sqr_16(sp_digit* r, const sp_digit* a); extern void sp_2048_sqr_avx2_16(sp_digit* r, const sp_digit* a); -extern sp_digit sp_2048_dbl_16(sp_digit* r, const sp_digit* a); extern void sp_2048_sqr_32(sp_digit* r, const sp_digit* a); extern void sp_2048_sqr_avx2_32(sp_digit* r, const sp_digit* a); @@ -456,6 +455,8 @@ static WC_INLINE int sp_2048_mod_16(sp_digit* r, const sp_digit* a, return sp_2048_div_16(a, m, NULL, r); } +extern void sp_2048_get_from_table_16(sp_digit* r, sp_digit** table, int idx); + /* Modular exponentiate a to the e mod m. (r = a^e mod m) * * r A single precision number that is the result of the operation. @@ -507,15 +508,9 @@ static int sp_2048_mod_exp_16(sp_digit* r, const sp_digit* a, const sp_digit* e, if (err == MP_OKAY) { norm = td; -#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) for (i=0; i<32; i++) t[i] = td + i * 32; rt = td + 1024; -#else - for (i=0; i<32; i++) - t[i] = &td[i * 32]; - rt = &td[1024]; -#endif sp_2048_mont_setup(m, &mp); sp_2048_mont_norm_16(norm, m); @@ -595,7 +590,11 @@ static int sp_2048_mod_exp_16(sp_digit* r, const sp_digit* a, const sp_digit* e, y = (byte)(n >> c); n <<= 64 - c; } + #ifndef WC_NO_CACHE_RESISTANT + sp_2048_get_from_table_16(r, t, y); + #else XMEMCPY(r, t[y], sizeof(sp_digit) * 16); + #endif for (; i>=0 || c>=5; ) { if (c >= 5) { y = (byte)((n >> 59) & 0x1f); @@ -628,7 +627,12 @@ static int sp_2048_mod_exp_16(sp_digit* r, const sp_digit* a, const sp_digit* e, sp_2048_sqr_16(rt, r); sp_2048_mont_reduce_16(rt, m, mp); - sp_2048_mul_16(r, rt, t[y]); + #ifndef WC_NO_CACHE_RESISTANT + sp_2048_get_from_table_16(r, t, y); + sp_2048_mul_16(r, rt, r); + #else + sp_2048_mul_16(r, rt, t[y]); + #endif sp_2048_mont_reduce_16(r, m, mp); } @@ -682,6 +686,8 @@ SP_NOINLINE static void sp_2048_mont_sqr_avx2_16(sp_digit* r, const sp_digit* a, } #endif /* HAVE_INTEL_AVX2 */ +extern void sp_2048_get_from_table_avx2_16(sp_digit* r, sp_digit** table, int idx); + #ifdef HAVE_INTEL_AVX2 /* Modular exponentiate a to the e mod m. (r = a^e mod m) * @@ -734,15 +740,9 @@ static int sp_2048_mod_exp_avx2_16(sp_digit* r, const sp_digit* a, const sp_digi if (err == MP_OKAY) { norm = td; -#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) for (i=0; i<32; i++) t[i] = td + i * 32; rt = td + 1024; -#else - for (i=0; i<32; i++) - t[i] = &td[i * 32]; - rt = &td[1024]; -#endif sp_2048_mont_setup(m, &mp); sp_2048_mont_norm_16(norm, m); @@ -822,7 +822,11 @@ static int sp_2048_mod_exp_avx2_16(sp_digit* r, const sp_digit* a, const sp_digi y = (byte)(n >> c); n <<= 64 - c; } + #ifndef WC_NO_CACHE_RESISTANT + sp_2048_get_from_table_avx2_16(r, t, y); + #else XMEMCPY(r, t[y], sizeof(sp_digit) * 16); + #endif for (; i>=0 || c>=5; ) { if (c >= 5) { y = (byte)((n >> 59) & 0x1f); @@ -855,7 +859,12 @@ static int sp_2048_mod_exp_avx2_16(sp_digit* r, const sp_digit* a, const sp_digi sp_2048_sqr_avx2_16(rt, r); sp_2048_mont_reduce_avx2_16(rt, m, mp); - sp_2048_mul_avx2_16(r, rt, t[y]); + #ifndef WC_NO_CACHE_RESISTANT + sp_2048_get_from_table_avx2_16(r, t, y); + sp_2048_mul_avx2_16(r, rt, r); + #else + sp_2048_mul_avx2_16(r, rt, t[y]); + #endif sp_2048_mont_reduce_avx2_16(r, m, mp); } @@ -1169,6 +1178,8 @@ static WC_INLINE int sp_2048_mod_32(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */ +extern void sp_2048_get_from_table_32(sp_digit* r, sp_digit** table, int idx); + /* Modular exponentiate a to the e mod m. (r = a^e mod m) * * r A single precision number that is the result of the operation. @@ -1186,9 +1197,9 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) sp_digit* td = NULL; #else - sp_digit td[(17 * 64) + 64]; + sp_digit td[(65 * 64) + 64]; #endif - sp_digit* t[16]; + sp_digit* t[64]; sp_digit* rt = NULL; sp_digit* norm; sp_digit mp = 1; @@ -1210,7 +1221,7 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) if (err == MP_OKAY) { - td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (17 * 64) + 64, NULL, + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (65 * 64) + 64, NULL, DYNAMIC_TYPE_TMP_BUFFER); if (td == NULL) { err = MEMORY_E; @@ -1220,15 +1231,9 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, if (err == MP_OKAY) { norm = td; -#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) - for (i=0; i<16; i++) + for (i=0; i<64; i++) t[i] = td + i * 64; - rt = td + 1024; -#else - for (i=0; i<16; i++) - t[i] = &td[i * 64]; - rt = &td[1024]; -#endif + rt = td + 4096; sp_2048_mont_setup(m, &mp); sp_2048_mont_norm_32(norm, m); @@ -1260,6 +1265,54 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, sp_2048_mont_mul_32(t[13], t[ 7], t[ 6], m, mp); sp_2048_mont_sqr_32(t[14], t[ 7], m, mp); sp_2048_mont_mul_32(t[15], t[ 8], t[ 7], m, mp); + sp_2048_mont_sqr_32(t[16], t[ 8], m, mp); + sp_2048_mont_mul_32(t[17], t[ 9], t[ 8], m, mp); + sp_2048_mont_sqr_32(t[18], t[ 9], m, mp); + sp_2048_mont_mul_32(t[19], t[10], t[ 9], m, mp); + sp_2048_mont_sqr_32(t[20], t[10], m, mp); + sp_2048_mont_mul_32(t[21], t[11], t[10], m, mp); + sp_2048_mont_sqr_32(t[22], t[11], m, mp); + sp_2048_mont_mul_32(t[23], t[12], t[11], m, mp); + sp_2048_mont_sqr_32(t[24], t[12], m, mp); + sp_2048_mont_mul_32(t[25], t[13], t[12], m, mp); + sp_2048_mont_sqr_32(t[26], t[13], m, mp); + sp_2048_mont_mul_32(t[27], t[14], t[13], m, mp); + sp_2048_mont_sqr_32(t[28], t[14], m, mp); + sp_2048_mont_mul_32(t[29], t[15], t[14], m, mp); + sp_2048_mont_sqr_32(t[30], t[15], m, mp); + sp_2048_mont_mul_32(t[31], t[16], t[15], m, mp); + sp_2048_mont_sqr_32(t[32], t[16], m, mp); + sp_2048_mont_mul_32(t[33], t[17], t[16], m, mp); + sp_2048_mont_sqr_32(t[34], t[17], m, mp); + sp_2048_mont_mul_32(t[35], t[18], t[17], m, mp); + sp_2048_mont_sqr_32(t[36], t[18], m, mp); + sp_2048_mont_mul_32(t[37], t[19], t[18], m, mp); + sp_2048_mont_sqr_32(t[38], t[19], m, mp); + sp_2048_mont_mul_32(t[39], t[20], t[19], m, mp); + sp_2048_mont_sqr_32(t[40], t[20], m, mp); + sp_2048_mont_mul_32(t[41], t[21], t[20], m, mp); + sp_2048_mont_sqr_32(t[42], t[21], m, mp); + sp_2048_mont_mul_32(t[43], t[22], t[21], m, mp); + sp_2048_mont_sqr_32(t[44], t[22], m, mp); + sp_2048_mont_mul_32(t[45], t[23], t[22], m, mp); + sp_2048_mont_sqr_32(t[46], t[23], m, mp); + sp_2048_mont_mul_32(t[47], t[24], t[23], m, mp); + sp_2048_mont_sqr_32(t[48], t[24], m, mp); + sp_2048_mont_mul_32(t[49], t[25], t[24], m, mp); + sp_2048_mont_sqr_32(t[50], t[25], m, mp); + sp_2048_mont_mul_32(t[51], t[26], t[25], m, mp); + sp_2048_mont_sqr_32(t[52], t[26], m, mp); + sp_2048_mont_mul_32(t[53], t[27], t[26], m, mp); + sp_2048_mont_sqr_32(t[54], t[27], m, mp); + sp_2048_mont_mul_32(t[55], t[28], t[27], m, mp); + sp_2048_mont_sqr_32(t[56], t[28], m, mp); + sp_2048_mont_mul_32(t[57], t[29], t[28], m, mp); + sp_2048_mont_sqr_32(t[58], t[29], m, mp); + sp_2048_mont_mul_32(t[59], t[30], t[29], m, mp); + sp_2048_mont_sqr_32(t[60], t[30], m, mp); + sp_2048_mont_mul_32(t[61], t[31], t[30], m, mp); + sp_2048_mont_sqr_32(t[62], t[31], m, mp); + sp_2048_mont_mul_32(t[63], t[32], t[31], m, mp); i = (bits - 1) / 64; n = e[i--]; @@ -1268,12 +1321,12 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, if (c == 0) { c = 64; } - /* Minus the number of top bits to use so rest is a multiple of 4. */ - if ((bits % 4) == 0) { - c -= 4; + /* Minus the number of top bits to use so rest is a multiple of 6. */ + if ((bits % 6) == 0) { + c -= 6; } else { - c -= bits % 4; + c -= bits % 6; } if (c < 0) { /* Number of bits in top word is less than number needed. */ @@ -1292,23 +1345,27 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, y = (byte)(n >> c); n <<= 64 - c; } + #ifndef WC_NO_CACHE_RESISTANT + sp_2048_get_from_table_32(r, t, y); + #else XMEMCPY(r, t[y], sizeof(sp_digit) * 32); - for (; i>=0 || c>=4; ) { - if (c >= 4) { - y = (byte)((n >> 60) & 0xf); - n <<= 4; - c -= 4; + #endif + for (; i>=0 || c>=6; ) { + if (c >= 6) { + y = (byte)((n >> 58) & 0x3f); + n <<= 6; + c -= 6; } else if (c == 0) { n = e[i--]; - y = (byte)(n >> 60); - n <<= 4; - c = 60; + y = (byte)(n >> 58); + n <<= 6; + c = 58; } else { - y = (byte)(n >> 60); + y = (byte)(n >> 58); n = e[i--]; - c = 4 - c; + c = 6 - c; y |= (byte)(n >> (64 - c)); n <<= c; c = 64 - c; @@ -1322,8 +1379,16 @@ static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e, sp_2048_mont_reduce_32(rt, m, mp); sp_2048_sqr_32(r, rt); sp_2048_mont_reduce_32(r, m, mp); - - sp_2048_mul_32(r, r, t[y]); + sp_2048_sqr_32(rt, r); + sp_2048_mont_reduce_32(rt, m, mp); + sp_2048_sqr_32(r, rt); + sp_2048_mont_reduce_32(r, m, mp); + #ifndef WC_NO_CACHE_RESISTANT + sp_2048_get_from_table_32(rt, t, y); + sp_2048_mul_32(r, r, rt); + #else + sp_2048_mul_32(r, r, t[y]); + #endif sp_2048_mont_reduce_32(r, m, mp); } @@ -1379,6 +1444,8 @@ SP_NOINLINE static void sp_2048_mont_sqr_avx2_32(sp_digit* r, const sp_digit* a, #endif /* HAVE_INTEL_AVX2 */ #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +extern void sp_2048_get_from_table_avx2_32(sp_digit* r, sp_digit** table, int idx); + #ifdef HAVE_INTEL_AVX2 /* Modular exponentiate a to the e mod m. (r = a^e mod m) * @@ -1397,9 +1464,9 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) sp_digit* td = NULL; #else - sp_digit td[(17 * 64) + 64]; + sp_digit td[(65 * 64) + 64]; #endif - sp_digit* t[16]; + sp_digit* t[64]; sp_digit* rt = NULL; sp_digit* norm; sp_digit mp = 1; @@ -1421,7 +1488,7 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) if (err == MP_OKAY) { - td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (17 * 64) + 64, NULL, + td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (65 * 64) + 64, NULL, DYNAMIC_TYPE_TMP_BUFFER); if (td == NULL) { err = MEMORY_E; @@ -1431,15 +1498,9 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi if (err == MP_OKAY) { norm = td; -#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) - for (i=0; i<16; i++) + for (i=0; i<64; i++) t[i] = td + i * 64; - rt = td + 1024; -#else - for (i=0; i<16; i++) - t[i] = &td[i * 64]; - rt = &td[1024]; -#endif + rt = td + 4096; sp_2048_mont_setup(m, &mp); sp_2048_mont_norm_32(norm, m); @@ -1471,6 +1532,54 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi sp_2048_mont_mul_avx2_32(t[13], t[ 7], t[ 6], m, mp); sp_2048_mont_sqr_avx2_32(t[14], t[ 7], m, mp); sp_2048_mont_mul_avx2_32(t[15], t[ 8], t[ 7], m, mp); + sp_2048_mont_sqr_avx2_32(t[16], t[ 8], m, mp); + sp_2048_mont_mul_avx2_32(t[17], t[ 9], t[ 8], m, mp); + sp_2048_mont_sqr_avx2_32(t[18], t[ 9], m, mp); + sp_2048_mont_mul_avx2_32(t[19], t[10], t[ 9], m, mp); + sp_2048_mont_sqr_avx2_32(t[20], t[10], m, mp); + sp_2048_mont_mul_avx2_32(t[21], t[11], t[10], m, mp); + sp_2048_mont_sqr_avx2_32(t[22], t[11], m, mp); + sp_2048_mont_mul_avx2_32(t[23], t[12], t[11], m, mp); + sp_2048_mont_sqr_avx2_32(t[24], t[12], m, mp); + sp_2048_mont_mul_avx2_32(t[25], t[13], t[12], m, mp); + sp_2048_mont_sqr_avx2_32(t[26], t[13], m, mp); + sp_2048_mont_mul_avx2_32(t[27], t[14], t[13], m, mp); + sp_2048_mont_sqr_avx2_32(t[28], t[14], m, mp); + sp_2048_mont_mul_avx2_32(t[29], t[15], t[14], m, mp); + sp_2048_mont_sqr_avx2_32(t[30], t[15], m, mp); + sp_2048_mont_mul_avx2_32(t[31], t[16], t[15], m, mp); + sp_2048_mont_sqr_avx2_32(t[32], t[16], m, mp); + sp_2048_mont_mul_avx2_32(t[33], t[17], t[16], m, mp); + sp_2048_mont_sqr_avx2_32(t[34], t[17], m, mp); + sp_2048_mont_mul_avx2_32(t[35], t[18], t[17], m, mp); + sp_2048_mont_sqr_avx2_32(t[36], t[18], m, mp); + sp_2048_mont_mul_avx2_32(t[37], t[19], t[18], m, mp); + sp_2048_mont_sqr_avx2_32(t[38], t[19], m, mp); + sp_2048_mont_mul_avx2_32(t[39], t[20], t[19], m, mp); + sp_2048_mont_sqr_avx2_32(t[40], t[20], m, mp); + sp_2048_mont_mul_avx2_32(t[41], t[21], t[20], m, mp); + sp_2048_mont_sqr_avx2_32(t[42], t[21], m, mp); + sp_2048_mont_mul_avx2_32(t[43], t[22], t[21], m, mp); + sp_2048_mont_sqr_avx2_32(t[44], t[22], m, mp); + sp_2048_mont_mul_avx2_32(t[45], t[23], t[22], m, mp); + sp_2048_mont_sqr_avx2_32(t[46], t[23], m, mp); + sp_2048_mont_mul_avx2_32(t[47], t[24], t[23], m, mp); + sp_2048_mont_sqr_avx2_32(t[48], t[24], m, mp); + sp_2048_mont_mul_avx2_32(t[49], t[25], t[24], m, mp); + sp_2048_mont_sqr_avx2_32(t[50], t[25], m, mp); + sp_2048_mont_mul_avx2_32(t[51], t[26], t[25], m, mp); + sp_2048_mont_sqr_avx2_32(t[52], t[26], m, mp); + sp_2048_mont_mul_avx2_32(t[53], t[27], t[26], m, mp); + sp_2048_mont_sqr_avx2_32(t[54], t[27], m, mp); + sp_2048_mont_mul_avx2_32(t[55], t[28], t[27], m, mp); + sp_2048_mont_sqr_avx2_32(t[56], t[28], m, mp); + sp_2048_mont_mul_avx2_32(t[57], t[29], t[28], m, mp); + sp_2048_mont_sqr_avx2_32(t[58], t[29], m, mp); + sp_2048_mont_mul_avx2_32(t[59], t[30], t[29], m, mp); + sp_2048_mont_sqr_avx2_32(t[60], t[30], m, mp); + sp_2048_mont_mul_avx2_32(t[61], t[31], t[30], m, mp); + sp_2048_mont_sqr_avx2_32(t[62], t[31], m, mp); + sp_2048_mont_mul_avx2_32(t[63], t[32], t[31], m, mp); i = (bits - 1) / 64; n = e[i--]; @@ -1479,12 +1588,12 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi if (c == 0) { c = 64; } - /* Minus the number of top bits to use so rest is a multiple of 4. */ - if ((bits % 4) == 0) { - c -= 4; + /* Minus the number of top bits to use so rest is a multiple of 6. */ + if ((bits % 6) == 0) { + c -= 6; } else { - c -= bits % 4; + c -= bits % 6; } if (c < 0) { /* Number of bits in top word is less than number needed. */ @@ -1503,23 +1612,27 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi y = (byte)(n >> c); n <<= 64 - c; } + #ifndef WC_NO_CACHE_RESISTANT + sp_2048_get_from_table_avx2_32(r, t, y); + #else XMEMCPY(r, t[y], sizeof(sp_digit) * 32); - for (; i>=0 || c>=4; ) { - if (c >= 4) { - y = (byte)((n >> 60) & 0xf); - n <<= 4; - c -= 4; + #endif + for (; i>=0 || c>=6; ) { + if (c >= 6) { + y = (byte)((n >> 58) & 0x3f); + n <<= 6; + c -= 6; } else if (c == 0) { n = e[i--]; - y = (byte)(n >> 60); - n <<= 4; - c = 60; + y = (byte)(n >> 58); + n <<= 6; + c = 58; } else { - y = (byte)(n >> 60); + y = (byte)(n >> 58); n = e[i--]; - c = 4 - c; + c = 6 - c; y |= (byte)(n >> (64 - c)); n <<= c; c = 64 - c; @@ -1533,8 +1646,16 @@ static int sp_2048_mod_exp_avx2_32(sp_digit* r, const sp_digit* a, const sp_digi sp_2048_mont_reduce_avx2_32(rt, m, mp); sp_2048_sqr_avx2_32(r, rt); sp_2048_mont_reduce_avx2_32(r, m, mp); - - sp_2048_mul_avx2_32(r, r, t[y]); + sp_2048_sqr_avx2_32(rt, r); + sp_2048_mont_reduce_avx2_32(rt, m, mp); + sp_2048_sqr_avx2_32(r, rt); + sp_2048_mont_reduce_avx2_32(r, m, mp); + #ifndef WC_NO_CACHE_RESISTANT + sp_2048_get_from_table_avx2_32(rt, t, y); + sp_2048_mul_avx2_32(r, r, rt); + #else + sp_2048_mul_avx2_32(r, r, t[y]); + #endif sp_2048_mont_reduce_avx2_32(r, m, mp); } @@ -2814,12 +2935,10 @@ extern void sp_3072_mul_avx2_48(sp_digit* r, const sp_digit* a, const sp_digit* extern void sp_3072_sqr_12(sp_digit* r, const sp_digit* a); extern void sp_3072_sqr_avx2_12(sp_digit* r, const sp_digit* a); -extern sp_digit sp_3072_dbl_12(sp_digit* r, const sp_digit* a); extern void sp_3072_sqr_24(sp_digit* r, const sp_digit* a); extern void sp_3072_sqr_avx2_24(sp_digit* r, const sp_digit* a); -extern sp_digit sp_3072_dbl_24(sp_digit* r, const sp_digit* a); extern void sp_3072_sqr_48(sp_digit* r, const sp_digit* a); extern void sp_3072_sqr_avx2_48(sp_digit* r, const sp_digit* a); @@ -3052,6 +3171,8 @@ static WC_INLINE int sp_3072_mod_24(sp_digit* r, const sp_digit* a, return sp_3072_div_24(a, m, NULL, r); } +extern void sp_3072_get_from_table_24(sp_digit* r, sp_digit** table, int idx); + /* Modular exponentiate a to the e mod m. (r = a^e mod m) * * r A single precision number that is the result of the operation. @@ -3103,15 +3224,9 @@ static int sp_3072_mod_exp_24(sp_digit* r, const sp_digit* a, const sp_digit* e, if (err == MP_OKAY) { norm = td; -#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) for (i=0; i<32; i++) t[i] = td + i * 48; rt = td + 1536; -#else - for (i=0; i<32; i++) - t[i] = &td[i * 48]; - rt = &td[1536]; -#endif sp_3072_mont_setup(m, &mp); sp_3072_mont_norm_24(norm, m); @@ -3191,7 +3306,11 @@ static int sp_3072_mod_exp_24(sp_digit* r, const sp_digit* a, const sp_digit* e, y = (byte)(n >> c); n <<= 64 - c; } + #ifndef WC_NO_CACHE_RESISTANT + sp_3072_get_from_table_24(r, t, y); + #else XMEMCPY(r, t[y], sizeof(sp_digit) * 24); + #endif for (; i>=0 || c>=5; ) { if (c >= 5) { y = (byte)((n >> 59) & 0x1f); @@ -3224,7 +3343,12 @@ static int sp_3072_mod_exp_24(sp_digit* r, const sp_digit* a, const sp_digit* e, sp_3072_sqr_24(rt, r); sp_3072_mont_reduce_24(rt, m, mp); - sp_3072_mul_24(r, rt, t[y]); + #ifndef WC_NO_CACHE_RESISTANT + sp_3072_get_from_table_24(r, t, y); + sp_3072_mul_24(r, rt, r); + #else + sp_3072_mul_24(r, rt, t[y]); + #endif sp_3072_mont_reduce_24(r, m, mp); } @@ -3278,6 +3402,8 @@ SP_NOINLINE static void sp_3072_mont_sqr_avx2_24(sp_digit* r, const sp_digit* a, } #endif /* HAVE_INTEL_AVX2 */ +extern void sp_3072_get_from_table_avx2_24(sp_digit* r, sp_digit** table, int idx); + #ifdef HAVE_INTEL_AVX2 /* Modular exponentiate a to the e mod m. (r = a^e mod m) * @@ -3330,15 +3456,9 @@ static int sp_3072_mod_exp_avx2_24(sp_digit* r, const sp_digit* a, const sp_digi if (err == MP_OKAY) { norm = td; -#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) for (i=0; i<32; i++) t[i] = td + i * 48; rt = td + 1536; -#else - for (i=0; i<32; i++) - t[i] = &td[i * 48]; - rt = &td[1536]; -#endif sp_3072_mont_setup(m, &mp); sp_3072_mont_norm_24(norm, m); @@ -3418,7 +3538,11 @@ static int sp_3072_mod_exp_avx2_24(sp_digit* r, const sp_digit* a, const sp_digi y = (byte)(n >> c); n <<= 64 - c; } + #ifndef WC_NO_CACHE_RESISTANT + sp_3072_get_from_table_avx2_24(r, t, y); + #else XMEMCPY(r, t[y], sizeof(sp_digit) * 24); + #endif for (; i>=0 || c>=5; ) { if (c >= 5) { y = (byte)((n >> 59) & 0x1f); @@ -3451,7 +3575,12 @@ static int sp_3072_mod_exp_avx2_24(sp_digit* r, const sp_digit* a, const sp_digi sp_3072_sqr_avx2_24(rt, r); sp_3072_mont_reduce_avx2_24(rt, m, mp); - sp_3072_mul_avx2_24(r, rt, t[y]); + #ifndef WC_NO_CACHE_RESISTANT + sp_3072_get_from_table_avx2_24(r, t, y); + sp_3072_mul_avx2_24(r, rt, r); + #else + sp_3072_mul_avx2_24(r, rt, t[y]); + #endif sp_3072_mont_reduce_avx2_24(r, m, mp); } @@ -3765,6 +3894,8 @@ static WC_INLINE int sp_3072_mod_48(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */ +extern void sp_3072_get_from_table_48(sp_digit* r, sp_digit** table, int idx); + /* Modular exponentiate a to the e mod m. (r = a^e mod m) * * r A single precision number that is the result of the operation. @@ -3816,15 +3947,9 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e, if (err == MP_OKAY) { norm = td; -#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) for (i=0; i<16; i++) t[i] = td + i * 96; rt = td + 1536; -#else - for (i=0; i<16; i++) - t[i] = &td[i * 96]; - rt = &td[1536]; -#endif sp_3072_mont_setup(m, &mp); sp_3072_mont_norm_48(norm, m); @@ -3888,7 +4013,11 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e, y = (byte)(n >> c); n <<= 64 - c; } + #ifndef WC_NO_CACHE_RESISTANT + sp_3072_get_from_table_48(r, t, y); + #else XMEMCPY(r, t[y], sizeof(sp_digit) * 48); + #endif for (; i>=0 || c>=4; ) { if (c >= 4) { y = (byte)((n >> 60) & 0xf); @@ -3918,8 +4047,12 @@ static int sp_3072_mod_exp_48(sp_digit* r, const sp_digit* a, const sp_digit* e, sp_3072_mont_reduce_48(rt, m, mp); sp_3072_sqr_48(r, rt); sp_3072_mont_reduce_48(r, m, mp); - - sp_3072_mul_48(r, r, t[y]); + #ifndef WC_NO_CACHE_RESISTANT + sp_3072_get_from_table_48(rt, t, y); + sp_3072_mul_48(r, r, rt); + #else + sp_3072_mul_48(r, r, t[y]); + #endif sp_3072_mont_reduce_48(r, m, mp); } @@ -3975,6 +4108,8 @@ SP_NOINLINE static void sp_3072_mont_sqr_avx2_48(sp_digit* r, const sp_digit* a, #endif /* HAVE_INTEL_AVX2 */ #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +extern void sp_3072_get_from_table_avx2_48(sp_digit* r, sp_digit** table, int idx); + #ifdef HAVE_INTEL_AVX2 /* Modular exponentiate a to the e mod m. (r = a^e mod m) * @@ -4027,15 +4162,9 @@ static int sp_3072_mod_exp_avx2_48(sp_digit* r, const sp_digit* a, const sp_digi if (err == MP_OKAY) { norm = td; -#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) for (i=0; i<16; i++) t[i] = td + i * 96; rt = td + 1536; -#else - for (i=0; i<16; i++) - t[i] = &td[i * 96]; - rt = &td[1536]; -#endif sp_3072_mont_setup(m, &mp); sp_3072_mont_norm_48(norm, m); @@ -4099,7 +4228,11 @@ static int sp_3072_mod_exp_avx2_48(sp_digit* r, const sp_digit* a, const sp_digi y = (byte)(n >> c); n <<= 64 - c; } + #ifndef WC_NO_CACHE_RESISTANT + sp_3072_get_from_table_avx2_48(r, t, y); + #else XMEMCPY(r, t[y], sizeof(sp_digit) * 48); + #endif for (; i>=0 || c>=4; ) { if (c >= 4) { y = (byte)((n >> 60) & 0xf); @@ -4129,8 +4262,12 @@ static int sp_3072_mod_exp_avx2_48(sp_digit* r, const sp_digit* a, const sp_digi sp_3072_mont_reduce_avx2_48(rt, m, mp); sp_3072_sqr_avx2_48(r, rt); sp_3072_mont_reduce_avx2_48(r, m, mp); - - sp_3072_mul_avx2_48(r, r, t[y]); + #ifndef WC_NO_CACHE_RESISTANT + sp_3072_get_from_table_avx2_48(rt, t, y); + sp_3072_mul_avx2_48(r, r, rt); + #else + sp_3072_mul_avx2_48(r, r, t[y]); + #endif sp_3072_mont_reduce_avx2_48(r, m, mp); } @@ -5399,7 +5536,6 @@ extern void sp_4096_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b); extern void sp_4096_mul_avx2_64(sp_digit* r, const sp_digit* a, const sp_digit* b); -extern sp_digit sp_2048_dbl_32(sp_digit* r, const sp_digit* a); extern void sp_4096_sqr_64(sp_digit* r, const sp_digit* a); extern void sp_4096_sqr_avx2_64(sp_digit* r, const sp_digit* a); @@ -5718,6 +5854,8 @@ static WC_INLINE int sp_4096_mod_64(sp_digit* r, const sp_digit* a, } #endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */ +extern void sp_4096_get_from_table_64(sp_digit* r, sp_digit** table, int idx); + /* Modular exponentiate a to the e mod m. (r = a^e mod m) * * r A single precision number that is the result of the operation. @@ -5769,15 +5907,9 @@ static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e, if (err == MP_OKAY) { norm = td; -#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) for (i=0; i<16; i++) t[i] = td + i * 128; rt = td + 2048; -#else - for (i=0; i<16; i++) - t[i] = &td[i * 128]; - rt = &td[2048]; -#endif sp_4096_mont_setup(m, &mp); sp_4096_mont_norm_64(norm, m); @@ -5841,7 +5973,11 @@ static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e, y = (byte)(n >> c); n <<= 64 - c; } + #ifndef WC_NO_CACHE_RESISTANT + sp_4096_get_from_table_64(r, t, y); + #else XMEMCPY(r, t[y], sizeof(sp_digit) * 64); + #endif for (; i>=0 || c>=4; ) { if (c >= 4) { y = (byte)((n >> 60) & 0xf); @@ -5871,8 +6007,12 @@ static int sp_4096_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e, sp_4096_mont_reduce_64(rt, m, mp); sp_4096_sqr_64(r, rt); sp_4096_mont_reduce_64(r, m, mp); - - sp_4096_mul_64(r, r, t[y]); + #ifndef WC_NO_CACHE_RESISTANT + sp_4096_get_from_table_64(rt, t, y); + sp_4096_mul_64(r, r, rt); + #else + sp_4096_mul_64(r, r, t[y]); + #endif sp_4096_mont_reduce_64(r, m, mp); } @@ -5928,6 +6068,8 @@ SP_NOINLINE static void sp_4096_mont_sqr_avx2_64(sp_digit* r, const sp_digit* a, #endif /* HAVE_INTEL_AVX2 */ #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +extern void sp_4096_get_from_table_avx2_64(sp_digit* r, sp_digit** table, int idx); + #ifdef HAVE_INTEL_AVX2 /* Modular exponentiate a to the e mod m. (r = a^e mod m) * @@ -5980,15 +6122,9 @@ static int sp_4096_mod_exp_avx2_64(sp_digit* r, const sp_digit* a, const sp_digi if (err == MP_OKAY) { norm = td; -#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) for (i=0; i<16; i++) t[i] = td + i * 128; rt = td + 2048; -#else - for (i=0; i<16; i++) - t[i] = &td[i * 128]; - rt = &td[2048]; -#endif sp_4096_mont_setup(m, &mp); sp_4096_mont_norm_64(norm, m); @@ -6052,7 +6188,11 @@ static int sp_4096_mod_exp_avx2_64(sp_digit* r, const sp_digit* a, const sp_digi y = (byte)(n >> c); n <<= 64 - c; } + #ifndef WC_NO_CACHE_RESISTANT + sp_4096_get_from_table_avx2_64(r, t, y); + #else XMEMCPY(r, t[y], sizeof(sp_digit) * 64); + #endif for (; i>=0 || c>=4; ) { if (c >= 4) { y = (byte)((n >> 60) & 0xf); @@ -6082,8 +6222,12 @@ static int sp_4096_mod_exp_avx2_64(sp_digit* r, const sp_digit* a, const sp_digi sp_4096_mont_reduce_avx2_64(rt, m, mp); sp_4096_sqr_avx2_64(r, rt); sp_4096_mont_reduce_avx2_64(r, m, mp); - - sp_4096_mul_avx2_64(r, r, t[y]); + #ifndef WC_NO_CACHE_RESISTANT + sp_4096_get_from_table_avx2_64(rt, t, y); + sp_4096_mul_avx2_64(r, r, rt); + #else + sp_4096_mul_avx2_64(r, r, t[y]); + #endif sp_4096_mont_reduce_avx2_64(r, m, mp); } @@ -7575,9 +7719,8 @@ extern sp_int64 sp_256_cmp_4(const sp_digit* a, const sp_digit* b); #define sp_256_norm_4(a) extern sp_digit sp_256_cond_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); -#define sp_256_mont_reduce_order_4 sp_256_mont_reduce_4 - extern void sp_256_mont_reduce_4(sp_digit* a, const sp_digit* m, sp_digit mp); +extern void sp_256_mont_reduce_order_4(sp_digit* a, const sp_digit* m, sp_digit mp); /* Map the Montgomery form projective coordinate point to an affine point. * * r Resulting affine coordinate point. @@ -7616,7 +7759,7 @@ static void sp_256_map_4(sp_point_256* r, const sp_point_256* p, (sp_digit)1 : (sp_digit)0)); sp_256_norm_4(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -7625,6 +7768,7 @@ extern void sp_256_mont_add_4(sp_digit* r, const sp_digit* a, const sp_digit* b, extern void sp_256_mont_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* m); extern void sp_256_mont_tpl_4(sp_digit* r, const sp_digit* a, const sp_digit* m); extern void sp_256_mont_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); +extern void sp_256_mont_sub_lower_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); extern void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m); /* Double the Montgomery form projective point p. * @@ -7741,7 +7885,7 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con break; case 16: /* Y = Y - X */ - sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_lower_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -7767,7 +7911,8 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p, sp_digit* t) +static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*4; @@ -7814,13 +7959,16 @@ static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p, sp_d /* X = X - Y */ sp_256_mont_sub_4(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_4(y, y, x, p256_mod); + sp_256_mont_sub_lower_4(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ sp_256_mont_sub_4(y, y, t2, p256_mod); } +extern void sp_256_mont_tpl_lower_4(sp_digit* r, const sp_digit* a, const sp_digit* m); +extern void sp_256_mont_sub_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); +extern void sp_256_mont_dbl_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -7835,7 +7983,6 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int n, sp_digit* a = t + 2*4; sp_digit* b = t + 4*4; sp_digit* t1 = t + 6*4; - sp_digit* t2 = t + 8*4; sp_digit* x; sp_digit* y; sp_digit* z; @@ -7859,17 +8006,18 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int n, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_4(a, t1, p256_mod); + sp_256_mont_tpl_lower_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); /* X = A^2 - 2B */ sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_4(t2, b, p256_mod); - sp_256_mont_sub_4(x, x, t2, p256_mod); + sp_256_mont_sub_dbl_4(x, x, b, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_dbl_sub_4(b, b, x, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_4(t1, t1, p256_mod, p256_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -7879,31 +8027,28 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int n, sp_256_mont_mul_4(w, w, t1, p256_mod, p256_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_4(y, b, x, p256_mod); - sp_256_mont_mul_4(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_4(y, y, p256_mod); + sp_256_mont_mul_4(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_4(y, y, t1, p256_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_256_mont_sqr_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_4(a, t1, p256_mod); + sp_256_mont_tpl_lower_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); /* X = A^2 - 2B */ sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_4(t2, b, p256_mod); - sp_256_mont_sub_4(x, x, t2, p256_mod); + sp_256_mont_sub_dbl_4(x, x, b, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_dbl_sub_4(b, b, x, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_4(t1, t1, p256_mod, p256_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_4(y, b, x, p256_mod); - sp_256_mont_mul_4(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_4(y, y, p256_mod); + sp_256_mont_mul_4(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_4(y, y, t1, p256_mod); #endif /* Y = Y/2 */ @@ -7942,6 +8087,7 @@ typedef struct sp_256_proj_point_add_4_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -7970,6 +8116,10 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->t3 = t + 4*4; ctx->t4 = t + 6*4; ctx->t5 = t + 8*4; + ctx->t6 = t + 10*4; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -7994,29 +8144,6 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_256)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<4; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<4; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<4; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -8030,16 +8157,16 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 6; break; case 6: - sp_256_mont_mul_4(ctx->t1, ctx->t1, ctx->x, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_4(ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 8; break; case 8: - sp_256_mont_mul_4(ctx->t4, ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 9; break; case 9: @@ -8048,7 +8175,7 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_4(ctx->t3, ctx->t3, ctx->y, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod); ctx->state = 11; break; case 11: @@ -8067,29 +8194,29 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_4(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_256_mont_sqr_4(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 15; break; case 15: - sp_256_mont_mul_4(ctx->z, ctx->z, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_4(ctx->x, ctx->t4, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 17; break; case 17: - sp_256_mont_sqr_4(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_4(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 18; break; case 18: - sp_256_mont_mul_4(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); ctx->state = 19; break; case 19: - sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(ctx->x, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 20; break; case 20: @@ -8097,24 +8224,24 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 21; break; case 21: - sp_256_mont_dbl_4(ctx->t1, ctx->y, p256_mod); + sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); ctx->state = 22; break; case 22: - sp_256_mont_sub_4(ctx->x, ctx->x, ctx->t1, p256_mod); + sp_256_mont_dbl_4(ctx->t3, ctx->y, p256_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_4(ctx->x, ctx->x, ctx->t3, p256_mod); ctx->state = 24; break; case 24: - sp_256_mont_mul_4(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 25; break; case 25: - sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 26; break; case 26: @@ -8122,9 +8249,30 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -8136,24 +8284,13 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, static void sp_256_proj_point_add_4(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*4; sp_digit* t3 = t + 4*4; sp_digit* t4 = t + 6*4; sp_digit* t5 = t + 8*4; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*4; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_256* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_256_sub_4(t1, p256_mod, q->y); @@ -8163,60 +8300,60 @@ static void sp_256_proj_point_add_4(sp_point_256* r, sp_256_proj_point_dbl_4(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<4; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<4; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<4; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_4(t1, q->z, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t3, t1, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t1, t1, x, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t1, t1, p->x, p256_mod, p256_mp_mod); /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_4(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod); /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_4(t3, t3, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t3, t3, p->y, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ sp_256_mont_sub_4(t2, t2, t1, p256_mod); /* R = S2 - S1 */ sp_256_mont_sub_4(t4, t4, t3, p256_mod); - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_4(z, z, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(z, z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod); sp_256_mont_mul_4(y, t1, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_4(z, p->z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(z, z, q->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_4(x, x, t5, p256_mod); - sp_256_mont_dbl_4(t1, y, p256_mod); - sp_256_mont_sub_4(x, x, t1, p256_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_4(y, y, x, p256_mod); - sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod); + sp_256_mont_sub_dbl_4(x, x, y, p256_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_4(y, y, x, p256_mod); + sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_4(y, y, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -8234,7 +8371,6 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r, sp_digit* a = t + 2*4; sp_digit* b = t + 4*4; sp_digit* t1 = t + 6*4; - sp_digit* t2 = t + 8*4; sp_digit* x = r[2*m].x; sp_digit* y = r[(1<x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_4(t1, q->z, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t3, t1, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t1, t1, x, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t1, t1, xa, p256_mod, p256_mp_mod); /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_4(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(t2, za, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t4, t2, za, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod); /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_4(t3, t3, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t3, t3, ya, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ @@ -8345,30 +8480,30 @@ static void sp_256_proj_point_add_sub_4(sp_point_256* ra, sp_256_mont_sub_4(t4, t4, t3, p256_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_256_mont_mul_4(z, z, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(z, z, t2, p256_mod, p256_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_256_mont_mul_4(za, za, q->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(za, za, t2, p256_mod, p256_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(xa, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(xs, t6, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(y, t1, t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(ya, t1, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_sub_4(x, x, t5, p256_mod); + sp_256_mont_sub_4(xa, xa, t5, p256_mod); sp_256_mont_sub_4(xs, xs, t5, p256_mod); - sp_256_mont_dbl_4(t1, y, p256_mod); - sp_256_mont_sub_4(x, x, t1, p256_mod); + sp_256_mont_dbl_4(t1, ya, p256_mod); + sp_256_mont_sub_4(xa, xa, t1, p256_mod); sp_256_mont_sub_4(xs, xs, t1, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_256_mont_sub_4(ys, y, xs, p256_mod); - sp_256_mont_sub_4(y, y, x, p256_mod); - sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod); + sp_256_mont_sub_lower_4(ys, ya, xs, p256_mod); + sp_256_mont_sub_lower_4(ya, ya, xa, p256_mod); + sp_256_mont_mul_4(ya, ya, t4, p256_mod, p256_mp_mod); sp_256_sub_4(t6, p256_mod, t6); sp_256_mont_mul_4(ys, ys, t6, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod); - sp_256_mont_sub_4(y, y, t5, p256_mod); + sp_256_mont_sub_4(ya, ya, t5, p256_mod); sp_256_mont_sub_4(ys, ys, t5, p256_mod); } @@ -8684,9 +8819,8 @@ static void sp_256_mont_inv_avx2_4(sp_digit* r, const sp_digit* a, sp_digit* td) } extern sp_digit sp_256_cond_sub_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); -#define sp_256_mont_reduce_order_avx2_4 sp_256_mont_reduce_avx2_4 - -extern void sp_256_mont_reduce_avx2_4(sp_digit* a, const sp_digit* m, sp_digit mp); +#define sp_256_mont_reduce_avx2_4 sp_256_mont_reduce_4 +extern void sp_256_mont_reduce_avx2_order_4(sp_digit* a, const sp_digit* m, sp_digit mp); /* Map the Montgomery form projective coordinate point to an affine point. * * r Resulting affine coordinate point. @@ -8725,7 +8859,7 @@ static void sp_256_map_avx2_4(sp_point_256* r, const sp_point_256* p, (sp_digit)1 : (sp_digit)0)); sp_256_norm_4(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -8734,6 +8868,7 @@ static void sp_256_map_avx2_4(sp_point_256* r, const sp_point_256* p, #define sp_256_mont_dbl_avx2_4 sp_256_mont_dbl_4 #define sp_256_mont_tpl_avx2_4 sp_256_mont_tpl_4 #define sp_256_mont_sub_avx2_4 sp_256_mont_sub_4 +#define sp_256_mont_sub_lower_avx2_4 sp_256_mont_sub_lower_4 extern void sp_256_div2_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* m); /* Double the Montgomery form projective point p. * @@ -8850,7 +8985,7 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r break; case 16: /* Y = Y - X */ - sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_lower_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -8876,7 +9011,8 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_256_proj_point_dbl_avx2_4(sp_point_256* r, const sp_point_256* p, sp_digit* t) +static void sp_256_proj_point_dbl_avx2_4(sp_point_256* r, const sp_point_256* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*4; @@ -8923,13 +9059,16 @@ static void sp_256_proj_point_dbl_avx2_4(sp_point_256* r, const sp_point_256* p, /* X = X - Y */ sp_256_mont_sub_avx2_4(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_avx2_4(y, y, x, p256_mod); + sp_256_mont_sub_lower_avx2_4(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_avx2_4(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ sp_256_mont_sub_avx2_4(y, y, t2, p256_mod); } +#define sp_256_mont_tpl_lower_avx2_4 sp_256_mont_tpl_lower_4 +#define sp_256_mont_sub_dbl_avx2_4 sp_256_mont_sub_dbl_4 +#define sp_256_mont_dbl_sub_avx2_4 sp_256_mont_dbl_sub_4 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -8944,7 +9083,6 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int n, sp_digit* a = t + 2*4; sp_digit* b = t + 4*4; sp_digit* t1 = t + 6*4; - sp_digit* t2 = t + 8*4; sp_digit* x; sp_digit* y; sp_digit* z; @@ -8968,17 +9106,18 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int n, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_avx2_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_avx2_4(a, t1, p256_mod); + sp_256_mont_tpl_lower_avx2_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_avx2_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod); /* X = A^2 - 2B */ sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_avx2_4(t2, b, p256_mod); - sp_256_mont_sub_avx2_4(x, x, t2, p256_mod); + sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_avx2_4(t1, t1, p256_mod, p256_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -8988,31 +9127,28 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int n, sp_256_mont_mul_avx2_4(w, w, t1, p256_mod, p256_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_avx2_4(y, b, x, p256_mod); - sp_256_mont_mul_avx2_4(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_avx2_4(y, y, p256_mod); + sp_256_mont_mul_avx2_4(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(y, y, t1, p256_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_256_mont_sqr_avx2_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_avx2_4(a, t1, p256_mod); + sp_256_mont_tpl_lower_avx2_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_avx2_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod); /* X = A^2 - 2B */ sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_avx2_4(t2, b, p256_mod); - sp_256_mont_sub_avx2_4(x, x, t2, p256_mod); + sp_256_mont_sub_dbl_4(x, x, b, p256_mod); + /* b = 2.(B - X) */ + sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_256_mont_sqr_avx2_4(t1, t1, p256_mod, p256_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_256_mont_sub_avx2_4(y, b, x, p256_mod); - sp_256_mont_mul_avx2_4(y, y, a, p256_mod, p256_mp_mod); - sp_256_mont_dbl_avx2_4(y, y, p256_mod); + sp_256_mont_mul_avx2_4(y, b, a, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(y, y, t1, p256_mod); #endif /* Y = Y/2 */ @@ -9038,6 +9174,7 @@ typedef struct sp_256_proj_point_add_avx2_4_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -9066,6 +9203,10 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r ctx->t3 = t + 4*4; ctx->t4 = t + 6*4; ctx->t5 = t + 8*4; + ctx->t6 = t + 10*4; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -9090,29 +9231,6 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_256)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<4; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<4; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<4; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -9126,16 +9244,16 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r ctx->state = 6; break; case 6: - sp_256_mont_mul_avx2_4(ctx->t1, ctx->t1, ctx->x, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_avx2_4(ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_avx2_4(ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 8; break; case 8: - sp_256_mont_mul_avx2_4(ctx->t4, ctx->t2, ctx->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod); ctx->state = 9; break; case 9: @@ -9144,7 +9262,7 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r break; case 10: /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_avx2_4(ctx->t3, ctx->t3, ctx->y, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod); ctx->state = 11; break; case 11: @@ -9163,29 +9281,29 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_avx2_4(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_256_mont_sqr_avx2_4(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 15; break; case 15: - sp_256_mont_mul_avx2_4(ctx->z, ctx->z, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_avx2_4(ctx->x, ctx->t4, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 17; break; case 17: - sp_256_mont_sqr_avx2_4(ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_avx2_4(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod); ctx->state = 18; break; case 18: - sp_256_mont_mul_avx2_4(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod); ctx->state = 19; break; case 19: - sp_256_mont_mul_avx2_4(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_avx2_4(ctx->x, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 20; break; case 20: @@ -9193,24 +9311,24 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r ctx->state = 21; break; case 21: - sp_256_mont_dbl_avx2_4(ctx->t1, ctx->y, p256_mod); + sp_256_mont_mul_avx2_4(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); ctx->state = 22; break; case 22: - sp_256_mont_sub_avx2_4(ctx->x, ctx->x, ctx->t1, p256_mod); + sp_256_mont_dbl_avx2_4(ctx->t3, ctx->y, p256_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_avx2_4(ctx->x, ctx->x, ctx->t3, p256_mod); ctx->state = 24; break; case 24: - sp_256_mont_mul_avx2_4(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 25; break; case 25: - sp_256_mont_mul_avx2_4(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod); ctx->state = 26; break; case 26: @@ -9218,9 +9336,30 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -9232,24 +9371,13 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r static void sp_256_proj_point_add_avx2_4(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*4; sp_digit* t3 = t + 4*4; sp_digit* t4 = t + 6*4; sp_digit* t5 = t + 8*4; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*4; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_256* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_256_sub_4(t1, p256_mod, q->y); @@ -9259,60 +9387,60 @@ static void sp_256_proj_point_add_avx2_4(sp_point_256* r, sp_256_proj_point_dbl_avx2_4(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<4; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<4; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<4; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_avx2_4(t1, q->z, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t3, t1, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_avx2_4(t1, t1, x, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(t1, t1, p->x, p256_mod, p256_mp_mod); /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_avx2_4(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_avx2_4(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_avx2_4(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t2, t2, q->x, p256_mod, p256_mp_mod); /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_avx2_4(t3, t3, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(t3, t3, p->y, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_avx2_4(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ sp_256_mont_sub_avx2_4(t2, t2, t1, p256_mod); /* R = S2 - S1 */ sp_256_mont_sub_avx2_4(t4, t4, t3, p256_mod); - /* Z3 = H*Z1*Z2 */ - sp_256_mont_mul_avx2_4(z, z, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_avx2_4(z, z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_avx2_4(t5, t2, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(y, t1, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t5, t5, t2, p256_mod, p256_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_256_mont_mul_avx2_4(z, p->z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(z, z, q->z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(x, x, t5, p256_mod); - sp_256_mont_dbl_avx2_4(t1, y, p256_mod); - sp_256_mont_sub_avx2_4(x, x, t1, p256_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_avx2_4(y, y, x, p256_mod); - sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod); + sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_sub_lower_avx2_4(y, y, x, p256_mod); + sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(y, y, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -9330,7 +9458,6 @@ static void sp_256_proj_point_dbl_n_store_avx2_4(sp_point_256* r, sp_digit* a = t + 2*4; sp_digit* b = t + 4*4; sp_digit* t1 = t + 6*4; - sp_digit* t2 = t + 8*4; sp_digit* x = r[2*m].x; sp_digit* y = r[(1<x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_avx2_4(t1, q->z, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t3, t1, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_avx2_4(t1, t1, x, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(t1, t1, xa, p256_mod, p256_mp_mod); /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_avx2_4(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_avx2_4(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_avx2_4(t2, za, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(t4, t2, za, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t2, t2, q->x, p256_mod, p256_mp_mod); /* S1 = Y1*Z2^3 */ - sp_256_mont_mul_avx2_4(t3, t3, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(t3, t3, ya, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_avx2_4(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ @@ -9441,30 +9567,30 @@ static void sp_256_proj_point_add_sub_avx2_4(sp_point_256* ra, sp_256_mont_sub_avx2_4(t4, t4, t3, p256_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_256_mont_mul_avx2_4(z, z, q->z, p256_mod, p256_mp_mod); - sp_256_mont_mul_avx2_4(z, z, t2, p256_mod, p256_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_256_mont_mul_avx2_4(za, za, q->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(za, za, t2, p256_mod, p256_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod); + sp_256_mont_sqr_avx2_4(xa, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_avx2_4(xs, t6, p256_mod, p256_mp_mod); sp_256_mont_sqr_avx2_4(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_avx2_4(y, t1, t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(ya, t1, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t5, t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_sub_avx2_4(x, x, t5, p256_mod); + sp_256_mont_sub_avx2_4(xa, xa, t5, p256_mod); sp_256_mont_sub_avx2_4(xs, xs, t5, p256_mod); - sp_256_mont_dbl_avx2_4(t1, y, p256_mod); - sp_256_mont_sub_avx2_4(x, x, t1, p256_mod); + sp_256_mont_dbl_avx2_4(t1, ya, p256_mod); + sp_256_mont_sub_avx2_4(xa, xa, t1, p256_mod); sp_256_mont_sub_avx2_4(xs, xs, t1, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_256_mont_sub_avx2_4(ys, y, xs, p256_mod); - sp_256_mont_sub_avx2_4(y, y, x, p256_mod); - sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod); + sp_256_mont_sub_lower_avx2_4(ys, ya, xs, p256_mod); + sp_256_mont_sub_lower_avx2_4(ya, ya, xa, p256_mod); + sp_256_mont_mul_avx2_4(ya, ya, t4, p256_mod, p256_mp_mod); sp_256_sub_4(t6, p256_mod, t6); sp_256_mont_mul_avx2_4(ys, ys, t6, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod); - sp_256_mont_sub_avx2_4(y, y, t5, p256_mod); + sp_256_mont_sub_avx2_4(ya, ya, t5, p256_mod); sp_256_mont_sub_avx2_4(ys, ys, t5, p256_mod); } @@ -9634,17 +9760,12 @@ typedef struct sp_table_entry_256 { static void sp_256_proj_point_add_qz1_4(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*4; sp_digit* t3 = t + 4*4; sp_digit* t4 = t + 6*4; sp_digit* t5 = t + 8*4; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*4; /* Check double */ (void)sp_256_sub_4(t1, p256_mod, q->y); @@ -9654,53 +9775,53 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r, const sp_point_256* p, sp_256_proj_point_dbl_4(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<4; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<4; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<4; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_4(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - X1 */ - sp_256_mont_sub_4(t2, t2, x, p256_mod); + sp_256_mont_sub_4(t2, t2, p->x, p256_mod); /* R = S2 - Y1 */ - sp_256_mont_sub_4(t4, t4, y, p256_mod); + sp_256_mont_sub_4(t4, t4, p->y, p256_mod); /* Z3 = H*Z1 */ - sp_256_mont_mul_4(z, z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_256_mont_sqr_4(t1, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t3, x, t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t3, p->x, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod); sp_256_mont_sub_4(x, t1, t5, p256_mod); - sp_256_mont_dbl_4(t1, t3, p256_mod); - sp_256_mont_sub_4(x, x, t1, p256_mod); + sp_256_mont_sub_dbl_4(x, x, t3, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_4(t3, t3, x, p256_mod); + sp_256_mont_sub_lower_4(t3, t3, x, p256_mod); sp_256_mont_mul_4(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t5, t5, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t5, t5, p->y, p256_mod, p256_mp_mod); sp_256_mont_sub_4(y, t3, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -9846,7 +9967,7 @@ static int sp_256_ecc_mulmod_stripe_4(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 4 * 5]; + sp_digit t[2 * 4 * 6]; #endif sp_point_256* p = NULL; int i; @@ -9867,7 +9988,7 @@ static int sp_256_ecc_mulmod_stripe_4(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -10048,7 +10169,7 @@ static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g, const sp_ #ifndef FP_ECC return sp_256_ecc_mulmod_win_add_sub_4(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 4 * 5]; + sp_digit tmp[2 * 4 * 6]; sp_cache_256_t* cache; int err = MP_OKAY; @@ -10098,17 +10219,12 @@ static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g, const sp_ static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - const sp_point_256* ap[2]; - sp_point_256* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*4; sp_digit* t3 = t + 4*4; sp_digit* t4 = t + 6*4; sp_digit* t5 = t + 8*4; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*4; /* Check double */ (void)sp_256_sub_4(t1, p256_mod, q->y); @@ -10118,53 +10234,53 @@ static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r, const sp_point_256 sp_256_proj_point_dbl_avx2_4(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_256*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_256)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<4; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<4; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<4; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_256_mont_sqr_avx2_4(t2, z, p256_mod, p256_mp_mod); - sp_256_mont_mul_avx2_4(t4, t2, z, p256_mod, p256_mp_mod); + sp_256_mont_sqr_avx2_4(t2, p->z, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(t4, t2, p->z, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t2, t2, q->x, p256_mod, p256_mp_mod); /* S2 = Y2*Z1^3 */ sp_256_mont_mul_avx2_4(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - X1 */ - sp_256_mont_sub_avx2_4(t2, t2, x, p256_mod); + sp_256_mont_sub_avx2_4(t2, t2, p->x, p256_mod); /* R = S2 - Y1 */ - sp_256_mont_sub_avx2_4(t4, t4, y, p256_mod); + sp_256_mont_sub_avx2_4(t4, t4, p->y, p256_mod); /* Z3 = H*Z1 */ - sp_256_mont_mul_avx2_4(z, z, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_256_mont_sqr_avx2_4(t1, t4, p256_mod, p256_mp_mod); sp_256_mont_sqr_avx2_4(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_avx2_4(t3, x, t5, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(t3, p->x, t5, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(t5, t5, t2, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(x, t1, t5, p256_mod); - sp_256_mont_dbl_avx2_4(t1, t3, p256_mod); - sp_256_mont_sub_avx2_4(x, x, t1, p256_mod); + sp_256_mont_sub_dbl_avx2_4(x, x, t3, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_avx2_4(t3, t3, x, p256_mod); + sp_256_mont_sub_lower_avx2_4(t3, t3, x, p256_mod); sp_256_mont_mul_avx2_4(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_avx2_4(t5, t5, y, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(t5, t5, p->y, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(y, t3, t5, p256_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -10308,7 +10424,7 @@ static int sp_256_ecc_mulmod_stripe_avx2_4(sp_point_256* r, const sp_point_256* sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 4 * 5]; + sp_digit t[2 * 4 * 6]; #endif sp_point_256* p = NULL; int i; @@ -10329,7 +10445,7 @@ static int sp_256_ecc_mulmod_stripe_avx2_4(sp_point_256* r, const sp_point_256* if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -10417,7 +10533,7 @@ static int sp_256_ecc_mulmod_avx2_4(sp_point_256* r, const sp_point_256* g, cons #ifndef FP_ECC return sp_256_ecc_mulmod_win_add_sub_avx2_4(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 4 * 5]; + sp_digit tmp[2 * 4 * 6]; sp_cache_256_t* cache; int err = MP_OKAY; @@ -10536,7 +10652,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[4 + 4 * 2 * 5]; + sp_digit k[4 + 4 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -10552,7 +10668,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (4 + 4 * 2 * 5), heap, + sizeof(sp_digit) * (4 + 4 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -23039,7 +23155,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, sp_digit* tmp = NULL; #else sp_point_256 rt[2]; - sp_digit tmp[2 * 4 * 5]; + sp_digit tmp[2 * 4 * 6]; #endif sp_point_256* p = NULL; sp_digit* negy = NULL; @@ -23058,7 +23174,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -23117,7 +23233,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 4 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 4 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -23173,7 +23289,7 @@ static int sp_256_ecc_mulmod_add_only_avx2_4(sp_point_256* r, const sp_point_256 sp_digit* tmp = NULL; #else sp_point_256 rt[2]; - sp_digit tmp[2 * 4 * 5]; + sp_digit tmp[2 * 4 * 6]; #endif sp_point_256* p = NULL; sp_digit* negy = NULL; @@ -23192,7 +23308,7 @@ static int sp_256_ecc_mulmod_add_only_avx2_4(sp_point_256* r, const sp_point_256 if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -23251,7 +23367,7 @@ static int sp_256_ecc_mulmod_add_only_avx2_4(sp_point_256* r, const sp_point_256 if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 4 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 4 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -23362,7 +23478,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_256 point[2]; - sp_digit k[4 + 4 * 2 * 5]; + sp_digit k[4 + 4 * 2 * 6]; #endif sp_point_256* addP = NULL; sp_digit* tmp = NULL; @@ -23378,7 +23494,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (4 + 4 * 2 * 5), + sizeof(sp_digit) * (4 + 4 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -23990,6 +24106,7 @@ static void sp_256_mont_inv_order_4(sp_digit* r, const sp_digit* a, sp_digit* t = td; sp_digit* t2 = td + 2 * 4; sp_digit* t3 = td + 4 * 4; + sp_digit* t4 = td + 6 * 4; int i; ASSERT_SAVED_VECTOR_REGISTERS(); @@ -23998,10 +24115,10 @@ static void sp_256_mont_inv_order_4(sp_digit* r, const sp_digit* a, sp_256_mont_sqr_order_4(t, a); /* t = a^3 = t * a */ sp_256_mont_mul_order_4(t, t, a); - /* t2= a^c = t ^ 2 ^ 2 */ - sp_256_mont_sqr_n_order_4(t2, t, 2); - /* t3= a^f = t2 * t */ - sp_256_mont_mul_order_4(t3, t2, t); + /* t4= a^c = t ^ 2 ^ 2 */ + sp_256_mont_sqr_n_order_4(t4, t, 2); + /* t3= a^f = t4 * t */ + sp_256_mont_mul_order_4(t3, t4, t); /* t2= a^f0 = t3 ^ 2 ^ 4 */ sp_256_mont_sqr_n_order_4(t2, t3, 4); /* t = a^ff = t2 * t3 */ @@ -24022,8 +24139,18 @@ static void sp_256_mont_inv_order_4(sp_digit* r, const sp_digit* a, sp_256_mont_sqr_n_order_4(t2, t2, 32); /* t2= a^ffffffff00000000ffffffffffffffff = t2 * t */ sp_256_mont_mul_order_4(t2, t2, t); + /* t2= a^ffffffff00000000ffffffffffffffffb */ + for (i=127; i>=124; i--) { + sp_256_mont_sqr_order_4(t2, t2); + if (((sp_digit)p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { + sp_256_mont_mul_order_4(t2, t2, a); + } + } + /* t2= a^ffffffff00000000ffffffffffffffffbc */ + sp_256_mont_sqr_n_order_4(t2, t2, 4); + sp_256_mont_mul_order_4(t2, t2, t4); /* t2= a^ffffffff00000000ffffffffffffffffbce6 */ - for (i=127; i>=112; i--) { + for (i=119; i>=112; i--) { sp_256_mont_sqr_order_4(t2, t2); if (((sp_digit)p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { sp_256_mont_mul_order_4(t2, t2, a); @@ -24042,8 +24169,28 @@ static void sp_256_mont_inv_order_4(sp_digit* r, const sp_digit* a, /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f */ sp_256_mont_sqr_n_order_4(t2, t2, 4); sp_256_mont_mul_order_4(t2, t2, t3); + /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9 */ + for (i=59; i>=48; i--) { + sp_256_mont_sqr_order_4(t2, t2); + if (((sp_digit)p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { + sp_256_mont_mul_order_4(t2, t2, a); + } + } + /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9c */ + sp_256_mont_sqr_n_order_4(t2, t2, 4); + sp_256_mont_mul_order_4(t2, t2, t4); + /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9ca */ + for (i=43; i>=40; i--) { + sp_256_mont_sqr_order_4(t2, t2); + if (((sp_digit)p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { + sp_256_mont_mul_order_4(t2, t2, a); + } + } + /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac */ + sp_256_mont_sqr_n_order_4(t2, t2, 4); + sp_256_mont_mul_order_4(t2, t2, t4); /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2 */ - for (i=59; i>=32; i--) { + for (i=35; i>=32; i--) { sp_256_mont_sqr_order_4(t2, t2); if (((sp_digit)p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { sp_256_mont_mul_order_4(t2, t2, a); @@ -24170,6 +24317,7 @@ static void sp_256_mont_inv_order_avx2_4(sp_digit* r, const sp_digit* a, sp_digit* t = td; sp_digit* t2 = td + 2 * 4; sp_digit* t3 = td + 4 * 4; + sp_digit* t4 = td + 6 * 4; int i; ASSERT_SAVED_VECTOR_REGISTERS(); @@ -24178,10 +24326,10 @@ static void sp_256_mont_inv_order_avx2_4(sp_digit* r, const sp_digit* a, sp_256_mont_sqr_order_avx2_4(t, a); /* t = a^3 = t * a */ sp_256_mont_mul_order_avx2_4(t, t, a); - /* t2= a^c = t ^ 2 ^ 2 */ - sp_256_mont_sqr_n_order_avx2_4(t2, t, 2); - /* t3= a^f = t2 * t */ - sp_256_mont_mul_order_avx2_4(t3, t2, t); + /* t4= a^c = t ^ 2 ^ 2 */ + sp_256_mont_sqr_n_order_avx2_4(t4, t, 2); + /* t3= a^f = t4 * t */ + sp_256_mont_mul_order_avx2_4(t3, t4, t); /* t2= a^f0 = t3 ^ 2 ^ 4 */ sp_256_mont_sqr_n_order_avx2_4(t2, t3, 4); /* t = a^ff = t2 * t3 */ @@ -24202,8 +24350,18 @@ static void sp_256_mont_inv_order_avx2_4(sp_digit* r, const sp_digit* a, sp_256_mont_sqr_n_order_avx2_4(t2, t2, 32); /* t2= a^ffffffff00000000ffffffffffffffff = t2 * t */ sp_256_mont_mul_order_avx2_4(t2, t2, t); + /* t2= a^ffffffff00000000ffffffffffffffffb */ + for (i=127; i>=124; i--) { + sp_256_mont_sqr_order_avx2_4(t2, t2); + if (((sp_digit)p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { + sp_256_mont_mul_order_avx2_4(t2, t2, a); + } + } + /* t2= a^ffffffff00000000ffffffffffffffffbc */ + sp_256_mont_sqr_n_order_avx2_4(t2, t2, 4); + sp_256_mont_mul_order_avx2_4(t2, t2, t4); /* t2= a^ffffffff00000000ffffffffffffffffbce6 */ - for (i=127; i>=112; i--) { + for (i=119; i>=112; i--) { sp_256_mont_sqr_order_avx2_4(t2, t2); if (((sp_digit)p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { sp_256_mont_mul_order_avx2_4(t2, t2, a); @@ -24222,8 +24380,28 @@ static void sp_256_mont_inv_order_avx2_4(sp_digit* r, const sp_digit* a, /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f */ sp_256_mont_sqr_n_order_avx2_4(t2, t2, 4); sp_256_mont_mul_order_avx2_4(t2, t2, t3); + /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9 */ + for (i=59; i>=48; i--) { + sp_256_mont_sqr_order_avx2_4(t2, t2); + if (((sp_digit)p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { + sp_256_mont_mul_order_avx2_4(t2, t2, a); + } + } + /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9c */ + sp_256_mont_sqr_n_order_avx2_4(t2, t2, 4); + sp_256_mont_mul_order_avx2_4(t2, t2, t4); + /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9ca */ + for (i=43; i>=40; i--) { + sp_256_mont_sqr_order_avx2_4(t2, t2); + if (((sp_digit)p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { + sp_256_mont_mul_order_avx2_4(t2, t2, a); + } + } + /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac */ + sp_256_mont_sqr_n_order_avx2_4(t2, t2, 4); + sp_256_mont_mul_order_avx2_4(t2, t2, t4); /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2 */ - for (i=59; i>=32; i--) { + for (i=35; i>=32; i--) { sp_256_mont_sqr_order_avx2_4(t2, t2); if (((sp_digit)p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { sp_256_mont_mul_order_avx2_4(t2, t2, a); @@ -24356,7 +24534,7 @@ typedef struct sp_ecc_sign_256_ctx { sp_digit x[2*4]; sp_digit k[2*4]; sp_digit r[2*4]; - sp_digit tmp[3 * 2*4]; + sp_digit tmp[4 * 2*4]; sp_point_256 point; sp_digit* s; sp_digit* kInv; @@ -24503,7 +24681,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W XMEMSET(ctx->x, 0, sizeof(sp_digit) * 2U * 4U); XMEMSET(ctx->k, 0, sizeof(sp_digit) * 2U * 4U); XMEMSET(ctx->r, 0, sizeof(sp_digit) * 2U * 4U); - XMEMSET(ctx->tmp, 0, sizeof(sp_digit) * 3U * 2U * 4U); + XMEMSET(ctx->tmp, 0, sizeof(sp_digit) * 4U * 2U * 4U); } return err; @@ -24517,7 +24695,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng, sp_digit* e = NULL; sp_point_256* point = NULL; #else - sp_digit e[7 * 2 * 4]; + sp_digit e[8 * 2 * 4]; sp_point_256 point[1]; #endif sp_digit* x = NULL; @@ -24542,7 +24720,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng, err = MEMORY_E; } if (err == MP_OKAY) { - e = (sp_digit*)XMALLOC(sizeof(sp_digit) * 7 * 2 * 4, heap, + e = (sp_digit*)XMALLOC(sizeof(sp_digit) * 8 * 2 * 4, heap, DYNAMIC_TYPE_ECC); if (e == NULL) err = MEMORY_E; @@ -24618,7 +24796,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng, if (e != NULL) #endif { - ForceZero(e, sizeof(sp_digit) * 7 * 2 * 4); + ForceZero(e, sizeof(sp_digit) * 8 * 2 * 4); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(e, heap, DYNAMIC_TYPE_ECC); #endif @@ -24817,7 +24995,7 @@ typedef struct sp_ecc_verify_256_ctx { sp_digit u1[2*4]; sp_digit u2[2*4]; sp_digit s[2*4]; - sp_digit tmp[2*4 * 5]; + sp_digit tmp[2*4 * 6]; sp_point_256 p1; sp_point_256 p2; } sp_ecc_verify_256_ctx; @@ -24963,7 +25141,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_256* p1 = NULL; #else - sp_digit u1[16 * 4]; + sp_digit u1[18 * 4]; sp_point_256 p1[2]; #endif sp_digit* u2 = NULL; @@ -24985,7 +25163,7 @@ int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 4, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 4, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -25318,7 +25496,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_256* p = NULL; #else - sp_digit tmp[2 * 4 * 5]; + sp_digit tmp[2 * 4 * 6]; sp_point_256 p[2]; #endif sp_point_256* q = NULL; @@ -25335,7 +25513,7 @@ int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -26303,83 +26481,16 @@ static void sp_384_map_6(sp_point_384* r, const sp_point_384* p, (sp_digit)1 : (sp_digit)0)); sp_384_norm_6(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } -/* Add two Montgomery form numbers (r = a + b % m). - * - * r Result of addition. - * a First number to add in Montgomery form. - * b Second number to add in Montgomery form. - * m Modulus (prime). - */ -static void sp_384_mont_add_6(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m) -{ - sp_digit o; - - ASSERT_SAVED_VECTOR_REGISTERS(); - - o = sp_384_add_6(r, a, b); - sp_384_cond_sub_6(r, r, m, 0 - o); -} - -extern sp_digit sp_384_dbl_6(sp_digit* r, const sp_digit* a); -/* Double a Montgomery form number (r = a + a % m). - * - * r Result of doubling. - * a Number to double in Montgomery form. - * m Modulus (prime). - */ -static void sp_384_mont_dbl_6(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - sp_digit o; - - ASSERT_SAVED_VECTOR_REGISTERS(); - - o = sp_384_dbl_6(r, a); - sp_384_cond_sub_6(r, r, m, 0 - o); -} - -/* Triple a Montgomery form number (r = a + a + a % m). - * - * r Result of Tripling. - * a Number to triple in Montgomery form. - * m Modulus (prime). - */ -static void sp_384_mont_tpl_6(sp_digit* r, const sp_digit* a, const sp_digit* m) -{ - sp_digit o; - - ASSERT_SAVED_VECTOR_REGISTERS(); - - o = sp_384_dbl_6(r, a); - sp_384_cond_sub_6(r, r, m, 0 - o); - o = sp_384_add_6(r, r, a); - sp_384_cond_sub_6(r, r, m, 0 - o); -} - -extern sp_digit sp_384_cond_add_6(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); -/* Subtract two Montgomery form numbers (r = a - b % m). - * - * r Result of subtration. - * a Number to subtract from in Montgomery form. - * b Number to subtract with in Montgomery form. - * m Modulus (prime). - */ -static void sp_384_mont_sub_6(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m) -{ - sp_digit o; - - ASSERT_SAVED_VECTOR_REGISTERS(); - - o = sp_384_sub_6(r, a, b); - sp_384_cond_add_6(r, r, m, o); -} - +extern void sp_384_mont_add_6(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); +extern void sp_384_mont_dbl_6(sp_digit* r, const sp_digit* a, const sp_digit* m); +extern void sp_384_mont_tpl_6(sp_digit* r, const sp_digit* a, const sp_digit* m); +extern void sp_384_mont_sub_6(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); +extern void sp_384_mont_sub_lower_6(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); extern void sp_384_div2_6(sp_digit* r, const sp_digit* a, const sp_digit* m); /* Double the Montgomery form projective point p. * @@ -26496,7 +26607,7 @@ static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con break; case 16: /* Y = Y - X */ - sp_384_mont_sub_6(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_lower_6(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -26522,7 +26633,8 @@ static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p, sp_digit* t) +static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*6; @@ -26569,13 +26681,15 @@ static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p, sp_d /* X = X - Y */ sp_384_mont_sub_6(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_6(y, y, x, p384_mod); + sp_384_mont_sub_lower_6(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_6(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ sp_384_mont_sub_6(y, y, t2, p384_mod); } +extern void sp_384_mont_dbl_lower_6(sp_digit* r, const sp_digit* a, const sp_digit* m); +extern void sp_384_mont_tpl_lower_6(sp_digit* r, const sp_digit* a, const sp_digit* m); /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -26614,7 +26728,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int n, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_6(a, t1, p384_mod); + sp_384_mont_tpl_lower_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_6(b, t1, x, p384_mod, p384_mp_mod); @@ -26622,9 +26736,12 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int n, sp_384_mont_sqr_6(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_6(t2, b, p384_mod); sp_384_mont_sub_6(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_6(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_6(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_6(t1, t1, p384_mod, p384_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -26634,16 +26751,14 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int n, sp_384_mont_mul_6(w, w, t1, p384_mod, p384_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_6(y, b, x, p384_mod); - sp_384_mont_mul_6(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_6(y, y, p384_mod); + sp_384_mont_mul_6(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_6(y, y, t1, p384_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_384_mont_sqr_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_6(a, t1, p384_mod); + sp_384_mont_tpl_lower_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_6(b, t1, x, p384_mod, p384_mp_mod); @@ -26651,14 +26766,15 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int n, sp_384_mont_sqr_6(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_6(t2, b, p384_mod); sp_384_mont_sub_6(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_6(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_6(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_6(t1, t1, p384_mod, p384_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_6(y, b, x, p384_mod); - sp_384_mont_mul_6(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_6(y, y, p384_mod); + sp_384_mont_mul_6(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_6(y, y, t1, p384_mod); #endif /* Y = Y/2 */ @@ -26697,6 +26813,7 @@ typedef struct sp_384_proj_point_add_6_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -26725,6 +26842,10 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->t3 = t + 4*6; ctx->t4 = t + 6*6; ctx->t5 = t + 8*6; + ctx->t6 = t + 10*6; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -26749,29 +26870,6 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_384)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<6; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<6; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<6; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -26785,16 +26883,16 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 6; break; case 6: - sp_384_mont_mul_6(ctx->t1, ctx->t1, ctx->x, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_6(ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 8; break; case 8: - sp_384_mont_mul_6(ctx->t4, ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 9; break; case 9: @@ -26803,7 +26901,7 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_6(ctx->t3, ctx->t3, ctx->y, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod); ctx->state = 11; break; case 11: @@ -26822,29 +26920,29 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_6(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_384_mont_sqr_6(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 15; break; case 15: - sp_384_mont_mul_6(ctx->z, ctx->z, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_6(ctx->x, ctx->t4, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 17; break; case 17: - sp_384_mont_sqr_6(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_6(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 18; break; case 18: - sp_384_mont_mul_6(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); ctx->state = 19; break; case 19: - sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(ctx->x, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 20; break; case 20: @@ -26852,24 +26950,24 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 21; break; case 21: - sp_384_mont_dbl_6(ctx->t1, ctx->y, p384_mod); + sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); ctx->state = 22; break; case 22: - sp_384_mont_sub_6(ctx->x, ctx->x, ctx->t1, p384_mod); + sp_384_mont_dbl_6(ctx->t3, ctx->y, p384_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_6(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_6(ctx->x, ctx->x, ctx->t3, p384_mod); ctx->state = 24; break; case 24: - sp_384_mont_mul_6(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_6(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 25; break; case 25: - sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 26; break; case 26: @@ -26877,9 +26975,30 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -26891,24 +27010,13 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, static void sp_384_proj_point_add_6(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*6; sp_digit* t3 = t + 4*6; sp_digit* t4 = t + 6*6; sp_digit* t5 = t + 8*6; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*6; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_384* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_384_sub_6(t1, p384_mod, q->y); @@ -26918,60 +27026,61 @@ static void sp_384_proj_point_add_6(sp_point_384* r, sp_384_proj_point_dbl_6(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<6; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<6; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<6; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_6(t1, q->z, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t3, t1, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t1, t1, x, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t1, t1, p->x, p384_mod, p384_mp_mod); /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_6(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod); /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_6(t3, t3, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t3, t3, p->y, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - U1 */ sp_384_mont_sub_6(t2, t2, t1, p384_mod); /* R = S2 - S1 */ sp_384_mont_sub_6(t4, t4, t3, p384_mod); - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_6(z, z, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(z, z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_6(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_6(t5, t2, p384_mod, p384_mp_mod); sp_384_mont_mul_6(y, t1, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t5, t5, t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_6(z, p->z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(z, z, q->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_6(x, x, t5, p384_mod); - sp_384_mont_dbl_6(t1, y, p384_mod); - sp_384_mont_sub_6(x, x, t1, p384_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_6(y, y, x, p384_mod); - sp_384_mont_mul_6(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t5, t5, t3, p384_mod, p384_mp_mod); + sp_384_mont_dbl_6(t3, y, p384_mod); + sp_384_mont_sub_6(x, x, t3, p384_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_6(y, y, x, p384_mod); + sp_384_mont_mul_6(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_6(y, y, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -27018,29 +27127,30 @@ static void sp_384_proj_point_dbl_n_store_6(sp_point_384* r, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_6(a, t1, p384_mod); + sp_384_mont_tpl_lower_6(a, t1, p384_mod); /* B = X*Y^2 */ - sp_384_mont_sqr_6(t2, y, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(b, t2, x, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(t1, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(b, t1, x, p384_mod, p384_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_384_mont_sqr_6(x, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_6(t1, b, p384_mod); - sp_384_mont_sub_6(x, x, t1, p384_mod); + sp_384_mont_dbl_6(t2, b, p384_mod); + sp_384_mont_sub_6(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_6(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_6(r[j].z, z, y, p384_mod, p384_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_384_mont_sqr_6(t2, t2, p384_mod, p384_mp_mod); + /* t1 = Y^4 */ + sp_384_mont_sqr_6(t1, t1, p384_mod, p384_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_384_mont_mul_6(w, w, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(w, w, t1, p384_mod, p384_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_6(y, b, x, p384_mod); - sp_384_mont_mul_6(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_6(y, y, p384_mod); - sp_384_mont_sub_6(y, y, t2, p384_mod); + sp_384_mont_mul_6(y, b, a, p384_mod, p384_mp_mod); + sp_384_mont_sub_6(y, y, t1, p384_mod); /* Y = Y/2 */ sp_384_div2_6(r[j].y, y, p384_mod); @@ -27066,30 +27176,30 @@ static void sp_384_proj_point_add_sub_6(sp_point_384* ra, sp_digit* t4 = t + 6*6; sp_digit* t5 = t + 8*6; sp_digit* t6 = t + 10*6; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_6(t1, q->z, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t3, t1, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t1, t1, x, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t1, t1, xa, p384_mod, p384_mp_mod); /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_6(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(t2, za, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t4, t2, za, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod); /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_6(t3, t3, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t3, t3, ya, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - U1 */ @@ -27100,30 +27210,30 @@ static void sp_384_proj_point_add_sub_6(sp_point_384* ra, sp_384_mont_sub_6(t4, t4, t3, p384_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_384_mont_mul_6(z, z, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(z, z, t2, p384_mod, p384_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_384_mont_mul_6(za, za, q->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(za, za, t2, p384_mod, p384_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_6(x, t4, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(xa, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_6(xs, t6, p384_mod, p384_mp_mod); sp_384_mont_sqr_6(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(y, t1, t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(ya, t1, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t5, t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_sub_6(x, x, t5, p384_mod); + sp_384_mont_sub_6(xa, xa, t5, p384_mod); sp_384_mont_sub_6(xs, xs, t5, p384_mod); - sp_384_mont_dbl_6(t1, y, p384_mod); - sp_384_mont_sub_6(x, x, t1, p384_mod); + sp_384_mont_dbl_6(t1, ya, p384_mod); + sp_384_mont_sub_6(xa, xa, t1, p384_mod); sp_384_mont_sub_6(xs, xs, t1, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_384_mont_sub_6(ys, y, xs, p384_mod); - sp_384_mont_sub_6(y, y, x, p384_mod); - sp_384_mont_mul_6(y, y, t4, p384_mod, p384_mp_mod); + sp_384_mont_sub_lower_6(ys, ya, xs, p384_mod); + sp_384_mont_sub_lower_6(ya, ya, xa, p384_mod); + sp_384_mont_mul_6(ya, ya, t4, p384_mod, p384_mp_mod); sp_384_sub_6(t6, p384_mod, t6); sp_384_mont_mul_6(ys, ys, t6, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t5, t5, t3, p384_mod, p384_mp_mod); - sp_384_mont_sub_6(y, y, t5, p384_mod); + sp_384_mont_sub_6(ya, ya, t5, p384_mod); sp_384_mont_sub_6(ys, ys, t5, p384_mod); } @@ -27527,7 +27637,7 @@ static void sp_384_map_avx2_6(sp_point_384* r, const sp_point_384* p, (sp_digit)1 : (sp_digit)0)); sp_384_norm_6(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -27536,6 +27646,7 @@ static void sp_384_map_avx2_6(sp_point_384* r, const sp_point_384* p, #define sp_384_mont_dbl_avx2_6 sp_384_mont_dbl_6 #define sp_384_mont_tpl_avx2_6 sp_384_mont_tpl_6 #define sp_384_mont_sub_avx2_6 sp_384_mont_sub_6 +#define sp_384_mont_sub_lower_avx2_6 sp_384_mont_sub_lower_6 extern void sp_384_div2_avx2_6(sp_digit* r, const sp_digit* a, const sp_digit* m); /* Double the Montgomery form projective point p. * @@ -27652,7 +27763,7 @@ static int sp_384_proj_point_dbl_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r break; case 16: /* Y = Y - X */ - sp_384_mont_sub_avx2_6(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_lower_avx2_6(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -27678,7 +27789,8 @@ static int sp_384_proj_point_dbl_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_384_proj_point_dbl_avx2_6(sp_point_384* r, const sp_point_384* p, sp_digit* t) +static void sp_384_proj_point_dbl_avx2_6(sp_point_384* r, const sp_point_384* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*6; @@ -27725,13 +27837,15 @@ static void sp_384_proj_point_dbl_avx2_6(sp_point_384* r, const sp_point_384* p, /* X = X - Y */ sp_384_mont_sub_avx2_6(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_avx2_6(y, y, x, p384_mod); + sp_384_mont_sub_lower_avx2_6(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_avx2_6(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ sp_384_mont_sub_avx2_6(y, y, t2, p384_mod); } +#define sp_384_mont_dbl_lower_avx2_6 sp_384_mont_dbl_lower_6 +#define sp_384_mont_tpl_lower_avx2_6 sp_384_mont_tpl_lower_6 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -27770,7 +27884,7 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int n, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_avx2_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_avx2_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_avx2_6(a, t1, p384_mod); + sp_384_mont_tpl_lower_avx2_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_avx2_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(b, t1, x, p384_mod, p384_mp_mod); @@ -27778,9 +27892,12 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int n, sp_384_mont_sqr_avx2_6(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_avx2_6(t2, b, p384_mod); sp_384_mont_sub_avx2_6(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_avx2_6(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_avx2_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_avx2_6(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_avx2_6(t1, t1, p384_mod, p384_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -27790,16 +27907,14 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int n, sp_384_mont_mul_avx2_6(w, w, t1, p384_mod, p384_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_avx2_6(y, b, x, p384_mod); - sp_384_mont_mul_avx2_6(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_avx2_6(y, y, p384_mod); + sp_384_mont_mul_avx2_6(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_avx2_6(y, y, t1, p384_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_384_mont_sqr_avx2_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_avx2_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_avx2_6(a, t1, p384_mod); + sp_384_mont_tpl_lower_avx2_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_avx2_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(b, t1, x, p384_mod, p384_mp_mod); @@ -27807,14 +27922,15 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int n, sp_384_mont_sqr_avx2_6(x, a, p384_mod, p384_mp_mod); sp_384_mont_dbl_avx2_6(t2, b, p384_mod); sp_384_mont_sub_avx2_6(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_avx2_6(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_avx2_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_avx2_6(z, z, y, p384_mod, p384_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_384_mont_sqr_avx2_6(t1, t1, p384_mod, p384_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_avx2_6(y, b, x, p384_mod); - sp_384_mont_mul_avx2_6(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_avx2_6(y, y, p384_mod); + sp_384_mont_mul_avx2_6(y, b, a, p384_mod, p384_mp_mod); sp_384_mont_sub_avx2_6(y, y, t1, p384_mod); #endif /* Y = Y/2 */ @@ -27840,6 +27956,7 @@ typedef struct sp_384_proj_point_add_avx2_6_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -27868,6 +27985,10 @@ static int sp_384_proj_point_add_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r ctx->t3 = t + 4*6; ctx->t4 = t + 6*6; ctx->t5 = t + 8*6; + ctx->t6 = t + 10*6; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -27892,29 +28013,6 @@ static int sp_384_proj_point_add_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_384)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<6; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<6; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<6; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -27928,16 +28026,16 @@ static int sp_384_proj_point_add_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r ctx->state = 6; break; case 6: - sp_384_mont_mul_avx2_6(ctx->t1, ctx->t1, ctx->x, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_avx2_6(ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_avx2_6(ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 8; break; case 8: - sp_384_mont_mul_avx2_6(ctx->t4, ctx->t2, ctx->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod); ctx->state = 9; break; case 9: @@ -27946,7 +28044,7 @@ static int sp_384_proj_point_add_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r break; case 10: /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_avx2_6(ctx->t3, ctx->t3, ctx->y, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod); ctx->state = 11; break; case 11: @@ -27965,29 +28063,29 @@ static int sp_384_proj_point_add_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_avx2_6(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_384_mont_sqr_avx2_6(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 15; break; case 15: - sp_384_mont_mul_avx2_6(ctx->z, ctx->z, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_avx2_6(ctx->x, ctx->t4, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 17; break; case 17: - sp_384_mont_sqr_avx2_6(ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_avx2_6(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod); ctx->state = 18; break; case 18: - sp_384_mont_mul_avx2_6(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod); ctx->state = 19; break; case 19: - sp_384_mont_mul_avx2_6(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_avx2_6(ctx->x, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 20; break; case 20: @@ -27995,24 +28093,24 @@ static int sp_384_proj_point_add_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r ctx->state = 21; break; case 21: - sp_384_mont_dbl_avx2_6(ctx->t1, ctx->y, p384_mod); + sp_384_mont_mul_avx2_6(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); ctx->state = 22; break; case 22: - sp_384_mont_sub_avx2_6(ctx->x, ctx->x, ctx->t1, p384_mod); + sp_384_mont_dbl_avx2_6(ctx->t3, ctx->y, p384_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_avx2_6(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_avx2_6(ctx->x, ctx->x, ctx->t3, p384_mod); ctx->state = 24; break; case 24: - sp_384_mont_mul_avx2_6(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_avx2_6(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 25; break; case 25: - sp_384_mont_mul_avx2_6(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod); ctx->state = 26; break; case 26: @@ -28020,9 +28118,30 @@ static int sp_384_proj_point_add_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -28034,24 +28153,13 @@ static int sp_384_proj_point_add_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r static void sp_384_proj_point_add_avx2_6(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*6; sp_digit* t3 = t + 4*6; sp_digit* t4 = t + 6*6; sp_digit* t5 = t + 8*6; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*6; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_384* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_384_sub_6(t1, p384_mod, q->y); @@ -28061,60 +28169,61 @@ static void sp_384_proj_point_add_avx2_6(sp_point_384* r, sp_384_proj_point_dbl_avx2_6(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<6; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<6; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<6; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_avx2_6(t1, q->z, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(t3, t1, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(t1, t1, x, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(t1, t1, p->x, p384_mod, p384_mp_mod); /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_avx2_6(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_avx2_6(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(t2, t2, q->x, p384_mod, p384_mp_mod); /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_avx2_6(t3, t3, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(t3, t3, p->y, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_avx2_6(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - U1 */ sp_384_mont_sub_avx2_6(t2, t2, t1, p384_mod); /* R = S2 - S1 */ sp_384_mont_sub_avx2_6(t4, t4, t3, p384_mod); - /* Z3 = H*Z1*Z2 */ - sp_384_mont_mul_avx2_6(z, z, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(z, z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_avx2_6(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_avx2_6(t5, t2, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(y, t1, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(t5, t5, t2, p384_mod, p384_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_384_mont_mul_avx2_6(z, p->z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(z, z, q->z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_avx2_6(x, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_avx2_6(x, x, t5, p384_mod); - sp_384_mont_dbl_avx2_6(t1, y, p384_mod); - sp_384_mont_sub_avx2_6(x, x, t1, p384_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_avx2_6(y, y, x, p384_mod); - sp_384_mont_mul_avx2_6(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(t5, t5, t3, p384_mod, p384_mp_mod); + sp_384_mont_dbl_avx2_6(t3, y, p384_mod); + sp_384_mont_sub_avx2_6(x, x, t3, p384_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_384_mont_sub_lower_avx2_6(y, y, x, p384_mod); + sp_384_mont_mul_avx2_6(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_avx2_6(y, y, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -28161,29 +28270,30 @@ static void sp_384_proj_point_dbl_n_store_avx2_6(sp_point_384* r, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_avx2_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_avx2_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_avx2_6(a, t1, p384_mod); + sp_384_mont_tpl_lower_avx2_6(a, t1, p384_mod); /* B = X*Y^2 */ - sp_384_mont_sqr_avx2_6(t2, y, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(b, t2, x, p384_mod, p384_mp_mod); + sp_384_mont_sqr_avx2_6(t1, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(b, t1, x, p384_mod, p384_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_384_mont_sqr_avx2_6(x, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_avx2_6(t1, b, p384_mod); - sp_384_mont_sub_avx2_6(x, x, t1, p384_mod); + sp_384_mont_dbl_avx2_6(t2, b, p384_mod); + sp_384_mont_sub_avx2_6(x, x, t2, p384_mod); + /* b = 2.(B - X) */ + sp_384_mont_sub_lower_avx2_6(t2, b, x, p384_mod); + sp_384_mont_dbl_lower_avx2_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_avx2_6(r[j].z, z, y, p384_mod, p384_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_384_mont_sqr_avx2_6(t2, t2, p384_mod, p384_mp_mod); + /* t1 = Y^4 */ + sp_384_mont_sqr_avx2_6(t1, t1, p384_mod, p384_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_384_mont_mul_avx2_6(w, w, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(w, w, t1, p384_mod, p384_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_384_mont_sub_avx2_6(y, b, x, p384_mod); - sp_384_mont_mul_avx2_6(y, y, a, p384_mod, p384_mp_mod); - sp_384_mont_dbl_avx2_6(y, y, p384_mod); - sp_384_mont_sub_avx2_6(y, y, t2, p384_mod); + sp_384_mont_mul_avx2_6(y, b, a, p384_mod, p384_mp_mod); + sp_384_mont_sub_avx2_6(y, y, t1, p384_mod); /* Y = Y/2 */ sp_384_div2_avx2_6(r[j].y, y, p384_mod); @@ -28209,30 +28319,30 @@ static void sp_384_proj_point_add_sub_avx2_6(sp_point_384* ra, sp_digit* t4 = t + 6*6; sp_digit* t5 = t + 8*6; sp_digit* t6 = t + 10*6; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_avx2_6(t1, q->z, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(t3, t1, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(t1, t1, x, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(t1, t1, xa, p384_mod, p384_mp_mod); /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_avx2_6(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_avx2_6(t2, za, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(t4, t2, za, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(t2, t2, q->x, p384_mod, p384_mp_mod); /* S1 = Y1*Z2^3 */ - sp_384_mont_mul_avx2_6(t3, t3, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(t3, t3, ya, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_avx2_6(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - U1 */ @@ -28243,30 +28353,30 @@ static void sp_384_proj_point_add_sub_avx2_6(sp_point_384* ra, sp_384_mont_sub_avx2_6(t4, t4, t3, p384_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_384_mont_mul_avx2_6(z, z, q->z, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(z, z, t2, p384_mod, p384_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_384_mont_mul_avx2_6(za, za, q->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(za, za, t2, p384_mod, p384_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_384_mont_sqr_avx2_6(x, t4, p384_mod, p384_mp_mod); + sp_384_mont_sqr_avx2_6(xa, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_avx2_6(xs, t6, p384_mod, p384_mp_mod); sp_384_mont_sqr_avx2_6(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(y, t1, t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(ya, t1, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(t5, t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_sub_avx2_6(x, x, t5, p384_mod); + sp_384_mont_sub_avx2_6(xa, xa, t5, p384_mod); sp_384_mont_sub_avx2_6(xs, xs, t5, p384_mod); - sp_384_mont_dbl_avx2_6(t1, y, p384_mod); - sp_384_mont_sub_avx2_6(x, x, t1, p384_mod); + sp_384_mont_dbl_avx2_6(t1, ya, p384_mod); + sp_384_mont_sub_avx2_6(xa, xa, t1, p384_mod); sp_384_mont_sub_avx2_6(xs, xs, t1, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_384_mont_sub_avx2_6(ys, y, xs, p384_mod); - sp_384_mont_sub_avx2_6(y, y, x, p384_mod); - sp_384_mont_mul_avx2_6(y, y, t4, p384_mod, p384_mp_mod); + sp_384_mont_sub_lower_avx2_6(ys, ya, xs, p384_mod); + sp_384_mont_sub_lower_avx2_6(ya, ya, xa, p384_mod); + sp_384_mont_mul_avx2_6(ya, ya, t4, p384_mod, p384_mp_mod); sp_384_sub_6(t6, p384_mod, t6); sp_384_mont_mul_avx2_6(ys, ys, t6, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(t5, t5, t3, p384_mod, p384_mp_mod); - sp_384_mont_sub_avx2_6(y, y, t5, p384_mod); + sp_384_mont_sub_avx2_6(ya, ya, t5, p384_mod); sp_384_mont_sub_avx2_6(ys, ys, t5, p384_mod); } @@ -28436,17 +28546,12 @@ typedef struct sp_table_entry_384 { static void sp_384_proj_point_add_qz1_6(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*6; sp_digit* t3 = t + 4*6; sp_digit* t4 = t + 6*6; sp_digit* t5 = t + 8*6; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*6; /* Check double */ (void)sp_384_sub_6(t1, p384_mod, q->y); @@ -28456,53 +28561,54 @@ static void sp_384_proj_point_add_qz1_6(sp_point_384* r, const sp_point_384* p, sp_384_proj_point_dbl_6(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<6; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<6; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<6; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_6(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - X1 */ - sp_384_mont_sub_6(t2, t2, x, p384_mod); + sp_384_mont_sub_6(t2, t2, p->x, p384_mod); /* R = S2 - Y1 */ - sp_384_mont_sub_6(t4, t4, y, p384_mod); + sp_384_mont_sub_6(t4, t4, p->y, p384_mod); /* Z3 = H*Z1 */ - sp_384_mont_mul_6(z, z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_384_mont_sqr_6(t1, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_6(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t3, x, t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t3, p->x, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_6(t5, t5, t2, p384_mod, p384_mp_mod); sp_384_mont_sub_6(x, t1, t5, p384_mod); sp_384_mont_dbl_6(t1, t3, p384_mod); sp_384_mont_sub_6(x, x, t1, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_6(t3, t3, x, p384_mod); + sp_384_mont_sub_lower_6(t3, t3, x, p384_mod); sp_384_mont_mul_6(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t5, t5, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t5, t5, p->y, p384_mod, p384_mp_mod); sp_384_mont_sub_6(y, t3, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -28900,17 +29006,12 @@ static int sp_384_ecc_mulmod_6(sp_point_384* r, const sp_point_384* g, const sp_ static void sp_384_proj_point_add_qz1_avx2_6(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - const sp_point_384* ap[2]; - sp_point_384* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*6; sp_digit* t3 = t + 4*6; sp_digit* t4 = t + 6*6; sp_digit* t5 = t + 8*6; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*6; /* Check double */ (void)sp_384_sub_6(t1, p384_mod, q->y); @@ -28920,53 +29021,54 @@ static void sp_384_proj_point_add_qz1_avx2_6(sp_point_384* r, const sp_point_384 sp_384_proj_point_dbl_avx2_6(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_384*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_384)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<6; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<6; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<6; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_384_mont_sqr_avx2_6(t2, z, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(t4, t2, z, p384_mod, p384_mp_mod); + sp_384_mont_sqr_avx2_6(t2, p->z, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(t4, t2, p->z, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(t2, t2, q->x, p384_mod, p384_mp_mod); /* S2 = Y2*Z1^3 */ sp_384_mont_mul_avx2_6(t4, t4, q->y, p384_mod, p384_mp_mod); /* H = U2 - X1 */ - sp_384_mont_sub_avx2_6(t2, t2, x, p384_mod); + sp_384_mont_sub_avx2_6(t2, t2, p->x, p384_mod); /* R = S2 - Y1 */ - sp_384_mont_sub_avx2_6(t4, t4, y, p384_mod); + sp_384_mont_sub_avx2_6(t4, t4, p->y, p384_mod); /* Z3 = H*Z1 */ - sp_384_mont_mul_avx2_6(z, z, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_384_mont_sqr_avx2_6(t1, t4, p384_mod, p384_mp_mod); sp_384_mont_sqr_avx2_6(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(t3, x, t5, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(t3, p->x, t5, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(t5, t5, t2, p384_mod, p384_mp_mod); sp_384_mont_sub_avx2_6(x, t1, t5, p384_mod); sp_384_mont_dbl_avx2_6(t1, t3, p384_mod); sp_384_mont_sub_avx2_6(x, x, t1, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_avx2_6(t3, t3, x, p384_mod); + sp_384_mont_sub_lower_avx2_6(t3, t3, x, p384_mod); sp_384_mont_mul_avx2_6(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(t5, t5, y, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(t5, t5, p->y, p384_mod, p384_mp_mod); sp_384_mont_sub_avx2_6(y, t3, t5, p384_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -49443,7 +49545,7 @@ typedef struct sp_ecc_verify_384_ctx { sp_digit u1[2*6]; sp_digit u2[2*6]; sp_digit s[2*6]; - sp_digit tmp[2*6 * 5]; + sp_digit tmp[2*6 * 6]; sp_point_384 p1; sp_point_384 p2; } sp_ecc_verify_384_ctx; @@ -49589,7 +49691,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_384* p1 = NULL; #else - sp_digit u1[16 * 6]; + sp_digit u1[18 * 6]; sp_point_384 p1[2]; #endif sp_digit* u2 = NULL; @@ -49611,7 +49713,7 @@ int sp_ecc_verify_384(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 6, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 6, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -49944,7 +50046,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_384* p = NULL; #else - sp_digit tmp[2 * 6 * 5]; + sp_digit tmp[2 * 6 * 6]; sp_point_384 p[2]; #endif sp_point_384* q = NULL; @@ -49961,7 +50063,7 @@ int sp_ecc_proj_add_point_384(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 6 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 6 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -50829,9 +50931,8 @@ extern sp_int64 sp_521_cmp_9(const sp_digit* a, const sp_digit* b); #define sp_521_norm_9(a) extern sp_digit sp_521_cond_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); -#define sp_521_mont_reduce_order_9 sp_521_mont_reduce_9 - extern void sp_521_mont_reduce_9(sp_digit* a, const sp_digit* m, sp_digit mp); +extern void sp_521_mont_reduce_order_9(sp_digit* a, const sp_digit* m, sp_digit mp); /* Map the Montgomery form projective coordinate point to an affine point. * * r Resulting affine coordinate point. @@ -50870,7 +50971,7 @@ static void sp_521_map_9(sp_point_521* r, const sp_point_521* p, (sp_digit)1 : (sp_digit)0)); sp_521_norm_9(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -50879,6 +50980,7 @@ extern void sp_521_mont_add_9(sp_digit* r, const sp_digit* a, const sp_digit* b, extern void sp_521_mont_dbl_9(sp_digit* r, const sp_digit* a, const sp_digit* m); extern void sp_521_mont_tpl_9(sp_digit* r, const sp_digit* a, const sp_digit* m); extern void sp_521_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); +#define sp_521_mont_sub_lower_9 sp_521_mont_sub_9 extern void sp_521_div2_9(sp_digit* r, const sp_digit* a, const sp_digit* m); /* Double the Montgomery form projective point p. * @@ -50995,7 +51097,7 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con break; case 16: /* Y = Y - X */ - sp_521_mont_sub_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -51021,7 +51123,8 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p, sp_digit* t) +static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*9; @@ -51068,13 +51171,15 @@ static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p, sp_d /* X = X - Y */ sp_521_mont_sub_9(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_9(y, y, x, p521_mod); + sp_521_mont_sub_lower_9(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_9(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ sp_521_mont_sub_9(y, y, t2, p521_mod); } +#define sp_521_mont_dbl_lower_9 sp_521_mont_dbl_9 +#define sp_521_mont_tpl_lower_9 sp_521_mont_tpl_9 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -51113,7 +51218,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int n, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_9(a, t1, p521_mod); + sp_521_mont_tpl_lower_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -51121,9 +51226,12 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int n, sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_9(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_9(t1, t1, p521_mod, p521_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -51133,16 +51241,14 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int n, sp_521_mont_mul_9(w, w, t1, p521_mod, p521_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_9(y, b, x, p521_mod); - sp_521_mont_mul_9(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_9(y, y, p521_mod); + sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, y, t1, p521_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_9(a, t1, p521_mod); + sp_521_mont_tpl_lower_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -51150,14 +51256,15 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int n, sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_9(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_9(t1, t1, p521_mod, p521_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_9(y, b, x, p521_mod); - sp_521_mont_mul_9(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_9(y, y, p521_mod); + sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, y, t1, p521_mod); #endif /* Y = Y/2 */ @@ -51197,6 +51304,7 @@ typedef struct sp_521_proj_point_add_9_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -51225,6 +51333,10 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->t3 = t + 4*9; ctx->t4 = t + 6*9; ctx->t5 = t + 8*9; + ctx->t6 = t + 10*9; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -51249,29 +51361,6 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_521)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -51285,16 +51374,16 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 6; break; case 6: - sp_521_mont_mul_9(ctx->t1, ctx->t1, ctx->x, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_9(ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 8; break; case 8: - sp_521_mont_mul_9(ctx->t4, ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 9; break; case 9: @@ -51303,7 +51392,7 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_9(ctx->t3, ctx->t3, ctx->y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod); ctx->state = 11; break; case 11: @@ -51322,29 +51411,29 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_521_mont_sqr_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 15; break; case 15: - sp_521_mont_mul_9(ctx->z, ctx->z, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 17; break; case 17: - sp_521_mont_sqr_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_9(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 18; break; case 18: - sp_521_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); ctx->state = 19; break; case 19: - sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 20; break; case 20: @@ -51352,24 +51441,24 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 21; break; case 21: - sp_521_mont_dbl_9(ctx->t1, ctx->y, p521_mod); + sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); ctx->state = 22; break; case 22: - sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t1, p521_mod); + sp_521_mont_dbl_9(ctx->t3, ctx->y, p521_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t3, p521_mod); ctx->state = 24; break; case 24: - sp_521_mont_mul_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 25; break; case 25: - sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 26; break; case 26: @@ -51377,9 +51466,30 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -51391,24 +51501,13 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, static void sp_521_proj_point_add_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*9; sp_digit* t3 = t + 4*9; sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*9; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_521* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_521_sub_9(t1, p521_mod, q->y); @@ -51418,60 +51517,61 @@ static void sp_521_proj_point_add_9(sp_point_521* r, sp_521_proj_point_dbl_9(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t3, t1, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t1, t1, x, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t1, t1, p->x, p521_mod, p521_mp_mod); /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_9(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod); /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_9(t3, t3, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t3, t3, p->y, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - U1 */ sp_521_mont_sub_9(t2, t2, t1, p521_mod); /* R = S2 - S1 */ sp_521_mont_sub_9(t4, t4, t3, p521_mod); - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(z, z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod); sp_521_mont_mul_9(y, t1, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_9(x, x, t5, p521_mod); - sp_521_mont_dbl_9(t1, y, p521_mod); - sp_521_mont_sub_9(x, x, t1, p521_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_9(y, y, x, p521_mod); - sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t3, p521_mod, p521_mp_mod); + sp_521_mont_dbl_9(t3, y, p521_mod); + sp_521_mont_sub_9(x, x, t3, p521_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_9(y, y, x, p521_mod); + sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, y, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -51518,29 +51618,30 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_9(a, t1, p521_mod); + sp_521_mont_tpl_lower_9(a, t1, p521_mod); /* B = X*Y^2 */ - sp_521_mont_sqr_9(t2, y, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(b, t2, x, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_9(t1, b, p521_mod); - sp_521_mont_sub_9(x, x, t1, p521_mod); + sp_521_mont_dbl_9(t2, b, p521_mod); + sp_521_mont_sub_9(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_9(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(r[j].z, z, y, p521_mod, p521_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_521_mont_sqr_9(t2, t2, p521_mod, p521_mp_mod); + /* t1 = Y^4 */ + sp_521_mont_sqr_9(t1, t1, p521_mod, p521_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_521_mont_mul_9(w, w, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(w, w, t1, p521_mod, p521_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_9(y, b, x, p521_mod); - sp_521_mont_mul_9(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_9(y, y, p521_mod); - sp_521_mont_sub_9(y, y, t2, p521_mod); + sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod); + sp_521_mont_sub_9(y, y, t1, p521_mod); /* Y = Y/2 */ sp_521_div2_9(r[j].y, y, p521_mod); @@ -51566,30 +51667,30 @@ static void sp_521_proj_point_add_sub_9(sp_point_521* ra, sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; sp_digit* t6 = t + 10*9; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t3, t1, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t1, t1, x, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t1, t1, xa, p521_mod, p521_mp_mod); /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_9(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t2, za, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t4, t2, za, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod); /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_9(t3, t3, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t3, t3, ya, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - U1 */ @@ -51600,30 +51701,30 @@ static void sp_521_proj_point_add_sub_9(sp_point_521* ra, sp_521_mont_sub_9(t4, t4, t3, p521_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(z, z, t2, p521_mod, p521_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_521_mont_mul_9(za, za, q->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(za, za, t2, p521_mod, p521_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(xa, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_9(xs, t6, p521_mod, p521_mp_mod); sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(y, t1, t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(ya, t1, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_sub_9(x, x, t5, p521_mod); + sp_521_mont_sub_9(xa, xa, t5, p521_mod); sp_521_mont_sub_9(xs, xs, t5, p521_mod); - sp_521_mont_dbl_9(t1, y, p521_mod); - sp_521_mont_sub_9(x, x, t1, p521_mod); + sp_521_mont_dbl_9(t1, ya, p521_mod); + sp_521_mont_sub_9(xa, xa, t1, p521_mod); sp_521_mont_sub_9(xs, xs, t1, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_521_mont_sub_9(ys, y, xs, p521_mod); - sp_521_mont_sub_9(y, y, x, p521_mod); - sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod); + sp_521_mont_sub_lower_9(ys, ya, xs, p521_mod); + sp_521_mont_sub_lower_9(ya, ya, xa, p521_mod); + sp_521_mont_mul_9(ya, ya, t4, p521_mod, p521_mp_mod); sp_521_sub_9(t6, p521_mod, t6); sp_521_mont_mul_9(ys, ys, t6, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t3, p521_mod, p521_mp_mod); - sp_521_mont_sub_9(y, y, t5, p521_mod); + sp_521_mont_sub_9(ya, ya, t5, p521_mod); sp_521_mont_sub_9(ys, ys, t5, p521_mod); } @@ -51951,9 +52052,9 @@ static void sp_521_mont_inv_avx2_9(sp_digit* r, const sp_digit* a, sp_digit* td) } extern sp_digit sp_521_cond_sub_avx2_9(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m); -#define sp_521_mont_reduce_order_avx2_9 sp_521_mont_reduce_avx2_9 +#define sp_521_mont_reduce_avx2_9 sp_521_mont_reduce_9 -extern void sp_521_mont_reduce_avx2_9(sp_digit* a, const sp_digit* m, sp_digit mp); +extern void sp_521_mont_reduce_order_avx2_9(sp_digit* a, const sp_digit* m, sp_digit mp); /* Map the Montgomery form projective coordinate point to an affine point. * * r Resulting affine coordinate point. @@ -51992,7 +52093,7 @@ static void sp_521_map_avx2_9(sp_point_521* r, const sp_point_521* p, (sp_digit)1 : (sp_digit)0)); sp_521_norm_9(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -52001,6 +52102,7 @@ static void sp_521_map_avx2_9(sp_point_521* r, const sp_point_521* p, #define sp_521_mont_dbl_avx2_9 sp_521_mont_dbl_9 #define sp_521_mont_tpl_avx2_9 sp_521_mont_tpl_9 #define sp_521_mont_sub_avx2_9 sp_521_mont_sub_9 +#define sp_521_mont_sub_lower_avx2_9 sp_521_mont_sub_avx2_9 extern void sp_521_div2_avx2_9(sp_digit* r, const sp_digit* a, const sp_digit* m); /* Double the Montgomery form projective point p. * @@ -52117,7 +52219,7 @@ static int sp_521_proj_point_dbl_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r break; case 16: /* Y = Y - X */ - sp_521_mont_sub_avx2_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_lower_avx2_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -52143,7 +52245,8 @@ static int sp_521_proj_point_dbl_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_521_proj_point_dbl_avx2_9(sp_point_521* r, const sp_point_521* p, sp_digit* t) +static void sp_521_proj_point_dbl_avx2_9(sp_point_521* r, const sp_point_521* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*9; @@ -52190,13 +52293,15 @@ static void sp_521_proj_point_dbl_avx2_9(sp_point_521* r, const sp_point_521* p, /* X = X - Y */ sp_521_mont_sub_avx2_9(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_avx2_9(y, y, x, p521_mod); + sp_521_mont_sub_lower_avx2_9(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_avx2_9(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ sp_521_mont_sub_avx2_9(y, y, t2, p521_mod); } +#define sp_521_mont_dbl_lower_avx2_9 sp_521_mont_dbl_avx2_9 +#define sp_521_mont_tpl_lower_avx2_9 sp_521_mont_tpl_avx2_9 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -52235,7 +52340,7 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int n, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_avx2_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_avx2_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_avx2_9(a, t1, p521_mod); + sp_521_mont_tpl_lower_avx2_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_avx2_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(b, t1, x, p521_mod, p521_mp_mod); @@ -52243,9 +52348,12 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int n, sp_521_mont_sqr_avx2_9(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_avx2_9(t2, b, p521_mod); sp_521_mont_sub_avx2_9(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_avx2_9(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_avx2_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_avx2_9(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_avx2_9(t1, t1, p521_mod, p521_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -52255,16 +52363,14 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int n, sp_521_mont_mul_avx2_9(w, w, t1, p521_mod, p521_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_avx2_9(y, b, x, p521_mod); - sp_521_mont_mul_avx2_9(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_avx2_9(y, y, p521_mod); + sp_521_mont_mul_avx2_9(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_avx2_9(y, y, t1, p521_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_521_mont_sqr_avx2_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_avx2_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_avx2_9(a, t1, p521_mod); + sp_521_mont_tpl_lower_avx2_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_avx2_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(b, t1, x, p521_mod, p521_mp_mod); @@ -52272,14 +52378,15 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int n, sp_521_mont_sqr_avx2_9(x, a, p521_mod, p521_mp_mod); sp_521_mont_dbl_avx2_9(t2, b, p521_mod); sp_521_mont_sub_avx2_9(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_avx2_9(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_avx2_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_avx2_9(z, z, y, p521_mod, p521_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_521_mont_sqr_avx2_9(t1, t1, p521_mod, p521_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_avx2_9(y, b, x, p521_mod); - sp_521_mont_mul_avx2_9(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_avx2_9(y, y, p521_mod); + sp_521_mont_mul_avx2_9(y, b, a, p521_mod, p521_mp_mod); sp_521_mont_sub_avx2_9(y, y, t1, p521_mod); #endif /* Y = Y/2 */ @@ -52305,6 +52412,7 @@ typedef struct sp_521_proj_point_add_avx2_9_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -52333,6 +52441,10 @@ static int sp_521_proj_point_add_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r ctx->t3 = t + 4*9; ctx->t4 = t + 6*9; ctx->t5 = t + 8*9; + ctx->t6 = t + 10*9; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -52357,29 +52469,6 @@ static int sp_521_proj_point_add_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_521)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -52393,16 +52482,16 @@ static int sp_521_proj_point_add_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r ctx->state = 6; break; case 6: - sp_521_mont_mul_avx2_9(ctx->t1, ctx->t1, ctx->x, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_avx2_9(ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_avx2_9(ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 8; break; case 8: - sp_521_mont_mul_avx2_9(ctx->t4, ctx->t2, ctx->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod); ctx->state = 9; break; case 9: @@ -52411,7 +52500,7 @@ static int sp_521_proj_point_add_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r break; case 10: /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_avx2_9(ctx->t3, ctx->t3, ctx->y, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod); ctx->state = 11; break; case 11: @@ -52430,29 +52519,29 @@ static int sp_521_proj_point_add_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_avx2_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_521_mont_sqr_avx2_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 15; break; case 15: - sp_521_mont_mul_avx2_9(ctx->z, ctx->z, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_avx2_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 17; break; case 17: - sp_521_mont_sqr_avx2_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_avx2_9(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod); ctx->state = 18; break; case 18: - sp_521_mont_mul_avx2_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod); ctx->state = 19; break; case 19: - sp_521_mont_mul_avx2_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_avx2_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 20; break; case 20: @@ -52460,24 +52549,24 @@ static int sp_521_proj_point_add_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r ctx->state = 21; break; case 21: - sp_521_mont_dbl_avx2_9(ctx->t1, ctx->y, p521_mod); + sp_521_mont_mul_avx2_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); ctx->state = 22; break; case 22: - sp_521_mont_sub_avx2_9(ctx->x, ctx->x, ctx->t1, p521_mod); + sp_521_mont_dbl_avx2_9(ctx->t3, ctx->y, p521_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_avx2_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_avx2_9(ctx->x, ctx->x, ctx->t3, p521_mod); ctx->state = 24; break; case 24: - sp_521_mont_mul_avx2_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_avx2_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 25; break; case 25: - sp_521_mont_mul_avx2_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod); ctx->state = 26; break; case 26: @@ -52485,9 +52574,30 @@ static int sp_521_proj_point_add_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -52499,24 +52609,13 @@ static int sp_521_proj_point_add_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r static void sp_521_proj_point_add_avx2_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*9; sp_digit* t3 = t + 4*9; sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*9; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_521* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_521_sub_9(t1, p521_mod, q->y); @@ -52526,60 +52625,61 @@ static void sp_521_proj_point_add_avx2_9(sp_point_521* r, sp_521_proj_point_dbl_avx2_9(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_avx2_9(t1, q->z, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(t3, t1, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(t1, t1, x, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(t1, t1, p->x, p521_mod, p521_mp_mod); /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_avx2_9(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_avx2_9(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(t2, t2, q->x, p521_mod, p521_mp_mod); /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_avx2_9(t3, t3, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(t3, t3, p->y, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_avx2_9(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - U1 */ sp_521_mont_sub_avx2_9(t2, t2, t1, p521_mod); /* R = S2 - S1 */ sp_521_mont_sub_avx2_9(t4, t4, t3, p521_mod); - /* Z3 = H*Z1*Z2 */ - sp_521_mont_mul_avx2_9(z, z, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(z, z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_avx2_9(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_avx2_9(t5, t2, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(y, t1, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(t5, t5, t2, p521_mod, p521_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_521_mont_mul_avx2_9(z, p->z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(z, z, q->z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_avx2_9(x, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_avx2_9(x, x, t5, p521_mod); - sp_521_mont_dbl_avx2_9(t1, y, p521_mod); - sp_521_mont_sub_avx2_9(x, x, t1, p521_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_avx2_9(y, y, x, p521_mod); - sp_521_mont_mul_avx2_9(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(t5, t5, t3, p521_mod, p521_mp_mod); + sp_521_mont_dbl_avx2_9(t3, y, p521_mod); + sp_521_mont_sub_avx2_9(x, x, t3, p521_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_521_mont_sub_lower_avx2_9(y, y, x, p521_mod); + sp_521_mont_mul_avx2_9(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_avx2_9(y, y, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -52626,29 +52726,30 @@ static void sp_521_proj_point_dbl_n_store_avx2_9(sp_point_521* r, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_avx2_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_avx2_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_avx2_9(a, t1, p521_mod); + sp_521_mont_tpl_lower_avx2_9(a, t1, p521_mod); /* B = X*Y^2 */ - sp_521_mont_sqr_avx2_9(t2, y, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(b, t2, x, p521_mod, p521_mp_mod); + sp_521_mont_sqr_avx2_9(t1, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(b, t1, x, p521_mod, p521_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_521_mont_sqr_avx2_9(x, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_avx2_9(t1, b, p521_mod); - sp_521_mont_sub_avx2_9(x, x, t1, p521_mod); + sp_521_mont_dbl_avx2_9(t2, b, p521_mod); + sp_521_mont_sub_avx2_9(x, x, t2, p521_mod); + /* b = 2.(B - X) */ + sp_521_mont_sub_lower_avx2_9(t2, b, x, p521_mod); + sp_521_mont_dbl_lower_avx2_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_avx2_9(r[j].z, z, y, p521_mod, p521_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_521_mont_sqr_avx2_9(t2, t2, p521_mod, p521_mp_mod); + /* t1 = Y^4 */ + sp_521_mont_sqr_avx2_9(t1, t1, p521_mod, p521_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_521_mont_mul_avx2_9(w, w, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(w, w, t1, p521_mod, p521_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_521_mont_sub_avx2_9(y, b, x, p521_mod); - sp_521_mont_mul_avx2_9(y, y, a, p521_mod, p521_mp_mod); - sp_521_mont_dbl_avx2_9(y, y, p521_mod); - sp_521_mont_sub_avx2_9(y, y, t2, p521_mod); + sp_521_mont_mul_avx2_9(y, b, a, p521_mod, p521_mp_mod); + sp_521_mont_sub_avx2_9(y, y, t1, p521_mod); /* Y = Y/2 */ sp_521_div2_avx2_9(r[j].y, y, p521_mod); @@ -52674,30 +52775,30 @@ static void sp_521_proj_point_add_sub_avx2_9(sp_point_521* ra, sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; sp_digit* t6 = t + 10*9; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_avx2_9(t1, q->z, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(t3, t1, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(t1, t1, x, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(t1, t1, xa, p521_mod, p521_mp_mod); /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_avx2_9(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_avx2_9(t2, za, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(t4, t2, za, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(t2, t2, q->x, p521_mod, p521_mp_mod); /* S1 = Y1*Z2^3 */ - sp_521_mont_mul_avx2_9(t3, t3, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(t3, t3, ya, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_avx2_9(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - U1 */ @@ -52708,30 +52809,30 @@ static void sp_521_proj_point_add_sub_avx2_9(sp_point_521* ra, sp_521_mont_sub_avx2_9(t4, t4, t3, p521_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_521_mont_mul_avx2_9(z, z, q->z, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(z, z, t2, p521_mod, p521_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_521_mont_mul_avx2_9(za, za, q->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(za, za, t2, p521_mod, p521_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_521_mont_sqr_avx2_9(x, t4, p521_mod, p521_mp_mod); + sp_521_mont_sqr_avx2_9(xa, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_avx2_9(xs, t6, p521_mod, p521_mp_mod); sp_521_mont_sqr_avx2_9(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(y, t1, t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(ya, t1, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(t5, t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_sub_avx2_9(x, x, t5, p521_mod); + sp_521_mont_sub_avx2_9(xa, xa, t5, p521_mod); sp_521_mont_sub_avx2_9(xs, xs, t5, p521_mod); - sp_521_mont_dbl_avx2_9(t1, y, p521_mod); - sp_521_mont_sub_avx2_9(x, x, t1, p521_mod); + sp_521_mont_dbl_avx2_9(t1, ya, p521_mod); + sp_521_mont_sub_avx2_9(xa, xa, t1, p521_mod); sp_521_mont_sub_avx2_9(xs, xs, t1, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_521_mont_sub_avx2_9(ys, y, xs, p521_mod); - sp_521_mont_sub_avx2_9(y, y, x, p521_mod); - sp_521_mont_mul_avx2_9(y, y, t4, p521_mod, p521_mp_mod); + sp_521_mont_sub_lower_avx2_9(ys, ya, xs, p521_mod); + sp_521_mont_sub_lower_avx2_9(ya, ya, xa, p521_mod); + sp_521_mont_mul_avx2_9(ya, ya, t4, p521_mod, p521_mp_mod); sp_521_sub_9(t6, p521_mod, t6); sp_521_mont_mul_avx2_9(ys, ys, t6, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(t5, t5, t3, p521_mod, p521_mp_mod); - sp_521_mont_sub_avx2_9(y, y, t5, p521_mod); + sp_521_mont_sub_avx2_9(ya, ya, t5, p521_mod); sp_521_mont_sub_avx2_9(ys, ys, t5, p521_mod); } @@ -52901,17 +53002,12 @@ typedef struct sp_table_entry_521 { static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*9; sp_digit* t3 = t + 4*9; sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*9; /* Check double */ (void)sp_521_sub_9(t1, p521_mod, q->y); @@ -52921,53 +53017,54 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p, sp_521_proj_point_dbl_9(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_9(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - X1 */ - sp_521_mont_sub_9(t2, t2, x, p521_mod); + sp_521_mont_sub_9(t2, t2, p->x, p521_mod); /* R = S2 - Y1 */ - sp_521_mont_sub_9(t4, t4, y, p521_mod); + sp_521_mont_sub_9(t4, t4, p->y, p521_mod); /* Z3 = H*Z1 */ - sp_521_mont_mul_9(z, z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_521_mont_sqr_9(t1, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t3, x, t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t3, p->x, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod); sp_521_mont_sub_9(x, t1, t5, p521_mod); sp_521_mont_dbl_9(t1, t3, p521_mod); sp_521_mont_sub_9(x, x, t1, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_9(t3, t3, x, p521_mod); + sp_521_mont_sub_lower_9(t3, t3, x, p521_mod); sp_521_mont_mul_9(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t5, t5, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t5, t5, p->y, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, t3, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -53113,7 +53210,7 @@ static int sp_521_ecc_mulmod_stripe_9(sp_point_521* r, const sp_point_521* g, sp_digit* t = NULL; #else sp_point_521 rt[2]; - sp_digit t[2 * 9 * 5]; + sp_digit t[2 * 9 * 6]; #endif sp_point_521* p = NULL; int i; @@ -53134,7 +53231,7 @@ static int sp_521_ecc_mulmod_stripe_9(sp_point_521* r, const sp_point_521* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -53365,17 +53462,12 @@ static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g, const sp_ static void sp_521_proj_point_add_qz1_avx2_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - const sp_point_521* ap[2]; - sp_point_521* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*9; sp_digit* t3 = t + 4*9; sp_digit* t4 = t + 6*9; sp_digit* t5 = t + 8*9; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*9; /* Check double */ (void)sp_521_sub_9(t1, p521_mod, q->y); @@ -53385,53 +53477,54 @@ static void sp_521_proj_point_add_qz1_avx2_9(sp_point_521* r, const sp_point_521 sp_521_proj_point_dbl_avx2_9(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_521*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_521)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<9; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<9; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<9; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_521_mont_sqr_avx2_9(t2, z, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(t4, t2, z, p521_mod, p521_mp_mod); + sp_521_mont_sqr_avx2_9(t2, p->z, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(t4, t2, p->z, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(t2, t2, q->x, p521_mod, p521_mp_mod); /* S2 = Y2*Z1^3 */ sp_521_mont_mul_avx2_9(t4, t4, q->y, p521_mod, p521_mp_mod); /* H = U2 - X1 */ - sp_521_mont_sub_avx2_9(t2, t2, x, p521_mod); + sp_521_mont_sub_avx2_9(t2, t2, p->x, p521_mod); /* R = S2 - Y1 */ - sp_521_mont_sub_avx2_9(t4, t4, y, p521_mod); + sp_521_mont_sub_avx2_9(t4, t4, p->y, p521_mod); /* Z3 = H*Z1 */ - sp_521_mont_mul_avx2_9(z, z, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_521_mont_sqr_avx2_9(t1, t4, p521_mod, p521_mp_mod); sp_521_mont_sqr_avx2_9(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(t3, x, t5, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(t3, p->x, t5, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(t5, t5, t2, p521_mod, p521_mp_mod); sp_521_mont_sub_avx2_9(x, t1, t5, p521_mod); sp_521_mont_dbl_avx2_9(t1, t3, p521_mod); sp_521_mont_sub_avx2_9(x, x, t1, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_avx2_9(t3, t3, x, p521_mod); + sp_521_mont_sub_lower_avx2_9(t3, t3, x, p521_mod); sp_521_mont_mul_avx2_9(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(t5, t5, y, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(t5, t5, p->y, p521_mod, p521_mp_mod); sp_521_mont_sub_avx2_9(y, t3, t5, p521_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -53575,7 +53668,7 @@ static int sp_521_ecc_mulmod_stripe_avx2_9(sp_point_521* r, const sp_point_521* sp_digit* t = NULL; #else sp_point_521 rt[2]; - sp_digit t[2 * 9 * 5]; + sp_digit t[2 * 9 * 6]; #endif sp_point_521* p = NULL; int i; @@ -53596,7 +53689,7 @@ static int sp_521_ecc_mulmod_stripe_avx2_9(sp_point_521* r, const sp_point_521* if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -53803,7 +53896,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[9 + 9 * 2 * 5]; + sp_digit k[9 + 9 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -53819,7 +53912,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (9 + 9 * 2 * 5), heap, + sizeof(sp_digit) * (9 + 9 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -88306,7 +88399,7 @@ static int sp_521_ecc_mulmod_add_only_9(sp_point_521* r, const sp_point_521* g, sp_digit* tmp = NULL; #else sp_point_521 rt[2]; - sp_digit tmp[2 * 9 * 5]; + sp_digit tmp[2 * 9 * 6]; #endif sp_point_521* p = NULL; sp_digit* negy = NULL; @@ -88325,7 +88418,7 @@ static int sp_521_ecc_mulmod_add_only_9(sp_point_521* r, const sp_point_521* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -88384,7 +88477,7 @@ static int sp_521_ecc_mulmod_add_only_9(sp_point_521* r, const sp_point_521* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 9 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 9 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -88440,7 +88533,7 @@ static int sp_521_ecc_mulmod_add_only_avx2_9(sp_point_521* r, const sp_point_521 sp_digit* tmp = NULL; #else sp_point_521 rt[2]; - sp_digit tmp[2 * 9 * 5]; + sp_digit tmp[2 * 9 * 6]; #endif sp_point_521* p = NULL; sp_digit* negy = NULL; @@ -88459,7 +88552,7 @@ static int sp_521_ecc_mulmod_add_only_avx2_9(sp_point_521* r, const sp_point_521 if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -88518,7 +88611,7 @@ static int sp_521_ecc_mulmod_add_only_avx2_9(sp_point_521* r, const sp_point_521 if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 9 * 5); + ForceZero(tmp, sizeof(sp_digit) * 2 * 9 * 6); #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC) XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -88629,7 +88722,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_521 point[2]; - sp_digit k[9 + 9 * 2 * 5]; + sp_digit k[9 + 9 * 2 * 6]; #endif sp_point_521* addP = NULL; sp_digit* tmp = NULL; @@ -88645,7 +88738,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (9 + 9 * 2 * 5), + sizeof(sp_digit) * (9 + 9 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; @@ -90144,7 +90237,7 @@ typedef struct sp_ecc_verify_521_ctx { sp_digit u1[2*9]; sp_digit u2[2*9]; sp_digit s[2*9]; - sp_digit tmp[2*9 * 5]; + sp_digit tmp[2*9 * 6]; sp_point_521 p1; sp_point_521 p2; } sp_ecc_verify_521_ctx; @@ -90293,7 +90386,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, sp_digit* u1 = NULL; sp_point_521* p1 = NULL; #else - sp_digit u1[16 * 9]; + sp_digit u1[18 * 9]; sp_point_521 p1[2]; #endif sp_digit* u2 = NULL; @@ -90315,7 +90408,7 @@ int sp_ecc_verify_521(const byte* hash, word32 hashLen, const mp_int* pX, err = MEMORY_E; } if (err == MP_OKAY) { - u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 9, heap, + u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 9, heap, DYNAMIC_TYPE_ECC); if (u1 == NULL) err = MEMORY_E; @@ -90652,7 +90745,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, sp_digit* tmp = NULL; sp_point_521* p = NULL; #else - sp_digit tmp[2 * 9 * 5]; + sp_digit tmp[2 * 9 * 6]; sp_point_521 p[2]; #endif sp_point_521* q = NULL; @@ -90669,7 +90762,7 @@ int sp_ecc_proj_add_point_521(mp_int* pX, mp_int* pY, mp_int* pZ, err = MEMORY_E; } if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 5, NULL, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, NULL, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; @@ -91678,7 +91771,7 @@ static void sp_1024_map_16(sp_point_1024* r, const sp_point_1024* p, (sp_digit)1 : (sp_digit)0)); sp_1024_norm_16(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -91687,6 +91780,7 @@ extern void sp_1024_mont_add_16(sp_digit* r, const sp_digit* a, const sp_digit* extern void sp_1024_mont_dbl_16(sp_digit* r, const sp_digit* a, const sp_digit* m); extern void sp_1024_mont_tpl_16(sp_digit* r, const sp_digit* a, const sp_digit* m); extern void sp_1024_mont_sub_16(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); +#define sp_1024_mont_sub_lower_16 sp_1024_mont_sub_16 extern void sp_1024_div2_16(sp_digit* r, const sp_digit* a, const sp_digit* m); /* Double the Montgomery form projective point p. * @@ -91803,7 +91897,7 @@ static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_16(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_lower_16(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -91829,7 +91923,8 @@ static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p, sp_digit* t) +static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*16; @@ -91876,13 +91971,15 @@ static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_16(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_16(y, y, x, p1024_mod); + sp_1024_mont_sub_lower_16(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_16(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ sp_1024_mont_sub_16(y, y, t2, p1024_mod); } +#define sp_1024_mont_dbl_lower_16 sp_1024_mont_dbl_16 +#define sp_1024_mont_tpl_lower_16 sp_1024_mont_tpl_16 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -91921,7 +92018,7 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int n, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_16(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -91929,9 +92026,12 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int n, sp_1024_mont_sqr_16(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_16(t2, b, p1024_mod); sp_1024_mont_sub_16(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_16(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_16(t1, t1, p1024_mod, p1024_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -91941,16 +92041,14 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int n, sp_1024_mont_mul_16(w, w, t1, p1024_mod, p1024_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_16(y, b, x, p1024_mod); - sp_1024_mont_mul_16(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_16(y, y, p1024_mod); + sp_1024_mont_mul_16(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(y, y, t1, p1024_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_16(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -91958,14 +92056,15 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int n, sp_1024_mont_sqr_16(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_16(t2, b, p1024_mod); sp_1024_mont_sub_16(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_16(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_16(t1, t1, p1024_mod, p1024_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_16(y, b, x, p1024_mod); - sp_1024_mont_mul_16(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_16(y, y, p1024_mod); + sp_1024_mont_mul_16(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(y, y, t1, p1024_mod); #endif /* Y = Y/2 */ @@ -92009,6 +92108,7 @@ typedef struct sp_1024_proj_point_add_16_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -92037,6 +92137,10 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->t3 = t + 4*16; ctx->t4 = t + 6*16; ctx->t5 = t + 8*16; + ctx->t6 = t + 10*16; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -92061,29 +92165,6 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_1024)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<16; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<16; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<16; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -92097,16 +92178,16 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 6; break; case 6: - sp_1024_mont_mul_16(ctx->t1, ctx->t1, ctx->x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_16(ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 8; break; case 8: - sp_1024_mont_mul_16(ctx->t4, ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 9; break; case 9: @@ -92115,7 +92196,7 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 10: /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_16(ctx->t3, ctx->t3, ctx->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod); ctx->state = 11; break; case 11: @@ -92134,29 +92215,29 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_16(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_1024_mont_sqr_16(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 15; break; case 15: - sp_1024_mont_mul_16(ctx->z, ctx->z, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_16(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 17; break; case 17: - sp_1024_mont_sqr_16(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_16(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 18; break; case 18: - sp_1024_mont_mul_16(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); ctx->state = 19; break; case 19: - sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 20; break; case 20: @@ -92164,24 +92245,24 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 21; break; case 21: - sp_1024_mont_dbl_16(ctx->t1, ctx->y, p1024_mod); + sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); ctx->state = 22; break; case 22: - sp_1024_mont_sub_16(ctx->x, ctx->x, ctx->t1, p1024_mod); + sp_1024_mont_dbl_16(ctx->t3, ctx->y, p1024_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_16(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_16(ctx->x, ctx->x, ctx->t3, p1024_mod); ctx->state = 24; break; case 24: - sp_1024_mont_mul_16(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_16(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 25; break; case 25: - sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 26; break; case 26: @@ -92189,9 +92270,30 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -92203,24 +92305,13 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, static void sp_1024_proj_point_add_16(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*16; sp_digit* t3 = t + 4*16; sp_digit* t4 = t + 6*16; sp_digit* t5 = t + 8*16; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*16; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_1024* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_1024_mont_sub_16(t1, p1024_mod, q->y, p1024_mod); @@ -92230,60 +92321,61 @@ static void sp_1024_proj_point_add_16(sp_point_1024* r, sp_1024_proj_point_dbl_16(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<16; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<16; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<16; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_16(t1, q->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t3, t1, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t1, t1, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t1, t1, p->x, p1024_mod, p1024_mp_mod); /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_16(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_16(t3, t3, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t3, t3, p->y, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - U1 */ sp_1024_mont_sub_16(t2, t2, t1, p1024_mod); /* R = S2 - S1 */ sp_1024_mont_sub_16(t4, t4, t3, p1024_mod); - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_16(z, z, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(z, z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_16(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_16(t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(y, t1, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t5, t5, t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_16(z, p->z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(z, z, q->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(x, x, t5, p1024_mod); - sp_1024_mont_dbl_16(t1, y, p1024_mod); - sp_1024_mont_sub_16(x, x, t1, p1024_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_16(y, y, x, p1024_mod); - sp_1024_mont_mul_16(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t5, t5, t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_dbl_16(t3, y, p1024_mod); + sp_1024_mont_sub_16(x, x, t3, p1024_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_16(y, y, x, p1024_mod); + sp_1024_mont_mul_16(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(y, y, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -92330,29 +92422,30 @@ static void sp_1024_proj_point_dbl_n_store_16(sp_point_1024* r, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_16(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_16(a, t1, p1024_mod); /* B = X*Y^2 */ - sp_1024_mont_sqr_16(t2, y, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(b, t2, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(t1, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(b, t1, x, p1024_mod, p1024_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_1024_mont_sqr_16(x, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_16(t1, b, p1024_mod); - sp_1024_mont_sub_16(x, x, t1, p1024_mod); + sp_1024_mont_dbl_16(t2, b, p1024_mod); + sp_1024_mont_sub_16(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_16(r[j].z, z, y, p1024_mod, p1024_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_1024_mont_sqr_16(t2, t2, p1024_mod, p1024_mp_mod); + /* t1 = Y^4 */ + sp_1024_mont_sqr_16(t1, t1, p1024_mod, p1024_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_1024_mont_mul_16(w, w, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(w, w, t1, p1024_mod, p1024_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_16(y, b, x, p1024_mod); - sp_1024_mont_mul_16(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_16(y, y, p1024_mod); - sp_1024_mont_sub_16(y, y, t2, p1024_mod); + sp_1024_mont_mul_16(y, b, a, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_16(y, y, t1, p1024_mod); /* Y = Y/2 */ sp_1024_div2_16(r[j].y, y, p1024_mod); @@ -92378,30 +92471,30 @@ static void sp_1024_proj_point_add_sub_16(sp_point_1024* ra, sp_digit* t4 = t + 6*16; sp_digit* t5 = t + 8*16; sp_digit* t6 = t + 10*16; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_16(t1, q->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t3, t1, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t1, t1, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t1, t1, xa, p1024_mod, p1024_mp_mod); /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_16(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(t2, za, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t4, t2, za, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_16(t3, t3, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t3, t3, ya, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - U1 */ @@ -92412,30 +92505,30 @@ static void sp_1024_proj_point_add_sub_16(sp_point_1024* ra, sp_1024_mont_sub_16(t4, t4, t3, p1024_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_1024_mont_mul_16(z, z, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(z, z, t2, p1024_mod, p1024_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_1024_mont_mul_16(za, za, q->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(za, za, t2, p1024_mod, p1024_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_16(x, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(xa, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_16(xs, t6, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_16(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(y, t1, t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(ya, t1, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t5, t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_16(x, x, t5, p1024_mod); + sp_1024_mont_sub_16(xa, xa, t5, p1024_mod); sp_1024_mont_sub_16(xs, xs, t5, p1024_mod); - sp_1024_mont_dbl_16(t1, y, p1024_mod); - sp_1024_mont_sub_16(x, x, t1, p1024_mod); + sp_1024_mont_dbl_16(t1, ya, p1024_mod); + sp_1024_mont_sub_16(xa, xa, t1, p1024_mod); sp_1024_mont_sub_16(xs, xs, t1, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_1024_mont_sub_16(ys, y, xs, p1024_mod); - sp_1024_mont_sub_16(y, y, x, p1024_mod); - sp_1024_mont_mul_16(y, y, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_lower_16(ys, ya, xs, p1024_mod); + sp_1024_mont_sub_lower_16(ya, ya, xa, p1024_mod); + sp_1024_mont_mul_16(ya, ya, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t6, p1024_mod, t6, p1024_mod); sp_1024_mont_mul_16(ys, ys, t6, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t5, t5, t3, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_16(y, y, t5, p1024_mod); + sp_1024_mont_sub_16(ya, ya, t5, p1024_mod); sp_1024_mont_sub_16(ys, ys, t5, p1024_mod); } @@ -92777,7 +92870,7 @@ static void sp_1024_map_avx2_16(sp_point_1024* r, const sp_point_1024* p, (sp_digit)1 : (sp_digit)0)); sp_1024_norm_16(r->y); - XMEMSET(r->z, 0, sizeof(r->z)); + XMEMSET(r->z, 0, sizeof(r->z) / 2); r->z[0] = 1; } @@ -92786,6 +92879,7 @@ extern void sp_1024_mont_add_avx2_16(sp_digit* r, const sp_digit* a, const sp_di extern void sp_1024_mont_dbl_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit* m); extern void sp_1024_mont_tpl_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit* m); extern void sp_1024_mont_sub_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); +#define sp_1024_mont_sub_lower_avx2_16 sp_1024_mont_sub_avx2_16 extern void sp_1024_div2_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit* m); /* Double the Montgomery form projective point p. * @@ -92902,7 +92996,7 @@ static int sp_1024_proj_point_dbl_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_avx2_16(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_lower_avx2_16(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -92928,7 +93022,8 @@ static int sp_1024_proj_point_dbl_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 } #endif /* WOLFSSL_SP_NONBLOCK */ -static void sp_1024_proj_point_dbl_avx2_16(sp_point_1024* r, const sp_point_1024* p, sp_digit* t) +static void sp_1024_proj_point_dbl_avx2_16(sp_point_1024* r, const sp_point_1024* p, + sp_digit* t) { sp_digit* t1 = t; sp_digit* t2 = t + 2*16; @@ -92975,13 +93070,15 @@ static void sp_1024_proj_point_dbl_avx2_16(sp_point_1024* r, const sp_point_1024 /* X = X - Y */ sp_1024_mont_sub_avx2_16(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_avx2_16(y, y, x, p1024_mod); + sp_1024_mont_sub_lower_avx2_16(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_avx2_16(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ sp_1024_mont_sub_avx2_16(y, y, t2, p1024_mod); } +#define sp_1024_mont_dbl_lower_avx2_16 sp_1024_mont_dbl_avx2_16 +#define sp_1024_mont_tpl_lower_avx2_16 sp_1024_mont_tpl_avx2_16 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -93020,7 +93117,7 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int n, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_avx2_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_avx2_16(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_avx2_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_avx2_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -93028,9 +93125,12 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int n, sp_1024_mont_sqr_avx2_16(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_avx2_16(t2, b, p1024_mod); sp_1024_mont_sub_avx2_16(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_avx2_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_avx2_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_avx2_16(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_avx2_16(t1, t1, p1024_mod, p1024_mp_mod); #ifdef WOLFSSL_SP_SMALL if (n != 0) @@ -93040,16 +93140,14 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int n, sp_1024_mont_mul_avx2_16(w, w, t1, p1024_mod, p1024_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_avx2_16(y, b, x, p1024_mod); - sp_1024_mont_mul_avx2_16(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_avx2_16(y, y, p1024_mod); + sp_1024_mont_mul_avx2_16(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(y, y, t1, p1024_mod); } #ifndef WOLFSSL_SP_SMALL /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_avx2_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_avx2_16(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_avx2_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_avx2_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -93057,14 +93155,15 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int n, sp_1024_mont_sqr_avx2_16(x, a, p1024_mod, p1024_mp_mod); sp_1024_mont_dbl_avx2_16(t2, b, p1024_mod); sp_1024_mont_sub_avx2_16(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_avx2_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_avx2_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_avx2_16(z, z, y, p1024_mod, p1024_mp_mod); - /* t2 = Y^4 */ + /* t1 = Y^4 */ sp_1024_mont_sqr_avx2_16(t1, t1, p1024_mod, p1024_mp_mod); /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_avx2_16(y, b, x, p1024_mod); - sp_1024_mont_mul_avx2_16(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_avx2_16(y, y, p1024_mod); + sp_1024_mont_mul_avx2_16(y, b, a, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(y, y, t1, p1024_mod); #endif /* Y = Y/2 */ @@ -93090,6 +93189,7 @@ typedef struct sp_1024_proj_point_add_avx2_16_ctx { sp_digit* t3; sp_digit* t4; sp_digit* t5; + sp_digit* t6; sp_digit* x; sp_digit* y; sp_digit* z; @@ -93118,6 +93218,10 @@ static int sp_1024_proj_point_add_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 ctx->t3 = t + 4*16; ctx->t4 = t + 6*16; ctx->t5 = t + 8*16; + ctx->t6 = t + 10*16; + ctx->x = ctx->t6; + ctx->y = ctx->t1; + ctx->z = ctx->t2; ctx->state = 1; break; @@ -93142,29 +93246,6 @@ static int sp_1024_proj_point_add_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 break; case 3: { - int i; - ctx->rp[0] = r; - - /*lint allow cast to different type of pointer*/ - ctx->rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(ctx->rp[1], 0, sizeof(sp_point_1024)); - ctx->x = ctx->rp[p->infinity | q->infinity]->x; - ctx->y = ctx->rp[p->infinity | q->infinity]->y; - ctx->z = ctx->rp[p->infinity | q->infinity]->z; - - ctx->ap[0] = p; - ctx->ap[1] = q; - for (i=0; i<16; i++) { - r->x[i] = ctx->ap[p->infinity]->x[i]; - } - for (i=0; i<16; i++) { - r->y[i] = ctx->ap[p->infinity]->y[i]; - } - for (i=0; i<16; i++) { - r->z[i] = ctx->ap[p->infinity]->z[i]; - } - r->infinity = ctx->ap[p->infinity]->infinity; - ctx->state = 4; break; } @@ -93178,16 +93259,16 @@ static int sp_1024_proj_point_add_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 ctx->state = 6; break; case 6: - sp_1024_mont_mul_avx2_16(ctx->t1, ctx->t1, ctx->x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod); ctx->state = 7; break; case 7: /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_avx2_16(ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_avx2_16(ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 8; break; case 8: - sp_1024_mont_mul_avx2_16(ctx->t4, ctx->t2, ctx->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod); ctx->state = 9; break; case 9: @@ -93196,7 +93277,7 @@ static int sp_1024_proj_point_add_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 break; case 10: /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_avx2_16(ctx->t3, ctx->t3, ctx->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod); ctx->state = 11; break; case 11: @@ -93215,29 +93296,29 @@ static int sp_1024_proj_point_add_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 ctx->state = 14; break; case 14: - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_avx2_16(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); + /* X3 = R^2 - H^3 - 2*U1*H^2 */ + sp_1024_mont_sqr_avx2_16(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 15; break; case 15: - sp_1024_mont_mul_avx2_16(ctx->z, ctx->z, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); ctx->state = 16; break; case 16: - /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_avx2_16(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 17; break; case 17: - sp_1024_mont_sqr_avx2_16(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_avx2_16(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod); ctx->state = 18; break; case 18: - sp_1024_mont_mul_avx2_16(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod); ctx->state = 19; break; case 19: - sp_1024_mont_mul_avx2_16(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_avx2_16(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 20; break; case 20: @@ -93245,24 +93326,24 @@ static int sp_1024_proj_point_add_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 ctx->state = 21; break; case 21: - sp_1024_mont_dbl_avx2_16(ctx->t1, ctx->y, p1024_mod); + sp_1024_mont_mul_avx2_16(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); ctx->state = 22; break; case 22: - sp_1024_mont_sub_avx2_16(ctx->x, ctx->x, ctx->t1, p1024_mod); + sp_1024_mont_dbl_avx2_16(ctx->t3, ctx->y, p1024_mod); ctx->state = 23; break; case 23: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_avx2_16(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_avx2_16(ctx->x, ctx->x, ctx->t3, p1024_mod); ctx->state = 24; break; case 24: - sp_1024_mont_mul_avx2_16(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_avx2_16(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 25; break; case 25: - sp_1024_mont_mul_avx2_16(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod); ctx->state = 26; break; case 26: @@ -93270,9 +93351,30 @@ static int sp_1024_proj_point_add_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 ctx->state = 27; /* fall-through */ case 27: + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; + err = MP_OKAY; break; } + } if (err == MP_OKAY && ctx->state != 27) { err = FP_WOULDBLOCK; @@ -93284,24 +93386,13 @@ static int sp_1024_proj_point_add_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 static void sp_1024_proj_point_add_avx2_16(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*16; sp_digit* t3 = t + 4*16; sp_digit* t4 = t + 6*16; sp_digit* t5 = t + 8*16; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*16; - /* Ensure only the first point is the same as the result. */ - if (q == r) { - const sp_point_1024* a = p; - p = q; - q = a; - } /* Check double */ (void)sp_1024_mont_sub_avx2_16(t1, p1024_mod, q->y, p1024_mod); @@ -93311,60 +93402,61 @@ static void sp_1024_proj_point_add_avx2_16(sp_point_1024* r, sp_1024_proj_point_dbl_avx2_16(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<16; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<16; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<16; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t6; + sp_digit* y = t1; + sp_digit* z = t2; + int i; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_avx2_16(t1, q->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(t3, t1, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(t1, t1, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(t1, t1, p->x, p1024_mod, p1024_mp_mod); /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_avx2_16(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_avx2_16(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_avx2_16(t3, t3, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(t3, t3, p->y, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_avx2_16(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - U1 */ sp_1024_mont_sub_avx2_16(t2, t2, t1, p1024_mod); /* R = S2 - S1 */ sp_1024_mont_sub_avx2_16(t4, t4, t3, p1024_mod); - /* Z3 = H*Z1*Z2 */ - sp_1024_mont_mul_avx2_16(z, z, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(z, z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_avx2_16(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_avx2_16(t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(y, t1, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(t5, t5, t2, p1024_mod, p1024_mp_mod); + /* Z3 = H*Z1*Z2 */ + sp_1024_mont_mul_avx2_16(z, p->z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(z, z, q->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_avx2_16(x, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(x, x, t5, p1024_mod); - sp_1024_mont_dbl_avx2_16(t1, y, p1024_mod); - sp_1024_mont_sub_avx2_16(x, x, t1, p1024_mod); - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_avx2_16(y, y, x, p1024_mod); - sp_1024_mont_mul_avx2_16(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(t5, t5, t3, p1024_mod, p1024_mp_mod); + sp_1024_mont_dbl_avx2_16(t3, y, p1024_mod); + sp_1024_mont_sub_avx2_16(x, x, t3, p1024_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_1024_mont_sub_lower_avx2_16(y, y, x, p1024_mod); + sp_1024_mont_mul_avx2_16(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(y, y, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -93411,29 +93503,30 @@ static void sp_1024_proj_point_dbl_n_store_avx2_16(sp_point_1024* r, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_avx2_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_avx2_16(a, t1, p1024_mod); + sp_1024_mont_tpl_lower_avx2_16(a, t1, p1024_mod); /* B = X*Y^2 */ - sp_1024_mont_sqr_avx2_16(t2, y, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(b, t2, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_avx2_16(t1, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(b, t1, x, p1024_mod, p1024_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_1024_mont_sqr_avx2_16(x, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_avx2_16(t1, b, p1024_mod); - sp_1024_mont_sub_avx2_16(x, x, t1, p1024_mod); + sp_1024_mont_dbl_avx2_16(t2, b, p1024_mod); + sp_1024_mont_sub_avx2_16(x, x, t2, p1024_mod); + /* b = 2.(B - X) */ + sp_1024_mont_sub_lower_avx2_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_lower_avx2_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_avx2_16(r[j].z, z, y, p1024_mod, p1024_mp_mod); z = r[j].z; - /* t2 = Y^4 */ - sp_1024_mont_sqr_avx2_16(t2, t2, p1024_mod, p1024_mp_mod); + /* t1 = Y^4 */ + sp_1024_mont_sqr_avx2_16(t1, t1, p1024_mod, p1024_mp_mod); if (i != n) { /* W = W*Y^4 */ - sp_1024_mont_mul_avx2_16(w, w, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(w, w, t1, p1024_mod, p1024_mp_mod); } /* y = 2*A*(B - X) - Y^4 */ - sp_1024_mont_sub_avx2_16(y, b, x, p1024_mod); - sp_1024_mont_mul_avx2_16(y, y, a, p1024_mod, p1024_mp_mod); - sp_1024_mont_dbl_avx2_16(y, y, p1024_mod); - sp_1024_mont_sub_avx2_16(y, y, t2, p1024_mod); + sp_1024_mont_mul_avx2_16(y, b, a, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_avx2_16(y, y, t1, p1024_mod); /* Y = Y/2 */ sp_1024_div2_avx2_16(r[j].y, y, p1024_mod); @@ -93459,30 +93552,30 @@ static void sp_1024_proj_point_add_sub_avx2_16(sp_point_1024* ra, sp_digit* t4 = t + 6*16; sp_digit* t5 = t + 8*16; sp_digit* t6 = t + 10*16; - sp_digit* x = ra->x; - sp_digit* y = ra->y; - sp_digit* z = ra->z; + sp_digit* xa = ra->x; + sp_digit* ya = ra->y; + sp_digit* za = ra->z; sp_digit* xs = rs->x; sp_digit* ys = rs->y; sp_digit* zs = rs->z; - XMEMCPY(x, p->x, sizeof(p->x) / 2); - XMEMCPY(y, p->y, sizeof(p->y) / 2); - XMEMCPY(z, p->z, sizeof(p->z) / 2); + XMEMCPY(xa, p->x, sizeof(p->x) / 2); + XMEMCPY(ya, p->y, sizeof(p->y) / 2); + XMEMCPY(za, p->z, sizeof(p->z) / 2); ra->infinity = 0; rs->infinity = 0; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_avx2_16(t1, q->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(t3, t1, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(t1, t1, x, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(t1, t1, xa, p1024_mod, p1024_mp_mod); /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_avx2_16(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_avx2_16(t2, za, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(t4, t2, za, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S1 = Y1*Z2^3 */ - sp_1024_mont_mul_avx2_16(t3, t3, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(t3, t3, ya, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_avx2_16(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - U1 */ @@ -93493,30 +93586,30 @@ static void sp_1024_proj_point_add_sub_avx2_16(sp_point_1024* ra, sp_1024_mont_sub_avx2_16(t4, t4, t3, p1024_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ - sp_1024_mont_mul_avx2_16(z, z, q->z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(z, z, t2, p1024_mod, p1024_mp_mod); - XMEMCPY(zs, z, sizeof(p->z)/2); + sp_1024_mont_mul_avx2_16(za, za, q->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(za, za, t2, p1024_mod, p1024_mp_mod); + XMEMCPY(zs, za, sizeof(p->z)/2); /* X3 = R^2 - H^3 - 2*U1*H^2 */ /* XS = RS^2 - H^3 - 2*U1*H^2 */ - sp_1024_mont_sqr_avx2_16(x, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_avx2_16(xa, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_avx2_16(xs, t6, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_avx2_16(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(y, t1, t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(ya, t1, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(t5, t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_avx2_16(x, x, t5, p1024_mod); + sp_1024_mont_sub_avx2_16(xa, xa, t5, p1024_mod); sp_1024_mont_sub_avx2_16(xs, xs, t5, p1024_mod); - sp_1024_mont_dbl_avx2_16(t1, y, p1024_mod); - sp_1024_mont_sub_avx2_16(x, x, t1, p1024_mod); + sp_1024_mont_dbl_avx2_16(t1, ya, p1024_mod); + sp_1024_mont_sub_avx2_16(xa, xa, t1, p1024_mod); sp_1024_mont_sub_avx2_16(xs, xs, t1, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_1024_mont_sub_avx2_16(ys, y, xs, p1024_mod); - sp_1024_mont_sub_avx2_16(y, y, x, p1024_mod); - sp_1024_mont_mul_avx2_16(y, y, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_lower_avx2_16(ys, ya, xs, p1024_mod); + sp_1024_mont_sub_lower_avx2_16(ya, ya, xa, p1024_mod); + sp_1024_mont_mul_avx2_16(ya, ya, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(t6, p1024_mod, t6, p1024_mod); sp_1024_mont_mul_avx2_16(ys, ys, t6, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(t5, t5, t3, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_avx2_16(y, y, t5, p1024_mod); + sp_1024_mont_sub_avx2_16(ya, ya, t5, p1024_mod); sp_1024_mont_sub_avx2_16(ys, ys, t5, p1024_mod); } @@ -93690,17 +93783,12 @@ typedef struct sp_table_entry_1024 { static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*16; sp_digit* t3 = t + 4*16; sp_digit* t4 = t + 6*16; sp_digit* t5 = t + 8*16; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*16; /* Check double */ (void)sp_1024_mont_sub_16(t1, p1024_mod, q->y, p1024_mod); @@ -93710,53 +93798,54 @@ static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, const sp_point_1024* sp_1024_proj_point_dbl_16(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<16; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<16; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<16; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_16(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - X1 */ - sp_1024_mont_sub_16(t2, t2, x, p1024_mod); + sp_1024_mont_sub_16(t2, t2, p->x, p1024_mod); /* R = S2 - Y1 */ - sp_1024_mont_sub_16(t4, t4, y, p1024_mod); + sp_1024_mont_sub_16(t4, t4, p->y, p1024_mod); /* Z3 = H*Z1 */ - sp_1024_mont_mul_16(z, z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_1024_mont_sqr_16(t1, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_16(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t3, x, t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t3, p->x, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(t5, t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(x, t1, t5, p1024_mod); sp_1024_mont_dbl_16(t1, t3, p1024_mod); sp_1024_mont_sub_16(x, x, t1, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_16(t3, t3, x, p1024_mod); + sp_1024_mont_sub_lower_16(t3, t3, x, p1024_mod); sp_1024_mont_mul_16(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t5, t5, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t5, t5, p->y, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(y, t3, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -93899,7 +93988,7 @@ static int sp_1024_ecc_mulmod_stripe_16(sp_point_1024* r, const sp_point_1024* g sp_digit* t = NULL; #else sp_point_1024 rt[2]; - sp_digit t[2 * 16 * 5]; + sp_digit t[2 * 16 * 6]; #endif sp_point_1024* p = NULL; int i; @@ -93920,7 +94009,7 @@ static int sp_1024_ecc_mulmod_stripe_16(sp_point_1024* r, const sp_point_1024* g if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 16 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 16 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -94085,7 +94174,7 @@ static int sp_1024_ecc_mulmod_16(sp_point_1024* r, const sp_point_1024* g, const #ifndef FP_ECC return sp_1024_ecc_mulmod_win_add_sub_16(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 16 * 5]; + sp_digit tmp[2 * 16 * 6]; sp_cache_1024_t* cache; int err = MP_OKAY; @@ -94135,17 +94224,12 @@ static int sp_1024_ecc_mulmod_16(sp_point_1024* r, const sp_point_1024* g, const static void sp_1024_proj_point_add_qz1_avx2_16(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - const sp_point_1024* ap[2]; - sp_point_1024* rp[2]; sp_digit* t1 = t; sp_digit* t2 = t + 2*16; sp_digit* t3 = t + 4*16; sp_digit* t4 = t + 6*16; sp_digit* t5 = t + 8*16; - sp_digit* x; - sp_digit* y; - sp_digit* z; - int i; + sp_digit* t6 = t + 10*16; /* Check double */ (void)sp_1024_mont_sub_avx2_16(t1, p1024_mod, q->y, p1024_mod); @@ -94155,53 +94239,54 @@ static void sp_1024_proj_point_add_qz1_avx2_16(sp_point_1024* r, const sp_point_ sp_1024_proj_point_dbl_avx2_16(r, p, t); } else { - rp[0] = r; - - /*lint allow cast to different type of pointer*/ - rp[1] = (sp_point_1024*)t; /*lint !e9087 !e740*/ - XMEMSET(rp[1], 0, sizeof(sp_point_1024)); - x = rp[p->infinity | q->infinity]->x; - y = rp[p->infinity | q->infinity]->y; - z = rp[p->infinity | q->infinity]->z; - - ap[0] = p; - ap[1] = q; - for (i=0; i<16; i++) { - r->x[i] = ap[p->infinity]->x[i]; - } - for (i=0; i<16; i++) { - r->y[i] = ap[p->infinity]->y[i]; - } - for (i=0; i<16; i++) { - r->z[i] = ap[p->infinity]->z[i]; - } - r->infinity = ap[p->infinity]->infinity; + sp_digit maskp; + sp_digit maskq; + sp_digit maskt; + sp_digit* x = t2; + sp_digit* y = t5; + sp_digit* z = t6; + int i; /* U2 = X2*Z1^2 */ - sp_1024_mont_sqr_avx2_16(t2, z, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(t4, t2, z, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_avx2_16(t2, p->z, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(t4, t2, p->z, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(t2, t2, q->x, p1024_mod, p1024_mp_mod); /* S2 = Y2*Z1^3 */ sp_1024_mont_mul_avx2_16(t4, t4, q->y, p1024_mod, p1024_mp_mod); /* H = U2 - X1 */ - sp_1024_mont_sub_avx2_16(t2, t2, x, p1024_mod); + sp_1024_mont_sub_avx2_16(t2, t2, p->x, p1024_mod); /* R = S2 - Y1 */ - sp_1024_mont_sub_avx2_16(t4, t4, y, p1024_mod); + sp_1024_mont_sub_avx2_16(t4, t4, p->y, p1024_mod); /* Z3 = H*Z1 */ - sp_1024_mont_mul_avx2_16(z, z, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ sp_1024_mont_sqr_avx2_16(t1, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sqr_avx2_16(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(t3, x, t5, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(t3, p->x, t5, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(t5, t5, t2, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(x, t1, t5, p1024_mod); sp_1024_mont_dbl_avx2_16(t1, t3, p1024_mod); sp_1024_mont_sub_avx2_16(x, x, t1, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_avx2_16(t3, t3, x, p1024_mod); + sp_1024_mont_sub_lower_avx2_16(t3, t3, x, p1024_mod); sp_1024_mont_mul_avx2_16(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(t5, t5, y, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(t5, t5, p->y, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(y, t3, t5, p1024_mod); + + maskp = 0 - (q->infinity & (!p->infinity)); + maskq = 0 - (p->infinity & (!q->infinity)); + maskt = ~(maskp | maskq); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); + } + r->z[0] |= p->infinity & q->infinity; + r->infinity = p->infinity & q->infinity; } } @@ -94344,7 +94429,7 @@ static int sp_1024_ecc_mulmod_stripe_avx2_16(sp_point_1024* r, const sp_point_10 sp_digit* t = NULL; #else sp_point_1024 rt[2]; - sp_digit t[2 * 16 * 5]; + sp_digit t[2 * 16 * 6]; #endif sp_point_1024* p = NULL; int i; @@ -94365,7 +94450,7 @@ static int sp_1024_ecc_mulmod_stripe_avx2_16(sp_point_1024* r, const sp_point_10 if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 16 * 5, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 16 * 6, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -94437,7 +94522,7 @@ static int sp_1024_ecc_mulmod_avx2_16(sp_point_1024* r, const sp_point_1024* g, #ifndef FP_ECC return sp_1024_ecc_mulmod_win_add_sub_avx2_16(r, g, k, map, ct, heap); #else - sp_digit tmp[2 * 16 * 5]; + sp_digit tmp[2 * 16 * 6]; sp_cache_1024_t* cache; int err = MP_OKAY; @@ -97988,7 +98073,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, sp_digit* k = NULL; #else sp_point_1024 point[2]; - sp_digit k[16 + 16 * 2 * 5]; + sp_digit k[16 + 16 * 2 * 6]; #endif sp_point_1024* addP = NULL; sp_digit* tmp = NULL; @@ -98004,7 +98089,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, err = MEMORY_E; if (err == MP_OKAY) { k = (sp_digit*)XMALLOC( - sizeof(sp_digit) * (16 + 16 * 2 * 5), + sizeof(sp_digit) * (16 + 16 * 2 * 6), heap, DYNAMIC_TYPE_ECC); if (k == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S index b0ad90f63..21c4eaf80 100644 --- a/wolfcrypt/src/sp_x86_64_asm.S +++ b/wolfcrypt/src/sp_x86_64_asm.S @@ -3837,7 +3837,6 @@ sp_2048_sub_in_place_32: _sp_2048_sub_in_place_32: #endif /* __APPLE__ */ movq (%rdi), %rdx - xorq %rax, %rax subq (%rsi), %rdx movq 8(%rdi), %rcx movq %rdx, (%rdi) @@ -3933,7 +3932,7 @@ _sp_2048_sub_in_place_32: movq %rdx, 240(%rdi) sbbq 248(%rsi), %rcx movq %rcx, 248(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_2048_sub_in_place_32,.-sp_2048_sub_in_place_32 @@ -7618,78 +7617,9 @@ L_end_2048_sqr_avx2_16: .size sp_2048_sqr_avx2_16,.-sp_2048_sqr_avx2_16 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ -/* Add a to a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_2048_dbl_16 -.type sp_2048_dbl_16,@function -.align 16 -sp_2048_dbl_16: -#else -.section __TEXT,__text -.globl _sp_2048_dbl_16 -.p2align 4 -_sp_2048_dbl_16: -#endif /* __APPLE__ */ - movq (%rsi), %rdx - xorq %rax, %rax - addq %rdx, %rdx - movq 8(%rsi), %rcx - movq %rdx, (%rdi) - adcq %rcx, %rcx - movq 16(%rsi), %rdx - movq %rcx, 8(%rdi) - adcq %rdx, %rdx - movq 24(%rsi), %rcx - movq %rdx, 16(%rdi) - adcq %rcx, %rcx - movq 32(%rsi), %rdx - movq %rcx, 24(%rdi) - adcq %rdx, %rdx - movq 40(%rsi), %rcx - movq %rdx, 32(%rdi) - adcq %rcx, %rcx - movq 48(%rsi), %rdx - movq %rcx, 40(%rdi) - adcq %rdx, %rdx - movq 56(%rsi), %rcx - movq %rdx, 48(%rdi) - adcq %rcx, %rcx - movq 64(%rsi), %rdx - movq %rcx, 56(%rdi) - adcq %rdx, %rdx - movq 72(%rsi), %rcx - movq %rdx, 64(%rdi) - adcq %rcx, %rcx - movq 80(%rsi), %rdx - movq %rcx, 72(%rdi) - adcq %rdx, %rdx - movq 88(%rsi), %rcx - movq %rdx, 80(%rdi) - adcq %rcx, %rcx - movq 96(%rsi), %rdx - movq %rcx, 88(%rdi) - adcq %rdx, %rdx - movq 104(%rsi), %rcx - movq %rdx, 96(%rdi) - adcq %rcx, %rcx - movq 112(%rsi), %rdx - movq %rcx, 104(%rdi) - adcq %rdx, %rdx - movq 120(%rsi), %rcx - movq %rdx, 112(%rdi) - adcq %rcx, %rcx - movq %rcx, 120(%rdi) - adcq $0x00, %rax - repz retq -#ifndef __APPLE__ -.size sp_2048_dbl_16,.-sp_2048_dbl_16 -#endif /* __APPLE__ */ /* Square a and put result in r. (r = a * a) + * + * Karatsuba: ah^2, al^2, (al - ah)^2 * * r A single precision integer. * a A single precision integer. @@ -7706,63 +7636,143 @@ sp_2048_sqr_32: .p2align 4 _sp_2048_sqr_32: #endif /* __APPLE__ */ - subq $0x298, %rsp - movq %rdi, 640(%rsp) - movq %rsi, 648(%rsp) - leaq 512(%rsp), %r8 + subq $0x110, %rsp + movq %rdi, 256(%rsp) + movq %rsi, 264(%rsp) + movq $0x00, %rcx + movq %rsp, %r8 leaq 128(%rsi), %r9 - # Add movq (%rsi), %rdx - xorq %rcx, %rcx - addq (%r9), %rdx + subq (%r9), %rdx movq 8(%rsi), %rax movq %rdx, (%r8) - adcq 8(%r9), %rax + sbbq 8(%r9), %rax movq 16(%rsi), %rdx movq %rax, 8(%r8) - adcq 16(%r9), %rdx + sbbq 16(%r9), %rdx movq 24(%rsi), %rax movq %rdx, 16(%r8) - adcq 24(%r9), %rax + sbbq 24(%r9), %rax movq 32(%rsi), %rdx movq %rax, 24(%r8) - adcq 32(%r9), %rdx + sbbq 32(%r9), %rdx movq 40(%rsi), %rax movq %rdx, 32(%r8) - adcq 40(%r9), %rax + sbbq 40(%r9), %rax movq 48(%rsi), %rdx movq %rax, 40(%r8) - adcq 48(%r9), %rdx + sbbq 48(%r9), %rdx movq 56(%rsi), %rax movq %rdx, 48(%r8) - adcq 56(%r9), %rax + sbbq 56(%r9), %rax movq 64(%rsi), %rdx movq %rax, 56(%r8) - adcq 64(%r9), %rdx + sbbq 64(%r9), %rdx movq 72(%rsi), %rax movq %rdx, 64(%r8) - adcq 72(%r9), %rax + sbbq 72(%r9), %rax movq 80(%rsi), %rdx movq %rax, 72(%r8) - adcq 80(%r9), %rdx + sbbq 80(%r9), %rdx movq 88(%rsi), %rax movq %rdx, 80(%r8) - adcq 88(%r9), %rax + sbbq 88(%r9), %rax movq 96(%rsi), %rdx movq %rax, 88(%r8) - adcq 96(%r9), %rdx + sbbq 96(%r9), %rdx movq 104(%rsi), %rax movq %rdx, 96(%r8) - adcq 104(%r9), %rax + sbbq 104(%r9), %rax movq 112(%rsi), %rdx movq %rax, 104(%r8) - adcq 112(%r9), %rdx + sbbq 112(%r9), %rdx movq 120(%rsi), %rax movq %rdx, 112(%r8) - adcq 120(%r9), %rax + sbbq 120(%r9), %rax + movq %rax, 120(%r8) + sbbq $0x00, %rcx + # Cond Negate + movq (%r8), %rdx + movq %rcx, %r9 + xorq %rcx, %rdx + negq %r9 + subq %rcx, %rdx + movq 8(%r8), %rax + sbbq $0x00, %r9 + movq %rdx, (%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 16(%r8), %rdx + setc %r9b + movq %rax, 8(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 24(%r8), %rax + setc %r9b + movq %rdx, 16(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 32(%r8), %rdx + setc %r9b + movq %rax, 24(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 40(%r8), %rax + setc %r9b + movq %rdx, 32(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 48(%r8), %rdx + setc %r9b + movq %rax, 40(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 56(%r8), %rax + setc %r9b + movq %rdx, 48(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 64(%r8), %rdx + setc %r9b + movq %rax, 56(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 72(%r8), %rax + setc %r9b + movq %rdx, 64(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 80(%r8), %rdx + setc %r9b + movq %rax, 72(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 88(%r8), %rax + setc %r9b + movq %rdx, 80(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 96(%r8), %rdx + setc %r9b + movq %rax, 88(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 104(%r8), %rax + setc %r9b + movq %rdx, 96(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 112(%r8), %rdx + setc %r9b + movq %rax, 104(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 120(%r8), %rax + setc %r9b + movq %rdx, 112(%r8) + xorq %rcx, %rax + addq %r9, %rax movq %rax, 120(%r8) - adcq $0x00, %rcx - movq %rcx, 656(%rsp) movq %r8, %rsi movq %rsp, %rdi #ifndef __APPLE__ @@ -7770,528 +7780,387 @@ _sp_2048_sqr_32: #else callq _sp_2048_sqr_16 #endif /* __APPLE__ */ - movq 648(%rsp), %rsi - leaq 256(%rsp), %rdi + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi addq $0x80, %rsi + addq $0x100, %rdi #ifndef __APPLE__ callq sp_2048_sqr_16@plt #else callq _sp_2048_sqr_16 #endif /* __APPLE__ */ - movq 648(%rsp), %rsi - movq 640(%rsp), %rdi + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi #ifndef __APPLE__ callq sp_2048_sqr_16@plt #else callq _sp_2048_sqr_16 #endif /* __APPLE__ */ #ifdef _WIN64 - movq 648(%rsp), %rsi - movq 640(%rsp), %rdi + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi #endif /* _WIN64 */ - movq 656(%rsp), %r10 - leaq 512(%rsp), %r8 - movq %r10, %rcx - negq %r10 - movq (%r8), %rdx - movq 8(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 256(%rdi) - movq %rax, 264(%rdi) - movq 16(%r8), %rdx - movq 24(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 272(%rdi) - movq %rax, 280(%rdi) - movq 32(%r8), %rdx - movq 40(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 288(%rdi) - movq %rax, 296(%rdi) - movq 48(%r8), %rdx - movq 56(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 304(%rdi) - movq %rax, 312(%rdi) - movq 64(%r8), %rdx - movq 72(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 320(%rdi) - movq %rax, 328(%rdi) - movq 80(%r8), %rdx - movq 88(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 336(%rdi) - movq %rax, 344(%rdi) - movq 96(%r8), %rdx - movq 104(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 352(%rdi) - movq %rax, 360(%rdi) - movq 112(%r8), %rdx - movq 120(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 368(%rdi) - movq %rax, 376(%rdi) - movq 256(%rdi), %rdx - addq %rdx, %rdx - movq 264(%rdi), %rax - movq %rdx, 256(%rdi) - adcq %rax, %rax - movq 272(%rdi), %rdx - movq %rax, 264(%rdi) - adcq %rdx, %rdx - movq 280(%rdi), %rax - movq %rdx, 272(%rdi) - adcq %rax, %rax - movq 288(%rdi), %rdx - movq %rax, 280(%rdi) - adcq %rdx, %rdx - movq 296(%rdi), %rax - movq %rdx, 288(%rdi) - adcq %rax, %rax - movq 304(%rdi), %rdx - movq %rax, 296(%rdi) - adcq %rdx, %rdx - movq 312(%rdi), %rax - movq %rdx, 304(%rdi) - adcq %rax, %rax - movq 320(%rdi), %rdx - movq %rax, 312(%rdi) - adcq %rdx, %rdx - movq 328(%rdi), %rax - movq %rdx, 320(%rdi) - adcq %rax, %rax - movq 336(%rdi), %rdx - movq %rax, 328(%rdi) - adcq %rdx, %rdx - movq 344(%rdi), %rax - movq %rdx, 336(%rdi) - adcq %rax, %rax - movq 352(%rdi), %rdx - movq %rax, 344(%rdi) - adcq %rdx, %rdx - movq 360(%rdi), %rax - movq %rdx, 352(%rdi) - adcq %rax, %rax - movq 368(%rdi), %rdx - movq %rax, 360(%rdi) - adcq %rdx, %rdx - movq 376(%rdi), %rax - movq %rdx, 368(%rdi) - adcq %rax, %rax - movq %rax, 376(%rdi) - adcq $0x00, %rcx - leaq 256(%rsp), %rsi - movq %rsp, %r8 - movq (%r8), %rdx - subq (%rsi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rsi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rsi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rsi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rsi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rsi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rsi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rsi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rsi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rsi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rsi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rsi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rsi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rsi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rsi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rsi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rsi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rsi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rsi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rsi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rsi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rsi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rsi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rsi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rsi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rsi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rsi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rsi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rsi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rsi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rsi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rsi), %rax - movq %rax, 248(%r8) + movq 256(%rsp), %rsi + leaq 128(%rsp), %r8 + addq $0x180, %rsi + movq $0x00, %rcx + movq -128(%r8), %rax + subq -128(%rsi), %rax + movq -120(%r8), %rdx + movq %rax, -128(%r8) + sbbq -120(%rsi), %rdx + movq -112(%r8), %rax + movq %rdx, -120(%r8) + sbbq -112(%rsi), %rax + movq -104(%r8), %rdx + movq %rax, -112(%r8) + sbbq -104(%rsi), %rdx + movq -96(%r8), %rax + movq %rdx, -104(%r8) + sbbq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq 96(%r8), %rax + movq %rdx, 88(%r8) + sbbq 96(%rsi), %rax + movq 104(%r8), %rdx + movq %rax, 96(%r8) + sbbq 104(%rsi), %rdx + movq 112(%r8), %rax + movq %rdx, 104(%r8) + sbbq 112(%rsi), %rax + movq 120(%r8), %rdx + movq %rax, 112(%r8) + sbbq 120(%rsi), %rdx + movq %rdx, 120(%r8) sbbq $0x00, %rcx - movq (%r8), %rdx - subq (%rdi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rdi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rdi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rdi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rdi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rdi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rdi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rdi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rdi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rdi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rdi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rdi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rdi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rdi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rdi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rdi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rdi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rdi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rdi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rdi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rdi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rdi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rdi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rdi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rdi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rdi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rdi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rdi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rdi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rdi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rdi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rdi), %rax - movq %rax, 248(%r8) + subq $0x100, %rsi + movq -128(%r8), %rax + subq -128(%rsi), %rax + movq -120(%r8), %rdx + movq %rax, -128(%r8) + sbbq -120(%rsi), %rdx + movq -112(%r8), %rax + movq %rdx, -120(%r8) + sbbq -112(%rsi), %rax + movq -104(%r8), %rdx + movq %rax, -112(%r8) + sbbq -104(%rsi), %rdx + movq -96(%r8), %rax + movq %rdx, -104(%r8) + sbbq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq 96(%r8), %rax + movq %rdx, 88(%r8) + sbbq 96(%rsi), %rax + movq 104(%r8), %rdx + movq %rax, 96(%r8) + sbbq 104(%rsi), %rdx + movq 112(%r8), %rax + movq %rdx, 104(%r8) + sbbq 112(%rsi), %rax + movq 120(%r8), %rdx + movq %rax, 112(%r8) + sbbq 120(%rsi), %rdx + movq %rdx, 120(%r8) sbbq $0x00, %rcx - # Add in place - movq 128(%rdi), %rdx - addq (%r8), %rdx - movq 136(%rdi), %rax - movq %rdx, 128(%rdi) - adcq 8(%r8), %rax - movq 144(%rdi), %rdx - movq %rax, 136(%rdi) - adcq 16(%r8), %rdx - movq 152(%rdi), %rax - movq %rdx, 144(%rdi) - adcq 24(%r8), %rax - movq 160(%rdi), %rdx - movq %rax, 152(%rdi) - adcq 32(%r8), %rdx - movq 168(%rdi), %rax - movq %rdx, 160(%rdi) - adcq 40(%r8), %rax - movq 176(%rdi), %rdx - movq %rax, 168(%rdi) - adcq 48(%r8), %rdx - movq 184(%rdi), %rax - movq %rdx, 176(%rdi) - adcq 56(%r8), %rax - movq 192(%rdi), %rdx - movq %rax, 184(%rdi) - adcq 64(%r8), %rdx - movq 200(%rdi), %rax - movq %rdx, 192(%rdi) - adcq 72(%r8), %rax - movq 208(%rdi), %rdx - movq %rax, 200(%rdi) - adcq 80(%r8), %rdx - movq 216(%rdi), %rax - movq %rdx, 208(%rdi) - adcq 88(%r8), %rax - movq 224(%rdi), %rdx - movq %rax, 216(%rdi) - adcq 96(%r8), %rdx - movq 232(%rdi), %rax - movq %rdx, 224(%rdi) - adcq 104(%r8), %rax - movq 240(%rdi), %rdx - movq %rax, 232(%rdi) - adcq 112(%r8), %rdx - movq 248(%rdi), %rax - movq %rdx, 240(%rdi) - adcq 120(%r8), %rax - movq 256(%rdi), %rdx - movq %rax, 248(%rdi) - adcq 128(%r8), %rdx - movq 264(%rdi), %rax - movq %rdx, 256(%rdi) - adcq 136(%r8), %rax - movq 272(%rdi), %rdx - movq %rax, 264(%rdi) - adcq 144(%r8), %rdx - movq 280(%rdi), %rax - movq %rdx, 272(%rdi) - adcq 152(%r8), %rax - movq 288(%rdi), %rdx - movq %rax, 280(%rdi) - adcq 160(%r8), %rdx - movq 296(%rdi), %rax - movq %rdx, 288(%rdi) - adcq 168(%r8), %rax - movq 304(%rdi), %rdx - movq %rax, 296(%rdi) - adcq 176(%r8), %rdx - movq 312(%rdi), %rax - movq %rdx, 304(%rdi) - adcq 184(%r8), %rax - movq 320(%rdi), %rdx - movq %rax, 312(%rdi) - adcq 192(%r8), %rdx - movq 328(%rdi), %rax - movq %rdx, 320(%rdi) - adcq 200(%r8), %rax - movq 336(%rdi), %rdx - movq %rax, 328(%rdi) - adcq 208(%r8), %rdx - movq 344(%rdi), %rax - movq %rdx, 336(%rdi) - adcq 216(%r8), %rax - movq 352(%rdi), %rdx - movq %rax, 344(%rdi) - adcq 224(%r8), %rdx - movq 360(%rdi), %rax - movq %rdx, 352(%rdi) - adcq 232(%r8), %rax - movq 368(%rdi), %rdx - movq %rax, 360(%rdi) - adcq 240(%r8), %rdx - movq 376(%rdi), %rax - movq %rdx, 368(%rdi) - adcq 248(%r8), %rax - movq %rax, 376(%rdi) - adcq $0x00, %rcx - movq %rcx, 384(%rdi) - # Add in place - movq 256(%rdi), %rdx - xorq %rcx, %rcx - addq (%rsi), %rdx - movq 264(%rdi), %rax - movq %rdx, 256(%rdi) - adcq 8(%rsi), %rax - movq 272(%rdi), %rdx - movq %rax, 264(%rdi) - adcq 16(%rsi), %rdx - movq 280(%rdi), %rax - movq %rdx, 272(%rdi) - adcq 24(%rsi), %rax - movq 288(%rdi), %rdx - movq %rax, 280(%rdi) - adcq 32(%rsi), %rdx - movq 296(%rdi), %rax - movq %rdx, 288(%rdi) - adcq 40(%rsi), %rax - movq 304(%rdi), %rdx - movq %rax, 296(%rdi) - adcq 48(%rsi), %rdx - movq 312(%rdi), %rax - movq %rdx, 304(%rdi) - adcq 56(%rsi), %rax - movq 320(%rdi), %rdx - movq %rax, 312(%rdi) - adcq 64(%rsi), %rdx - movq 328(%rdi), %rax - movq %rdx, 320(%rdi) - adcq 72(%rsi), %rax - movq 336(%rdi), %rdx - movq %rax, 328(%rdi) - adcq 80(%rsi), %rdx - movq 344(%rdi), %rax - movq %rdx, 336(%rdi) - adcq 88(%rsi), %rax - movq 352(%rdi), %rdx - movq %rax, 344(%rdi) - adcq 96(%rsi), %rdx - movq 360(%rdi), %rax - movq %rdx, 352(%rdi) - adcq 104(%rsi), %rax - movq 368(%rdi), %rdx - movq %rax, 360(%rdi) - adcq 112(%rsi), %rdx - movq 376(%rdi), %rax - movq %rdx, 368(%rdi) - adcq 120(%rsi), %rax - movq 384(%rdi), %rdx - movq %rax, 376(%rdi) - adcq 128(%rsi), %rdx - movq %rdx, 384(%rdi) - adcq $0x00, %rcx - # Add to zero - movq 136(%rsi), %rdx + movq 256(%rsp), %rdi + negq %rcx + addq $0x100, %rdi + movq -128(%rdi), %rax + subq -128(%r8), %rax + movq -120(%rdi), %rdx + movq %rax, -128(%rdi) + sbbq -120(%r8), %rdx + movq -112(%rdi), %rax + movq %rdx, -120(%rdi) + sbbq -112(%r8), %rax + movq -104(%rdi), %rdx + movq %rax, -112(%rdi) + sbbq -104(%r8), %rdx + movq -96(%rdi), %rax + movq %rdx, -104(%rdi) + sbbq -96(%r8), %rax + movq -88(%rdi), %rdx + movq %rax, -96(%rdi) + sbbq -88(%r8), %rdx + movq -80(%rdi), %rax + movq %rdx, -88(%rdi) + sbbq -80(%r8), %rax + movq -72(%rdi), %rdx + movq %rax, -80(%rdi) + sbbq -72(%r8), %rdx + movq -64(%rdi), %rax + movq %rdx, -72(%rdi) + sbbq -64(%r8), %rax + movq -56(%rdi), %rdx + movq %rax, -64(%rdi) + sbbq -56(%r8), %rdx + movq -48(%rdi), %rax + movq %rdx, -56(%rdi) + sbbq -48(%r8), %rax + movq -40(%rdi), %rdx + movq %rax, -48(%rdi) + sbbq -40(%r8), %rdx + movq -32(%rdi), %rax + movq %rdx, -40(%rdi) + sbbq -32(%r8), %rax + movq -24(%rdi), %rdx + movq %rax, -32(%rdi) + sbbq -24(%r8), %rdx + movq -16(%rdi), %rax + movq %rdx, -24(%rdi) + sbbq -16(%r8), %rax + movq -8(%rdi), %rdx + movq %rax, -16(%rdi) + sbbq -8(%r8), %rdx + movq (%rdi), %rax + movq %rdx, -8(%rdi) + sbbq (%r8), %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) + sbbq 8(%r8), %rdx + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) + sbbq 16(%r8), %rax + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) + sbbq 24(%r8), %rdx + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) + sbbq 32(%r8), %rax + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) + sbbq 40(%r8), %rdx + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) + sbbq 48(%r8), %rax + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) + sbbq 56(%r8), %rdx + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) + sbbq 64(%r8), %rax + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) + sbbq 72(%r8), %rdx + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) + sbbq 80(%r8), %rax + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) + sbbq 88(%r8), %rdx + movq 96(%rdi), %rax + movq %rdx, 88(%rdi) + sbbq 96(%r8), %rax + movq 104(%rdi), %rdx + movq %rax, 96(%rdi) + sbbq 104(%r8), %rdx + movq 112(%rdi), %rax + movq %rdx, 104(%rdi) + sbbq 112(%r8), %rax + movq 120(%rdi), %rdx + movq %rax, 112(%rdi) + sbbq 120(%r8), %rdx + movq %rdx, 120(%rdi) + sbbq $0x00, %rcx + movq 256(%rsp), %rdi + addq $0x180, %rdi + # Add in word + movq (%rdi), %rax + addq %rcx, %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) adcq $0x00, %rdx - movq 144(%rsi), %rax - movq %rdx, 392(%rdi) + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) adcq $0x00, %rax - movq 152(%rsi), %rdx - movq %rax, 400(%rdi) + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) adcq $0x00, %rdx - movq 160(%rsi), %rax - movq %rdx, 408(%rdi) + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) adcq $0x00, %rax - movq 168(%rsi), %rdx - movq %rax, 416(%rdi) + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) adcq $0x00, %rdx - movq 176(%rsi), %rax - movq %rdx, 424(%rdi) + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) adcq $0x00, %rax - movq 184(%rsi), %rdx - movq %rax, 432(%rdi) + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) adcq $0x00, %rdx - movq 192(%rsi), %rax - movq %rdx, 440(%rdi) + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) adcq $0x00, %rax - movq 200(%rsi), %rdx - movq %rax, 448(%rdi) + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) adcq $0x00, %rdx - movq 208(%rsi), %rax - movq %rdx, 456(%rdi) + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) adcq $0x00, %rax - movq 216(%rsi), %rdx - movq %rax, 464(%rdi) + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) adcq $0x00, %rdx - movq 224(%rsi), %rax - movq %rdx, 472(%rdi) + movq 96(%rdi), %rax + movq %rdx, 88(%rdi) adcq $0x00, %rax - movq 232(%rsi), %rdx - movq %rax, 480(%rdi) + movq 104(%rdi), %rdx + movq %rax, 96(%rdi) adcq $0x00, %rdx - movq 240(%rsi), %rax - movq %rdx, 488(%rdi) + movq 112(%rdi), %rax + movq %rdx, 104(%rdi) adcq $0x00, %rax - movq 248(%rsi), %rdx - movq %rax, 496(%rdi) + movq 120(%rdi), %rdx + movq %rax, 112(%rdi) adcq $0x00, %rdx - movq %rdx, 504(%rdi) - addq $0x298, %rsp + movq %rdx, 120(%rdi) + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + addq $0x110, %rsp repz retq #ifndef __APPLE__ .size sp_2048_sqr_32,.-sp_2048_sqr_32 #endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Square a and put result in r. (r = a * a) + * + * Karatsuba: ah^2, al^2, (al - ah)^2 * * r A single precision integer. * a A single precision integer. @@ -8308,63 +8177,143 @@ sp_2048_sqr_avx2_32: .p2align 4 _sp_2048_sqr_avx2_32: #endif /* __APPLE__ */ - subq $0x298, %rsp - movq %rdi, 640(%rsp) - movq %rsi, 648(%rsp) - leaq 512(%rsp), %r8 + subq $0x110, %rsp + movq %rdi, 256(%rsp) + movq %rsi, 264(%rsp) + movq $0x00, %rcx + movq %rsp, %r8 leaq 128(%rsi), %r9 - # Add movq (%rsi), %rdx - xorq %rcx, %rcx - addq (%r9), %rdx + subq (%r9), %rdx movq 8(%rsi), %rax movq %rdx, (%r8) - adcq 8(%r9), %rax + sbbq 8(%r9), %rax movq 16(%rsi), %rdx movq %rax, 8(%r8) - adcq 16(%r9), %rdx + sbbq 16(%r9), %rdx movq 24(%rsi), %rax movq %rdx, 16(%r8) - adcq 24(%r9), %rax + sbbq 24(%r9), %rax movq 32(%rsi), %rdx movq %rax, 24(%r8) - adcq 32(%r9), %rdx + sbbq 32(%r9), %rdx movq 40(%rsi), %rax movq %rdx, 32(%r8) - adcq 40(%r9), %rax + sbbq 40(%r9), %rax movq 48(%rsi), %rdx movq %rax, 40(%r8) - adcq 48(%r9), %rdx + sbbq 48(%r9), %rdx movq 56(%rsi), %rax movq %rdx, 48(%r8) - adcq 56(%r9), %rax + sbbq 56(%r9), %rax movq 64(%rsi), %rdx movq %rax, 56(%r8) - adcq 64(%r9), %rdx + sbbq 64(%r9), %rdx movq 72(%rsi), %rax movq %rdx, 64(%r8) - adcq 72(%r9), %rax + sbbq 72(%r9), %rax movq 80(%rsi), %rdx movq %rax, 72(%r8) - adcq 80(%r9), %rdx + sbbq 80(%r9), %rdx movq 88(%rsi), %rax movq %rdx, 80(%r8) - adcq 88(%r9), %rax + sbbq 88(%r9), %rax movq 96(%rsi), %rdx movq %rax, 88(%r8) - adcq 96(%r9), %rdx + sbbq 96(%r9), %rdx movq 104(%rsi), %rax movq %rdx, 96(%r8) - adcq 104(%r9), %rax + sbbq 104(%r9), %rax movq 112(%rsi), %rdx movq %rax, 104(%r8) - adcq 112(%r9), %rdx + sbbq 112(%r9), %rdx movq 120(%rsi), %rax movq %rdx, 112(%r8) - adcq 120(%r9), %rax + sbbq 120(%r9), %rax + movq %rax, 120(%r8) + sbbq $0x00, %rcx + # Cond Negate + movq (%r8), %rdx + movq %rcx, %r9 + xorq %rcx, %rdx + negq %r9 + subq %rcx, %rdx + movq 8(%r8), %rax + sbbq $0x00, %r9 + movq %rdx, (%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 16(%r8), %rdx + setc %r9b + movq %rax, 8(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 24(%r8), %rax + setc %r9b + movq %rdx, 16(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 32(%r8), %rdx + setc %r9b + movq %rax, 24(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 40(%r8), %rax + setc %r9b + movq %rdx, 32(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 48(%r8), %rdx + setc %r9b + movq %rax, 40(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 56(%r8), %rax + setc %r9b + movq %rdx, 48(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 64(%r8), %rdx + setc %r9b + movq %rax, 56(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 72(%r8), %rax + setc %r9b + movq %rdx, 64(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 80(%r8), %rdx + setc %r9b + movq %rax, 72(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 88(%r8), %rax + setc %r9b + movq %rdx, 80(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 96(%r8), %rdx + setc %r9b + movq %rax, 88(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 104(%r8), %rax + setc %r9b + movq %rdx, 96(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 112(%r8), %rdx + setc %r9b + movq %rax, 104(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 120(%r8), %rax + setc %r9b + movq %rdx, 112(%r8) + xorq %rcx, %rax + addq %r9, %rax movq %rax, 120(%r8) - adcq $0x00, %rcx - movq %rcx, 656(%rsp) movq %r8, %rsi movq %rsp, %rdi #ifndef __APPLE__ @@ -8372,490 +8321,379 @@ _sp_2048_sqr_avx2_32: #else callq _sp_2048_sqr_avx2_16 #endif /* __APPLE__ */ - movq 648(%rsp), %rsi - leaq 256(%rsp), %rdi + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi addq $0x80, %rsi + addq $0x100, %rdi #ifndef __APPLE__ callq sp_2048_sqr_avx2_16@plt #else callq _sp_2048_sqr_avx2_16 #endif /* __APPLE__ */ - movq 648(%rsp), %rsi - movq 640(%rsp), %rdi + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi #ifndef __APPLE__ callq sp_2048_sqr_avx2_16@plt #else callq _sp_2048_sqr_avx2_16 #endif /* __APPLE__ */ #ifdef _WIN64 - movq 648(%rsp), %rsi - movq 640(%rsp), %rdi + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi #endif /* _WIN64 */ - movq 656(%rsp), %r10 - leaq 512(%rsp), %r8 - movq %r10, %rcx - negq %r10 - movq (%r8), %rdx - pextq %r10, %rdx, %rdx - addq %rdx, %rdx - movq 8(%r8), %rax - movq %rdx, 256(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 16(%r8), %rdx - movq %rax, 264(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 24(%r8), %rax - movq %rdx, 272(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 32(%r8), %rdx - movq %rax, 280(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 40(%r8), %rax - movq %rdx, 288(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 48(%r8), %rdx - movq %rax, 296(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 56(%r8), %rax - movq %rdx, 304(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 64(%r8), %rdx - movq %rax, 312(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 72(%r8), %rax - movq %rdx, 320(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 80(%r8), %rdx - movq %rax, 328(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 88(%r8), %rax - movq %rdx, 336(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 96(%r8), %rdx - movq %rax, 344(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 104(%r8), %rax - movq %rdx, 352(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 112(%r8), %rdx - movq %rax, 360(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 120(%r8), %rax - movq %rdx, 368(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq %rax, 376(%rdi) - adcq $0x00, %rcx - leaq 256(%rsp), %rsi - movq %rsp, %r8 - movq (%r8), %rdx - subq (%rsi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rsi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rsi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rsi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rsi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rsi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rsi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rsi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rsi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rsi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rsi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rsi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rsi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rsi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rsi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rsi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rsi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rsi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rsi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rsi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rsi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rsi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rsi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rsi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rsi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rsi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rsi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rsi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rsi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rsi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rsi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rsi), %rax - movq %rax, 248(%r8) + movq 256(%rsp), %rsi + leaq 128(%rsp), %r8 + addq $0x180, %rsi + movq $0x00, %rcx + movq -128(%r8), %rax + subq -128(%rsi), %rax + movq -120(%r8), %rdx + movq %rax, -128(%r8) + sbbq -120(%rsi), %rdx + movq -112(%r8), %rax + movq %rdx, -120(%r8) + sbbq -112(%rsi), %rax + movq -104(%r8), %rdx + movq %rax, -112(%r8) + sbbq -104(%rsi), %rdx + movq -96(%r8), %rax + movq %rdx, -104(%r8) + sbbq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq 96(%r8), %rax + movq %rdx, 88(%r8) + sbbq 96(%rsi), %rax + movq 104(%r8), %rdx + movq %rax, 96(%r8) + sbbq 104(%rsi), %rdx + movq 112(%r8), %rax + movq %rdx, 104(%r8) + sbbq 112(%rsi), %rax + movq 120(%r8), %rdx + movq %rax, 112(%r8) + sbbq 120(%rsi), %rdx + movq %rdx, 120(%r8) sbbq $0x00, %rcx - movq (%r8), %rdx - subq (%rdi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rdi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rdi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rdi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rdi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rdi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rdi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rdi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rdi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rdi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rdi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rdi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rdi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rdi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rdi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rdi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rdi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rdi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rdi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rdi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rdi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rdi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rdi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rdi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rdi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rdi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rdi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rdi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rdi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rdi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rdi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rdi), %rax - movq %rax, 248(%r8) + subq $0x100, %rsi + movq -128(%r8), %rax + subq -128(%rsi), %rax + movq -120(%r8), %rdx + movq %rax, -128(%r8) + sbbq -120(%rsi), %rdx + movq -112(%r8), %rax + movq %rdx, -120(%r8) + sbbq -112(%rsi), %rax + movq -104(%r8), %rdx + movq %rax, -112(%r8) + sbbq -104(%rsi), %rdx + movq -96(%r8), %rax + movq %rdx, -104(%r8) + sbbq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq 96(%r8), %rax + movq %rdx, 88(%r8) + sbbq 96(%rsi), %rax + movq 104(%r8), %rdx + movq %rax, 96(%r8) + sbbq 104(%rsi), %rdx + movq 112(%r8), %rax + movq %rdx, 104(%r8) + sbbq 112(%rsi), %rax + movq 120(%r8), %rdx + movq %rax, 112(%r8) + sbbq 120(%rsi), %rdx + movq %rdx, 120(%r8) sbbq $0x00, %rcx - # Add in place - movq 128(%rdi), %rdx - addq (%r8), %rdx - movq 136(%rdi), %rax - movq %rdx, 128(%rdi) - adcq 8(%r8), %rax - movq 144(%rdi), %rdx - movq %rax, 136(%rdi) - adcq 16(%r8), %rdx - movq 152(%rdi), %rax - movq %rdx, 144(%rdi) - adcq 24(%r8), %rax - movq 160(%rdi), %rdx - movq %rax, 152(%rdi) - adcq 32(%r8), %rdx - movq 168(%rdi), %rax - movq %rdx, 160(%rdi) - adcq 40(%r8), %rax - movq 176(%rdi), %rdx - movq %rax, 168(%rdi) - adcq 48(%r8), %rdx - movq 184(%rdi), %rax - movq %rdx, 176(%rdi) - adcq 56(%r8), %rax - movq 192(%rdi), %rdx - movq %rax, 184(%rdi) - adcq 64(%r8), %rdx - movq 200(%rdi), %rax - movq %rdx, 192(%rdi) - adcq 72(%r8), %rax - movq 208(%rdi), %rdx - movq %rax, 200(%rdi) - adcq 80(%r8), %rdx - movq 216(%rdi), %rax - movq %rdx, 208(%rdi) - adcq 88(%r8), %rax - movq 224(%rdi), %rdx - movq %rax, 216(%rdi) - adcq 96(%r8), %rdx - movq 232(%rdi), %rax - movq %rdx, 224(%rdi) - adcq 104(%r8), %rax - movq 240(%rdi), %rdx - movq %rax, 232(%rdi) - adcq 112(%r8), %rdx - movq 248(%rdi), %rax - movq %rdx, 240(%rdi) - adcq 120(%r8), %rax - movq 256(%rdi), %rdx - movq %rax, 248(%rdi) - adcq 128(%r8), %rdx - movq 264(%rdi), %rax - movq %rdx, 256(%rdi) - adcq 136(%r8), %rax - movq 272(%rdi), %rdx - movq %rax, 264(%rdi) - adcq 144(%r8), %rdx - movq 280(%rdi), %rax - movq %rdx, 272(%rdi) - adcq 152(%r8), %rax - movq 288(%rdi), %rdx - movq %rax, 280(%rdi) - adcq 160(%r8), %rdx - movq 296(%rdi), %rax - movq %rdx, 288(%rdi) - adcq 168(%r8), %rax - movq 304(%rdi), %rdx - movq %rax, 296(%rdi) - adcq 176(%r8), %rdx - movq 312(%rdi), %rax - movq %rdx, 304(%rdi) - adcq 184(%r8), %rax - movq 320(%rdi), %rdx - movq %rax, 312(%rdi) - adcq 192(%r8), %rdx - movq 328(%rdi), %rax - movq %rdx, 320(%rdi) - adcq 200(%r8), %rax - movq 336(%rdi), %rdx - movq %rax, 328(%rdi) - adcq 208(%r8), %rdx - movq 344(%rdi), %rax - movq %rdx, 336(%rdi) - adcq 216(%r8), %rax - movq 352(%rdi), %rdx - movq %rax, 344(%rdi) - adcq 224(%r8), %rdx - movq 360(%rdi), %rax - movq %rdx, 352(%rdi) - adcq 232(%r8), %rax - movq 368(%rdi), %rdx - movq %rax, 360(%rdi) - adcq 240(%r8), %rdx - movq 376(%rdi), %rax - movq %rdx, 368(%rdi) - adcq 248(%r8), %rax - movq %rax, 376(%rdi) - adcq $0x00, %rcx - movq %rcx, 384(%rdi) - # Add in place - movq 256(%rdi), %rdx - xorq %rcx, %rcx - addq (%rsi), %rdx - movq 264(%rdi), %rax - movq %rdx, 256(%rdi) - adcq 8(%rsi), %rax - movq 272(%rdi), %rdx - movq %rax, 264(%rdi) - adcq 16(%rsi), %rdx - movq 280(%rdi), %rax - movq %rdx, 272(%rdi) - adcq 24(%rsi), %rax - movq 288(%rdi), %rdx - movq %rax, 280(%rdi) - adcq 32(%rsi), %rdx - movq 296(%rdi), %rax - movq %rdx, 288(%rdi) - adcq 40(%rsi), %rax - movq 304(%rdi), %rdx - movq %rax, 296(%rdi) - adcq 48(%rsi), %rdx - movq 312(%rdi), %rax - movq %rdx, 304(%rdi) - adcq 56(%rsi), %rax - movq 320(%rdi), %rdx - movq %rax, 312(%rdi) - adcq 64(%rsi), %rdx - movq 328(%rdi), %rax - movq %rdx, 320(%rdi) - adcq 72(%rsi), %rax - movq 336(%rdi), %rdx - movq %rax, 328(%rdi) - adcq 80(%rsi), %rdx - movq 344(%rdi), %rax - movq %rdx, 336(%rdi) - adcq 88(%rsi), %rax - movq 352(%rdi), %rdx - movq %rax, 344(%rdi) - adcq 96(%rsi), %rdx - movq 360(%rdi), %rax - movq %rdx, 352(%rdi) - adcq 104(%rsi), %rax - movq 368(%rdi), %rdx - movq %rax, 360(%rdi) - adcq 112(%rsi), %rdx - movq 376(%rdi), %rax - movq %rdx, 368(%rdi) - adcq 120(%rsi), %rax - movq 384(%rdi), %rdx - movq %rax, 376(%rdi) - adcq 128(%rsi), %rdx - movq %rdx, 384(%rdi) - adcq $0x00, %rcx - # Add to zero - movq 136(%rsi), %rdx + movq 256(%rsp), %rdi + negq %rcx + addq $0x100, %rdi + movq -128(%rdi), %rax + subq -128(%r8), %rax + movq -120(%rdi), %rdx + movq %rax, -128(%rdi) + sbbq -120(%r8), %rdx + movq -112(%rdi), %rax + movq %rdx, -120(%rdi) + sbbq -112(%r8), %rax + movq -104(%rdi), %rdx + movq %rax, -112(%rdi) + sbbq -104(%r8), %rdx + movq -96(%rdi), %rax + movq %rdx, -104(%rdi) + sbbq -96(%r8), %rax + movq -88(%rdi), %rdx + movq %rax, -96(%rdi) + sbbq -88(%r8), %rdx + movq -80(%rdi), %rax + movq %rdx, -88(%rdi) + sbbq -80(%r8), %rax + movq -72(%rdi), %rdx + movq %rax, -80(%rdi) + sbbq -72(%r8), %rdx + movq -64(%rdi), %rax + movq %rdx, -72(%rdi) + sbbq -64(%r8), %rax + movq -56(%rdi), %rdx + movq %rax, -64(%rdi) + sbbq -56(%r8), %rdx + movq -48(%rdi), %rax + movq %rdx, -56(%rdi) + sbbq -48(%r8), %rax + movq -40(%rdi), %rdx + movq %rax, -48(%rdi) + sbbq -40(%r8), %rdx + movq -32(%rdi), %rax + movq %rdx, -40(%rdi) + sbbq -32(%r8), %rax + movq -24(%rdi), %rdx + movq %rax, -32(%rdi) + sbbq -24(%r8), %rdx + movq -16(%rdi), %rax + movq %rdx, -24(%rdi) + sbbq -16(%r8), %rax + movq -8(%rdi), %rdx + movq %rax, -16(%rdi) + sbbq -8(%r8), %rdx + movq (%rdi), %rax + movq %rdx, -8(%rdi) + sbbq (%r8), %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) + sbbq 8(%r8), %rdx + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) + sbbq 16(%r8), %rax + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) + sbbq 24(%r8), %rdx + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) + sbbq 32(%r8), %rax + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) + sbbq 40(%r8), %rdx + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) + sbbq 48(%r8), %rax + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) + sbbq 56(%r8), %rdx + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) + sbbq 64(%r8), %rax + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) + sbbq 72(%r8), %rdx + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) + sbbq 80(%r8), %rax + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) + sbbq 88(%r8), %rdx + movq 96(%rdi), %rax + movq %rdx, 88(%rdi) + sbbq 96(%r8), %rax + movq 104(%rdi), %rdx + movq %rax, 96(%rdi) + sbbq 104(%r8), %rdx + movq 112(%rdi), %rax + movq %rdx, 104(%rdi) + sbbq 112(%r8), %rax + movq 120(%rdi), %rdx + movq %rax, 112(%rdi) + sbbq 120(%r8), %rdx + movq %rdx, 120(%rdi) + sbbq $0x00, %rcx + movq 256(%rsp), %rdi + addq $0x180, %rdi + # Add in word + movq (%rdi), %rax + addq %rcx, %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) adcq $0x00, %rdx - movq 144(%rsi), %rax - movq %rdx, 392(%rdi) + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) adcq $0x00, %rax - movq 152(%rsi), %rdx - movq %rax, 400(%rdi) + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) adcq $0x00, %rdx - movq 160(%rsi), %rax - movq %rdx, 408(%rdi) + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) adcq $0x00, %rax - movq 168(%rsi), %rdx - movq %rax, 416(%rdi) + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) adcq $0x00, %rdx - movq 176(%rsi), %rax - movq %rdx, 424(%rdi) + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) adcq $0x00, %rax - movq 184(%rsi), %rdx - movq %rax, 432(%rdi) + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) adcq $0x00, %rdx - movq 192(%rsi), %rax - movq %rdx, 440(%rdi) + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) adcq $0x00, %rax - movq 200(%rsi), %rdx - movq %rax, 448(%rdi) + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) adcq $0x00, %rdx - movq 208(%rsi), %rax - movq %rdx, 456(%rdi) + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) adcq $0x00, %rax - movq 216(%rsi), %rdx - movq %rax, 464(%rdi) + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) adcq $0x00, %rdx - movq 224(%rsi), %rax - movq %rdx, 472(%rdi) + movq 96(%rdi), %rax + movq %rdx, 88(%rdi) adcq $0x00, %rax - movq 232(%rsi), %rdx - movq %rax, 480(%rdi) + movq 104(%rdi), %rdx + movq %rax, 96(%rdi) adcq $0x00, %rdx - movq 240(%rsi), %rax - movq %rdx, 488(%rdi) + movq 112(%rdi), %rax + movq %rdx, 104(%rdi) adcq $0x00, %rax - movq 248(%rsi), %rdx - movq %rax, 496(%rdi) + movq 120(%rdi), %rdx + movq %rax, 112(%rdi) adcq $0x00, %rdx - movq %rdx, 504(%rdi) - addq $0x298, %rsp + movq %rdx, 120(%rdi) + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + addq $0x110, %rsp repz retq #ifndef __APPLE__ .size sp_2048_sqr_avx2_32,.-sp_2048_sqr_avx2_32 @@ -8879,7 +8717,6 @@ sp_2048_sub_in_place_16: _sp_2048_sub_in_place_16: #endif /* __APPLE__ */ movq (%rdi), %rdx - xorq %rax, %rax subq (%rsi), %rdx movq 8(%rdi), %rcx movq %rdx, (%rdi) @@ -8927,7 +8764,7 @@ _sp_2048_sub_in_place_16: movq %rdx, 112(%rdi) sbbq 120(%rsi), %rcx movq %rcx, 120(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_2048_sub_in_place_16,.-sp_2048_sub_in_place_16 @@ -9230,7 +9067,6 @@ sp_2048_cond_sub_16: _sp_2048_cond_sub_16: #endif /* __APPLE__ */ subq $0x80, %rsp - movq $0x00, %rax movq (%rdx), %r8 movq 8(%rdx), %r9 andq %rcx, %r8 @@ -9343,7 +9179,7 @@ _sp_2048_cond_sub_16: sbbq %rdx, %r9 movq %r8, 112(%rdi) movq %r9, 120(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax addq $0x80, %rsp repz retq #ifndef __APPLE__ @@ -9588,7 +9424,6 @@ sp_2048_cond_sub_avx2_16: .p2align 4 _sp_2048_cond_sub_avx2_16: #endif /* __APPLE__ */ - movq $0x00, %rax movq (%rdx), %r10 movq (%rsi), %r8 pextq %rcx, %r10, %r10 @@ -9669,7 +9504,7 @@ _sp_2048_cond_sub_avx2_16: movq %r10, 112(%rdi) sbbq %r9, %r8 movq %r8, 120(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_2048_cond_sub_avx2_16,.-sp_2048_cond_sub_avx2_16 @@ -10131,6 +9966,1173 @@ _sp_2048_cmp_16: #ifndef __APPLE__ .size sp_2048_cmp_16,.-sp_2048_cmp_16 #endif /* __APPLE__ */ +#ifndef WC_NO_CACHE_RESISTANT +#ifndef __APPLE__ +.text +.globl sp_2048_get_from_table_16 +.type sp_2048_get_from_table_16,@function +.align 16 +sp_2048_get_from_table_16: +#else +.section __TEXT,__text +.globl _sp_2048_get_from_table_16 +.p2align 4 +_sp_2048_get_from_table_16: +#endif /* __APPLE__ */ + movq $0x01, %rax + movd %rdx, %xmm10 + movd %rax, %xmm11 + pxor %xmm13, %xmm13 + pshufd $0x00, %xmm11, %xmm11 + pshufd $0x00, %xmm10, %xmm10 + # START: 0-7 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 0-7 + # START: 8-15 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + # END: 8-15 + repz retq +#ifndef __APPLE__ +.size sp_2048_get_from_table_16,.-sp_2048_get_from_table_16 +#endif /* __APPLE__ */ +#endif /* !WC_NO_CACHE_RESISTANT */ #ifdef HAVE_INTEL_AVX2 /* Reduce the number back to 2048 bits using Montgomery reduction. * @@ -10469,6 +11471,553 @@ L_2048_mont_reduce_avx2_16_loop: .size sp_2048_mont_reduce_avx2_16,.-sp_2048_mont_reduce_avx2_16 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ +#ifndef WC_NO_CACHE_RESISTANT +#ifndef __APPLE__ +.text +.globl sp_2048_get_from_table_avx2_16 +.type sp_2048_get_from_table_avx2_16,@function +.align 16 +sp_2048_get_from_table_avx2_16: +#else +.section __TEXT,__text +.globl _sp_2048_get_from_table_avx2_16 +.p2align 4 +_sp_2048_get_from_table_avx2_16: +#endif /* __APPLE__ */ + movq $0x01, %rax + movd %rdx, %xmm10 + movd %rax, %xmm11 + vpxor %ymm13, %ymm13, %ymm13 + vpermd %ymm10, %ymm13, %ymm10 + vpermd %ymm11, %ymm13, %ymm11 + # START: 0-15 + vpxor %ymm13, %ymm13, %ymm13 + vpxor %ymm4, %ymm4, %ymm4 + vpxor %ymm5, %ymm5, %ymm5 + vpxor %ymm6, %ymm6, %ymm6 + vpxor %ymm7, %ymm7, %ymm7 + # ENTRY: 0 + movq (%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, 32(%rdi) + vmovdqu %ymm6, 64(%rdi) + vmovdqu %ymm7, 96(%rdi) + # END: 0-15 + repz retq +#ifndef __APPLE__ +.size sp_2048_get_from_table_avx2_16,.-sp_2048_get_from_table_avx2_16 +#endif /* __APPLE__ */ +#endif /* !WC_NO_CACHE_RESISTANT */ /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -10490,7 +12039,6 @@ sp_2048_cond_sub_32: _sp_2048_cond_sub_32: #endif /* __APPLE__ */ subq $0x100, %rsp - movq $0x00, %rax movq (%rdx), %r8 movq 8(%rdx), %r9 andq %rcx, %r8 @@ -10715,7 +12263,7 @@ _sp_2048_cond_sub_32: sbbq %rdx, %r9 movq %r8, 240(%rdi) movq %r9, 248(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax addq $0x100, %rsp repz retq #ifndef __APPLE__ @@ -11118,7 +12666,6 @@ sp_2048_sub_32: _sp_2048_sub_32: #endif /* __APPLE__ */ movq (%rsi), %rcx - xorq %rax, %rax subq (%rdx), %rcx movq 8(%rsi), %r8 movq %rcx, (%rdi) @@ -11214,7 +12761,7 @@ _sp_2048_sub_32: movq %rcx, 240(%rdi) sbbq 248(%rdx), %r8 movq %r8, 248(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_2048_sub_32,.-sp_2048_sub_32 @@ -11487,7 +13034,6 @@ sp_2048_cond_sub_avx2_32: .p2align 4 _sp_2048_cond_sub_avx2_32: #endif /* __APPLE__ */ - movq $0x00, %rax movq (%rdx), %r10 movq (%rsi), %r8 pextq %rcx, %r10, %r10 @@ -11648,7 +13194,7 @@ _sp_2048_cond_sub_avx2_32: movq %r8, 240(%rdi) sbbq %r10, %r9 movq %r9, 248(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_2048_cond_sub_avx2_32,.-sp_2048_cond_sub_avx2_32 @@ -11938,6 +13484,4621 @@ _sp_2048_cmp_32: #ifndef __APPLE__ .size sp_2048_cmp_32,.-sp_2048_cmp_32 #endif /* __APPLE__ */ +#ifndef WC_NO_CACHE_RESISTANT +#ifndef __APPLE__ +.text +.globl sp_2048_get_from_table_32 +.type sp_2048_get_from_table_32,@function +.align 16 +sp_2048_get_from_table_32: +#else +.section __TEXT,__text +.globl _sp_2048_get_from_table_32 +.p2align 4 +_sp_2048_get_from_table_32: +#endif /* __APPLE__ */ + movq $0x01, %rax + movd %rdx, %xmm10 + movd %rax, %xmm11 + pxor %xmm13, %xmm13 + pshufd $0x00, %xmm11, %xmm11 + pshufd $0x00, %xmm10, %xmm10 + # START: 0-7 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 32 + movq 256(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 33 + movq 264(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 34 + movq 272(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 35 + movq 280(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 36 + movq 288(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 37 + movq 296(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 38 + movq 304(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 39 + movq 312(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 40 + movq 320(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 41 + movq 328(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 42 + movq 336(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 43 + movq 344(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 44 + movq 352(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 45 + movq 360(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 46 + movq 368(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 47 + movq 376(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 48 + movq 384(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 49 + movq 392(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 50 + movq 400(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 51 + movq 408(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 52 + movq 416(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 53 + movq 424(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 54 + movq 432(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 55 + movq 440(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 56 + movq 448(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 57 + movq 456(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 58 + movq 464(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 59 + movq 472(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 60 + movq 480(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 61 + movq 488(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 62 + movq 496(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 63 + movq 504(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 0-7 + # START: 8-15 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 32 + movq 256(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 33 + movq 264(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 34 + movq 272(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 35 + movq 280(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 36 + movq 288(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 37 + movq 296(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 38 + movq 304(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 39 + movq 312(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 40 + movq 320(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 41 + movq 328(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 42 + movq 336(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 43 + movq 344(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 44 + movq 352(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 45 + movq 360(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 46 + movq 368(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 47 + movq 376(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 48 + movq 384(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 49 + movq 392(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 50 + movq 400(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 51 + movq 408(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 52 + movq 416(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 53 + movq 424(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 54 + movq 432(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 55 + movq 440(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 56 + movq 448(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 57 + movq 456(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 58 + movq 464(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 59 + movq 472(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 60 + movq 480(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 61 + movq 488(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 62 + movq 496(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 63 + movq 504(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 8-15 + # START: 16-23 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 32 + movq 256(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 33 + movq 264(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 34 + movq 272(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 35 + movq 280(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 36 + movq 288(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 37 + movq 296(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 38 + movq 304(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 39 + movq 312(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 40 + movq 320(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 41 + movq 328(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 42 + movq 336(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 43 + movq 344(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 44 + movq 352(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 45 + movq 360(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 46 + movq 368(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 47 + movq 376(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 48 + movq 384(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 49 + movq 392(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 50 + movq 400(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 51 + movq 408(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 52 + movq 416(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 53 + movq 424(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 54 + movq 432(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 55 + movq 440(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 56 + movq 448(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 57 + movq 456(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 58 + movq 464(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 59 + movq 472(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 60 + movq 480(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 61 + movq 488(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 62 + movq 496(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 63 + movq 504(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 16-23 + # START: 24-31 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 32 + movq 256(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 33 + movq 264(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 34 + movq 272(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 35 + movq 280(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 36 + movq 288(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 37 + movq 296(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 38 + movq 304(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 39 + movq 312(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 40 + movq 320(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 41 + movq 328(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 42 + movq 336(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 43 + movq 344(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 44 + movq 352(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 45 + movq 360(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 46 + movq 368(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 47 + movq 376(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 48 + movq 384(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 49 + movq 392(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 50 + movq 400(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 51 + movq 408(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 52 + movq 416(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 53 + movq 424(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 54 + movq 432(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 55 + movq 440(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 56 + movq 448(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 57 + movq 456(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 58 + movq 464(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 59 + movq 472(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 60 + movq 480(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 61 + movq 488(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 62 + movq 496(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 63 + movq 504(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + # END: 24-31 + repz retq +#ifndef __APPLE__ +.size sp_2048_get_from_table_32,.-sp_2048_get_from_table_32 +#endif /* __APPLE__ */ +#endif /* !WC_NO_CACHE_RESISTANT */ #ifdef HAVE_INTEL_AVX2 /* Reduce the number back to 2048 bits using Montgomery reduction. * @@ -12351,6 +18512,2165 @@ L_2048_mont_reduce_avx2_32_loop: .size sp_2048_mont_reduce_avx2_32,.-sp_2048_mont_reduce_avx2_32 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ +#ifndef WC_NO_CACHE_RESISTANT +#ifndef __APPLE__ +.text +.globl sp_2048_get_from_table_avx2_32 +.type sp_2048_get_from_table_avx2_32,@function +.align 16 +sp_2048_get_from_table_avx2_32: +#else +.section __TEXT,__text +.globl _sp_2048_get_from_table_avx2_32 +.p2align 4 +_sp_2048_get_from_table_avx2_32: +#endif /* __APPLE__ */ + movq $0x01, %rax + movd %rdx, %xmm10 + movd %rax, %xmm11 + vpxor %ymm13, %ymm13, %ymm13 + vpermd %ymm10, %ymm13, %ymm10 + vpermd %ymm11, %ymm13, %ymm11 + # START: 0-15 + vpxor %ymm13, %ymm13, %ymm13 + vpxor %ymm4, %ymm4, %ymm4 + vpxor %ymm5, %ymm5, %ymm5 + vpxor %ymm6, %ymm6, %ymm6 + vpxor %ymm7, %ymm7, %ymm7 + # ENTRY: 0 + movq (%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 32 + movq 256(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 33 + movq 264(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 34 + movq 272(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 35 + movq 280(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 36 + movq 288(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 37 + movq 296(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 38 + movq 304(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 39 + movq 312(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 40 + movq 320(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 41 + movq 328(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 42 + movq 336(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 43 + movq 344(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 44 + movq 352(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 45 + movq 360(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 46 + movq 368(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 47 + movq 376(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 48 + movq 384(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 49 + movq 392(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 50 + movq 400(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 51 + movq 408(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 52 + movq 416(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 53 + movq 424(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 54 + movq 432(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 55 + movq 440(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 56 + movq 448(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 57 + movq 456(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 58 + movq 464(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 59 + movq 472(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 60 + movq 480(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 61 + movq 488(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 62 + movq 496(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 63 + movq 504(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, 32(%rdi) + vmovdqu %ymm6, 64(%rdi) + vmovdqu %ymm7, 96(%rdi) + addq $0x80, %rdi + # END: 0-15 + # START: 16-31 + vpxor %ymm13, %ymm13, %ymm13 + vpxor %ymm4, %ymm4, %ymm4 + vpxor %ymm5, %ymm5, %ymm5 + vpxor %ymm6, %ymm6, %ymm6 + vpxor %ymm7, %ymm7, %ymm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 32 + movq 256(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 33 + movq 264(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 34 + movq 272(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 35 + movq 280(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 36 + movq 288(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 37 + movq 296(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 38 + movq 304(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 39 + movq 312(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 40 + movq 320(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 41 + movq 328(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 42 + movq 336(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 43 + movq 344(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 44 + movq 352(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 45 + movq 360(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 46 + movq 368(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 47 + movq 376(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 48 + movq 384(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 49 + movq 392(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 50 + movq 400(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 51 + movq 408(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 52 + movq 416(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 53 + movq 424(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 54 + movq 432(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 55 + movq 440(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 56 + movq 448(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 57 + movq 456(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 58 + movq 464(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 59 + movq 472(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 60 + movq 480(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 61 + movq 488(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 62 + movq 496(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 63 + movq 504(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, 32(%rdi) + vmovdqu %ymm6, 64(%rdi) + vmovdqu %ymm7, 96(%rdi) + # END: 16-31 + repz retq +#ifndef __APPLE__ +.size sp_2048_get_from_table_avx2_32,.-sp_2048_get_from_table_avx2_32 +#endif /* __APPLE__ */ +#endif /* !WC_NO_CACHE_RESISTANT */ /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -15205,7 +23525,6 @@ sp_3072_sub_in_place_24: _sp_3072_sub_in_place_24: #endif /* __APPLE__ */ movq (%rdi), %rdx - xorq %rax, %rax subq (%rsi), %rdx movq 8(%rdi), %rcx movq %rdx, (%rdi) @@ -15277,7 +23596,7 @@ _sp_3072_sub_in_place_24: movq %rdx, 176(%rdi) sbbq 184(%rsi), %rcx movq %rcx, 184(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_3072_sub_in_place_24,.-sp_3072_sub_in_place_24 @@ -16499,7 +24818,6 @@ sp_3072_sub_in_place_48: _sp_3072_sub_in_place_48: #endif /* __APPLE__ */ movq (%rdi), %rdx - xorq %rax, %rax subq (%rsi), %rdx movq 8(%rdi), %rcx movq %rdx, (%rdi) @@ -16643,7 +24961,7 @@ _sp_3072_sub_in_place_48: movq %rdx, 368(%rdi) sbbq 376(%rsi), %rcx movq %rcx, 376(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_3072_sub_in_place_48,.-sp_3072_sub_in_place_48 @@ -20141,66 +28459,9 @@ L_end_3072_sqr_avx2_12: .size sp_3072_sqr_avx2_12,.-sp_3072_sqr_avx2_12 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ -/* Add a to a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_3072_dbl_12 -.type sp_3072_dbl_12,@function -.align 16 -sp_3072_dbl_12: -#else -.section __TEXT,__text -.globl _sp_3072_dbl_12 -.p2align 4 -_sp_3072_dbl_12: -#endif /* __APPLE__ */ - movq (%rsi), %rdx - xorq %rax, %rax - addq %rdx, %rdx - movq 8(%rsi), %rcx - movq %rdx, (%rdi) - adcq %rcx, %rcx - movq 16(%rsi), %rdx - movq %rcx, 8(%rdi) - adcq %rdx, %rdx - movq 24(%rsi), %rcx - movq %rdx, 16(%rdi) - adcq %rcx, %rcx - movq 32(%rsi), %rdx - movq %rcx, 24(%rdi) - adcq %rdx, %rdx - movq 40(%rsi), %rcx - movq %rdx, 32(%rdi) - adcq %rcx, %rcx - movq 48(%rsi), %rdx - movq %rcx, 40(%rdi) - adcq %rdx, %rdx - movq 56(%rsi), %rcx - movq %rdx, 48(%rdi) - adcq %rcx, %rcx - movq 64(%rsi), %rdx - movq %rcx, 56(%rdi) - adcq %rdx, %rdx - movq 72(%rsi), %rcx - movq %rdx, 64(%rdi) - adcq %rcx, %rcx - movq 80(%rsi), %rdx - movq %rcx, 72(%rdi) - adcq %rdx, %rdx - movq 88(%rsi), %rcx - movq %rdx, 80(%rdi) - adcq %rcx, %rcx - movq %rcx, 88(%rdi) - adcq $0x00, %rax - repz retq -#ifndef __APPLE__ -.size sp_3072_dbl_12,.-sp_3072_dbl_12 -#endif /* __APPLE__ */ /* Square a and put result in r. (r = a * a) + * + * Karatsuba: ah^2, al^2, (al - ah)^2 * * r A single precision integer. * a A single precision integer. @@ -20217,51 +28478,111 @@ sp_3072_sqr_24: .p2align 4 _sp_3072_sqr_24: #endif /* __APPLE__ */ - subq $0x1f8, %rsp - movq %rdi, 480(%rsp) - movq %rsi, 488(%rsp) - leaq 384(%rsp), %r8 + subq $0xd0, %rsp + movq %rdi, 192(%rsp) + movq %rsi, 200(%rsp) + movq $0x00, %rcx + movq %rsp, %r8 leaq 96(%rsi), %r9 - # Add movq (%rsi), %rdx - xorq %rcx, %rcx - addq (%r9), %rdx + subq (%r9), %rdx movq 8(%rsi), %rax movq %rdx, (%r8) - adcq 8(%r9), %rax + sbbq 8(%r9), %rax movq 16(%rsi), %rdx movq %rax, 8(%r8) - adcq 16(%r9), %rdx + sbbq 16(%r9), %rdx movq 24(%rsi), %rax movq %rdx, 16(%r8) - adcq 24(%r9), %rax + sbbq 24(%r9), %rax movq 32(%rsi), %rdx movq %rax, 24(%r8) - adcq 32(%r9), %rdx + sbbq 32(%r9), %rdx movq 40(%rsi), %rax movq %rdx, 32(%r8) - adcq 40(%r9), %rax + sbbq 40(%r9), %rax movq 48(%rsi), %rdx movq %rax, 40(%r8) - adcq 48(%r9), %rdx + sbbq 48(%r9), %rdx movq 56(%rsi), %rax movq %rdx, 48(%r8) - adcq 56(%r9), %rax + sbbq 56(%r9), %rax movq 64(%rsi), %rdx movq %rax, 56(%r8) - adcq 64(%r9), %rdx + sbbq 64(%r9), %rdx movq 72(%rsi), %rax movq %rdx, 64(%r8) - adcq 72(%r9), %rax + sbbq 72(%r9), %rax movq 80(%rsi), %rdx movq %rax, 72(%r8) - adcq 80(%r9), %rdx + sbbq 80(%r9), %rdx movq 88(%rsi), %rax movq %rdx, 80(%r8) - adcq 88(%r9), %rax + sbbq 88(%r9), %rax + movq %rax, 88(%r8) + sbbq $0x00, %rcx + # Cond Negate + movq (%r8), %rdx + movq %rcx, %r9 + xorq %rcx, %rdx + negq %r9 + subq %rcx, %rdx + movq 8(%r8), %rax + sbbq $0x00, %r9 + movq %rdx, (%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 16(%r8), %rdx + setc %r9b + movq %rax, 8(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 24(%r8), %rax + setc %r9b + movq %rdx, 16(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 32(%r8), %rdx + setc %r9b + movq %rax, 24(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 40(%r8), %rax + setc %r9b + movq %rdx, 32(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 48(%r8), %rdx + setc %r9b + movq %rax, 40(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 56(%r8), %rax + setc %r9b + movq %rdx, 48(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 64(%r8), %rdx + setc %r9b + movq %rax, 56(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 72(%r8), %rax + setc %r9b + movq %rdx, 64(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 80(%r8), %rdx + setc %r9b + movq %rax, 72(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 88(%r8), %rax + setc %r9b + movq %rdx, 80(%r8) + xorq %rcx, %rax + addq %r9, %rax movq %rax, 88(%r8) - adcq $0x00, %rcx - movq %rcx, 496(%rsp) movq %r8, %rsi movq %rsp, %rdi #ifndef __APPLE__ @@ -20269,409 +28590,303 @@ _sp_3072_sqr_24: #else callq _sp_3072_sqr_12 #endif /* __APPLE__ */ - movq 488(%rsp), %rsi - leaq 192(%rsp), %rdi + movq 200(%rsp), %rsi + movq 192(%rsp), %rdi addq $0x60, %rsi + addq $0xc0, %rdi #ifndef __APPLE__ callq sp_3072_sqr_12@plt #else callq _sp_3072_sqr_12 #endif /* __APPLE__ */ - movq 488(%rsp), %rsi - movq 480(%rsp), %rdi + movq 200(%rsp), %rsi + movq 192(%rsp), %rdi #ifndef __APPLE__ callq sp_3072_sqr_12@plt #else callq _sp_3072_sqr_12 #endif /* __APPLE__ */ #ifdef _WIN64 - movq 488(%rsp), %rsi - movq 480(%rsp), %rdi + movq 200(%rsp), %rsi + movq 192(%rsp), %rdi #endif /* _WIN64 */ - movq 496(%rsp), %r10 - movq %rdi, %r9 - leaq 384(%rsp), %r8 - movq %r10, %rcx - negq %r10 - addq $0xc0, %r9 - movq (%r8), %rdx - movq 8(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, (%r9) - movq %rax, 8(%r9) - movq 16(%r8), %rdx - movq 24(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 16(%r9) - movq %rax, 24(%r9) - movq 32(%r8), %rdx - movq 40(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 32(%r9) - movq %rax, 40(%r9) - movq 48(%r8), %rdx - movq 56(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 48(%r9) - movq %rax, 56(%r9) - movq 64(%r8), %rdx - movq 72(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 64(%r9) - movq %rax, 72(%r9) - movq 80(%r8), %rdx - movq 88(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 80(%r9) - movq %rax, 88(%r9) - movq (%r9), %rdx - addq %rdx, %rdx - movq 8(%r9), %rax - movq %rdx, (%r9) - adcq %rax, %rax - movq 16(%r9), %rdx - movq %rax, 8(%r9) - adcq %rdx, %rdx - movq 24(%r9), %rax - movq %rdx, 16(%r9) - adcq %rax, %rax - movq 32(%r9), %rdx - movq %rax, 24(%r9) - adcq %rdx, %rdx - movq 40(%r9), %rax - movq %rdx, 32(%r9) - adcq %rax, %rax - movq 48(%r9), %rdx - movq %rax, 40(%r9) - adcq %rdx, %rdx - movq 56(%r9), %rax - movq %rdx, 48(%r9) - adcq %rax, %rax - movq 64(%r9), %rdx - movq %rax, 56(%r9) - adcq %rdx, %rdx - movq 72(%r9), %rax - movq %rdx, 64(%r9) - adcq %rax, %rax - movq 80(%r9), %rdx - movq %rax, 72(%r9) - adcq %rdx, %rdx - movq 88(%r9), %rax - movq %rdx, 80(%r9) - adcq %rax, %rax - movq %rax, 88(%r9) - adcq $0x00, %rcx - leaq 192(%rsp), %rsi - movq %rsp, %r8 - movq (%r8), %rdx - subq (%rsi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rsi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rsi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rsi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rsi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rsi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rsi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rsi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rsi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rsi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rsi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rsi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rsi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rsi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rsi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rsi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rsi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rsi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rsi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rsi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rsi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rsi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rsi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rsi), %rax - movq %rax, 184(%r8) + movq 192(%rsp), %rsi + leaq 96(%rsp), %r8 + addq $0x120, %rsi + movq $0x00, %rcx + movq -96(%r8), %rax + subq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq %rdx, 88(%r8) sbbq $0x00, %rcx - movq (%r8), %rdx - subq (%rdi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rdi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rdi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rdi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rdi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rdi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rdi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rdi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rdi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rdi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rdi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rdi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rdi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rdi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rdi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rdi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rdi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rdi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rdi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rdi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rdi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rdi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rdi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rdi), %rax - movq %rax, 184(%r8) + subq $0xc0, %rsi + movq -96(%r8), %rax + subq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq %rdx, 88(%r8) sbbq $0x00, %rcx - subq $0x60, %r9 - # Add in place - movq (%r9), %rdx - addq (%r8), %rdx - movq 8(%r9), %rax - movq %rdx, (%r9) - adcq 8(%r8), %rax - movq 16(%r9), %rdx - movq %rax, 8(%r9) - adcq 16(%r8), %rdx - movq 24(%r9), %rax - movq %rdx, 16(%r9) - adcq 24(%r8), %rax - movq 32(%r9), %rdx - movq %rax, 24(%r9) - adcq 32(%r8), %rdx - movq 40(%r9), %rax - movq %rdx, 32(%r9) - adcq 40(%r8), %rax - movq 48(%r9), %rdx - movq %rax, 40(%r9) - adcq 48(%r8), %rdx - movq 56(%r9), %rax - movq %rdx, 48(%r9) - adcq 56(%r8), %rax - movq 64(%r9), %rdx - movq %rax, 56(%r9) - adcq 64(%r8), %rdx - movq 72(%r9), %rax - movq %rdx, 64(%r9) - adcq 72(%r8), %rax - movq 80(%r9), %rdx - movq %rax, 72(%r9) - adcq 80(%r8), %rdx - movq 88(%r9), %rax - movq %rdx, 80(%r9) - adcq 88(%r8), %rax - movq 96(%r9), %rdx - movq %rax, 88(%r9) - adcq 96(%r8), %rdx - movq 104(%r9), %rax - movq %rdx, 96(%r9) - adcq 104(%r8), %rax - movq 112(%r9), %rdx - movq %rax, 104(%r9) - adcq 112(%r8), %rdx - movq 120(%r9), %rax - movq %rdx, 112(%r9) - adcq 120(%r8), %rax - movq 128(%r9), %rdx - movq %rax, 120(%r9) - adcq 128(%r8), %rdx - movq 136(%r9), %rax - movq %rdx, 128(%r9) - adcq 136(%r8), %rax - movq 144(%r9), %rdx - movq %rax, 136(%r9) - adcq 144(%r8), %rdx - movq 152(%r9), %rax - movq %rdx, 144(%r9) - adcq 152(%r8), %rax - movq 160(%r9), %rdx - movq %rax, 152(%r9) - adcq 160(%r8), %rdx - movq 168(%r9), %rax - movq %rdx, 160(%r9) - adcq 168(%r8), %rax - movq 176(%r9), %rdx - movq %rax, 168(%r9) - adcq 176(%r8), %rdx - movq 184(%r9), %rax - movq %rdx, 176(%r9) - adcq 184(%r8), %rax - movq %rax, 184(%r9) - adcq $0x00, %rcx - movq %rcx, 288(%rdi) - # Add in place - movq 96(%r9), %rdx - addq (%rsi), %rdx - movq 104(%r9), %rax - movq %rdx, 96(%r9) - adcq 8(%rsi), %rax - movq 112(%r9), %rdx - movq %rax, 104(%r9) - adcq 16(%rsi), %rdx - movq 120(%r9), %rax - movq %rdx, 112(%r9) - adcq 24(%rsi), %rax - movq 128(%r9), %rdx - movq %rax, 120(%r9) - adcq 32(%rsi), %rdx - movq 136(%r9), %rax - movq %rdx, 128(%r9) - adcq 40(%rsi), %rax - movq 144(%r9), %rdx - movq %rax, 136(%r9) - adcq 48(%rsi), %rdx - movq 152(%r9), %rax - movq %rdx, 144(%r9) - adcq 56(%rsi), %rax - movq 160(%r9), %rdx - movq %rax, 152(%r9) - adcq 64(%rsi), %rdx - movq 168(%r9), %rax - movq %rdx, 160(%r9) - adcq 72(%rsi), %rax - movq 176(%r9), %rdx - movq %rax, 168(%r9) - adcq 80(%rsi), %rdx - movq 184(%r9), %rax - movq %rdx, 176(%r9) - adcq 88(%rsi), %rax - movq 192(%r9), %rdx - movq %rax, 184(%r9) - adcq 96(%rsi), %rdx - movq %rdx, 192(%r9) - # Add to zero - movq 104(%rsi), %rdx + movq 192(%rsp), %rdi + negq %rcx + addq $0xc0, %rdi + movq -96(%rdi), %rax + subq -96(%r8), %rax + movq -88(%rdi), %rdx + movq %rax, -96(%rdi) + sbbq -88(%r8), %rdx + movq -80(%rdi), %rax + movq %rdx, -88(%rdi) + sbbq -80(%r8), %rax + movq -72(%rdi), %rdx + movq %rax, -80(%rdi) + sbbq -72(%r8), %rdx + movq -64(%rdi), %rax + movq %rdx, -72(%rdi) + sbbq -64(%r8), %rax + movq -56(%rdi), %rdx + movq %rax, -64(%rdi) + sbbq -56(%r8), %rdx + movq -48(%rdi), %rax + movq %rdx, -56(%rdi) + sbbq -48(%r8), %rax + movq -40(%rdi), %rdx + movq %rax, -48(%rdi) + sbbq -40(%r8), %rdx + movq -32(%rdi), %rax + movq %rdx, -40(%rdi) + sbbq -32(%r8), %rax + movq -24(%rdi), %rdx + movq %rax, -32(%rdi) + sbbq -24(%r8), %rdx + movq -16(%rdi), %rax + movq %rdx, -24(%rdi) + sbbq -16(%r8), %rax + movq -8(%rdi), %rdx + movq %rax, -16(%rdi) + sbbq -8(%r8), %rdx + movq (%rdi), %rax + movq %rdx, -8(%rdi) + sbbq (%r8), %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) + sbbq 8(%r8), %rdx + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) + sbbq 16(%r8), %rax + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) + sbbq 24(%r8), %rdx + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) + sbbq 32(%r8), %rax + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) + sbbq 40(%r8), %rdx + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) + sbbq 48(%r8), %rax + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) + sbbq 56(%r8), %rdx + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) + sbbq 64(%r8), %rax + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) + sbbq 72(%r8), %rdx + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) + sbbq 80(%r8), %rax + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) + sbbq 88(%r8), %rdx + movq %rdx, 88(%rdi) + sbbq $0x00, %rcx + movq 192(%rsp), %rdi + addq $0x120, %rdi + # Add in word + movq (%rdi), %rax + addq %rcx, %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) adcq $0x00, %rdx - movq 112(%rsi), %rax - movq %rdx, 200(%r9) + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) adcq $0x00, %rax - movq 120(%rsi), %rdx - movq %rax, 208(%r9) + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) adcq $0x00, %rdx - movq 128(%rsi), %rax - movq %rdx, 216(%r9) + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) adcq $0x00, %rax - movq 136(%rsi), %rdx - movq %rax, 224(%r9) + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) adcq $0x00, %rdx - movq 144(%rsi), %rax - movq %rdx, 232(%r9) + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) adcq $0x00, %rax - movq 152(%rsi), %rdx - movq %rax, 240(%r9) + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) adcq $0x00, %rdx - movq 160(%rsi), %rax - movq %rdx, 248(%r9) + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) adcq $0x00, %rax - movq 168(%rsi), %rdx - movq %rax, 256(%r9) + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) adcq $0x00, %rdx - movq 176(%rsi), %rax - movq %rdx, 264(%r9) + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) adcq $0x00, %rax - movq 184(%rsi), %rdx - movq %rax, 272(%r9) + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) adcq $0x00, %rdx - movq %rdx, 280(%r9) - addq $0x1f8, %rsp + movq %rdx, 88(%rdi) + movq 200(%rsp), %rsi + movq 192(%rsp), %rdi + addq $0xd0, %rsp repz retq #ifndef __APPLE__ .size sp_3072_sqr_24,.-sp_3072_sqr_24 #endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Square a and put result in r. (r = a * a) + * + * Karatsuba: ah^2, al^2, (al - ah)^2 * * r A single precision integer. * a A single precision integer. @@ -20688,51 +28903,111 @@ sp_3072_sqr_avx2_24: .p2align 4 _sp_3072_sqr_avx2_24: #endif /* __APPLE__ */ - subq $0x1f8, %rsp - movq %rdi, 480(%rsp) - movq %rsi, 488(%rsp) - leaq 384(%rsp), %r8 + subq $0xd0, %rsp + movq %rdi, 192(%rsp) + movq %rsi, 200(%rsp) + movq $0x00, %rcx + movq %rsp, %r8 leaq 96(%rsi), %r9 - # Add movq (%rsi), %rdx - xorq %rcx, %rcx - addq (%r9), %rdx + subq (%r9), %rdx movq 8(%rsi), %rax movq %rdx, (%r8) - adcq 8(%r9), %rax + sbbq 8(%r9), %rax movq 16(%rsi), %rdx movq %rax, 8(%r8) - adcq 16(%r9), %rdx + sbbq 16(%r9), %rdx movq 24(%rsi), %rax movq %rdx, 16(%r8) - adcq 24(%r9), %rax + sbbq 24(%r9), %rax movq 32(%rsi), %rdx movq %rax, 24(%r8) - adcq 32(%r9), %rdx + sbbq 32(%r9), %rdx movq 40(%rsi), %rax movq %rdx, 32(%r8) - adcq 40(%r9), %rax + sbbq 40(%r9), %rax movq 48(%rsi), %rdx movq %rax, 40(%r8) - adcq 48(%r9), %rdx + sbbq 48(%r9), %rdx movq 56(%rsi), %rax movq %rdx, 48(%r8) - adcq 56(%r9), %rax + sbbq 56(%r9), %rax movq 64(%rsi), %rdx movq %rax, 56(%r8) - adcq 64(%r9), %rdx + sbbq 64(%r9), %rdx movq 72(%rsi), %rax movq %rdx, 64(%r8) - adcq 72(%r9), %rax + sbbq 72(%r9), %rax movq 80(%rsi), %rdx movq %rax, 72(%r8) - adcq 80(%r9), %rdx + sbbq 80(%r9), %rdx movq 88(%rsi), %rax movq %rdx, 80(%r8) - adcq 88(%r9), %rax + sbbq 88(%r9), %rax + movq %rax, 88(%r8) + sbbq $0x00, %rcx + # Cond Negate + movq (%r8), %rdx + movq %rcx, %r9 + xorq %rcx, %rdx + negq %r9 + subq %rcx, %rdx + movq 8(%r8), %rax + sbbq $0x00, %r9 + movq %rdx, (%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 16(%r8), %rdx + setc %r9b + movq %rax, 8(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 24(%r8), %rax + setc %r9b + movq %rdx, 16(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 32(%r8), %rdx + setc %r9b + movq %rax, 24(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 40(%r8), %rax + setc %r9b + movq %rdx, 32(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 48(%r8), %rdx + setc %r9b + movq %rax, 40(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 56(%r8), %rax + setc %r9b + movq %rdx, 48(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 64(%r8), %rdx + setc %r9b + movq %rax, 56(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 72(%r8), %rax + setc %r9b + movq %rdx, 64(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 80(%r8), %rdx + setc %r9b + movq %rax, 72(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 88(%r8), %rax + setc %r9b + movq %rdx, 80(%r8) + xorq %rcx, %rax + addq %r9, %rax movq %rax, 88(%r8) - adcq $0x00, %rcx - movq %rcx, 496(%rsp) movq %r8, %rsi movq %rsp, %rdi #ifndef __APPLE__ @@ -20740,480 +29015,303 @@ _sp_3072_sqr_avx2_24: #else callq _sp_3072_sqr_avx2_12 #endif /* __APPLE__ */ - movq 488(%rsp), %rsi - leaq 192(%rsp), %rdi + movq 200(%rsp), %rsi + movq 192(%rsp), %rdi addq $0x60, %rsi + addq $0xc0, %rdi #ifndef __APPLE__ callq sp_3072_sqr_avx2_12@plt #else callq _sp_3072_sqr_avx2_12 #endif /* __APPLE__ */ - movq 488(%rsp), %rsi - movq 480(%rsp), %rdi + movq 200(%rsp), %rsi + movq 192(%rsp), %rdi #ifndef __APPLE__ callq sp_3072_sqr_avx2_12@plt #else callq _sp_3072_sqr_avx2_12 #endif /* __APPLE__ */ #ifdef _WIN64 - movq 488(%rsp), %rsi - movq 480(%rsp), %rdi + movq 200(%rsp), %rsi + movq 192(%rsp), %rdi #endif /* _WIN64 */ - movq 496(%rsp), %r10 - movq %rdi, %r9 - leaq 384(%rsp), %r8 - movq %r10, %rcx - negq %r10 - addq $0xc0, %r9 - movq (%r8), %rdx - pextq %r10, %rdx, %rdx - addq %rdx, %rdx - movq 8(%r8), %rax - movq %rdx, (%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 16(%r8), %rdx - movq %rax, 8(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 32(%r8), %rdx - movq %rax, 24(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 48(%r8), %rdx - movq %rax, 40(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 64(%r8), %rdx - movq %rax, 56(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 80(%r8), %rdx - movq %rax, 72(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq %rax, 88(%r9) - adcq $0x00, %rcx - leaq 192(%rsp), %rsi - movq %rsp, %r8 - movq (%r8), %rdx - subq (%rsi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rsi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rsi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rsi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rsi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rsi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rsi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rsi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rsi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rsi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rsi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rsi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rsi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rsi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rsi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rsi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rsi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rsi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rsi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rsi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rsi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rsi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rsi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rsi), %rax - movq %rax, 184(%r8) + movq 192(%rsp), %rsi + leaq 96(%rsp), %r8 + addq $0x120, %rsi + movq $0x00, %rcx + movq -96(%r8), %rax + subq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq %rdx, 88(%r8) sbbq $0x00, %rcx - movq (%r8), %rdx - subq (%rdi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rdi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rdi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rdi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rdi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rdi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rdi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rdi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rdi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rdi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rdi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rdi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rdi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rdi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rdi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rdi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rdi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rdi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rdi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rdi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rdi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rdi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rdi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rdi), %rax - movq %rax, 184(%r8) + subq $0xc0, %rsi + movq -96(%r8), %rax + subq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq %rdx, 88(%r8) sbbq $0x00, %rcx - subq $0x60, %r9 - # Add in place - movq (%r9), %rdx - addq (%r8), %rdx - movq 8(%r9), %rax - movq %rdx, (%r9) - adcq 8(%r8), %rax - movq 16(%r9), %rdx - movq %rax, 8(%r9) - adcq 16(%r8), %rdx - movq 24(%r9), %rax - movq %rdx, 16(%r9) - adcq 24(%r8), %rax - movq 32(%r9), %rdx - movq %rax, 24(%r9) - adcq 32(%r8), %rdx - movq 40(%r9), %rax - movq %rdx, 32(%r9) - adcq 40(%r8), %rax - movq 48(%r9), %rdx - movq %rax, 40(%r9) - adcq 48(%r8), %rdx - movq 56(%r9), %rax - movq %rdx, 48(%r9) - adcq 56(%r8), %rax - movq 64(%r9), %rdx - movq %rax, 56(%r9) - adcq 64(%r8), %rdx - movq 72(%r9), %rax - movq %rdx, 64(%r9) - adcq 72(%r8), %rax - movq 80(%r9), %rdx - movq %rax, 72(%r9) - adcq 80(%r8), %rdx - movq 88(%r9), %rax - movq %rdx, 80(%r9) - adcq 88(%r8), %rax - movq 96(%r9), %rdx - movq %rax, 88(%r9) - adcq 96(%r8), %rdx - movq 104(%r9), %rax - movq %rdx, 96(%r9) - adcq 104(%r8), %rax - movq 112(%r9), %rdx - movq %rax, 104(%r9) - adcq 112(%r8), %rdx - movq 120(%r9), %rax - movq %rdx, 112(%r9) - adcq 120(%r8), %rax - movq 128(%r9), %rdx - movq %rax, 120(%r9) - adcq 128(%r8), %rdx - movq 136(%r9), %rax - movq %rdx, 128(%r9) - adcq 136(%r8), %rax - movq 144(%r9), %rdx - movq %rax, 136(%r9) - adcq 144(%r8), %rdx - movq 152(%r9), %rax - movq %rdx, 144(%r9) - adcq 152(%r8), %rax - movq 160(%r9), %rdx - movq %rax, 152(%r9) - adcq 160(%r8), %rdx - movq 168(%r9), %rax - movq %rdx, 160(%r9) - adcq 168(%r8), %rax - movq 176(%r9), %rdx - movq %rax, 168(%r9) - adcq 176(%r8), %rdx - movq 184(%r9), %rax - movq %rdx, 176(%r9) - adcq 184(%r8), %rax - movq %rax, 184(%r9) - adcq $0x00, %rcx - movq %rcx, 288(%rdi) - # Add in place - movq 96(%r9), %rdx - addq (%rsi), %rdx - movq 104(%r9), %rax - movq %rdx, 96(%r9) - adcq 8(%rsi), %rax - movq 112(%r9), %rdx - movq %rax, 104(%r9) - adcq 16(%rsi), %rdx - movq 120(%r9), %rax - movq %rdx, 112(%r9) - adcq 24(%rsi), %rax - movq 128(%r9), %rdx - movq %rax, 120(%r9) - adcq 32(%rsi), %rdx - movq 136(%r9), %rax - movq %rdx, 128(%r9) - adcq 40(%rsi), %rax - movq 144(%r9), %rdx - movq %rax, 136(%r9) - adcq 48(%rsi), %rdx - movq 152(%r9), %rax - movq %rdx, 144(%r9) - adcq 56(%rsi), %rax - movq 160(%r9), %rdx - movq %rax, 152(%r9) - adcq 64(%rsi), %rdx - movq 168(%r9), %rax - movq %rdx, 160(%r9) - adcq 72(%rsi), %rax - movq 176(%r9), %rdx - movq %rax, 168(%r9) - adcq 80(%rsi), %rdx - movq 184(%r9), %rax - movq %rdx, 176(%r9) - adcq 88(%rsi), %rax - movq 192(%r9), %rdx - movq %rax, 184(%r9) - adcq 96(%rsi), %rdx - movq %rdx, 192(%r9) - # Add to zero - movq 104(%rsi), %rdx + movq 192(%rsp), %rdi + negq %rcx + addq $0xc0, %rdi + movq -96(%rdi), %rax + subq -96(%r8), %rax + movq -88(%rdi), %rdx + movq %rax, -96(%rdi) + sbbq -88(%r8), %rdx + movq -80(%rdi), %rax + movq %rdx, -88(%rdi) + sbbq -80(%r8), %rax + movq -72(%rdi), %rdx + movq %rax, -80(%rdi) + sbbq -72(%r8), %rdx + movq -64(%rdi), %rax + movq %rdx, -72(%rdi) + sbbq -64(%r8), %rax + movq -56(%rdi), %rdx + movq %rax, -64(%rdi) + sbbq -56(%r8), %rdx + movq -48(%rdi), %rax + movq %rdx, -56(%rdi) + sbbq -48(%r8), %rax + movq -40(%rdi), %rdx + movq %rax, -48(%rdi) + sbbq -40(%r8), %rdx + movq -32(%rdi), %rax + movq %rdx, -40(%rdi) + sbbq -32(%r8), %rax + movq -24(%rdi), %rdx + movq %rax, -32(%rdi) + sbbq -24(%r8), %rdx + movq -16(%rdi), %rax + movq %rdx, -24(%rdi) + sbbq -16(%r8), %rax + movq -8(%rdi), %rdx + movq %rax, -16(%rdi) + sbbq -8(%r8), %rdx + movq (%rdi), %rax + movq %rdx, -8(%rdi) + sbbq (%r8), %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) + sbbq 8(%r8), %rdx + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) + sbbq 16(%r8), %rax + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) + sbbq 24(%r8), %rdx + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) + sbbq 32(%r8), %rax + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) + sbbq 40(%r8), %rdx + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) + sbbq 48(%r8), %rax + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) + sbbq 56(%r8), %rdx + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) + sbbq 64(%r8), %rax + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) + sbbq 72(%r8), %rdx + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) + sbbq 80(%r8), %rax + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) + sbbq 88(%r8), %rdx + movq %rdx, 88(%rdi) + sbbq $0x00, %rcx + movq 192(%rsp), %rdi + addq $0x120, %rdi + # Add in word + movq (%rdi), %rax + addq %rcx, %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) adcq $0x00, %rdx - movq 112(%rsi), %rax - movq %rdx, 200(%r9) + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) adcq $0x00, %rax - movq 120(%rsi), %rdx - movq %rax, 208(%r9) + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) adcq $0x00, %rdx - movq 128(%rsi), %rax - movq %rdx, 216(%r9) + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) adcq $0x00, %rax - movq 136(%rsi), %rdx - movq %rax, 224(%r9) + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) adcq $0x00, %rdx - movq 144(%rsi), %rax - movq %rdx, 232(%r9) + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) adcq $0x00, %rax - movq 152(%rsi), %rdx - movq %rax, 240(%r9) + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) adcq $0x00, %rdx - movq 160(%rsi), %rax - movq %rdx, 248(%r9) + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) adcq $0x00, %rax - movq 168(%rsi), %rdx - movq %rax, 256(%r9) + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) adcq $0x00, %rdx - movq 176(%rsi), %rax - movq %rdx, 264(%r9) + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) adcq $0x00, %rax - movq 184(%rsi), %rdx - movq %rax, 272(%r9) + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) adcq $0x00, %rdx - movq %rdx, 280(%r9) - addq $0x1f8, %rsp + movq %rdx, 88(%rdi) + movq 200(%rsp), %rsi + movq 192(%rsp), %rdi + addq $0xd0, %rsp repz retq #ifndef __APPLE__ .size sp_3072_sqr_avx2_24,.-sp_3072_sqr_avx2_24 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ -/* Add a to a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_3072_dbl_24 -.type sp_3072_dbl_24,@function -.align 16 -sp_3072_dbl_24: -#else -.section __TEXT,__text -.globl _sp_3072_dbl_24 -.p2align 4 -_sp_3072_dbl_24: -#endif /* __APPLE__ */ - movq (%rsi), %rdx - xorq %rax, %rax - addq %rdx, %rdx - movq 8(%rsi), %rcx - movq %rdx, (%rdi) - adcq %rcx, %rcx - movq 16(%rsi), %rdx - movq %rcx, 8(%rdi) - adcq %rdx, %rdx - movq 24(%rsi), %rcx - movq %rdx, 16(%rdi) - adcq %rcx, %rcx - movq 32(%rsi), %rdx - movq %rcx, 24(%rdi) - adcq %rdx, %rdx - movq 40(%rsi), %rcx - movq %rdx, 32(%rdi) - adcq %rcx, %rcx - movq 48(%rsi), %rdx - movq %rcx, 40(%rdi) - adcq %rdx, %rdx - movq 56(%rsi), %rcx - movq %rdx, 48(%rdi) - adcq %rcx, %rcx - movq 64(%rsi), %rdx - movq %rcx, 56(%rdi) - adcq %rdx, %rdx - movq 72(%rsi), %rcx - movq %rdx, 64(%rdi) - adcq %rcx, %rcx - movq 80(%rsi), %rdx - movq %rcx, 72(%rdi) - adcq %rdx, %rdx - movq 88(%rsi), %rcx - movq %rdx, 80(%rdi) - adcq %rcx, %rcx - movq 96(%rsi), %rdx - movq %rcx, 88(%rdi) - adcq %rdx, %rdx - movq 104(%rsi), %rcx - movq %rdx, 96(%rdi) - adcq %rcx, %rcx - movq 112(%rsi), %rdx - movq %rcx, 104(%rdi) - adcq %rdx, %rdx - movq 120(%rsi), %rcx - movq %rdx, 112(%rdi) - adcq %rcx, %rcx - movq 128(%rsi), %rdx - movq %rcx, 120(%rdi) - adcq %rdx, %rdx - movq 136(%rsi), %rcx - movq %rdx, 128(%rdi) - adcq %rcx, %rcx - movq 144(%rsi), %rdx - movq %rcx, 136(%rdi) - adcq %rdx, %rdx - movq 152(%rsi), %rcx - movq %rdx, 144(%rdi) - adcq %rcx, %rcx - movq 160(%rsi), %rdx - movq %rcx, 152(%rdi) - adcq %rdx, %rdx - movq 168(%rsi), %rcx - movq %rdx, 160(%rdi) - adcq %rcx, %rcx - movq 176(%rsi), %rdx - movq %rcx, 168(%rdi) - adcq %rdx, %rdx - movq 184(%rsi), %rcx - movq %rdx, 176(%rdi) - adcq %rcx, %rcx - movq %rcx, 184(%rdi) - adcq $0x00, %rax - repz retq -#ifndef __APPLE__ -.size sp_3072_dbl_24,.-sp_3072_dbl_24 -#endif /* __APPLE__ */ /* Square a and put result in r. (r = a * a) + * + * Karatsuba: ah^2, al^2, (al - ah)^2 * * r A single precision integer. * a A single precision integer. @@ -21230,87 +29328,207 @@ sp_3072_sqr_48: .p2align 4 _sp_3072_sqr_48: #endif /* __APPLE__ */ - subq $0x3d8, %rsp - movq %rdi, 960(%rsp) - movq %rsi, 968(%rsp) - leaq 768(%rsp), %r8 + subq $0x190, %rsp + movq %rdi, 384(%rsp) + movq %rsi, 392(%rsp) + movq $0x00, %rcx + movq %rsp, %r8 leaq 192(%rsi), %r9 - # Add movq (%rsi), %rdx - xorq %rcx, %rcx - addq (%r9), %rdx + subq (%r9), %rdx movq 8(%rsi), %rax movq %rdx, (%r8) - adcq 8(%r9), %rax + sbbq 8(%r9), %rax movq 16(%rsi), %rdx movq %rax, 8(%r8) - adcq 16(%r9), %rdx + sbbq 16(%r9), %rdx movq 24(%rsi), %rax movq %rdx, 16(%r8) - adcq 24(%r9), %rax + sbbq 24(%r9), %rax movq 32(%rsi), %rdx movq %rax, 24(%r8) - adcq 32(%r9), %rdx + sbbq 32(%r9), %rdx movq 40(%rsi), %rax movq %rdx, 32(%r8) - adcq 40(%r9), %rax + sbbq 40(%r9), %rax movq 48(%rsi), %rdx movq %rax, 40(%r8) - adcq 48(%r9), %rdx + sbbq 48(%r9), %rdx movq 56(%rsi), %rax movq %rdx, 48(%r8) - adcq 56(%r9), %rax + sbbq 56(%r9), %rax movq 64(%rsi), %rdx movq %rax, 56(%r8) - adcq 64(%r9), %rdx + sbbq 64(%r9), %rdx movq 72(%rsi), %rax movq %rdx, 64(%r8) - adcq 72(%r9), %rax + sbbq 72(%r9), %rax movq 80(%rsi), %rdx movq %rax, 72(%r8) - adcq 80(%r9), %rdx + sbbq 80(%r9), %rdx movq 88(%rsi), %rax movq %rdx, 80(%r8) - adcq 88(%r9), %rax + sbbq 88(%r9), %rax movq 96(%rsi), %rdx movq %rax, 88(%r8) - adcq 96(%r9), %rdx + sbbq 96(%r9), %rdx movq 104(%rsi), %rax movq %rdx, 96(%r8) - adcq 104(%r9), %rax + sbbq 104(%r9), %rax movq 112(%rsi), %rdx movq %rax, 104(%r8) - adcq 112(%r9), %rdx + sbbq 112(%r9), %rdx movq 120(%rsi), %rax movq %rdx, 112(%r8) - adcq 120(%r9), %rax + sbbq 120(%r9), %rax movq 128(%rsi), %rdx movq %rax, 120(%r8) - adcq 128(%r9), %rdx + sbbq 128(%r9), %rdx movq 136(%rsi), %rax movq %rdx, 128(%r8) - adcq 136(%r9), %rax + sbbq 136(%r9), %rax movq 144(%rsi), %rdx movq %rax, 136(%r8) - adcq 144(%r9), %rdx + sbbq 144(%r9), %rdx movq 152(%rsi), %rax movq %rdx, 144(%r8) - adcq 152(%r9), %rax + sbbq 152(%r9), %rax movq 160(%rsi), %rdx movq %rax, 152(%r8) - adcq 160(%r9), %rdx + sbbq 160(%r9), %rdx movq 168(%rsi), %rax movq %rdx, 160(%r8) - adcq 168(%r9), %rax + sbbq 168(%r9), %rax movq 176(%rsi), %rdx movq %rax, 168(%r8) - adcq 176(%r9), %rdx + sbbq 176(%r9), %rdx movq 184(%rsi), %rax movq %rdx, 176(%r8) - adcq 184(%r9), %rax + sbbq 184(%r9), %rax + movq %rax, 184(%r8) + sbbq $0x00, %rcx + # Cond Negate + movq (%r8), %rdx + movq %rcx, %r9 + xorq %rcx, %rdx + negq %r9 + subq %rcx, %rdx + movq 8(%r8), %rax + sbbq $0x00, %r9 + movq %rdx, (%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 16(%r8), %rdx + setc %r9b + movq %rax, 8(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 24(%r8), %rax + setc %r9b + movq %rdx, 16(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 32(%r8), %rdx + setc %r9b + movq %rax, 24(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 40(%r8), %rax + setc %r9b + movq %rdx, 32(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 48(%r8), %rdx + setc %r9b + movq %rax, 40(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 56(%r8), %rax + setc %r9b + movq %rdx, 48(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 64(%r8), %rdx + setc %r9b + movq %rax, 56(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 72(%r8), %rax + setc %r9b + movq %rdx, 64(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 80(%r8), %rdx + setc %r9b + movq %rax, 72(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 88(%r8), %rax + setc %r9b + movq %rdx, 80(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 96(%r8), %rdx + setc %r9b + movq %rax, 88(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 104(%r8), %rax + setc %r9b + movq %rdx, 96(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 112(%r8), %rdx + setc %r9b + movq %rax, 104(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 120(%r8), %rax + setc %r9b + movq %rdx, 112(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 128(%r8), %rdx + setc %r9b + movq %rax, 120(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 136(%r8), %rax + setc %r9b + movq %rdx, 128(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 144(%r8), %rdx + setc %r9b + movq %rax, 136(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 152(%r8), %rax + setc %r9b + movq %rdx, 144(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 160(%r8), %rdx + setc %r9b + movq %rax, 152(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 168(%r8), %rax + setc %r9b + movq %rdx, 160(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 176(%r8), %rdx + setc %r9b + movq %rax, 168(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 184(%r8), %rax + setc %r9b + movq %rdx, 176(%r8) + xorq %rcx, %rax + addq %r9, %rax movq %rax, 184(%r8) - adcq $0x00, %rcx - movq %rcx, 976(%rsp) movq %r8, %rsi movq %rsp, %rdi #ifndef __APPLE__ @@ -21318,769 +29536,555 @@ _sp_3072_sqr_48: #else callq _sp_3072_sqr_24 #endif /* __APPLE__ */ - movq 968(%rsp), %rsi - leaq 384(%rsp), %rdi + movq 392(%rsp), %rsi + movq 384(%rsp), %rdi addq $0xc0, %rsi + addq $0x180, %rdi #ifndef __APPLE__ callq sp_3072_sqr_24@plt #else callq _sp_3072_sqr_24 #endif /* __APPLE__ */ - movq 968(%rsp), %rsi - movq 960(%rsp), %rdi + movq 392(%rsp), %rsi + movq 384(%rsp), %rdi #ifndef __APPLE__ callq sp_3072_sqr_24@plt #else callq _sp_3072_sqr_24 #endif /* __APPLE__ */ #ifdef _WIN64 - movq 968(%rsp), %rsi - movq 960(%rsp), %rdi + movq 392(%rsp), %rsi + movq 384(%rsp), %rdi #endif /* _WIN64 */ - movq 976(%rsp), %r10 - movq %rdi, %r9 - leaq 768(%rsp), %r8 - movq %r10, %rcx - negq %r10 - addq $0x180, %r9 - movq (%r8), %rdx - movq 8(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, (%r9) - movq %rax, 8(%r9) - movq 16(%r8), %rdx - movq 24(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 16(%r9) - movq %rax, 24(%r9) - movq 32(%r8), %rdx - movq 40(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 32(%r9) - movq %rax, 40(%r9) - movq 48(%r8), %rdx - movq 56(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 48(%r9) - movq %rax, 56(%r9) - movq 64(%r8), %rdx - movq 72(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 64(%r9) - movq %rax, 72(%r9) - movq 80(%r8), %rdx - movq 88(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 80(%r9) - movq %rax, 88(%r9) - movq 96(%r8), %rdx - movq 104(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 96(%r9) - movq %rax, 104(%r9) - movq 112(%r8), %rdx - movq 120(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 112(%r9) - movq %rax, 120(%r9) - movq 128(%r8), %rdx - movq 136(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 128(%r9) - movq %rax, 136(%r9) - movq 144(%r8), %rdx - movq 152(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 144(%r9) - movq %rax, 152(%r9) - movq 160(%r8), %rdx - movq 168(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 160(%r9) - movq %rax, 168(%r9) - movq 176(%r8), %rdx - movq 184(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 176(%r9) - movq %rax, 184(%r9) - movq (%r9), %rdx - addq %rdx, %rdx - movq 8(%r9), %rax - movq %rdx, (%r9) - adcq %rax, %rax - movq 16(%r9), %rdx - movq %rax, 8(%r9) - adcq %rdx, %rdx - movq 24(%r9), %rax - movq %rdx, 16(%r9) - adcq %rax, %rax - movq 32(%r9), %rdx - movq %rax, 24(%r9) - adcq %rdx, %rdx - movq 40(%r9), %rax - movq %rdx, 32(%r9) - adcq %rax, %rax - movq 48(%r9), %rdx - movq %rax, 40(%r9) - adcq %rdx, %rdx - movq 56(%r9), %rax - movq %rdx, 48(%r9) - adcq %rax, %rax - movq 64(%r9), %rdx - movq %rax, 56(%r9) - adcq %rdx, %rdx - movq 72(%r9), %rax - movq %rdx, 64(%r9) - adcq %rax, %rax - movq 80(%r9), %rdx - movq %rax, 72(%r9) - adcq %rdx, %rdx - movq 88(%r9), %rax - movq %rdx, 80(%r9) - adcq %rax, %rax - movq 96(%r9), %rdx - movq %rax, 88(%r9) - adcq %rdx, %rdx - movq 104(%r9), %rax - movq %rdx, 96(%r9) - adcq %rax, %rax - movq 112(%r9), %rdx - movq %rax, 104(%r9) - adcq %rdx, %rdx - movq 120(%r9), %rax - movq %rdx, 112(%r9) - adcq %rax, %rax - movq 128(%r9), %rdx - movq %rax, 120(%r9) - adcq %rdx, %rdx - movq 136(%r9), %rax - movq %rdx, 128(%r9) - adcq %rax, %rax - movq 144(%r9), %rdx - movq %rax, 136(%r9) - adcq %rdx, %rdx - movq 152(%r9), %rax - movq %rdx, 144(%r9) - adcq %rax, %rax - movq 160(%r9), %rdx - movq %rax, 152(%r9) - adcq %rdx, %rdx - movq 168(%r9), %rax - movq %rdx, 160(%r9) - adcq %rax, %rax - movq 176(%r9), %rdx - movq %rax, 168(%r9) - adcq %rdx, %rdx - movq 184(%r9), %rax - movq %rdx, 176(%r9) - adcq %rax, %rax - movq %rax, 184(%r9) - adcq $0x00, %rcx - leaq 384(%rsp), %rsi - movq %rsp, %r8 - movq (%r8), %rdx - subq (%rsi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rsi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rsi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rsi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rsi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rsi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rsi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rsi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rsi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rsi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rsi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rsi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rsi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rsi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rsi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rsi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rsi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rsi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rsi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rsi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rsi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rsi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rsi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rsi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rsi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rsi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rsi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rsi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rsi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rsi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rsi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rsi), %rax - movq 256(%r8), %rdx - movq %rax, 248(%r8) - sbbq 256(%rsi), %rdx - movq 264(%r8), %rax - movq %rdx, 256(%r8) - sbbq 264(%rsi), %rax - movq 272(%r8), %rdx - movq %rax, 264(%r8) - sbbq 272(%rsi), %rdx - movq 280(%r8), %rax - movq %rdx, 272(%r8) - sbbq 280(%rsi), %rax - movq 288(%r8), %rdx - movq %rax, 280(%r8) - sbbq 288(%rsi), %rdx - movq 296(%r8), %rax - movq %rdx, 288(%r8) - sbbq 296(%rsi), %rax - movq 304(%r8), %rdx - movq %rax, 296(%r8) - sbbq 304(%rsi), %rdx - movq 312(%r8), %rax - movq %rdx, 304(%r8) - sbbq 312(%rsi), %rax - movq 320(%r8), %rdx - movq %rax, 312(%r8) - sbbq 320(%rsi), %rdx - movq 328(%r8), %rax - movq %rdx, 320(%r8) - sbbq 328(%rsi), %rax - movq 336(%r8), %rdx - movq %rax, 328(%r8) - sbbq 336(%rsi), %rdx - movq 344(%r8), %rax - movq %rdx, 336(%r8) - sbbq 344(%rsi), %rax - movq 352(%r8), %rdx - movq %rax, 344(%r8) - sbbq 352(%rsi), %rdx - movq 360(%r8), %rax - movq %rdx, 352(%r8) - sbbq 360(%rsi), %rax - movq 368(%r8), %rdx - movq %rax, 360(%r8) - sbbq 368(%rsi), %rdx - movq 376(%r8), %rax - movq %rdx, 368(%r8) - sbbq 376(%rsi), %rax - movq %rax, 376(%r8) + movq 384(%rsp), %rsi + leaq 192(%rsp), %r8 + addq $0x240, %rsi + movq $0x00, %rcx + movq -192(%r8), %rax + subq -192(%rsi), %rax + movq -184(%r8), %rdx + movq %rax, -192(%r8) + sbbq -184(%rsi), %rdx + movq -176(%r8), %rax + movq %rdx, -184(%r8) + sbbq -176(%rsi), %rax + movq -168(%r8), %rdx + movq %rax, -176(%r8) + sbbq -168(%rsi), %rdx + movq -160(%r8), %rax + movq %rdx, -168(%r8) + sbbq -160(%rsi), %rax + movq -152(%r8), %rdx + movq %rax, -160(%r8) + sbbq -152(%rsi), %rdx + movq -144(%r8), %rax + movq %rdx, -152(%r8) + sbbq -144(%rsi), %rax + movq -136(%r8), %rdx + movq %rax, -144(%r8) + sbbq -136(%rsi), %rdx + movq -128(%r8), %rax + movq %rdx, -136(%r8) + sbbq -128(%rsi), %rax + movq -120(%r8), %rdx + movq %rax, -128(%r8) + sbbq -120(%rsi), %rdx + movq -112(%r8), %rax + movq %rdx, -120(%r8) + sbbq -112(%rsi), %rax + movq -104(%r8), %rdx + movq %rax, -112(%r8) + sbbq -104(%rsi), %rdx + movq -96(%r8), %rax + movq %rdx, -104(%r8) + sbbq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq 96(%r8), %rax + movq %rdx, 88(%r8) + sbbq 96(%rsi), %rax + movq 104(%r8), %rdx + movq %rax, 96(%r8) + sbbq 104(%rsi), %rdx + movq 112(%r8), %rax + movq %rdx, 104(%r8) + sbbq 112(%rsi), %rax + movq 120(%r8), %rdx + movq %rax, 112(%r8) + sbbq 120(%rsi), %rdx + movq 128(%r8), %rax + movq %rdx, 120(%r8) + sbbq 128(%rsi), %rax + movq 136(%r8), %rdx + movq %rax, 128(%r8) + sbbq 136(%rsi), %rdx + movq 144(%r8), %rax + movq %rdx, 136(%r8) + sbbq 144(%rsi), %rax + movq 152(%r8), %rdx + movq %rax, 144(%r8) + sbbq 152(%rsi), %rdx + movq 160(%r8), %rax + movq %rdx, 152(%r8) + sbbq 160(%rsi), %rax + movq 168(%r8), %rdx + movq %rax, 160(%r8) + sbbq 168(%rsi), %rdx + movq 176(%r8), %rax + movq %rdx, 168(%r8) + sbbq 176(%rsi), %rax + movq 184(%r8), %rdx + movq %rax, 176(%r8) + sbbq 184(%rsi), %rdx + movq %rdx, 184(%r8) sbbq $0x00, %rcx - movq (%r8), %rdx - subq (%rdi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rdi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rdi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rdi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rdi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rdi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rdi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rdi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rdi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rdi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rdi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rdi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rdi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rdi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rdi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rdi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rdi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rdi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rdi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rdi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rdi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rdi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rdi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rdi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rdi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rdi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rdi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rdi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rdi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rdi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rdi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rdi), %rax - movq 256(%r8), %rdx - movq %rax, 248(%r8) - sbbq 256(%rdi), %rdx - movq 264(%r8), %rax - movq %rdx, 256(%r8) - sbbq 264(%rdi), %rax - movq 272(%r8), %rdx - movq %rax, 264(%r8) - sbbq 272(%rdi), %rdx - movq 280(%r8), %rax - movq %rdx, 272(%r8) - sbbq 280(%rdi), %rax - movq 288(%r8), %rdx - movq %rax, 280(%r8) - sbbq 288(%rdi), %rdx - movq 296(%r8), %rax - movq %rdx, 288(%r8) - sbbq 296(%rdi), %rax - movq 304(%r8), %rdx - movq %rax, 296(%r8) - sbbq 304(%rdi), %rdx - movq 312(%r8), %rax - movq %rdx, 304(%r8) - sbbq 312(%rdi), %rax - movq 320(%r8), %rdx - movq %rax, 312(%r8) - sbbq 320(%rdi), %rdx - movq 328(%r8), %rax - movq %rdx, 320(%r8) - sbbq 328(%rdi), %rax - movq 336(%r8), %rdx - movq %rax, 328(%r8) - sbbq 336(%rdi), %rdx - movq 344(%r8), %rax - movq %rdx, 336(%r8) - sbbq 344(%rdi), %rax - movq 352(%r8), %rdx - movq %rax, 344(%r8) - sbbq 352(%rdi), %rdx - movq 360(%r8), %rax - movq %rdx, 352(%r8) - sbbq 360(%rdi), %rax - movq 368(%r8), %rdx - movq %rax, 360(%r8) - sbbq 368(%rdi), %rdx - movq 376(%r8), %rax - movq %rdx, 368(%r8) - sbbq 376(%rdi), %rax - movq %rax, 376(%r8) + subq $0x180, %rsi + movq -192(%r8), %rax + subq -192(%rsi), %rax + movq -184(%r8), %rdx + movq %rax, -192(%r8) + sbbq -184(%rsi), %rdx + movq -176(%r8), %rax + movq %rdx, -184(%r8) + sbbq -176(%rsi), %rax + movq -168(%r8), %rdx + movq %rax, -176(%r8) + sbbq -168(%rsi), %rdx + movq -160(%r8), %rax + movq %rdx, -168(%r8) + sbbq -160(%rsi), %rax + movq -152(%r8), %rdx + movq %rax, -160(%r8) + sbbq -152(%rsi), %rdx + movq -144(%r8), %rax + movq %rdx, -152(%r8) + sbbq -144(%rsi), %rax + movq -136(%r8), %rdx + movq %rax, -144(%r8) + sbbq -136(%rsi), %rdx + movq -128(%r8), %rax + movq %rdx, -136(%r8) + sbbq -128(%rsi), %rax + movq -120(%r8), %rdx + movq %rax, -128(%r8) + sbbq -120(%rsi), %rdx + movq -112(%r8), %rax + movq %rdx, -120(%r8) + sbbq -112(%rsi), %rax + movq -104(%r8), %rdx + movq %rax, -112(%r8) + sbbq -104(%rsi), %rdx + movq -96(%r8), %rax + movq %rdx, -104(%r8) + sbbq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq 96(%r8), %rax + movq %rdx, 88(%r8) + sbbq 96(%rsi), %rax + movq 104(%r8), %rdx + movq %rax, 96(%r8) + sbbq 104(%rsi), %rdx + movq 112(%r8), %rax + movq %rdx, 104(%r8) + sbbq 112(%rsi), %rax + movq 120(%r8), %rdx + movq %rax, 112(%r8) + sbbq 120(%rsi), %rdx + movq 128(%r8), %rax + movq %rdx, 120(%r8) + sbbq 128(%rsi), %rax + movq 136(%r8), %rdx + movq %rax, 128(%r8) + sbbq 136(%rsi), %rdx + movq 144(%r8), %rax + movq %rdx, 136(%r8) + sbbq 144(%rsi), %rax + movq 152(%r8), %rdx + movq %rax, 144(%r8) + sbbq 152(%rsi), %rdx + movq 160(%r8), %rax + movq %rdx, 152(%r8) + sbbq 160(%rsi), %rax + movq 168(%r8), %rdx + movq %rax, 160(%r8) + sbbq 168(%rsi), %rdx + movq 176(%r8), %rax + movq %rdx, 168(%r8) + sbbq 176(%rsi), %rax + movq 184(%r8), %rdx + movq %rax, 176(%r8) + sbbq 184(%rsi), %rdx + movq %rdx, 184(%r8) sbbq $0x00, %rcx - subq $0xc0, %r9 - # Add in place - movq (%r9), %rdx - addq (%r8), %rdx - movq 8(%r9), %rax - movq %rdx, (%r9) - adcq 8(%r8), %rax - movq 16(%r9), %rdx - movq %rax, 8(%r9) - adcq 16(%r8), %rdx - movq 24(%r9), %rax - movq %rdx, 16(%r9) - adcq 24(%r8), %rax - movq 32(%r9), %rdx - movq %rax, 24(%r9) - adcq 32(%r8), %rdx - movq 40(%r9), %rax - movq %rdx, 32(%r9) - adcq 40(%r8), %rax - movq 48(%r9), %rdx - movq %rax, 40(%r9) - adcq 48(%r8), %rdx - movq 56(%r9), %rax - movq %rdx, 48(%r9) - adcq 56(%r8), %rax - movq 64(%r9), %rdx - movq %rax, 56(%r9) - adcq 64(%r8), %rdx - movq 72(%r9), %rax - movq %rdx, 64(%r9) - adcq 72(%r8), %rax - movq 80(%r9), %rdx - movq %rax, 72(%r9) - adcq 80(%r8), %rdx - movq 88(%r9), %rax - movq %rdx, 80(%r9) - adcq 88(%r8), %rax - movq 96(%r9), %rdx - movq %rax, 88(%r9) - adcq 96(%r8), %rdx - movq 104(%r9), %rax - movq %rdx, 96(%r9) - adcq 104(%r8), %rax - movq 112(%r9), %rdx - movq %rax, 104(%r9) - adcq 112(%r8), %rdx - movq 120(%r9), %rax - movq %rdx, 112(%r9) - adcq 120(%r8), %rax - movq 128(%r9), %rdx - movq %rax, 120(%r9) - adcq 128(%r8), %rdx - movq 136(%r9), %rax - movq %rdx, 128(%r9) - adcq 136(%r8), %rax - movq 144(%r9), %rdx - movq %rax, 136(%r9) - adcq 144(%r8), %rdx - movq 152(%r9), %rax - movq %rdx, 144(%r9) - adcq 152(%r8), %rax - movq 160(%r9), %rdx - movq %rax, 152(%r9) - adcq 160(%r8), %rdx - movq 168(%r9), %rax - movq %rdx, 160(%r9) - adcq 168(%r8), %rax - movq 176(%r9), %rdx - movq %rax, 168(%r9) - adcq 176(%r8), %rdx - movq 184(%r9), %rax - movq %rdx, 176(%r9) - adcq 184(%r8), %rax - movq 192(%r9), %rdx - movq %rax, 184(%r9) - adcq 192(%r8), %rdx - movq 200(%r9), %rax - movq %rdx, 192(%r9) - adcq 200(%r8), %rax - movq 208(%r9), %rdx - movq %rax, 200(%r9) - adcq 208(%r8), %rdx - movq 216(%r9), %rax - movq %rdx, 208(%r9) - adcq 216(%r8), %rax - movq 224(%r9), %rdx - movq %rax, 216(%r9) - adcq 224(%r8), %rdx - movq 232(%r9), %rax - movq %rdx, 224(%r9) - adcq 232(%r8), %rax - movq 240(%r9), %rdx - movq %rax, 232(%r9) - adcq 240(%r8), %rdx - movq 248(%r9), %rax - movq %rdx, 240(%r9) - adcq 248(%r8), %rax - movq 256(%r9), %rdx - movq %rax, 248(%r9) - adcq 256(%r8), %rdx - movq 264(%r9), %rax - movq %rdx, 256(%r9) - adcq 264(%r8), %rax - movq 272(%r9), %rdx - movq %rax, 264(%r9) - adcq 272(%r8), %rdx - movq 280(%r9), %rax - movq %rdx, 272(%r9) - adcq 280(%r8), %rax - movq 288(%r9), %rdx - movq %rax, 280(%r9) - adcq 288(%r8), %rdx - movq 296(%r9), %rax - movq %rdx, 288(%r9) - adcq 296(%r8), %rax - movq 304(%r9), %rdx - movq %rax, 296(%r9) - adcq 304(%r8), %rdx - movq 312(%r9), %rax - movq %rdx, 304(%r9) - adcq 312(%r8), %rax - movq 320(%r9), %rdx - movq %rax, 312(%r9) - adcq 320(%r8), %rdx - movq 328(%r9), %rax - movq %rdx, 320(%r9) - adcq 328(%r8), %rax - movq 336(%r9), %rdx - movq %rax, 328(%r9) - adcq 336(%r8), %rdx - movq 344(%r9), %rax - movq %rdx, 336(%r9) - adcq 344(%r8), %rax - movq 352(%r9), %rdx - movq %rax, 344(%r9) - adcq 352(%r8), %rdx - movq 360(%r9), %rax - movq %rdx, 352(%r9) - adcq 360(%r8), %rax - movq 368(%r9), %rdx - movq %rax, 360(%r9) - adcq 368(%r8), %rdx - movq 376(%r9), %rax - movq %rdx, 368(%r9) - adcq 376(%r8), %rax - movq %rax, 376(%r9) - adcq $0x00, %rcx - movq %rcx, 576(%rdi) - # Add in place - movq 192(%r9), %rdx - addq (%rsi), %rdx - movq 200(%r9), %rax - movq %rdx, 192(%r9) - adcq 8(%rsi), %rax - movq 208(%r9), %rdx - movq %rax, 200(%r9) - adcq 16(%rsi), %rdx - movq 216(%r9), %rax - movq %rdx, 208(%r9) - adcq 24(%rsi), %rax - movq 224(%r9), %rdx - movq %rax, 216(%r9) - adcq 32(%rsi), %rdx - movq 232(%r9), %rax - movq %rdx, 224(%r9) - adcq 40(%rsi), %rax - movq 240(%r9), %rdx - movq %rax, 232(%r9) - adcq 48(%rsi), %rdx - movq 248(%r9), %rax - movq %rdx, 240(%r9) - adcq 56(%rsi), %rax - movq 256(%r9), %rdx - movq %rax, 248(%r9) - adcq 64(%rsi), %rdx - movq 264(%r9), %rax - movq %rdx, 256(%r9) - adcq 72(%rsi), %rax - movq 272(%r9), %rdx - movq %rax, 264(%r9) - adcq 80(%rsi), %rdx - movq 280(%r9), %rax - movq %rdx, 272(%r9) - adcq 88(%rsi), %rax - movq 288(%r9), %rdx - movq %rax, 280(%r9) - adcq 96(%rsi), %rdx - movq 296(%r9), %rax - movq %rdx, 288(%r9) - adcq 104(%rsi), %rax - movq 304(%r9), %rdx - movq %rax, 296(%r9) - adcq 112(%rsi), %rdx - movq 312(%r9), %rax - movq %rdx, 304(%r9) - adcq 120(%rsi), %rax - movq 320(%r9), %rdx - movq %rax, 312(%r9) - adcq 128(%rsi), %rdx - movq 328(%r9), %rax - movq %rdx, 320(%r9) - adcq 136(%rsi), %rax - movq 336(%r9), %rdx - movq %rax, 328(%r9) - adcq 144(%rsi), %rdx - movq 344(%r9), %rax - movq %rdx, 336(%r9) - adcq 152(%rsi), %rax - movq 352(%r9), %rdx - movq %rax, 344(%r9) - adcq 160(%rsi), %rdx - movq 360(%r9), %rax - movq %rdx, 352(%r9) - adcq 168(%rsi), %rax - movq 368(%r9), %rdx - movq %rax, 360(%r9) - adcq 176(%rsi), %rdx - movq 376(%r9), %rax - movq %rdx, 368(%r9) - adcq 184(%rsi), %rax - movq 384(%r9), %rdx - movq %rax, 376(%r9) - adcq 192(%rsi), %rdx - movq %rdx, 384(%r9) - # Add to zero - movq 200(%rsi), %rdx + movq 384(%rsp), %rdi + negq %rcx + addq $0x180, %rdi + movq -192(%rdi), %rax + subq -192(%r8), %rax + movq -184(%rdi), %rdx + movq %rax, -192(%rdi) + sbbq -184(%r8), %rdx + movq -176(%rdi), %rax + movq %rdx, -184(%rdi) + sbbq -176(%r8), %rax + movq -168(%rdi), %rdx + movq %rax, -176(%rdi) + sbbq -168(%r8), %rdx + movq -160(%rdi), %rax + movq %rdx, -168(%rdi) + sbbq -160(%r8), %rax + movq -152(%rdi), %rdx + movq %rax, -160(%rdi) + sbbq -152(%r8), %rdx + movq -144(%rdi), %rax + movq %rdx, -152(%rdi) + sbbq -144(%r8), %rax + movq -136(%rdi), %rdx + movq %rax, -144(%rdi) + sbbq -136(%r8), %rdx + movq -128(%rdi), %rax + movq %rdx, -136(%rdi) + sbbq -128(%r8), %rax + movq -120(%rdi), %rdx + movq %rax, -128(%rdi) + sbbq -120(%r8), %rdx + movq -112(%rdi), %rax + movq %rdx, -120(%rdi) + sbbq -112(%r8), %rax + movq -104(%rdi), %rdx + movq %rax, -112(%rdi) + sbbq -104(%r8), %rdx + movq -96(%rdi), %rax + movq %rdx, -104(%rdi) + sbbq -96(%r8), %rax + movq -88(%rdi), %rdx + movq %rax, -96(%rdi) + sbbq -88(%r8), %rdx + movq -80(%rdi), %rax + movq %rdx, -88(%rdi) + sbbq -80(%r8), %rax + movq -72(%rdi), %rdx + movq %rax, -80(%rdi) + sbbq -72(%r8), %rdx + movq -64(%rdi), %rax + movq %rdx, -72(%rdi) + sbbq -64(%r8), %rax + movq -56(%rdi), %rdx + movq %rax, -64(%rdi) + sbbq -56(%r8), %rdx + movq -48(%rdi), %rax + movq %rdx, -56(%rdi) + sbbq -48(%r8), %rax + movq -40(%rdi), %rdx + movq %rax, -48(%rdi) + sbbq -40(%r8), %rdx + movq -32(%rdi), %rax + movq %rdx, -40(%rdi) + sbbq -32(%r8), %rax + movq -24(%rdi), %rdx + movq %rax, -32(%rdi) + sbbq -24(%r8), %rdx + movq -16(%rdi), %rax + movq %rdx, -24(%rdi) + sbbq -16(%r8), %rax + movq -8(%rdi), %rdx + movq %rax, -16(%rdi) + sbbq -8(%r8), %rdx + movq (%rdi), %rax + movq %rdx, -8(%rdi) + sbbq (%r8), %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) + sbbq 8(%r8), %rdx + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) + sbbq 16(%r8), %rax + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) + sbbq 24(%r8), %rdx + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) + sbbq 32(%r8), %rax + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) + sbbq 40(%r8), %rdx + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) + sbbq 48(%r8), %rax + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) + sbbq 56(%r8), %rdx + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) + sbbq 64(%r8), %rax + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) + sbbq 72(%r8), %rdx + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) + sbbq 80(%r8), %rax + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) + sbbq 88(%r8), %rdx + movq 96(%rdi), %rax + movq %rdx, 88(%rdi) + sbbq 96(%r8), %rax + movq 104(%rdi), %rdx + movq %rax, 96(%rdi) + sbbq 104(%r8), %rdx + movq 112(%rdi), %rax + movq %rdx, 104(%rdi) + sbbq 112(%r8), %rax + movq 120(%rdi), %rdx + movq %rax, 112(%rdi) + sbbq 120(%r8), %rdx + movq 128(%rdi), %rax + movq %rdx, 120(%rdi) + sbbq 128(%r8), %rax + movq 136(%rdi), %rdx + movq %rax, 128(%rdi) + sbbq 136(%r8), %rdx + movq 144(%rdi), %rax + movq %rdx, 136(%rdi) + sbbq 144(%r8), %rax + movq 152(%rdi), %rdx + movq %rax, 144(%rdi) + sbbq 152(%r8), %rdx + movq 160(%rdi), %rax + movq %rdx, 152(%rdi) + sbbq 160(%r8), %rax + movq 168(%rdi), %rdx + movq %rax, 160(%rdi) + sbbq 168(%r8), %rdx + movq 176(%rdi), %rax + movq %rdx, 168(%rdi) + sbbq 176(%r8), %rax + movq 184(%rdi), %rdx + movq %rax, 176(%rdi) + sbbq 184(%r8), %rdx + movq %rdx, 184(%rdi) + sbbq $0x00, %rcx + movq 384(%rsp), %rdi + addq $0x240, %rdi + # Add in word + movq (%rdi), %rax + addq %rcx, %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) adcq $0x00, %rdx - movq 208(%rsi), %rax - movq %rdx, 392(%r9) + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) adcq $0x00, %rax - movq 216(%rsi), %rdx - movq %rax, 400(%r9) + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) adcq $0x00, %rdx - movq 224(%rsi), %rax - movq %rdx, 408(%r9) + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) adcq $0x00, %rax - movq 232(%rsi), %rdx - movq %rax, 416(%r9) + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) adcq $0x00, %rdx - movq 240(%rsi), %rax - movq %rdx, 424(%r9) + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) adcq $0x00, %rax - movq 248(%rsi), %rdx - movq %rax, 432(%r9) + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) adcq $0x00, %rdx - movq 256(%rsi), %rax - movq %rdx, 440(%r9) + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) adcq $0x00, %rax - movq 264(%rsi), %rdx - movq %rax, 448(%r9) + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) adcq $0x00, %rdx - movq 272(%rsi), %rax - movq %rdx, 456(%r9) + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) adcq $0x00, %rax - movq 280(%rsi), %rdx - movq %rax, 464(%r9) + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) adcq $0x00, %rdx - movq 288(%rsi), %rax - movq %rdx, 472(%r9) + movq 96(%rdi), %rax + movq %rdx, 88(%rdi) adcq $0x00, %rax - movq 296(%rsi), %rdx - movq %rax, 480(%r9) + movq 104(%rdi), %rdx + movq %rax, 96(%rdi) adcq $0x00, %rdx - movq 304(%rsi), %rax - movq %rdx, 488(%r9) + movq 112(%rdi), %rax + movq %rdx, 104(%rdi) adcq $0x00, %rax - movq 312(%rsi), %rdx - movq %rax, 496(%r9) + movq 120(%rdi), %rdx + movq %rax, 112(%rdi) adcq $0x00, %rdx - movq 320(%rsi), %rax - movq %rdx, 504(%r9) + movq 128(%rdi), %rax + movq %rdx, 120(%rdi) adcq $0x00, %rax - movq 328(%rsi), %rdx - movq %rax, 512(%r9) + movq 136(%rdi), %rdx + movq %rax, 128(%rdi) adcq $0x00, %rdx - movq 336(%rsi), %rax - movq %rdx, 520(%r9) + movq 144(%rdi), %rax + movq %rdx, 136(%rdi) adcq $0x00, %rax - movq 344(%rsi), %rdx - movq %rax, 528(%r9) + movq 152(%rdi), %rdx + movq %rax, 144(%rdi) adcq $0x00, %rdx - movq 352(%rsi), %rax - movq %rdx, 536(%r9) + movq 160(%rdi), %rax + movq %rdx, 152(%rdi) adcq $0x00, %rax - movq 360(%rsi), %rdx - movq %rax, 544(%r9) + movq 168(%rdi), %rdx + movq %rax, 160(%rdi) adcq $0x00, %rdx - movq 368(%rsi), %rax - movq %rdx, 552(%r9) + movq 176(%rdi), %rax + movq %rdx, 168(%rdi) adcq $0x00, %rax - movq 376(%rsi), %rdx - movq %rax, 560(%r9) + movq 184(%rdi), %rdx + movq %rax, 176(%rdi) adcq $0x00, %rdx - movq %rdx, 568(%r9) - addq $0x3d8, %rsp + movq %rdx, 184(%rdi) + movq 392(%rsp), %rsi + movq 384(%rsp), %rdi + addq $0x190, %rsp repz retq #ifndef __APPLE__ .size sp_3072_sqr_48,.-sp_3072_sqr_48 #endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Square a and put result in r. (r = a * a) + * + * Karatsuba: ah^2, al^2, (al - ah)^2 * * r A single precision integer. * a A single precision integer. @@ -22097,87 +30101,207 @@ sp_3072_sqr_avx2_48: .p2align 4 _sp_3072_sqr_avx2_48: #endif /* __APPLE__ */ - subq $0x3d8, %rsp - movq %rdi, 960(%rsp) - movq %rsi, 968(%rsp) - leaq 768(%rsp), %r8 + subq $0x190, %rsp + movq %rdi, 384(%rsp) + movq %rsi, 392(%rsp) + movq $0x00, %rcx + movq %rsp, %r8 leaq 192(%rsi), %r9 - # Add movq (%rsi), %rdx - xorq %rcx, %rcx - addq (%r9), %rdx + subq (%r9), %rdx movq 8(%rsi), %rax movq %rdx, (%r8) - adcq 8(%r9), %rax + sbbq 8(%r9), %rax movq 16(%rsi), %rdx movq %rax, 8(%r8) - adcq 16(%r9), %rdx + sbbq 16(%r9), %rdx movq 24(%rsi), %rax movq %rdx, 16(%r8) - adcq 24(%r9), %rax + sbbq 24(%r9), %rax movq 32(%rsi), %rdx movq %rax, 24(%r8) - adcq 32(%r9), %rdx + sbbq 32(%r9), %rdx movq 40(%rsi), %rax movq %rdx, 32(%r8) - adcq 40(%r9), %rax + sbbq 40(%r9), %rax movq 48(%rsi), %rdx movq %rax, 40(%r8) - adcq 48(%r9), %rdx + sbbq 48(%r9), %rdx movq 56(%rsi), %rax movq %rdx, 48(%r8) - adcq 56(%r9), %rax + sbbq 56(%r9), %rax movq 64(%rsi), %rdx movq %rax, 56(%r8) - adcq 64(%r9), %rdx + sbbq 64(%r9), %rdx movq 72(%rsi), %rax movq %rdx, 64(%r8) - adcq 72(%r9), %rax + sbbq 72(%r9), %rax movq 80(%rsi), %rdx movq %rax, 72(%r8) - adcq 80(%r9), %rdx + sbbq 80(%r9), %rdx movq 88(%rsi), %rax movq %rdx, 80(%r8) - adcq 88(%r9), %rax + sbbq 88(%r9), %rax movq 96(%rsi), %rdx movq %rax, 88(%r8) - adcq 96(%r9), %rdx + sbbq 96(%r9), %rdx movq 104(%rsi), %rax movq %rdx, 96(%r8) - adcq 104(%r9), %rax + sbbq 104(%r9), %rax movq 112(%rsi), %rdx movq %rax, 104(%r8) - adcq 112(%r9), %rdx + sbbq 112(%r9), %rdx movq 120(%rsi), %rax movq %rdx, 112(%r8) - adcq 120(%r9), %rax + sbbq 120(%r9), %rax movq 128(%rsi), %rdx movq %rax, 120(%r8) - adcq 128(%r9), %rdx + sbbq 128(%r9), %rdx movq 136(%rsi), %rax movq %rdx, 128(%r8) - adcq 136(%r9), %rax + sbbq 136(%r9), %rax movq 144(%rsi), %rdx movq %rax, 136(%r8) - adcq 144(%r9), %rdx + sbbq 144(%r9), %rdx movq 152(%rsi), %rax movq %rdx, 144(%r8) - adcq 152(%r9), %rax + sbbq 152(%r9), %rax movq 160(%rsi), %rdx movq %rax, 152(%r8) - adcq 160(%r9), %rdx + sbbq 160(%r9), %rdx movq 168(%rsi), %rax movq %rdx, 160(%r8) - adcq 168(%r9), %rax + sbbq 168(%r9), %rax movq 176(%rsi), %rdx movq %rax, 168(%r8) - adcq 176(%r9), %rdx + sbbq 176(%r9), %rdx movq 184(%rsi), %rax movq %rdx, 176(%r8) - adcq 184(%r9), %rax + sbbq 184(%r9), %rax + movq %rax, 184(%r8) + sbbq $0x00, %rcx + # Cond Negate + movq (%r8), %rdx + movq %rcx, %r9 + xorq %rcx, %rdx + negq %r9 + subq %rcx, %rdx + movq 8(%r8), %rax + sbbq $0x00, %r9 + movq %rdx, (%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 16(%r8), %rdx + setc %r9b + movq %rax, 8(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 24(%r8), %rax + setc %r9b + movq %rdx, 16(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 32(%r8), %rdx + setc %r9b + movq %rax, 24(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 40(%r8), %rax + setc %r9b + movq %rdx, 32(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 48(%r8), %rdx + setc %r9b + movq %rax, 40(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 56(%r8), %rax + setc %r9b + movq %rdx, 48(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 64(%r8), %rdx + setc %r9b + movq %rax, 56(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 72(%r8), %rax + setc %r9b + movq %rdx, 64(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 80(%r8), %rdx + setc %r9b + movq %rax, 72(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 88(%r8), %rax + setc %r9b + movq %rdx, 80(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 96(%r8), %rdx + setc %r9b + movq %rax, 88(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 104(%r8), %rax + setc %r9b + movq %rdx, 96(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 112(%r8), %rdx + setc %r9b + movq %rax, 104(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 120(%r8), %rax + setc %r9b + movq %rdx, 112(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 128(%r8), %rdx + setc %r9b + movq %rax, 120(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 136(%r8), %rax + setc %r9b + movq %rdx, 128(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 144(%r8), %rdx + setc %r9b + movq %rax, 136(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 152(%r8), %rax + setc %r9b + movq %rdx, 144(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 160(%r8), %rdx + setc %r9b + movq %rax, 152(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 168(%r8), %rax + setc %r9b + movq %rdx, 160(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 176(%r8), %rdx + setc %r9b + movq %rax, 168(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 184(%r8), %rax + setc %r9b + movq %rdx, 176(%r8) + xorq %rcx, %rax + addq %r9, %rax movq %rax, 184(%r8) - adcq $0x00, %rcx - movq %rcx, 976(%rsp) movq %r8, %rsi movq %rsp, %rdi #ifndef __APPLE__ @@ -22185,715 +30309,547 @@ _sp_3072_sqr_avx2_48: #else callq _sp_3072_sqr_avx2_24 #endif /* __APPLE__ */ - movq 968(%rsp), %rsi - leaq 384(%rsp), %rdi + movq 392(%rsp), %rsi + movq 384(%rsp), %rdi addq $0xc0, %rsi + addq $0x180, %rdi #ifndef __APPLE__ callq sp_3072_sqr_avx2_24@plt #else callq _sp_3072_sqr_avx2_24 #endif /* __APPLE__ */ - movq 968(%rsp), %rsi - movq 960(%rsp), %rdi + movq 392(%rsp), %rsi + movq 384(%rsp), %rdi #ifndef __APPLE__ callq sp_3072_sqr_avx2_24@plt #else callq _sp_3072_sqr_avx2_24 #endif /* __APPLE__ */ #ifdef _WIN64 - movq 968(%rsp), %rsi - movq 960(%rsp), %rdi + movq 392(%rsp), %rsi + movq 384(%rsp), %rdi #endif /* _WIN64 */ - movq 976(%rsp), %r10 - movq %rdi, %r9 - leaq 768(%rsp), %r8 - movq %r10, %rcx - negq %r10 - addq $0x180, %r9 - movq (%r8), %rdx - pextq %r10, %rdx, %rdx - addq %rdx, %rdx - movq 8(%r8), %rax - movq %rdx, (%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 16(%r8), %rdx - movq %rax, 8(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 32(%r8), %rdx - movq %rax, 24(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 48(%r8), %rdx - movq %rax, 40(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 64(%r8), %rdx - movq %rax, 56(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 80(%r8), %rdx - movq %rax, 72(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 96(%r8), %rdx - movq %rax, 88(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 112(%r8), %rdx - movq %rax, 104(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 128(%r8), %rdx - movq %rax, 120(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 144(%r8), %rdx - movq %rax, 136(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 160(%r8), %rdx - movq %rax, 152(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 176(%r8), %rdx - movq %rax, 168(%r9) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r9) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq %rax, 184(%r9) - adcq $0x00, %rcx - leaq 384(%rsp), %rsi - movq %rsp, %r8 - movq (%r8), %rdx - subq (%rsi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rsi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rsi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rsi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rsi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rsi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rsi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rsi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rsi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rsi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rsi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rsi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rsi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rsi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rsi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rsi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rsi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rsi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rsi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rsi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rsi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rsi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rsi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rsi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rsi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rsi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rsi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rsi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rsi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rsi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rsi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rsi), %rax - movq 256(%r8), %rdx - movq %rax, 248(%r8) - sbbq 256(%rsi), %rdx - movq 264(%r8), %rax - movq %rdx, 256(%r8) - sbbq 264(%rsi), %rax - movq 272(%r8), %rdx - movq %rax, 264(%r8) - sbbq 272(%rsi), %rdx - movq 280(%r8), %rax - movq %rdx, 272(%r8) - sbbq 280(%rsi), %rax - movq 288(%r8), %rdx - movq %rax, 280(%r8) - sbbq 288(%rsi), %rdx - movq 296(%r8), %rax - movq %rdx, 288(%r8) - sbbq 296(%rsi), %rax - movq 304(%r8), %rdx - movq %rax, 296(%r8) - sbbq 304(%rsi), %rdx - movq 312(%r8), %rax - movq %rdx, 304(%r8) - sbbq 312(%rsi), %rax - movq 320(%r8), %rdx - movq %rax, 312(%r8) - sbbq 320(%rsi), %rdx - movq 328(%r8), %rax - movq %rdx, 320(%r8) - sbbq 328(%rsi), %rax - movq 336(%r8), %rdx - movq %rax, 328(%r8) - sbbq 336(%rsi), %rdx - movq 344(%r8), %rax - movq %rdx, 336(%r8) - sbbq 344(%rsi), %rax - movq 352(%r8), %rdx - movq %rax, 344(%r8) - sbbq 352(%rsi), %rdx - movq 360(%r8), %rax - movq %rdx, 352(%r8) - sbbq 360(%rsi), %rax - movq 368(%r8), %rdx - movq %rax, 360(%r8) - sbbq 368(%rsi), %rdx - movq 376(%r8), %rax - movq %rdx, 368(%r8) - sbbq 376(%rsi), %rax - movq %rax, 376(%r8) + movq 384(%rsp), %rsi + leaq 192(%rsp), %r8 + addq $0x240, %rsi + movq $0x00, %rcx + movq -192(%r8), %rax + subq -192(%rsi), %rax + movq -184(%r8), %rdx + movq %rax, -192(%r8) + sbbq -184(%rsi), %rdx + movq -176(%r8), %rax + movq %rdx, -184(%r8) + sbbq -176(%rsi), %rax + movq -168(%r8), %rdx + movq %rax, -176(%r8) + sbbq -168(%rsi), %rdx + movq -160(%r8), %rax + movq %rdx, -168(%r8) + sbbq -160(%rsi), %rax + movq -152(%r8), %rdx + movq %rax, -160(%r8) + sbbq -152(%rsi), %rdx + movq -144(%r8), %rax + movq %rdx, -152(%r8) + sbbq -144(%rsi), %rax + movq -136(%r8), %rdx + movq %rax, -144(%r8) + sbbq -136(%rsi), %rdx + movq -128(%r8), %rax + movq %rdx, -136(%r8) + sbbq -128(%rsi), %rax + movq -120(%r8), %rdx + movq %rax, -128(%r8) + sbbq -120(%rsi), %rdx + movq -112(%r8), %rax + movq %rdx, -120(%r8) + sbbq -112(%rsi), %rax + movq -104(%r8), %rdx + movq %rax, -112(%r8) + sbbq -104(%rsi), %rdx + movq -96(%r8), %rax + movq %rdx, -104(%r8) + sbbq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq 96(%r8), %rax + movq %rdx, 88(%r8) + sbbq 96(%rsi), %rax + movq 104(%r8), %rdx + movq %rax, 96(%r8) + sbbq 104(%rsi), %rdx + movq 112(%r8), %rax + movq %rdx, 104(%r8) + sbbq 112(%rsi), %rax + movq 120(%r8), %rdx + movq %rax, 112(%r8) + sbbq 120(%rsi), %rdx + movq 128(%r8), %rax + movq %rdx, 120(%r8) + sbbq 128(%rsi), %rax + movq 136(%r8), %rdx + movq %rax, 128(%r8) + sbbq 136(%rsi), %rdx + movq 144(%r8), %rax + movq %rdx, 136(%r8) + sbbq 144(%rsi), %rax + movq 152(%r8), %rdx + movq %rax, 144(%r8) + sbbq 152(%rsi), %rdx + movq 160(%r8), %rax + movq %rdx, 152(%r8) + sbbq 160(%rsi), %rax + movq 168(%r8), %rdx + movq %rax, 160(%r8) + sbbq 168(%rsi), %rdx + movq 176(%r8), %rax + movq %rdx, 168(%r8) + sbbq 176(%rsi), %rax + movq 184(%r8), %rdx + movq %rax, 176(%r8) + sbbq 184(%rsi), %rdx + movq %rdx, 184(%r8) sbbq $0x00, %rcx - movq (%r8), %rdx - subq (%rdi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rdi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rdi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rdi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rdi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rdi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rdi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rdi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rdi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rdi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rdi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rdi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rdi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rdi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rdi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rdi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rdi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rdi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rdi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rdi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rdi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rdi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rdi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rdi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rdi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rdi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rdi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rdi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rdi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rdi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rdi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rdi), %rax - movq 256(%r8), %rdx - movq %rax, 248(%r8) - sbbq 256(%rdi), %rdx - movq 264(%r8), %rax - movq %rdx, 256(%r8) - sbbq 264(%rdi), %rax - movq 272(%r8), %rdx - movq %rax, 264(%r8) - sbbq 272(%rdi), %rdx - movq 280(%r8), %rax - movq %rdx, 272(%r8) - sbbq 280(%rdi), %rax - movq 288(%r8), %rdx - movq %rax, 280(%r8) - sbbq 288(%rdi), %rdx - movq 296(%r8), %rax - movq %rdx, 288(%r8) - sbbq 296(%rdi), %rax - movq 304(%r8), %rdx - movq %rax, 296(%r8) - sbbq 304(%rdi), %rdx - movq 312(%r8), %rax - movq %rdx, 304(%r8) - sbbq 312(%rdi), %rax - movq 320(%r8), %rdx - movq %rax, 312(%r8) - sbbq 320(%rdi), %rdx - movq 328(%r8), %rax - movq %rdx, 320(%r8) - sbbq 328(%rdi), %rax - movq 336(%r8), %rdx - movq %rax, 328(%r8) - sbbq 336(%rdi), %rdx - movq 344(%r8), %rax - movq %rdx, 336(%r8) - sbbq 344(%rdi), %rax - movq 352(%r8), %rdx - movq %rax, 344(%r8) - sbbq 352(%rdi), %rdx - movq 360(%r8), %rax - movq %rdx, 352(%r8) - sbbq 360(%rdi), %rax - movq 368(%r8), %rdx - movq %rax, 360(%r8) - sbbq 368(%rdi), %rdx - movq 376(%r8), %rax - movq %rdx, 368(%r8) - sbbq 376(%rdi), %rax - movq %rax, 376(%r8) + subq $0x180, %rsi + movq -192(%r8), %rax + subq -192(%rsi), %rax + movq -184(%r8), %rdx + movq %rax, -192(%r8) + sbbq -184(%rsi), %rdx + movq -176(%r8), %rax + movq %rdx, -184(%r8) + sbbq -176(%rsi), %rax + movq -168(%r8), %rdx + movq %rax, -176(%r8) + sbbq -168(%rsi), %rdx + movq -160(%r8), %rax + movq %rdx, -168(%r8) + sbbq -160(%rsi), %rax + movq -152(%r8), %rdx + movq %rax, -160(%r8) + sbbq -152(%rsi), %rdx + movq -144(%r8), %rax + movq %rdx, -152(%r8) + sbbq -144(%rsi), %rax + movq -136(%r8), %rdx + movq %rax, -144(%r8) + sbbq -136(%rsi), %rdx + movq -128(%r8), %rax + movq %rdx, -136(%r8) + sbbq -128(%rsi), %rax + movq -120(%r8), %rdx + movq %rax, -128(%r8) + sbbq -120(%rsi), %rdx + movq -112(%r8), %rax + movq %rdx, -120(%r8) + sbbq -112(%rsi), %rax + movq -104(%r8), %rdx + movq %rax, -112(%r8) + sbbq -104(%rsi), %rdx + movq -96(%r8), %rax + movq %rdx, -104(%r8) + sbbq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq 96(%r8), %rax + movq %rdx, 88(%r8) + sbbq 96(%rsi), %rax + movq 104(%r8), %rdx + movq %rax, 96(%r8) + sbbq 104(%rsi), %rdx + movq 112(%r8), %rax + movq %rdx, 104(%r8) + sbbq 112(%rsi), %rax + movq 120(%r8), %rdx + movq %rax, 112(%r8) + sbbq 120(%rsi), %rdx + movq 128(%r8), %rax + movq %rdx, 120(%r8) + sbbq 128(%rsi), %rax + movq 136(%r8), %rdx + movq %rax, 128(%r8) + sbbq 136(%rsi), %rdx + movq 144(%r8), %rax + movq %rdx, 136(%r8) + sbbq 144(%rsi), %rax + movq 152(%r8), %rdx + movq %rax, 144(%r8) + sbbq 152(%rsi), %rdx + movq 160(%r8), %rax + movq %rdx, 152(%r8) + sbbq 160(%rsi), %rax + movq 168(%r8), %rdx + movq %rax, 160(%r8) + sbbq 168(%rsi), %rdx + movq 176(%r8), %rax + movq %rdx, 168(%r8) + sbbq 176(%rsi), %rax + movq 184(%r8), %rdx + movq %rax, 176(%r8) + sbbq 184(%rsi), %rdx + movq %rdx, 184(%r8) sbbq $0x00, %rcx - subq $0xc0, %r9 - # Add in place - movq (%r9), %rdx - addq (%r8), %rdx - movq 8(%r9), %rax - movq %rdx, (%r9) - adcq 8(%r8), %rax - movq 16(%r9), %rdx - movq %rax, 8(%r9) - adcq 16(%r8), %rdx - movq 24(%r9), %rax - movq %rdx, 16(%r9) - adcq 24(%r8), %rax - movq 32(%r9), %rdx - movq %rax, 24(%r9) - adcq 32(%r8), %rdx - movq 40(%r9), %rax - movq %rdx, 32(%r9) - adcq 40(%r8), %rax - movq 48(%r9), %rdx - movq %rax, 40(%r9) - adcq 48(%r8), %rdx - movq 56(%r9), %rax - movq %rdx, 48(%r9) - adcq 56(%r8), %rax - movq 64(%r9), %rdx - movq %rax, 56(%r9) - adcq 64(%r8), %rdx - movq 72(%r9), %rax - movq %rdx, 64(%r9) - adcq 72(%r8), %rax - movq 80(%r9), %rdx - movq %rax, 72(%r9) - adcq 80(%r8), %rdx - movq 88(%r9), %rax - movq %rdx, 80(%r9) - adcq 88(%r8), %rax - movq 96(%r9), %rdx - movq %rax, 88(%r9) - adcq 96(%r8), %rdx - movq 104(%r9), %rax - movq %rdx, 96(%r9) - adcq 104(%r8), %rax - movq 112(%r9), %rdx - movq %rax, 104(%r9) - adcq 112(%r8), %rdx - movq 120(%r9), %rax - movq %rdx, 112(%r9) - adcq 120(%r8), %rax - movq 128(%r9), %rdx - movq %rax, 120(%r9) - adcq 128(%r8), %rdx - movq 136(%r9), %rax - movq %rdx, 128(%r9) - adcq 136(%r8), %rax - movq 144(%r9), %rdx - movq %rax, 136(%r9) - adcq 144(%r8), %rdx - movq 152(%r9), %rax - movq %rdx, 144(%r9) - adcq 152(%r8), %rax - movq 160(%r9), %rdx - movq %rax, 152(%r9) - adcq 160(%r8), %rdx - movq 168(%r9), %rax - movq %rdx, 160(%r9) - adcq 168(%r8), %rax - movq 176(%r9), %rdx - movq %rax, 168(%r9) - adcq 176(%r8), %rdx - movq 184(%r9), %rax - movq %rdx, 176(%r9) - adcq 184(%r8), %rax - movq 192(%r9), %rdx - movq %rax, 184(%r9) - adcq 192(%r8), %rdx - movq 200(%r9), %rax - movq %rdx, 192(%r9) - adcq 200(%r8), %rax - movq 208(%r9), %rdx - movq %rax, 200(%r9) - adcq 208(%r8), %rdx - movq 216(%r9), %rax - movq %rdx, 208(%r9) - adcq 216(%r8), %rax - movq 224(%r9), %rdx - movq %rax, 216(%r9) - adcq 224(%r8), %rdx - movq 232(%r9), %rax - movq %rdx, 224(%r9) - adcq 232(%r8), %rax - movq 240(%r9), %rdx - movq %rax, 232(%r9) - adcq 240(%r8), %rdx - movq 248(%r9), %rax - movq %rdx, 240(%r9) - adcq 248(%r8), %rax - movq 256(%r9), %rdx - movq %rax, 248(%r9) - adcq 256(%r8), %rdx - movq 264(%r9), %rax - movq %rdx, 256(%r9) - adcq 264(%r8), %rax - movq 272(%r9), %rdx - movq %rax, 264(%r9) - adcq 272(%r8), %rdx - movq 280(%r9), %rax - movq %rdx, 272(%r9) - adcq 280(%r8), %rax - movq 288(%r9), %rdx - movq %rax, 280(%r9) - adcq 288(%r8), %rdx - movq 296(%r9), %rax - movq %rdx, 288(%r9) - adcq 296(%r8), %rax - movq 304(%r9), %rdx - movq %rax, 296(%r9) - adcq 304(%r8), %rdx - movq 312(%r9), %rax - movq %rdx, 304(%r9) - adcq 312(%r8), %rax - movq 320(%r9), %rdx - movq %rax, 312(%r9) - adcq 320(%r8), %rdx - movq 328(%r9), %rax - movq %rdx, 320(%r9) - adcq 328(%r8), %rax - movq 336(%r9), %rdx - movq %rax, 328(%r9) - adcq 336(%r8), %rdx - movq 344(%r9), %rax - movq %rdx, 336(%r9) - adcq 344(%r8), %rax - movq 352(%r9), %rdx - movq %rax, 344(%r9) - adcq 352(%r8), %rdx - movq 360(%r9), %rax - movq %rdx, 352(%r9) - adcq 360(%r8), %rax - movq 368(%r9), %rdx - movq %rax, 360(%r9) - adcq 368(%r8), %rdx - movq 376(%r9), %rax - movq %rdx, 368(%r9) - adcq 376(%r8), %rax - movq %rax, 376(%r9) - adcq $0x00, %rcx - movq %rcx, 576(%rdi) - # Add in place - movq 192(%r9), %rdx - addq (%rsi), %rdx - movq 200(%r9), %rax - movq %rdx, 192(%r9) - adcq 8(%rsi), %rax - movq 208(%r9), %rdx - movq %rax, 200(%r9) - adcq 16(%rsi), %rdx - movq 216(%r9), %rax - movq %rdx, 208(%r9) - adcq 24(%rsi), %rax - movq 224(%r9), %rdx - movq %rax, 216(%r9) - adcq 32(%rsi), %rdx - movq 232(%r9), %rax - movq %rdx, 224(%r9) - adcq 40(%rsi), %rax - movq 240(%r9), %rdx - movq %rax, 232(%r9) - adcq 48(%rsi), %rdx - movq 248(%r9), %rax - movq %rdx, 240(%r9) - adcq 56(%rsi), %rax - movq 256(%r9), %rdx - movq %rax, 248(%r9) - adcq 64(%rsi), %rdx - movq 264(%r9), %rax - movq %rdx, 256(%r9) - adcq 72(%rsi), %rax - movq 272(%r9), %rdx - movq %rax, 264(%r9) - adcq 80(%rsi), %rdx - movq 280(%r9), %rax - movq %rdx, 272(%r9) - adcq 88(%rsi), %rax - movq 288(%r9), %rdx - movq %rax, 280(%r9) - adcq 96(%rsi), %rdx - movq 296(%r9), %rax - movq %rdx, 288(%r9) - adcq 104(%rsi), %rax - movq 304(%r9), %rdx - movq %rax, 296(%r9) - adcq 112(%rsi), %rdx - movq 312(%r9), %rax - movq %rdx, 304(%r9) - adcq 120(%rsi), %rax - movq 320(%r9), %rdx - movq %rax, 312(%r9) - adcq 128(%rsi), %rdx - movq 328(%r9), %rax - movq %rdx, 320(%r9) - adcq 136(%rsi), %rax - movq 336(%r9), %rdx - movq %rax, 328(%r9) - adcq 144(%rsi), %rdx - movq 344(%r9), %rax - movq %rdx, 336(%r9) - adcq 152(%rsi), %rax - movq 352(%r9), %rdx - movq %rax, 344(%r9) - adcq 160(%rsi), %rdx - movq 360(%r9), %rax - movq %rdx, 352(%r9) - adcq 168(%rsi), %rax - movq 368(%r9), %rdx - movq %rax, 360(%r9) - adcq 176(%rsi), %rdx - movq 376(%r9), %rax - movq %rdx, 368(%r9) - adcq 184(%rsi), %rax - movq 384(%r9), %rdx - movq %rax, 376(%r9) - adcq 192(%rsi), %rdx - movq %rdx, 384(%r9) - # Add to zero - movq 200(%rsi), %rdx + movq 384(%rsp), %rdi + negq %rcx + addq $0x180, %rdi + movq -192(%rdi), %rax + subq -192(%r8), %rax + movq -184(%rdi), %rdx + movq %rax, -192(%rdi) + sbbq -184(%r8), %rdx + movq -176(%rdi), %rax + movq %rdx, -184(%rdi) + sbbq -176(%r8), %rax + movq -168(%rdi), %rdx + movq %rax, -176(%rdi) + sbbq -168(%r8), %rdx + movq -160(%rdi), %rax + movq %rdx, -168(%rdi) + sbbq -160(%r8), %rax + movq -152(%rdi), %rdx + movq %rax, -160(%rdi) + sbbq -152(%r8), %rdx + movq -144(%rdi), %rax + movq %rdx, -152(%rdi) + sbbq -144(%r8), %rax + movq -136(%rdi), %rdx + movq %rax, -144(%rdi) + sbbq -136(%r8), %rdx + movq -128(%rdi), %rax + movq %rdx, -136(%rdi) + sbbq -128(%r8), %rax + movq -120(%rdi), %rdx + movq %rax, -128(%rdi) + sbbq -120(%r8), %rdx + movq -112(%rdi), %rax + movq %rdx, -120(%rdi) + sbbq -112(%r8), %rax + movq -104(%rdi), %rdx + movq %rax, -112(%rdi) + sbbq -104(%r8), %rdx + movq -96(%rdi), %rax + movq %rdx, -104(%rdi) + sbbq -96(%r8), %rax + movq -88(%rdi), %rdx + movq %rax, -96(%rdi) + sbbq -88(%r8), %rdx + movq -80(%rdi), %rax + movq %rdx, -88(%rdi) + sbbq -80(%r8), %rax + movq -72(%rdi), %rdx + movq %rax, -80(%rdi) + sbbq -72(%r8), %rdx + movq -64(%rdi), %rax + movq %rdx, -72(%rdi) + sbbq -64(%r8), %rax + movq -56(%rdi), %rdx + movq %rax, -64(%rdi) + sbbq -56(%r8), %rdx + movq -48(%rdi), %rax + movq %rdx, -56(%rdi) + sbbq -48(%r8), %rax + movq -40(%rdi), %rdx + movq %rax, -48(%rdi) + sbbq -40(%r8), %rdx + movq -32(%rdi), %rax + movq %rdx, -40(%rdi) + sbbq -32(%r8), %rax + movq -24(%rdi), %rdx + movq %rax, -32(%rdi) + sbbq -24(%r8), %rdx + movq -16(%rdi), %rax + movq %rdx, -24(%rdi) + sbbq -16(%r8), %rax + movq -8(%rdi), %rdx + movq %rax, -16(%rdi) + sbbq -8(%r8), %rdx + movq (%rdi), %rax + movq %rdx, -8(%rdi) + sbbq (%r8), %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) + sbbq 8(%r8), %rdx + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) + sbbq 16(%r8), %rax + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) + sbbq 24(%r8), %rdx + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) + sbbq 32(%r8), %rax + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) + sbbq 40(%r8), %rdx + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) + sbbq 48(%r8), %rax + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) + sbbq 56(%r8), %rdx + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) + sbbq 64(%r8), %rax + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) + sbbq 72(%r8), %rdx + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) + sbbq 80(%r8), %rax + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) + sbbq 88(%r8), %rdx + movq 96(%rdi), %rax + movq %rdx, 88(%rdi) + sbbq 96(%r8), %rax + movq 104(%rdi), %rdx + movq %rax, 96(%rdi) + sbbq 104(%r8), %rdx + movq 112(%rdi), %rax + movq %rdx, 104(%rdi) + sbbq 112(%r8), %rax + movq 120(%rdi), %rdx + movq %rax, 112(%rdi) + sbbq 120(%r8), %rdx + movq 128(%rdi), %rax + movq %rdx, 120(%rdi) + sbbq 128(%r8), %rax + movq 136(%rdi), %rdx + movq %rax, 128(%rdi) + sbbq 136(%r8), %rdx + movq 144(%rdi), %rax + movq %rdx, 136(%rdi) + sbbq 144(%r8), %rax + movq 152(%rdi), %rdx + movq %rax, 144(%rdi) + sbbq 152(%r8), %rdx + movq 160(%rdi), %rax + movq %rdx, 152(%rdi) + sbbq 160(%r8), %rax + movq 168(%rdi), %rdx + movq %rax, 160(%rdi) + sbbq 168(%r8), %rdx + movq 176(%rdi), %rax + movq %rdx, 168(%rdi) + sbbq 176(%r8), %rax + movq 184(%rdi), %rdx + movq %rax, 176(%rdi) + sbbq 184(%r8), %rdx + movq %rdx, 184(%rdi) + sbbq $0x00, %rcx + movq 384(%rsp), %rdi + addq $0x240, %rdi + # Add in word + movq (%rdi), %rax + addq %rcx, %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) adcq $0x00, %rdx - movq 208(%rsi), %rax - movq %rdx, 392(%r9) + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) adcq $0x00, %rax - movq 216(%rsi), %rdx - movq %rax, 400(%r9) + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) adcq $0x00, %rdx - movq 224(%rsi), %rax - movq %rdx, 408(%r9) + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) adcq $0x00, %rax - movq 232(%rsi), %rdx - movq %rax, 416(%r9) + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) adcq $0x00, %rdx - movq 240(%rsi), %rax - movq %rdx, 424(%r9) + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) adcq $0x00, %rax - movq 248(%rsi), %rdx - movq %rax, 432(%r9) + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) adcq $0x00, %rdx - movq 256(%rsi), %rax - movq %rdx, 440(%r9) + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) adcq $0x00, %rax - movq 264(%rsi), %rdx - movq %rax, 448(%r9) + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) adcq $0x00, %rdx - movq 272(%rsi), %rax - movq %rdx, 456(%r9) + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) adcq $0x00, %rax - movq 280(%rsi), %rdx - movq %rax, 464(%r9) + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) adcq $0x00, %rdx - movq 288(%rsi), %rax - movq %rdx, 472(%r9) + movq 96(%rdi), %rax + movq %rdx, 88(%rdi) adcq $0x00, %rax - movq 296(%rsi), %rdx - movq %rax, 480(%r9) + movq 104(%rdi), %rdx + movq %rax, 96(%rdi) adcq $0x00, %rdx - movq 304(%rsi), %rax - movq %rdx, 488(%r9) + movq 112(%rdi), %rax + movq %rdx, 104(%rdi) adcq $0x00, %rax - movq 312(%rsi), %rdx - movq %rax, 496(%r9) + movq 120(%rdi), %rdx + movq %rax, 112(%rdi) adcq $0x00, %rdx - movq 320(%rsi), %rax - movq %rdx, 504(%r9) + movq 128(%rdi), %rax + movq %rdx, 120(%rdi) adcq $0x00, %rax - movq 328(%rsi), %rdx - movq %rax, 512(%r9) + movq 136(%rdi), %rdx + movq %rax, 128(%rdi) adcq $0x00, %rdx - movq 336(%rsi), %rax - movq %rdx, 520(%r9) + movq 144(%rdi), %rax + movq %rdx, 136(%rdi) adcq $0x00, %rax - movq 344(%rsi), %rdx - movq %rax, 528(%r9) + movq 152(%rdi), %rdx + movq %rax, 144(%rdi) adcq $0x00, %rdx - movq 352(%rsi), %rax - movq %rdx, 536(%r9) + movq 160(%rdi), %rax + movq %rdx, 152(%rdi) adcq $0x00, %rax - movq 360(%rsi), %rdx - movq %rax, 544(%r9) + movq 168(%rdi), %rdx + movq %rax, 160(%rdi) adcq $0x00, %rdx - movq 368(%rsi), %rax - movq %rdx, 552(%r9) + movq 176(%rdi), %rax + movq %rdx, 168(%rdi) adcq $0x00, %rax - movq 376(%rsi), %rdx - movq %rax, 560(%r9) + movq 184(%rdi), %rdx + movq %rax, 176(%rdi) adcq $0x00, %rdx - movq %rdx, 568(%r9) - addq $0x3d8, %rsp + movq %rdx, 184(%rdi) + movq 392(%rsp), %rsi + movq 384(%rsp), %rdi + addq $0x190, %rsp repz retq #ifndef __APPLE__ .size sp_3072_sqr_avx2_48,.-sp_3072_sqr_avx2_48 @@ -23325,7 +31281,6 @@ sp_3072_cond_sub_24: _sp_3072_cond_sub_24: #endif /* __APPLE__ */ subq $0xc0, %rsp - movq $0x00, %rax movq (%rdx), %r8 movq 8(%rdx), %r9 andq %rcx, %r8 @@ -23494,7 +31449,7 @@ _sp_3072_cond_sub_24: sbbq %rdx, %r9 movq %r8, 176(%rdi) movq %r9, 184(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax addq $0xc0, %rsp repz retq #ifndef __APPLE__ @@ -23819,7 +31774,6 @@ sp_3072_cond_sub_avx2_24: .p2align 4 _sp_3072_cond_sub_avx2_24: #endif /* __APPLE__ */ - movq $0x00, %rax movq (%rdx), %r10 movq (%rsi), %r8 pextq %rcx, %r10, %r10 @@ -23940,7 +31894,7 @@ _sp_3072_cond_sub_avx2_24: movq %r9, 176(%rdi) sbbq %r8, %r10 movq %r10, 184(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_3072_cond_sub_avx2_24,.-sp_3072_cond_sub_avx2_24 @@ -24578,6 +32532,1761 @@ _sp_3072_cmp_24: #ifndef __APPLE__ .size sp_3072_cmp_24,.-sp_3072_cmp_24 #endif /* __APPLE__ */ +#ifndef WC_NO_CACHE_RESISTANT +#ifndef __APPLE__ +.text +.globl sp_3072_get_from_table_24 +.type sp_3072_get_from_table_24,@function +.align 16 +sp_3072_get_from_table_24: +#else +.section __TEXT,__text +.globl _sp_3072_get_from_table_24 +.p2align 4 +_sp_3072_get_from_table_24: +#endif /* __APPLE__ */ + movq $0x01, %rax + movd %rdx, %xmm10 + movd %rax, %xmm11 + pxor %xmm13, %xmm13 + pshufd $0x00, %xmm11, %xmm11 + pshufd $0x00, %xmm10, %xmm10 + # START: 0-7 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 0-7 + # START: 8-15 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 8-15 + # START: 16-23 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + # END: 16-23 + repz retq +#ifndef __APPLE__ +.size sp_3072_get_from_table_24,.-sp_3072_get_from_table_24 +#endif /* __APPLE__ */ +#endif /* !WC_NO_CACHE_RESISTANT */ #ifdef HAVE_INTEL_AVX2 /* Reduce the number back to 3072 bits using Montgomery reduction. * @@ -24903,6 +34612,913 @@ L_3072_mont_reduce_avx2_24_loop: .size sp_3072_mont_reduce_avx2_24,.-sp_3072_mont_reduce_avx2_24 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ +#ifndef WC_NO_CACHE_RESISTANT +#ifndef __APPLE__ +.text +.globl sp_3072_get_from_table_avx2_24 +.type sp_3072_get_from_table_avx2_24,@function +.align 16 +sp_3072_get_from_table_avx2_24: +#else +.section __TEXT,__text +.globl _sp_3072_get_from_table_avx2_24 +.p2align 4 +_sp_3072_get_from_table_avx2_24: +#endif /* __APPLE__ */ + movq $0x01, %rax + movd %rdx, %xmm10 + movd %rax, %xmm11 + vpxor %ymm13, %ymm13, %ymm13 + vpermd %ymm10, %ymm13, %ymm10 + vpermd %ymm11, %ymm13, %ymm11 + # START: 0-15 + vpxor %ymm13, %ymm13, %ymm13 + vpxor %ymm4, %ymm4, %ymm4 + vpxor %ymm5, %ymm5, %ymm5 + vpxor %ymm6, %ymm6, %ymm6 + vpxor %ymm7, %ymm7, %ymm7 + # ENTRY: 0 + movq (%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, 32(%rdi) + vmovdqu %ymm6, 64(%rdi) + vmovdqu %ymm7, 96(%rdi) + addq $0x80, %rdi + # END: 0-15 + # START: 16-23 + vpxor %ymm13, %ymm13, %ymm13 + vpxor %ymm4, %ymm4, %ymm4 + vpxor %ymm5, %ymm5, %ymm5 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 16 + movq 128(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 17 + movq 136(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 18 + movq 144(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 19 + movq 152(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 20 + movq 160(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 21 + movq 168(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 22 + movq 176(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 23 + movq 184(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 24 + movq 192(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 25 + movq 200(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 26 + movq 208(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 27 + movq 216(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 28 + movq 224(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 29 + movq 232(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 30 + movq 240(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 31 + movq 248(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpaddd %ymm11, %ymm13, %ymm13 + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, 32(%rdi) + # END: 16-23 + repz retq +#ifndef __APPLE__ +.size sp_3072_get_from_table_avx2_24,.-sp_3072_get_from_table_avx2_24 +#endif /* __APPLE__ */ +#endif /* !WC_NO_CACHE_RESISTANT */ /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -24924,7 +35540,6 @@ sp_3072_cond_sub_48: _sp_3072_cond_sub_48: #endif /* __APPLE__ */ subq $0x180, %rsp - movq $0x00, %rax movq (%rdx), %r8 movq 8(%rdx), %r9 andq %rcx, %r8 @@ -25261,7 +35876,7 @@ _sp_3072_cond_sub_48: sbbq %rdx, %r9 movq %r8, 368(%rdi) movq %r9, 376(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax addq $0x180, %rsp repz retq #ifndef __APPLE__ @@ -25824,7 +36439,6 @@ sp_3072_sub_48: _sp_3072_sub_48: #endif /* __APPLE__ */ movq (%rsi), %rcx - xorq %rax, %rax subq (%rdx), %rcx movq 8(%rsi), %r8 movq %rcx, (%rdi) @@ -25968,7 +36582,7 @@ _sp_3072_sub_48: movq %rcx, 368(%rdi) sbbq 376(%rdx), %r8 movq %r8, 376(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_3072_sub_48,.-sp_3072_sub_48 @@ -26337,7 +36951,6 @@ sp_3072_cond_sub_avx2_48: .p2align 4 _sp_3072_cond_sub_avx2_48: #endif /* __APPLE__ */ - movq $0x00, %rax movq (%rdx), %r10 movq (%rsi), %r8 pextq %rcx, %r10, %r10 @@ -26578,7 +37191,7 @@ _sp_3072_cond_sub_avx2_48: movq %r9, 368(%rdi) sbbq %r8, %r10 movq %r10, 376(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_3072_cond_sub_avx2_48,.-sp_3072_cond_sub_avx2_48 @@ -26996,6 +37609,1813 @@ _sp_3072_cmp_48: #ifndef __APPLE__ .size sp_3072_cmp_48,.-sp_3072_cmp_48 #endif /* __APPLE__ */ +#ifndef WC_NO_CACHE_RESISTANT +#ifndef __APPLE__ +.text +.globl sp_3072_get_from_table_48 +.type sp_3072_get_from_table_48,@function +.align 16 +sp_3072_get_from_table_48: +#else +.section __TEXT,__text +.globl _sp_3072_get_from_table_48 +.p2align 4 +_sp_3072_get_from_table_48: +#endif /* __APPLE__ */ + movq $0x01, %rax + movd %rdx, %xmm10 + movd %rax, %xmm11 + pxor %xmm13, %xmm13 + pshufd $0x00, %xmm11, %xmm11 + pshufd $0x00, %xmm10, %xmm10 + # START: 0-7 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 0-7 + # START: 8-15 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 8-15 + # START: 16-23 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 16-23 + # START: 24-31 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 24-31 + # START: 32-39 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 32-39 + # START: 40-47 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + # END: 40-47 + repz retq +#ifndef __APPLE__ +.size sp_3072_get_from_table_48,.-sp_3072_get_from_table_48 +#endif /* __APPLE__ */ +#endif /* !WC_NO_CACHE_RESISTANT */ #ifdef HAVE_INTEL_AVX2 /* Reduce the number back to 3072 bits using Montgomery reduction. * @@ -27585,6 +40005,865 @@ L_3072_mont_reduce_avx2_48_loop: .size sp_3072_mont_reduce_avx2_48,.-sp_3072_mont_reduce_avx2_48 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ +#ifndef WC_NO_CACHE_RESISTANT +#ifndef __APPLE__ +.text +.globl sp_3072_get_from_table_avx2_48 +.type sp_3072_get_from_table_avx2_48,@function +.align 16 +sp_3072_get_from_table_avx2_48: +#else +.section __TEXT,__text +.globl _sp_3072_get_from_table_avx2_48 +.p2align 4 +_sp_3072_get_from_table_avx2_48: +#endif /* __APPLE__ */ + movq $0x01, %rax + movd %rdx, %xmm10 + movd %rax, %xmm11 + vpxor %ymm13, %ymm13, %ymm13 + vpermd %ymm10, %ymm13, %ymm10 + vpermd %ymm11, %ymm13, %ymm11 + # START: 0-15 + vpxor %ymm13, %ymm13, %ymm13 + vpxor %ymm4, %ymm4, %ymm4 + vpxor %ymm5, %ymm5, %ymm5 + vpxor %ymm6, %ymm6, %ymm6 + vpxor %ymm7, %ymm7, %ymm7 + # ENTRY: 0 + movq (%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, 32(%rdi) + vmovdqu %ymm6, 64(%rdi) + vmovdqu %ymm7, 96(%rdi) + addq $0x80, %rdi + # END: 0-15 + # START: 16-31 + vpxor %ymm13, %ymm13, %ymm13 + vpxor %ymm4, %ymm4, %ymm4 + vpxor %ymm5, %ymm5, %ymm5 + vpxor %ymm6, %ymm6, %ymm6 + vpxor %ymm7, %ymm7, %ymm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, 32(%rdi) + vmovdqu %ymm6, 64(%rdi) + vmovdqu %ymm7, 96(%rdi) + addq $0x80, %rdi + # END: 16-31 + # START: 32-47 + vpxor %ymm13, %ymm13, %ymm13 + vpxor %ymm4, %ymm4, %ymm4 + vpxor %ymm5, %ymm5, %ymm5 + vpxor %ymm6, %ymm6, %ymm6 + vpxor %ymm7, %ymm7, %ymm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, 32(%rdi) + vmovdqu %ymm6, 64(%rdi) + vmovdqu %ymm7, 96(%rdi) + # END: 32-47 + repz retq +#ifndef __APPLE__ +.size sp_3072_get_from_table_avx2_48,.-sp_3072_get_from_table_avx2_48 +#endif /* __APPLE__ */ +#endif /* !WC_NO_CACHE_RESISTANT */ /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -28669,7 +41948,6 @@ sp_4096_sub_in_place_64: _sp_4096_sub_in_place_64: #endif /* __APPLE__ */ movq (%rdi), %rdx - xorq %rax, %rax subq (%rsi), %rdx movq 8(%rdi), %rcx movq %rdx, (%rdi) @@ -28861,7 +42139,7 @@ _sp_4096_sub_in_place_64: movq %rdx, 496(%rdi) sbbq 504(%rsi), %rcx movq %rcx, 504(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_4096_sub_in_place_64,.-sp_4096_sub_in_place_64 @@ -31689,126 +44967,9 @@ _sp_4096_mul_avx2_64: .size sp_4096_mul_avx2_64,.-sp_4096_mul_avx2_64 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ -/* Add a to a into r. (r = a + a) - * - * r A single precision integer. - * a A single precision integer. - */ -#ifndef __APPLE__ -.text -.globl sp_2048_dbl_32 -.type sp_2048_dbl_32,@function -.align 16 -sp_2048_dbl_32: -#else -.section __TEXT,__text -.globl _sp_2048_dbl_32 -.p2align 4 -_sp_2048_dbl_32: -#endif /* __APPLE__ */ - movq (%rsi), %rdx - xorq %rax, %rax - addq %rdx, %rdx - movq 8(%rsi), %rcx - movq %rdx, (%rdi) - adcq %rcx, %rcx - movq 16(%rsi), %rdx - movq %rcx, 8(%rdi) - adcq %rdx, %rdx - movq 24(%rsi), %rcx - movq %rdx, 16(%rdi) - adcq %rcx, %rcx - movq 32(%rsi), %rdx - movq %rcx, 24(%rdi) - adcq %rdx, %rdx - movq 40(%rsi), %rcx - movq %rdx, 32(%rdi) - adcq %rcx, %rcx - movq 48(%rsi), %rdx - movq %rcx, 40(%rdi) - adcq %rdx, %rdx - movq 56(%rsi), %rcx - movq %rdx, 48(%rdi) - adcq %rcx, %rcx - movq 64(%rsi), %rdx - movq %rcx, 56(%rdi) - adcq %rdx, %rdx - movq 72(%rsi), %rcx - movq %rdx, 64(%rdi) - adcq %rcx, %rcx - movq 80(%rsi), %rdx - movq %rcx, 72(%rdi) - adcq %rdx, %rdx - movq 88(%rsi), %rcx - movq %rdx, 80(%rdi) - adcq %rcx, %rcx - movq 96(%rsi), %rdx - movq %rcx, 88(%rdi) - adcq %rdx, %rdx - movq 104(%rsi), %rcx - movq %rdx, 96(%rdi) - adcq %rcx, %rcx - movq 112(%rsi), %rdx - movq %rcx, 104(%rdi) - adcq %rdx, %rdx - movq 120(%rsi), %rcx - movq %rdx, 112(%rdi) - adcq %rcx, %rcx - movq 128(%rsi), %rdx - movq %rcx, 120(%rdi) - adcq %rdx, %rdx - movq 136(%rsi), %rcx - movq %rdx, 128(%rdi) - adcq %rcx, %rcx - movq 144(%rsi), %rdx - movq %rcx, 136(%rdi) - adcq %rdx, %rdx - movq 152(%rsi), %rcx - movq %rdx, 144(%rdi) - adcq %rcx, %rcx - movq 160(%rsi), %rdx - movq %rcx, 152(%rdi) - adcq %rdx, %rdx - movq 168(%rsi), %rcx - movq %rdx, 160(%rdi) - adcq %rcx, %rcx - movq 176(%rsi), %rdx - movq %rcx, 168(%rdi) - adcq %rdx, %rdx - movq 184(%rsi), %rcx - movq %rdx, 176(%rdi) - adcq %rcx, %rcx - movq 192(%rsi), %rdx - movq %rcx, 184(%rdi) - adcq %rdx, %rdx - movq 200(%rsi), %rcx - movq %rdx, 192(%rdi) - adcq %rcx, %rcx - movq 208(%rsi), %rdx - movq %rcx, 200(%rdi) - adcq %rdx, %rdx - movq 216(%rsi), %rcx - movq %rdx, 208(%rdi) - adcq %rcx, %rcx - movq 224(%rsi), %rdx - movq %rcx, 216(%rdi) - adcq %rdx, %rdx - movq 232(%rsi), %rcx - movq %rdx, 224(%rdi) - adcq %rcx, %rcx - movq 240(%rsi), %rdx - movq %rcx, 232(%rdi) - adcq %rdx, %rdx - movq 248(%rsi), %rcx - movq %rdx, 240(%rdi) - adcq %rcx, %rcx - movq %rcx, 248(%rdi) - adcq $0x00, %rax - repz retq -#ifndef __APPLE__ -.size sp_2048_dbl_32,.-sp_2048_dbl_32 -#endif /* __APPLE__ */ /* Square a and put result in r. (r = a * a) + * + * Karatsuba: ah^2, al^2, (al - ah)^2 * * r A single precision integer. * a A single precision integer. @@ -31825,111 +44986,271 @@ sp_4096_sqr_64: .p2align 4 _sp_4096_sqr_64: #endif /* __APPLE__ */ - subq $0x518, %rsp - movq %rdi, 1280(%rsp) - movq %rsi, 1288(%rsp) - leaq 1024(%rsp), %r8 + subq $0x210, %rsp + movq %rdi, 512(%rsp) + movq %rsi, 520(%rsp) + movq $0x00, %rcx + movq %rsp, %r8 leaq 256(%rsi), %r9 - # Add movq (%rsi), %rdx - xorq %rcx, %rcx - addq (%r9), %rdx + subq (%r9), %rdx movq 8(%rsi), %rax movq %rdx, (%r8) - adcq 8(%r9), %rax + sbbq 8(%r9), %rax movq 16(%rsi), %rdx movq %rax, 8(%r8) - adcq 16(%r9), %rdx + sbbq 16(%r9), %rdx movq 24(%rsi), %rax movq %rdx, 16(%r8) - adcq 24(%r9), %rax + sbbq 24(%r9), %rax movq 32(%rsi), %rdx movq %rax, 24(%r8) - adcq 32(%r9), %rdx + sbbq 32(%r9), %rdx movq 40(%rsi), %rax movq %rdx, 32(%r8) - adcq 40(%r9), %rax + sbbq 40(%r9), %rax movq 48(%rsi), %rdx movq %rax, 40(%r8) - adcq 48(%r9), %rdx + sbbq 48(%r9), %rdx movq 56(%rsi), %rax movq %rdx, 48(%r8) - adcq 56(%r9), %rax + sbbq 56(%r9), %rax movq 64(%rsi), %rdx movq %rax, 56(%r8) - adcq 64(%r9), %rdx + sbbq 64(%r9), %rdx movq 72(%rsi), %rax movq %rdx, 64(%r8) - adcq 72(%r9), %rax + sbbq 72(%r9), %rax movq 80(%rsi), %rdx movq %rax, 72(%r8) - adcq 80(%r9), %rdx + sbbq 80(%r9), %rdx movq 88(%rsi), %rax movq %rdx, 80(%r8) - adcq 88(%r9), %rax + sbbq 88(%r9), %rax movq 96(%rsi), %rdx movq %rax, 88(%r8) - adcq 96(%r9), %rdx + sbbq 96(%r9), %rdx movq 104(%rsi), %rax movq %rdx, 96(%r8) - adcq 104(%r9), %rax + sbbq 104(%r9), %rax movq 112(%rsi), %rdx movq %rax, 104(%r8) - adcq 112(%r9), %rdx + sbbq 112(%r9), %rdx movq 120(%rsi), %rax movq %rdx, 112(%r8) - adcq 120(%r9), %rax + sbbq 120(%r9), %rax movq 128(%rsi), %rdx movq %rax, 120(%r8) - adcq 128(%r9), %rdx + sbbq 128(%r9), %rdx movq 136(%rsi), %rax movq %rdx, 128(%r8) - adcq 136(%r9), %rax + sbbq 136(%r9), %rax movq 144(%rsi), %rdx movq %rax, 136(%r8) - adcq 144(%r9), %rdx + sbbq 144(%r9), %rdx movq 152(%rsi), %rax movq %rdx, 144(%r8) - adcq 152(%r9), %rax + sbbq 152(%r9), %rax movq 160(%rsi), %rdx movq %rax, 152(%r8) - adcq 160(%r9), %rdx + sbbq 160(%r9), %rdx movq 168(%rsi), %rax movq %rdx, 160(%r8) - adcq 168(%r9), %rax + sbbq 168(%r9), %rax movq 176(%rsi), %rdx movq %rax, 168(%r8) - adcq 176(%r9), %rdx + sbbq 176(%r9), %rdx movq 184(%rsi), %rax movq %rdx, 176(%r8) - adcq 184(%r9), %rax + sbbq 184(%r9), %rax movq 192(%rsi), %rdx movq %rax, 184(%r8) - adcq 192(%r9), %rdx + sbbq 192(%r9), %rdx movq 200(%rsi), %rax movq %rdx, 192(%r8) - adcq 200(%r9), %rax + sbbq 200(%r9), %rax movq 208(%rsi), %rdx movq %rax, 200(%r8) - adcq 208(%r9), %rdx + sbbq 208(%r9), %rdx movq 216(%rsi), %rax movq %rdx, 208(%r8) - adcq 216(%r9), %rax + sbbq 216(%r9), %rax movq 224(%rsi), %rdx movq %rax, 216(%r8) - adcq 224(%r9), %rdx + sbbq 224(%r9), %rdx movq 232(%rsi), %rax movq %rdx, 224(%r8) - adcq 232(%r9), %rax + sbbq 232(%r9), %rax movq 240(%rsi), %rdx movq %rax, 232(%r8) - adcq 240(%r9), %rdx + sbbq 240(%r9), %rdx movq 248(%rsi), %rax movq %rdx, 240(%r8) - adcq 248(%r9), %rax + sbbq 248(%r9), %rax + movq %rax, 248(%r8) + sbbq $0x00, %rcx + # Cond Negate + movq (%r8), %rdx + movq %rcx, %r9 + xorq %rcx, %rdx + negq %r9 + subq %rcx, %rdx + movq 8(%r8), %rax + sbbq $0x00, %r9 + movq %rdx, (%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 16(%r8), %rdx + setc %r9b + movq %rax, 8(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 24(%r8), %rax + setc %r9b + movq %rdx, 16(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 32(%r8), %rdx + setc %r9b + movq %rax, 24(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 40(%r8), %rax + setc %r9b + movq %rdx, 32(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 48(%r8), %rdx + setc %r9b + movq %rax, 40(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 56(%r8), %rax + setc %r9b + movq %rdx, 48(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 64(%r8), %rdx + setc %r9b + movq %rax, 56(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 72(%r8), %rax + setc %r9b + movq %rdx, 64(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 80(%r8), %rdx + setc %r9b + movq %rax, 72(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 88(%r8), %rax + setc %r9b + movq %rdx, 80(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 96(%r8), %rdx + setc %r9b + movq %rax, 88(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 104(%r8), %rax + setc %r9b + movq %rdx, 96(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 112(%r8), %rdx + setc %r9b + movq %rax, 104(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 120(%r8), %rax + setc %r9b + movq %rdx, 112(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 128(%r8), %rdx + setc %r9b + movq %rax, 120(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 136(%r8), %rax + setc %r9b + movq %rdx, 128(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 144(%r8), %rdx + setc %r9b + movq %rax, 136(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 152(%r8), %rax + setc %r9b + movq %rdx, 144(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 160(%r8), %rdx + setc %r9b + movq %rax, 152(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 168(%r8), %rax + setc %r9b + movq %rdx, 160(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 176(%r8), %rdx + setc %r9b + movq %rax, 168(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 184(%r8), %rax + setc %r9b + movq %rdx, 176(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 192(%r8), %rdx + setc %r9b + movq %rax, 184(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 200(%r8), %rax + setc %r9b + movq %rdx, 192(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 208(%r8), %rdx + setc %r9b + movq %rax, 200(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 216(%r8), %rax + setc %r9b + movq %rdx, 208(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 224(%r8), %rdx + setc %r9b + movq %rax, 216(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 232(%r8), %rax + setc %r9b + movq %rdx, 224(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 240(%r8), %rdx + setc %r9b + movq %rax, 232(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 248(%r8), %rax + setc %r9b + movq %rdx, 240(%r8) + xorq %rcx, %rax + addq %r9, %rax movq %rax, 248(%r8) - adcq $0x00, %rcx - movq %rcx, 1296(%rsp) movq %r8, %rsi movq %rsp, %rdi #ifndef __APPLE__ @@ -31937,1008 +45258,723 @@ _sp_4096_sqr_64: #else callq _sp_2048_sqr_32 #endif /* __APPLE__ */ - movq 1288(%rsp), %rsi - leaq 512(%rsp), %rdi + movq 520(%rsp), %rsi + movq 512(%rsp), %rdi addq $0x100, %rsi + addq $0x200, %rdi #ifndef __APPLE__ callq sp_2048_sqr_32@plt #else callq _sp_2048_sqr_32 #endif /* __APPLE__ */ - movq 1288(%rsp), %rsi - movq 1280(%rsp), %rdi + movq 520(%rsp), %rsi + movq 512(%rsp), %rdi #ifndef __APPLE__ callq sp_2048_sqr_32@plt #else callq _sp_2048_sqr_32 #endif /* __APPLE__ */ #ifdef _WIN64 - movq 1288(%rsp), %rsi - movq 1280(%rsp), %rdi + movq 520(%rsp), %rsi + movq 512(%rsp), %rdi #endif /* _WIN64 */ - movq 1296(%rsp), %r10 - leaq 1024(%rsp), %r8 - movq %r10, %rcx - negq %r10 - movq (%r8), %rdx - movq 8(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 512(%rdi) - movq %rax, 520(%rdi) - movq 16(%r8), %rdx - movq 24(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 528(%rdi) - movq %rax, 536(%rdi) - movq 32(%r8), %rdx - movq 40(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 544(%rdi) - movq %rax, 552(%rdi) - movq 48(%r8), %rdx - movq 56(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 560(%rdi) - movq %rax, 568(%rdi) - movq 64(%r8), %rdx - movq 72(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 576(%rdi) - movq %rax, 584(%rdi) - movq 80(%r8), %rdx - movq 88(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 592(%rdi) - movq %rax, 600(%rdi) - movq 96(%r8), %rdx - movq 104(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 608(%rdi) - movq %rax, 616(%rdi) - movq 112(%r8), %rdx - movq 120(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 624(%rdi) - movq %rax, 632(%rdi) - movq 128(%r8), %rdx - movq 136(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 640(%rdi) - movq %rax, 648(%rdi) - movq 144(%r8), %rdx - movq 152(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 656(%rdi) - movq %rax, 664(%rdi) - movq 160(%r8), %rdx - movq 168(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 672(%rdi) - movq %rax, 680(%rdi) - movq 176(%r8), %rdx - movq 184(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 688(%rdi) - movq %rax, 696(%rdi) - movq 192(%r8), %rdx - movq 200(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 704(%rdi) - movq %rax, 712(%rdi) - movq 208(%r8), %rdx - movq 216(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 720(%rdi) - movq %rax, 728(%rdi) - movq 224(%r8), %rdx - movq 232(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 736(%rdi) - movq %rax, 744(%rdi) - movq 240(%r8), %rdx - movq 248(%r8), %rax - andq %r10, %rdx - andq %r10, %rax - movq %rdx, 752(%rdi) - movq %rax, 760(%rdi) - movq 512(%rdi), %rdx - addq %rdx, %rdx - movq 520(%rdi), %rax - movq %rdx, 512(%rdi) - adcq %rax, %rax - movq 528(%rdi), %rdx - movq %rax, 520(%rdi) - adcq %rdx, %rdx - movq 536(%rdi), %rax - movq %rdx, 528(%rdi) - adcq %rax, %rax - movq 544(%rdi), %rdx - movq %rax, 536(%rdi) - adcq %rdx, %rdx - movq 552(%rdi), %rax - movq %rdx, 544(%rdi) - adcq %rax, %rax - movq 560(%rdi), %rdx - movq %rax, 552(%rdi) - adcq %rdx, %rdx - movq 568(%rdi), %rax - movq %rdx, 560(%rdi) - adcq %rax, %rax - movq 576(%rdi), %rdx - movq %rax, 568(%rdi) - adcq %rdx, %rdx - movq 584(%rdi), %rax - movq %rdx, 576(%rdi) - adcq %rax, %rax - movq 592(%rdi), %rdx - movq %rax, 584(%rdi) - adcq %rdx, %rdx - movq 600(%rdi), %rax - movq %rdx, 592(%rdi) - adcq %rax, %rax - movq 608(%rdi), %rdx - movq %rax, 600(%rdi) - adcq %rdx, %rdx - movq 616(%rdi), %rax - movq %rdx, 608(%rdi) - adcq %rax, %rax - movq 624(%rdi), %rdx - movq %rax, 616(%rdi) - adcq %rdx, %rdx - movq 632(%rdi), %rax - movq %rdx, 624(%rdi) - adcq %rax, %rax - movq 640(%rdi), %rdx - movq %rax, 632(%rdi) - adcq %rdx, %rdx - movq 648(%rdi), %rax - movq %rdx, 640(%rdi) - adcq %rax, %rax - movq 656(%rdi), %rdx - movq %rax, 648(%rdi) - adcq %rdx, %rdx - movq 664(%rdi), %rax - movq %rdx, 656(%rdi) - adcq %rax, %rax - movq 672(%rdi), %rdx - movq %rax, 664(%rdi) - adcq %rdx, %rdx - movq 680(%rdi), %rax - movq %rdx, 672(%rdi) - adcq %rax, %rax - movq 688(%rdi), %rdx - movq %rax, 680(%rdi) - adcq %rdx, %rdx - movq 696(%rdi), %rax - movq %rdx, 688(%rdi) - adcq %rax, %rax - movq 704(%rdi), %rdx - movq %rax, 696(%rdi) - adcq %rdx, %rdx - movq 712(%rdi), %rax - movq %rdx, 704(%rdi) - adcq %rax, %rax - movq 720(%rdi), %rdx - movq %rax, 712(%rdi) - adcq %rdx, %rdx - movq 728(%rdi), %rax - movq %rdx, 720(%rdi) - adcq %rax, %rax - movq 736(%rdi), %rdx - movq %rax, 728(%rdi) - adcq %rdx, %rdx - movq 744(%rdi), %rax - movq %rdx, 736(%rdi) - adcq %rax, %rax - movq 752(%rdi), %rdx - movq %rax, 744(%rdi) - adcq %rdx, %rdx - movq 760(%rdi), %rax - movq %rdx, 752(%rdi) - adcq %rax, %rax - movq %rax, 760(%rdi) - adcq $0x00, %rcx - leaq 512(%rsp), %rsi - movq %rsp, %r8 - movq (%r8), %rdx - subq (%rsi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rsi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rsi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rsi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rsi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rsi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rsi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rsi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rsi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rsi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rsi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rsi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rsi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rsi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rsi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rsi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rsi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rsi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rsi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rsi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rsi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rsi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rsi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rsi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rsi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rsi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rsi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rsi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rsi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rsi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rsi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rsi), %rax - movq 256(%r8), %rdx - movq %rax, 248(%r8) - sbbq 256(%rsi), %rdx - movq 264(%r8), %rax - movq %rdx, 256(%r8) - sbbq 264(%rsi), %rax - movq 272(%r8), %rdx - movq %rax, 264(%r8) - sbbq 272(%rsi), %rdx - movq 280(%r8), %rax - movq %rdx, 272(%r8) - sbbq 280(%rsi), %rax - movq 288(%r8), %rdx - movq %rax, 280(%r8) - sbbq 288(%rsi), %rdx - movq 296(%r8), %rax - movq %rdx, 288(%r8) - sbbq 296(%rsi), %rax - movq 304(%r8), %rdx - movq %rax, 296(%r8) - sbbq 304(%rsi), %rdx - movq 312(%r8), %rax - movq %rdx, 304(%r8) - sbbq 312(%rsi), %rax - movq 320(%r8), %rdx - movq %rax, 312(%r8) - sbbq 320(%rsi), %rdx - movq 328(%r8), %rax - movq %rdx, 320(%r8) - sbbq 328(%rsi), %rax - movq 336(%r8), %rdx - movq %rax, 328(%r8) - sbbq 336(%rsi), %rdx - movq 344(%r8), %rax - movq %rdx, 336(%r8) - sbbq 344(%rsi), %rax - movq 352(%r8), %rdx - movq %rax, 344(%r8) - sbbq 352(%rsi), %rdx - movq 360(%r8), %rax - movq %rdx, 352(%r8) - sbbq 360(%rsi), %rax - movq 368(%r8), %rdx - movq %rax, 360(%r8) - sbbq 368(%rsi), %rdx - movq 376(%r8), %rax - movq %rdx, 368(%r8) - sbbq 376(%rsi), %rax - movq 384(%r8), %rdx - movq %rax, 376(%r8) - sbbq 384(%rsi), %rdx - movq 392(%r8), %rax - movq %rdx, 384(%r8) - sbbq 392(%rsi), %rax - movq 400(%r8), %rdx - movq %rax, 392(%r8) - sbbq 400(%rsi), %rdx - movq 408(%r8), %rax - movq %rdx, 400(%r8) - sbbq 408(%rsi), %rax - movq 416(%r8), %rdx - movq %rax, 408(%r8) - sbbq 416(%rsi), %rdx - movq 424(%r8), %rax - movq %rdx, 416(%r8) - sbbq 424(%rsi), %rax - movq 432(%r8), %rdx - movq %rax, 424(%r8) - sbbq 432(%rsi), %rdx - movq 440(%r8), %rax - movq %rdx, 432(%r8) - sbbq 440(%rsi), %rax - movq 448(%r8), %rdx - movq %rax, 440(%r8) - sbbq 448(%rsi), %rdx - movq 456(%r8), %rax - movq %rdx, 448(%r8) - sbbq 456(%rsi), %rax - movq 464(%r8), %rdx - movq %rax, 456(%r8) - sbbq 464(%rsi), %rdx - movq 472(%r8), %rax - movq %rdx, 464(%r8) - sbbq 472(%rsi), %rax - movq 480(%r8), %rdx - movq %rax, 472(%r8) - sbbq 480(%rsi), %rdx - movq 488(%r8), %rax - movq %rdx, 480(%r8) - sbbq 488(%rsi), %rax - movq 496(%r8), %rdx - movq %rax, 488(%r8) - sbbq 496(%rsi), %rdx - movq 504(%r8), %rax - movq %rdx, 496(%r8) - sbbq 504(%rsi), %rax - movq %rax, 504(%r8) + movq 512(%rsp), %rsi + leaq 256(%rsp), %r8 + addq $0x300, %rsi + movq $0x00, %rcx + movq -256(%r8), %rax + subq -256(%rsi), %rax + movq -248(%r8), %rdx + movq %rax, -256(%r8) + sbbq -248(%rsi), %rdx + movq -240(%r8), %rax + movq %rdx, -248(%r8) + sbbq -240(%rsi), %rax + movq -232(%r8), %rdx + movq %rax, -240(%r8) + sbbq -232(%rsi), %rdx + movq -224(%r8), %rax + movq %rdx, -232(%r8) + sbbq -224(%rsi), %rax + movq -216(%r8), %rdx + movq %rax, -224(%r8) + sbbq -216(%rsi), %rdx + movq -208(%r8), %rax + movq %rdx, -216(%r8) + sbbq -208(%rsi), %rax + movq -200(%r8), %rdx + movq %rax, -208(%r8) + sbbq -200(%rsi), %rdx + movq -192(%r8), %rax + movq %rdx, -200(%r8) + sbbq -192(%rsi), %rax + movq -184(%r8), %rdx + movq %rax, -192(%r8) + sbbq -184(%rsi), %rdx + movq -176(%r8), %rax + movq %rdx, -184(%r8) + sbbq -176(%rsi), %rax + movq -168(%r8), %rdx + movq %rax, -176(%r8) + sbbq -168(%rsi), %rdx + movq -160(%r8), %rax + movq %rdx, -168(%r8) + sbbq -160(%rsi), %rax + movq -152(%r8), %rdx + movq %rax, -160(%r8) + sbbq -152(%rsi), %rdx + movq -144(%r8), %rax + movq %rdx, -152(%r8) + sbbq -144(%rsi), %rax + movq -136(%r8), %rdx + movq %rax, -144(%r8) + sbbq -136(%rsi), %rdx + movq -128(%r8), %rax + movq %rdx, -136(%r8) + sbbq -128(%rsi), %rax + movq -120(%r8), %rdx + movq %rax, -128(%r8) + sbbq -120(%rsi), %rdx + movq -112(%r8), %rax + movq %rdx, -120(%r8) + sbbq -112(%rsi), %rax + movq -104(%r8), %rdx + movq %rax, -112(%r8) + sbbq -104(%rsi), %rdx + movq -96(%r8), %rax + movq %rdx, -104(%r8) + sbbq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq 96(%r8), %rax + movq %rdx, 88(%r8) + sbbq 96(%rsi), %rax + movq 104(%r8), %rdx + movq %rax, 96(%r8) + sbbq 104(%rsi), %rdx + movq 112(%r8), %rax + movq %rdx, 104(%r8) + sbbq 112(%rsi), %rax + movq 120(%r8), %rdx + movq %rax, 112(%r8) + sbbq 120(%rsi), %rdx + movq 128(%r8), %rax + movq %rdx, 120(%r8) + sbbq 128(%rsi), %rax + movq 136(%r8), %rdx + movq %rax, 128(%r8) + sbbq 136(%rsi), %rdx + movq 144(%r8), %rax + movq %rdx, 136(%r8) + sbbq 144(%rsi), %rax + movq 152(%r8), %rdx + movq %rax, 144(%r8) + sbbq 152(%rsi), %rdx + movq 160(%r8), %rax + movq %rdx, 152(%r8) + sbbq 160(%rsi), %rax + movq 168(%r8), %rdx + movq %rax, 160(%r8) + sbbq 168(%rsi), %rdx + movq 176(%r8), %rax + movq %rdx, 168(%r8) + sbbq 176(%rsi), %rax + movq 184(%r8), %rdx + movq %rax, 176(%r8) + sbbq 184(%rsi), %rdx + movq 192(%r8), %rax + movq %rdx, 184(%r8) + sbbq 192(%rsi), %rax + movq 200(%r8), %rdx + movq %rax, 192(%r8) + sbbq 200(%rsi), %rdx + movq 208(%r8), %rax + movq %rdx, 200(%r8) + sbbq 208(%rsi), %rax + movq 216(%r8), %rdx + movq %rax, 208(%r8) + sbbq 216(%rsi), %rdx + movq 224(%r8), %rax + movq %rdx, 216(%r8) + sbbq 224(%rsi), %rax + movq 232(%r8), %rdx + movq %rax, 224(%r8) + sbbq 232(%rsi), %rdx + movq 240(%r8), %rax + movq %rdx, 232(%r8) + sbbq 240(%rsi), %rax + movq 248(%r8), %rdx + movq %rax, 240(%r8) + sbbq 248(%rsi), %rdx + movq %rdx, 248(%r8) sbbq $0x00, %rcx - movq (%r8), %rdx - subq (%rdi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rdi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rdi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rdi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rdi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rdi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rdi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rdi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rdi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rdi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rdi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rdi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rdi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rdi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rdi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rdi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rdi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rdi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rdi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rdi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rdi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rdi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rdi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rdi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rdi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rdi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rdi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rdi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rdi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rdi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rdi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rdi), %rax - movq 256(%r8), %rdx - movq %rax, 248(%r8) - sbbq 256(%rdi), %rdx - movq 264(%r8), %rax - movq %rdx, 256(%r8) - sbbq 264(%rdi), %rax - movq 272(%r8), %rdx - movq %rax, 264(%r8) - sbbq 272(%rdi), %rdx - movq 280(%r8), %rax - movq %rdx, 272(%r8) - sbbq 280(%rdi), %rax - movq 288(%r8), %rdx - movq %rax, 280(%r8) - sbbq 288(%rdi), %rdx - movq 296(%r8), %rax - movq %rdx, 288(%r8) - sbbq 296(%rdi), %rax - movq 304(%r8), %rdx - movq %rax, 296(%r8) - sbbq 304(%rdi), %rdx - movq 312(%r8), %rax - movq %rdx, 304(%r8) - sbbq 312(%rdi), %rax - movq 320(%r8), %rdx - movq %rax, 312(%r8) - sbbq 320(%rdi), %rdx - movq 328(%r8), %rax - movq %rdx, 320(%r8) - sbbq 328(%rdi), %rax - movq 336(%r8), %rdx - movq %rax, 328(%r8) - sbbq 336(%rdi), %rdx - movq 344(%r8), %rax - movq %rdx, 336(%r8) - sbbq 344(%rdi), %rax - movq 352(%r8), %rdx - movq %rax, 344(%r8) - sbbq 352(%rdi), %rdx - movq 360(%r8), %rax - movq %rdx, 352(%r8) - sbbq 360(%rdi), %rax - movq 368(%r8), %rdx - movq %rax, 360(%r8) - sbbq 368(%rdi), %rdx - movq 376(%r8), %rax - movq %rdx, 368(%r8) - sbbq 376(%rdi), %rax - movq 384(%r8), %rdx - movq %rax, 376(%r8) - sbbq 384(%rdi), %rdx - movq 392(%r8), %rax - movq %rdx, 384(%r8) - sbbq 392(%rdi), %rax - movq 400(%r8), %rdx - movq %rax, 392(%r8) - sbbq 400(%rdi), %rdx - movq 408(%r8), %rax - movq %rdx, 400(%r8) - sbbq 408(%rdi), %rax - movq 416(%r8), %rdx - movq %rax, 408(%r8) - sbbq 416(%rdi), %rdx - movq 424(%r8), %rax - movq %rdx, 416(%r8) - sbbq 424(%rdi), %rax - movq 432(%r8), %rdx - movq %rax, 424(%r8) - sbbq 432(%rdi), %rdx - movq 440(%r8), %rax - movq %rdx, 432(%r8) - sbbq 440(%rdi), %rax - movq 448(%r8), %rdx - movq %rax, 440(%r8) - sbbq 448(%rdi), %rdx - movq 456(%r8), %rax - movq %rdx, 448(%r8) - sbbq 456(%rdi), %rax - movq 464(%r8), %rdx - movq %rax, 456(%r8) - sbbq 464(%rdi), %rdx - movq 472(%r8), %rax - movq %rdx, 464(%r8) - sbbq 472(%rdi), %rax - movq 480(%r8), %rdx - movq %rax, 472(%r8) - sbbq 480(%rdi), %rdx - movq 488(%r8), %rax - movq %rdx, 480(%r8) - sbbq 488(%rdi), %rax - movq 496(%r8), %rdx - movq %rax, 488(%r8) - sbbq 496(%rdi), %rdx - movq 504(%r8), %rax - movq %rdx, 496(%r8) - sbbq 504(%rdi), %rax - movq %rax, 504(%r8) + subq $0x200, %rsi + movq -256(%r8), %rax + subq -256(%rsi), %rax + movq -248(%r8), %rdx + movq %rax, -256(%r8) + sbbq -248(%rsi), %rdx + movq -240(%r8), %rax + movq %rdx, -248(%r8) + sbbq -240(%rsi), %rax + movq -232(%r8), %rdx + movq %rax, -240(%r8) + sbbq -232(%rsi), %rdx + movq -224(%r8), %rax + movq %rdx, -232(%r8) + sbbq -224(%rsi), %rax + movq -216(%r8), %rdx + movq %rax, -224(%r8) + sbbq -216(%rsi), %rdx + movq -208(%r8), %rax + movq %rdx, -216(%r8) + sbbq -208(%rsi), %rax + movq -200(%r8), %rdx + movq %rax, -208(%r8) + sbbq -200(%rsi), %rdx + movq -192(%r8), %rax + movq %rdx, -200(%r8) + sbbq -192(%rsi), %rax + movq -184(%r8), %rdx + movq %rax, -192(%r8) + sbbq -184(%rsi), %rdx + movq -176(%r8), %rax + movq %rdx, -184(%r8) + sbbq -176(%rsi), %rax + movq -168(%r8), %rdx + movq %rax, -176(%r8) + sbbq -168(%rsi), %rdx + movq -160(%r8), %rax + movq %rdx, -168(%r8) + sbbq -160(%rsi), %rax + movq -152(%r8), %rdx + movq %rax, -160(%r8) + sbbq -152(%rsi), %rdx + movq -144(%r8), %rax + movq %rdx, -152(%r8) + sbbq -144(%rsi), %rax + movq -136(%r8), %rdx + movq %rax, -144(%r8) + sbbq -136(%rsi), %rdx + movq -128(%r8), %rax + movq %rdx, -136(%r8) + sbbq -128(%rsi), %rax + movq -120(%r8), %rdx + movq %rax, -128(%r8) + sbbq -120(%rsi), %rdx + movq -112(%r8), %rax + movq %rdx, -120(%r8) + sbbq -112(%rsi), %rax + movq -104(%r8), %rdx + movq %rax, -112(%r8) + sbbq -104(%rsi), %rdx + movq -96(%r8), %rax + movq %rdx, -104(%r8) + sbbq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq 96(%r8), %rax + movq %rdx, 88(%r8) + sbbq 96(%rsi), %rax + movq 104(%r8), %rdx + movq %rax, 96(%r8) + sbbq 104(%rsi), %rdx + movq 112(%r8), %rax + movq %rdx, 104(%r8) + sbbq 112(%rsi), %rax + movq 120(%r8), %rdx + movq %rax, 112(%r8) + sbbq 120(%rsi), %rdx + movq 128(%r8), %rax + movq %rdx, 120(%r8) + sbbq 128(%rsi), %rax + movq 136(%r8), %rdx + movq %rax, 128(%r8) + sbbq 136(%rsi), %rdx + movq 144(%r8), %rax + movq %rdx, 136(%r8) + sbbq 144(%rsi), %rax + movq 152(%r8), %rdx + movq %rax, 144(%r8) + sbbq 152(%rsi), %rdx + movq 160(%r8), %rax + movq %rdx, 152(%r8) + sbbq 160(%rsi), %rax + movq 168(%r8), %rdx + movq %rax, 160(%r8) + sbbq 168(%rsi), %rdx + movq 176(%r8), %rax + movq %rdx, 168(%r8) + sbbq 176(%rsi), %rax + movq 184(%r8), %rdx + movq %rax, 176(%r8) + sbbq 184(%rsi), %rdx + movq 192(%r8), %rax + movq %rdx, 184(%r8) + sbbq 192(%rsi), %rax + movq 200(%r8), %rdx + movq %rax, 192(%r8) + sbbq 200(%rsi), %rdx + movq 208(%r8), %rax + movq %rdx, 200(%r8) + sbbq 208(%rsi), %rax + movq 216(%r8), %rdx + movq %rax, 208(%r8) + sbbq 216(%rsi), %rdx + movq 224(%r8), %rax + movq %rdx, 216(%r8) + sbbq 224(%rsi), %rax + movq 232(%r8), %rdx + movq %rax, 224(%r8) + sbbq 232(%rsi), %rdx + movq 240(%r8), %rax + movq %rdx, 232(%r8) + sbbq 240(%rsi), %rax + movq 248(%r8), %rdx + movq %rax, 240(%r8) + sbbq 248(%rsi), %rdx + movq %rdx, 248(%r8) sbbq $0x00, %rcx - # Add in place - movq 256(%rdi), %rdx - addq (%r8), %rdx - movq 264(%rdi), %rax - movq %rdx, 256(%rdi) - adcq 8(%r8), %rax - movq 272(%rdi), %rdx - movq %rax, 264(%rdi) - adcq 16(%r8), %rdx - movq 280(%rdi), %rax - movq %rdx, 272(%rdi) - adcq 24(%r8), %rax - movq 288(%rdi), %rdx - movq %rax, 280(%rdi) - adcq 32(%r8), %rdx - movq 296(%rdi), %rax - movq %rdx, 288(%rdi) - adcq 40(%r8), %rax - movq 304(%rdi), %rdx - movq %rax, 296(%rdi) - adcq 48(%r8), %rdx - movq 312(%rdi), %rax - movq %rdx, 304(%rdi) - adcq 56(%r8), %rax - movq 320(%rdi), %rdx - movq %rax, 312(%rdi) - adcq 64(%r8), %rdx - movq 328(%rdi), %rax - movq %rdx, 320(%rdi) - adcq 72(%r8), %rax - movq 336(%rdi), %rdx - movq %rax, 328(%rdi) - adcq 80(%r8), %rdx - movq 344(%rdi), %rax - movq %rdx, 336(%rdi) - adcq 88(%r8), %rax - movq 352(%rdi), %rdx - movq %rax, 344(%rdi) - adcq 96(%r8), %rdx - movq 360(%rdi), %rax - movq %rdx, 352(%rdi) - adcq 104(%r8), %rax - movq 368(%rdi), %rdx - movq %rax, 360(%rdi) - adcq 112(%r8), %rdx - movq 376(%rdi), %rax - movq %rdx, 368(%rdi) - adcq 120(%r8), %rax - movq 384(%rdi), %rdx - movq %rax, 376(%rdi) - adcq 128(%r8), %rdx - movq 392(%rdi), %rax - movq %rdx, 384(%rdi) - adcq 136(%r8), %rax - movq 400(%rdi), %rdx - movq %rax, 392(%rdi) - adcq 144(%r8), %rdx - movq 408(%rdi), %rax - movq %rdx, 400(%rdi) - adcq 152(%r8), %rax - movq 416(%rdi), %rdx - movq %rax, 408(%rdi) - adcq 160(%r8), %rdx - movq 424(%rdi), %rax - movq %rdx, 416(%rdi) - adcq 168(%r8), %rax - movq 432(%rdi), %rdx - movq %rax, 424(%rdi) - adcq 176(%r8), %rdx - movq 440(%rdi), %rax - movq %rdx, 432(%rdi) - adcq 184(%r8), %rax - movq 448(%rdi), %rdx - movq %rax, 440(%rdi) - adcq 192(%r8), %rdx - movq 456(%rdi), %rax - movq %rdx, 448(%rdi) - adcq 200(%r8), %rax - movq 464(%rdi), %rdx - movq %rax, 456(%rdi) - adcq 208(%r8), %rdx - movq 472(%rdi), %rax - movq %rdx, 464(%rdi) - adcq 216(%r8), %rax - movq 480(%rdi), %rdx - movq %rax, 472(%rdi) - adcq 224(%r8), %rdx - movq 488(%rdi), %rax - movq %rdx, 480(%rdi) - adcq 232(%r8), %rax - movq 496(%rdi), %rdx - movq %rax, 488(%rdi) - adcq 240(%r8), %rdx - movq 504(%rdi), %rax - movq %rdx, 496(%rdi) - adcq 248(%r8), %rax - movq 512(%rdi), %rdx - movq %rax, 504(%rdi) - adcq 256(%r8), %rdx - movq 520(%rdi), %rax - movq %rdx, 512(%rdi) - adcq 264(%r8), %rax - movq 528(%rdi), %rdx - movq %rax, 520(%rdi) - adcq 272(%r8), %rdx - movq 536(%rdi), %rax - movq %rdx, 528(%rdi) - adcq 280(%r8), %rax - movq 544(%rdi), %rdx - movq %rax, 536(%rdi) - adcq 288(%r8), %rdx - movq 552(%rdi), %rax - movq %rdx, 544(%rdi) - adcq 296(%r8), %rax - movq 560(%rdi), %rdx - movq %rax, 552(%rdi) - adcq 304(%r8), %rdx - movq 568(%rdi), %rax - movq %rdx, 560(%rdi) - adcq 312(%r8), %rax - movq 576(%rdi), %rdx - movq %rax, 568(%rdi) - adcq 320(%r8), %rdx - movq 584(%rdi), %rax - movq %rdx, 576(%rdi) - adcq 328(%r8), %rax - movq 592(%rdi), %rdx - movq %rax, 584(%rdi) - adcq 336(%r8), %rdx - movq 600(%rdi), %rax - movq %rdx, 592(%rdi) - adcq 344(%r8), %rax - movq 608(%rdi), %rdx - movq %rax, 600(%rdi) - adcq 352(%r8), %rdx - movq 616(%rdi), %rax - movq %rdx, 608(%rdi) - adcq 360(%r8), %rax - movq 624(%rdi), %rdx - movq %rax, 616(%rdi) - adcq 368(%r8), %rdx - movq 632(%rdi), %rax - movq %rdx, 624(%rdi) - adcq 376(%r8), %rax - movq 640(%rdi), %rdx - movq %rax, 632(%rdi) - adcq 384(%r8), %rdx - movq 648(%rdi), %rax - movq %rdx, 640(%rdi) - adcq 392(%r8), %rax - movq 656(%rdi), %rdx - movq %rax, 648(%rdi) - adcq 400(%r8), %rdx - movq 664(%rdi), %rax - movq %rdx, 656(%rdi) - adcq 408(%r8), %rax - movq 672(%rdi), %rdx - movq %rax, 664(%rdi) - adcq 416(%r8), %rdx - movq 680(%rdi), %rax - movq %rdx, 672(%rdi) - adcq 424(%r8), %rax - movq 688(%rdi), %rdx - movq %rax, 680(%rdi) - adcq 432(%r8), %rdx - movq 696(%rdi), %rax - movq %rdx, 688(%rdi) - adcq 440(%r8), %rax - movq 704(%rdi), %rdx - movq %rax, 696(%rdi) - adcq 448(%r8), %rdx - movq 712(%rdi), %rax - movq %rdx, 704(%rdi) - adcq 456(%r8), %rax - movq 720(%rdi), %rdx - movq %rax, 712(%rdi) - adcq 464(%r8), %rdx - movq 728(%rdi), %rax - movq %rdx, 720(%rdi) - adcq 472(%r8), %rax - movq 736(%rdi), %rdx - movq %rax, 728(%rdi) - adcq 480(%r8), %rdx - movq 744(%rdi), %rax - movq %rdx, 736(%rdi) - adcq 488(%r8), %rax - movq 752(%rdi), %rdx - movq %rax, 744(%rdi) - adcq 496(%r8), %rdx - movq 760(%rdi), %rax - movq %rdx, 752(%rdi) - adcq 504(%r8), %rax - movq %rax, 760(%rdi) - adcq $0x00, %rcx - movq %rcx, 768(%rdi) - # Add in place - movq 512(%rdi), %rdx - xorq %rcx, %rcx - addq (%rsi), %rdx - movq 520(%rdi), %rax - movq %rdx, 512(%rdi) - adcq 8(%rsi), %rax - movq 528(%rdi), %rdx - movq %rax, 520(%rdi) - adcq 16(%rsi), %rdx - movq 536(%rdi), %rax - movq %rdx, 528(%rdi) - adcq 24(%rsi), %rax - movq 544(%rdi), %rdx - movq %rax, 536(%rdi) - adcq 32(%rsi), %rdx - movq 552(%rdi), %rax - movq %rdx, 544(%rdi) - adcq 40(%rsi), %rax - movq 560(%rdi), %rdx - movq %rax, 552(%rdi) - adcq 48(%rsi), %rdx - movq 568(%rdi), %rax - movq %rdx, 560(%rdi) - adcq 56(%rsi), %rax - movq 576(%rdi), %rdx - movq %rax, 568(%rdi) - adcq 64(%rsi), %rdx - movq 584(%rdi), %rax - movq %rdx, 576(%rdi) - adcq 72(%rsi), %rax - movq 592(%rdi), %rdx - movq %rax, 584(%rdi) - adcq 80(%rsi), %rdx - movq 600(%rdi), %rax - movq %rdx, 592(%rdi) - adcq 88(%rsi), %rax - movq 608(%rdi), %rdx - movq %rax, 600(%rdi) - adcq 96(%rsi), %rdx - movq 616(%rdi), %rax - movq %rdx, 608(%rdi) - adcq 104(%rsi), %rax - movq 624(%rdi), %rdx - movq %rax, 616(%rdi) - adcq 112(%rsi), %rdx - movq 632(%rdi), %rax - movq %rdx, 624(%rdi) - adcq 120(%rsi), %rax - movq 640(%rdi), %rdx - movq %rax, 632(%rdi) - adcq 128(%rsi), %rdx - movq 648(%rdi), %rax - movq %rdx, 640(%rdi) - adcq 136(%rsi), %rax - movq 656(%rdi), %rdx - movq %rax, 648(%rdi) - adcq 144(%rsi), %rdx - movq 664(%rdi), %rax - movq %rdx, 656(%rdi) - adcq 152(%rsi), %rax - movq 672(%rdi), %rdx - movq %rax, 664(%rdi) - adcq 160(%rsi), %rdx - movq 680(%rdi), %rax - movq %rdx, 672(%rdi) - adcq 168(%rsi), %rax - movq 688(%rdi), %rdx - movq %rax, 680(%rdi) - adcq 176(%rsi), %rdx - movq 696(%rdi), %rax - movq %rdx, 688(%rdi) - adcq 184(%rsi), %rax - movq 704(%rdi), %rdx - movq %rax, 696(%rdi) - adcq 192(%rsi), %rdx - movq 712(%rdi), %rax - movq %rdx, 704(%rdi) - adcq 200(%rsi), %rax - movq 720(%rdi), %rdx - movq %rax, 712(%rdi) - adcq 208(%rsi), %rdx - movq 728(%rdi), %rax - movq %rdx, 720(%rdi) - adcq 216(%rsi), %rax - movq 736(%rdi), %rdx - movq %rax, 728(%rdi) - adcq 224(%rsi), %rdx - movq 744(%rdi), %rax - movq %rdx, 736(%rdi) - adcq 232(%rsi), %rax - movq 752(%rdi), %rdx - movq %rax, 744(%rdi) - adcq 240(%rsi), %rdx - movq 760(%rdi), %rax - movq %rdx, 752(%rdi) - adcq 248(%rsi), %rax - movq 768(%rdi), %rdx - movq %rax, 760(%rdi) - adcq 256(%rsi), %rdx - movq %rdx, 768(%rdi) - adcq $0x00, %rcx - # Add to zero - movq 264(%rsi), %rdx + movq 512(%rsp), %rdi + negq %rcx + addq $0x200, %rdi + movq -256(%rdi), %rax + subq -256(%r8), %rax + movq -248(%rdi), %rdx + movq %rax, -256(%rdi) + sbbq -248(%r8), %rdx + movq -240(%rdi), %rax + movq %rdx, -248(%rdi) + sbbq -240(%r8), %rax + movq -232(%rdi), %rdx + movq %rax, -240(%rdi) + sbbq -232(%r8), %rdx + movq -224(%rdi), %rax + movq %rdx, -232(%rdi) + sbbq -224(%r8), %rax + movq -216(%rdi), %rdx + movq %rax, -224(%rdi) + sbbq -216(%r8), %rdx + movq -208(%rdi), %rax + movq %rdx, -216(%rdi) + sbbq -208(%r8), %rax + movq -200(%rdi), %rdx + movq %rax, -208(%rdi) + sbbq -200(%r8), %rdx + movq -192(%rdi), %rax + movq %rdx, -200(%rdi) + sbbq -192(%r8), %rax + movq -184(%rdi), %rdx + movq %rax, -192(%rdi) + sbbq -184(%r8), %rdx + movq -176(%rdi), %rax + movq %rdx, -184(%rdi) + sbbq -176(%r8), %rax + movq -168(%rdi), %rdx + movq %rax, -176(%rdi) + sbbq -168(%r8), %rdx + movq -160(%rdi), %rax + movq %rdx, -168(%rdi) + sbbq -160(%r8), %rax + movq -152(%rdi), %rdx + movq %rax, -160(%rdi) + sbbq -152(%r8), %rdx + movq -144(%rdi), %rax + movq %rdx, -152(%rdi) + sbbq -144(%r8), %rax + movq -136(%rdi), %rdx + movq %rax, -144(%rdi) + sbbq -136(%r8), %rdx + movq -128(%rdi), %rax + movq %rdx, -136(%rdi) + sbbq -128(%r8), %rax + movq -120(%rdi), %rdx + movq %rax, -128(%rdi) + sbbq -120(%r8), %rdx + movq -112(%rdi), %rax + movq %rdx, -120(%rdi) + sbbq -112(%r8), %rax + movq -104(%rdi), %rdx + movq %rax, -112(%rdi) + sbbq -104(%r8), %rdx + movq -96(%rdi), %rax + movq %rdx, -104(%rdi) + sbbq -96(%r8), %rax + movq -88(%rdi), %rdx + movq %rax, -96(%rdi) + sbbq -88(%r8), %rdx + movq -80(%rdi), %rax + movq %rdx, -88(%rdi) + sbbq -80(%r8), %rax + movq -72(%rdi), %rdx + movq %rax, -80(%rdi) + sbbq -72(%r8), %rdx + movq -64(%rdi), %rax + movq %rdx, -72(%rdi) + sbbq -64(%r8), %rax + movq -56(%rdi), %rdx + movq %rax, -64(%rdi) + sbbq -56(%r8), %rdx + movq -48(%rdi), %rax + movq %rdx, -56(%rdi) + sbbq -48(%r8), %rax + movq -40(%rdi), %rdx + movq %rax, -48(%rdi) + sbbq -40(%r8), %rdx + movq -32(%rdi), %rax + movq %rdx, -40(%rdi) + sbbq -32(%r8), %rax + movq -24(%rdi), %rdx + movq %rax, -32(%rdi) + sbbq -24(%r8), %rdx + movq -16(%rdi), %rax + movq %rdx, -24(%rdi) + sbbq -16(%r8), %rax + movq -8(%rdi), %rdx + movq %rax, -16(%rdi) + sbbq -8(%r8), %rdx + movq (%rdi), %rax + movq %rdx, -8(%rdi) + sbbq (%r8), %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) + sbbq 8(%r8), %rdx + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) + sbbq 16(%r8), %rax + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) + sbbq 24(%r8), %rdx + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) + sbbq 32(%r8), %rax + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) + sbbq 40(%r8), %rdx + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) + sbbq 48(%r8), %rax + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) + sbbq 56(%r8), %rdx + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) + sbbq 64(%r8), %rax + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) + sbbq 72(%r8), %rdx + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) + sbbq 80(%r8), %rax + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) + sbbq 88(%r8), %rdx + movq 96(%rdi), %rax + movq %rdx, 88(%rdi) + sbbq 96(%r8), %rax + movq 104(%rdi), %rdx + movq %rax, 96(%rdi) + sbbq 104(%r8), %rdx + movq 112(%rdi), %rax + movq %rdx, 104(%rdi) + sbbq 112(%r8), %rax + movq 120(%rdi), %rdx + movq %rax, 112(%rdi) + sbbq 120(%r8), %rdx + movq 128(%rdi), %rax + movq %rdx, 120(%rdi) + sbbq 128(%r8), %rax + movq 136(%rdi), %rdx + movq %rax, 128(%rdi) + sbbq 136(%r8), %rdx + movq 144(%rdi), %rax + movq %rdx, 136(%rdi) + sbbq 144(%r8), %rax + movq 152(%rdi), %rdx + movq %rax, 144(%rdi) + sbbq 152(%r8), %rdx + movq 160(%rdi), %rax + movq %rdx, 152(%rdi) + sbbq 160(%r8), %rax + movq 168(%rdi), %rdx + movq %rax, 160(%rdi) + sbbq 168(%r8), %rdx + movq 176(%rdi), %rax + movq %rdx, 168(%rdi) + sbbq 176(%r8), %rax + movq 184(%rdi), %rdx + movq %rax, 176(%rdi) + sbbq 184(%r8), %rdx + movq 192(%rdi), %rax + movq %rdx, 184(%rdi) + sbbq 192(%r8), %rax + movq 200(%rdi), %rdx + movq %rax, 192(%rdi) + sbbq 200(%r8), %rdx + movq 208(%rdi), %rax + movq %rdx, 200(%rdi) + sbbq 208(%r8), %rax + movq 216(%rdi), %rdx + movq %rax, 208(%rdi) + sbbq 216(%r8), %rdx + movq 224(%rdi), %rax + movq %rdx, 216(%rdi) + sbbq 224(%r8), %rax + movq 232(%rdi), %rdx + movq %rax, 224(%rdi) + sbbq 232(%r8), %rdx + movq 240(%rdi), %rax + movq %rdx, 232(%rdi) + sbbq 240(%r8), %rax + movq 248(%rdi), %rdx + movq %rax, 240(%rdi) + sbbq 248(%r8), %rdx + movq %rdx, 248(%rdi) + sbbq $0x00, %rcx + movq 512(%rsp), %rdi + addq $0x300, %rdi + # Add in word + movq (%rdi), %rax + addq %rcx, %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) adcq $0x00, %rdx - movq 272(%rsi), %rax - movq %rdx, 776(%rdi) + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) adcq $0x00, %rax - movq 280(%rsi), %rdx - movq %rax, 784(%rdi) + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) adcq $0x00, %rdx - movq 288(%rsi), %rax - movq %rdx, 792(%rdi) + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) adcq $0x00, %rax - movq 296(%rsi), %rdx - movq %rax, 800(%rdi) + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) adcq $0x00, %rdx - movq 304(%rsi), %rax - movq %rdx, 808(%rdi) + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) adcq $0x00, %rax - movq 312(%rsi), %rdx - movq %rax, 816(%rdi) + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) adcq $0x00, %rdx - movq 320(%rsi), %rax - movq %rdx, 824(%rdi) + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) adcq $0x00, %rax - movq 328(%rsi), %rdx - movq %rax, 832(%rdi) + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) adcq $0x00, %rdx - movq 336(%rsi), %rax - movq %rdx, 840(%rdi) + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) adcq $0x00, %rax - movq 344(%rsi), %rdx - movq %rax, 848(%rdi) + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) adcq $0x00, %rdx - movq 352(%rsi), %rax - movq %rdx, 856(%rdi) + movq 96(%rdi), %rax + movq %rdx, 88(%rdi) adcq $0x00, %rax - movq 360(%rsi), %rdx - movq %rax, 864(%rdi) + movq 104(%rdi), %rdx + movq %rax, 96(%rdi) adcq $0x00, %rdx - movq 368(%rsi), %rax - movq %rdx, 872(%rdi) + movq 112(%rdi), %rax + movq %rdx, 104(%rdi) adcq $0x00, %rax - movq 376(%rsi), %rdx - movq %rax, 880(%rdi) + movq 120(%rdi), %rdx + movq %rax, 112(%rdi) adcq $0x00, %rdx - movq 384(%rsi), %rax - movq %rdx, 888(%rdi) + movq 128(%rdi), %rax + movq %rdx, 120(%rdi) adcq $0x00, %rax - movq 392(%rsi), %rdx - movq %rax, 896(%rdi) + movq 136(%rdi), %rdx + movq %rax, 128(%rdi) adcq $0x00, %rdx - movq 400(%rsi), %rax - movq %rdx, 904(%rdi) + movq 144(%rdi), %rax + movq %rdx, 136(%rdi) adcq $0x00, %rax - movq 408(%rsi), %rdx - movq %rax, 912(%rdi) + movq 152(%rdi), %rdx + movq %rax, 144(%rdi) adcq $0x00, %rdx - movq 416(%rsi), %rax - movq %rdx, 920(%rdi) + movq 160(%rdi), %rax + movq %rdx, 152(%rdi) adcq $0x00, %rax - movq 424(%rsi), %rdx - movq %rax, 928(%rdi) + movq 168(%rdi), %rdx + movq %rax, 160(%rdi) adcq $0x00, %rdx - movq 432(%rsi), %rax - movq %rdx, 936(%rdi) + movq 176(%rdi), %rax + movq %rdx, 168(%rdi) adcq $0x00, %rax - movq 440(%rsi), %rdx - movq %rax, 944(%rdi) + movq 184(%rdi), %rdx + movq %rax, 176(%rdi) adcq $0x00, %rdx - movq 448(%rsi), %rax - movq %rdx, 952(%rdi) + movq 192(%rdi), %rax + movq %rdx, 184(%rdi) adcq $0x00, %rax - movq 456(%rsi), %rdx - movq %rax, 960(%rdi) + movq 200(%rdi), %rdx + movq %rax, 192(%rdi) adcq $0x00, %rdx - movq 464(%rsi), %rax - movq %rdx, 968(%rdi) + movq 208(%rdi), %rax + movq %rdx, 200(%rdi) adcq $0x00, %rax - movq 472(%rsi), %rdx - movq %rax, 976(%rdi) + movq 216(%rdi), %rdx + movq %rax, 208(%rdi) adcq $0x00, %rdx - movq 480(%rsi), %rax - movq %rdx, 984(%rdi) + movq 224(%rdi), %rax + movq %rdx, 216(%rdi) adcq $0x00, %rax - movq 488(%rsi), %rdx - movq %rax, 992(%rdi) + movq 232(%rdi), %rdx + movq %rax, 224(%rdi) adcq $0x00, %rdx - movq 496(%rsi), %rax - movq %rdx, 1000(%rdi) + movq 240(%rdi), %rax + movq %rdx, 232(%rdi) adcq $0x00, %rax - movq 504(%rsi), %rdx - movq %rax, 1008(%rdi) + movq 248(%rdi), %rdx + movq %rax, 240(%rdi) adcq $0x00, %rdx - movq %rdx, 1016(%rdi) - addq $0x518, %rsp + movq %rdx, 248(%rdi) + movq 520(%rsp), %rsi + movq 512(%rsp), %rdi + addq $0x210, %rsp repz retq #ifndef __APPLE__ .size sp_4096_sqr_64,.-sp_4096_sqr_64 #endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 /* Square a and put result in r. (r = a * a) + * + * Karatsuba: ah^2, al^2, (al - ah)^2 * * r A single precision integer. * a A single precision integer. @@ -32955,111 +45991,271 @@ sp_4096_sqr_avx2_64: .p2align 4 _sp_4096_sqr_avx2_64: #endif /* __APPLE__ */ - subq $0x518, %rsp - movq %rdi, 1280(%rsp) - movq %rsi, 1288(%rsp) - leaq 1024(%rsp), %r8 + subq $0x210, %rsp + movq %rdi, 512(%rsp) + movq %rsi, 520(%rsp) + movq $0x00, %rcx + movq %rsp, %r8 leaq 256(%rsi), %r9 - # Add movq (%rsi), %rdx - xorq %rcx, %rcx - addq (%r9), %rdx + subq (%r9), %rdx movq 8(%rsi), %rax movq %rdx, (%r8) - adcq 8(%r9), %rax + sbbq 8(%r9), %rax movq 16(%rsi), %rdx movq %rax, 8(%r8) - adcq 16(%r9), %rdx + sbbq 16(%r9), %rdx movq 24(%rsi), %rax movq %rdx, 16(%r8) - adcq 24(%r9), %rax + sbbq 24(%r9), %rax movq 32(%rsi), %rdx movq %rax, 24(%r8) - adcq 32(%r9), %rdx + sbbq 32(%r9), %rdx movq 40(%rsi), %rax movq %rdx, 32(%r8) - adcq 40(%r9), %rax + sbbq 40(%r9), %rax movq 48(%rsi), %rdx movq %rax, 40(%r8) - adcq 48(%r9), %rdx + sbbq 48(%r9), %rdx movq 56(%rsi), %rax movq %rdx, 48(%r8) - adcq 56(%r9), %rax + sbbq 56(%r9), %rax movq 64(%rsi), %rdx movq %rax, 56(%r8) - adcq 64(%r9), %rdx + sbbq 64(%r9), %rdx movq 72(%rsi), %rax movq %rdx, 64(%r8) - adcq 72(%r9), %rax + sbbq 72(%r9), %rax movq 80(%rsi), %rdx movq %rax, 72(%r8) - adcq 80(%r9), %rdx + sbbq 80(%r9), %rdx movq 88(%rsi), %rax movq %rdx, 80(%r8) - adcq 88(%r9), %rax + sbbq 88(%r9), %rax movq 96(%rsi), %rdx movq %rax, 88(%r8) - adcq 96(%r9), %rdx + sbbq 96(%r9), %rdx movq 104(%rsi), %rax movq %rdx, 96(%r8) - adcq 104(%r9), %rax + sbbq 104(%r9), %rax movq 112(%rsi), %rdx movq %rax, 104(%r8) - adcq 112(%r9), %rdx + sbbq 112(%r9), %rdx movq 120(%rsi), %rax movq %rdx, 112(%r8) - adcq 120(%r9), %rax + sbbq 120(%r9), %rax movq 128(%rsi), %rdx movq %rax, 120(%r8) - adcq 128(%r9), %rdx + sbbq 128(%r9), %rdx movq 136(%rsi), %rax movq %rdx, 128(%r8) - adcq 136(%r9), %rax + sbbq 136(%r9), %rax movq 144(%rsi), %rdx movq %rax, 136(%r8) - adcq 144(%r9), %rdx + sbbq 144(%r9), %rdx movq 152(%rsi), %rax movq %rdx, 144(%r8) - adcq 152(%r9), %rax + sbbq 152(%r9), %rax movq 160(%rsi), %rdx movq %rax, 152(%r8) - adcq 160(%r9), %rdx + sbbq 160(%r9), %rdx movq 168(%rsi), %rax movq %rdx, 160(%r8) - adcq 168(%r9), %rax + sbbq 168(%r9), %rax movq 176(%rsi), %rdx movq %rax, 168(%r8) - adcq 176(%r9), %rdx + sbbq 176(%r9), %rdx movq 184(%rsi), %rax movq %rdx, 176(%r8) - adcq 184(%r9), %rax + sbbq 184(%r9), %rax movq 192(%rsi), %rdx movq %rax, 184(%r8) - adcq 192(%r9), %rdx + sbbq 192(%r9), %rdx movq 200(%rsi), %rax movq %rdx, 192(%r8) - adcq 200(%r9), %rax + sbbq 200(%r9), %rax movq 208(%rsi), %rdx movq %rax, 200(%r8) - adcq 208(%r9), %rdx + sbbq 208(%r9), %rdx movq 216(%rsi), %rax movq %rdx, 208(%r8) - adcq 216(%r9), %rax + sbbq 216(%r9), %rax movq 224(%rsi), %rdx movq %rax, 216(%r8) - adcq 224(%r9), %rdx + sbbq 224(%r9), %rdx movq 232(%rsi), %rax movq %rdx, 224(%r8) - adcq 232(%r9), %rax + sbbq 232(%r9), %rax movq 240(%rsi), %rdx movq %rax, 232(%r8) - adcq 240(%r9), %rdx + sbbq 240(%r9), %rdx movq 248(%rsi), %rax movq %rdx, 240(%r8) - adcq 248(%r9), %rax + sbbq 248(%r9), %rax + movq %rax, 248(%r8) + sbbq $0x00, %rcx + # Cond Negate + movq (%r8), %rdx + movq %rcx, %r9 + xorq %rcx, %rdx + negq %r9 + subq %rcx, %rdx + movq 8(%r8), %rax + sbbq $0x00, %r9 + movq %rdx, (%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 16(%r8), %rdx + setc %r9b + movq %rax, 8(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 24(%r8), %rax + setc %r9b + movq %rdx, 16(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 32(%r8), %rdx + setc %r9b + movq %rax, 24(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 40(%r8), %rax + setc %r9b + movq %rdx, 32(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 48(%r8), %rdx + setc %r9b + movq %rax, 40(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 56(%r8), %rax + setc %r9b + movq %rdx, 48(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 64(%r8), %rdx + setc %r9b + movq %rax, 56(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 72(%r8), %rax + setc %r9b + movq %rdx, 64(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 80(%r8), %rdx + setc %r9b + movq %rax, 72(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 88(%r8), %rax + setc %r9b + movq %rdx, 80(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 96(%r8), %rdx + setc %r9b + movq %rax, 88(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 104(%r8), %rax + setc %r9b + movq %rdx, 96(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 112(%r8), %rdx + setc %r9b + movq %rax, 104(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 120(%r8), %rax + setc %r9b + movq %rdx, 112(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 128(%r8), %rdx + setc %r9b + movq %rax, 120(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 136(%r8), %rax + setc %r9b + movq %rdx, 128(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 144(%r8), %rdx + setc %r9b + movq %rax, 136(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 152(%r8), %rax + setc %r9b + movq %rdx, 144(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 160(%r8), %rdx + setc %r9b + movq %rax, 152(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 168(%r8), %rax + setc %r9b + movq %rdx, 160(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 176(%r8), %rdx + setc %r9b + movq %rax, 168(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 184(%r8), %rax + setc %r9b + movq %rdx, 176(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 192(%r8), %rdx + setc %r9b + movq %rax, 184(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 200(%r8), %rax + setc %r9b + movq %rdx, 192(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 208(%r8), %rdx + setc %r9b + movq %rax, 200(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 216(%r8), %rax + setc %r9b + movq %rdx, 208(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 224(%r8), %rdx + setc %r9b + movq %rax, 216(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 232(%r8), %rax + setc %r9b + movq %rdx, 224(%r8) + xorq %rcx, %rax + addq %r9, %rax + movq 240(%r8), %rdx + setc %r9b + movq %rax, 232(%r8) + xorq %rcx, %rdx + addq %r9, %rdx + movq 248(%r8), %rax + setc %r9b + movq %rdx, 240(%r8) + xorq %rcx, %rax + addq %r9, %rax movq %rax, 248(%r8) - adcq $0x00, %rcx - movq %rcx, 1296(%rsp) movq %r8, %rsi movq %rsp, %rdi #ifndef __APPLE__ @@ -33067,938 +46263,715 @@ _sp_4096_sqr_avx2_64: #else callq _sp_2048_sqr_avx2_32 #endif /* __APPLE__ */ - movq 1288(%rsp), %rsi - leaq 512(%rsp), %rdi + movq 520(%rsp), %rsi + movq 512(%rsp), %rdi addq $0x100, %rsi + addq $0x200, %rdi #ifndef __APPLE__ callq sp_2048_sqr_avx2_32@plt #else callq _sp_2048_sqr_avx2_32 #endif /* __APPLE__ */ - movq 1288(%rsp), %rsi - movq 1280(%rsp), %rdi + movq 520(%rsp), %rsi + movq 512(%rsp), %rdi #ifndef __APPLE__ callq sp_2048_sqr_avx2_32@plt #else callq _sp_2048_sqr_avx2_32 #endif /* __APPLE__ */ #ifdef _WIN64 - movq 1288(%rsp), %rsi - movq 1280(%rsp), %rdi + movq 520(%rsp), %rsi + movq 512(%rsp), %rdi #endif /* _WIN64 */ - movq 1296(%rsp), %r10 - leaq 1024(%rsp), %r8 - movq %r10, %rcx - negq %r10 - movq (%r8), %rdx - pextq %r10, %rdx, %rdx - addq %rdx, %rdx - movq 8(%r8), %rax - movq %rdx, 512(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 16(%r8), %rdx - movq %rax, 520(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 24(%r8), %rax - movq %rdx, 528(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 32(%r8), %rdx - movq %rax, 536(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 40(%r8), %rax - movq %rdx, 544(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 48(%r8), %rdx - movq %rax, 552(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 56(%r8), %rax - movq %rdx, 560(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 64(%r8), %rdx - movq %rax, 568(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 72(%r8), %rax - movq %rdx, 576(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 80(%r8), %rdx - movq %rax, 584(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 88(%r8), %rax - movq %rdx, 592(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 96(%r8), %rdx - movq %rax, 600(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 104(%r8), %rax - movq %rdx, 608(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 112(%r8), %rdx - movq %rax, 616(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 120(%r8), %rax - movq %rdx, 624(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 128(%r8), %rdx - movq %rax, 632(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 136(%r8), %rax - movq %rdx, 640(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 144(%r8), %rdx - movq %rax, 648(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 152(%r8), %rax - movq %rdx, 656(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 160(%r8), %rdx - movq %rax, 664(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 168(%r8), %rax - movq %rdx, 672(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 176(%r8), %rdx - movq %rax, 680(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 184(%r8), %rax - movq %rdx, 688(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 192(%r8), %rdx - movq %rax, 696(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 200(%r8), %rax - movq %rdx, 704(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 208(%r8), %rdx - movq %rax, 712(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 216(%r8), %rax - movq %rdx, 720(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 224(%r8), %rdx - movq %rax, 728(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 232(%r8), %rax - movq %rdx, 736(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq 240(%r8), %rdx - movq %rax, 744(%rdi) - pextq %r10, %rdx, %rdx - adcq %rdx, %rdx - movq 248(%r8), %rax - movq %rdx, 752(%rdi) - pextq %r10, %rax, %rax - adcq %rax, %rax - movq %rax, 760(%rdi) - adcq $0x00, %rcx - leaq 512(%rsp), %rsi - movq %rsp, %r8 - movq (%r8), %rdx - subq (%rsi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rsi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rsi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rsi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rsi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rsi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rsi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rsi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rsi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rsi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rsi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rsi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rsi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rsi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rsi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rsi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rsi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rsi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rsi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rsi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rsi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rsi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rsi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rsi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rsi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rsi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rsi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rsi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rsi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rsi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rsi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rsi), %rax - movq 256(%r8), %rdx - movq %rax, 248(%r8) - sbbq 256(%rsi), %rdx - movq 264(%r8), %rax - movq %rdx, 256(%r8) - sbbq 264(%rsi), %rax - movq 272(%r8), %rdx - movq %rax, 264(%r8) - sbbq 272(%rsi), %rdx - movq 280(%r8), %rax - movq %rdx, 272(%r8) - sbbq 280(%rsi), %rax - movq 288(%r8), %rdx - movq %rax, 280(%r8) - sbbq 288(%rsi), %rdx - movq 296(%r8), %rax - movq %rdx, 288(%r8) - sbbq 296(%rsi), %rax - movq 304(%r8), %rdx - movq %rax, 296(%r8) - sbbq 304(%rsi), %rdx - movq 312(%r8), %rax - movq %rdx, 304(%r8) - sbbq 312(%rsi), %rax - movq 320(%r8), %rdx - movq %rax, 312(%r8) - sbbq 320(%rsi), %rdx - movq 328(%r8), %rax - movq %rdx, 320(%r8) - sbbq 328(%rsi), %rax - movq 336(%r8), %rdx - movq %rax, 328(%r8) - sbbq 336(%rsi), %rdx - movq 344(%r8), %rax - movq %rdx, 336(%r8) - sbbq 344(%rsi), %rax - movq 352(%r8), %rdx - movq %rax, 344(%r8) - sbbq 352(%rsi), %rdx - movq 360(%r8), %rax - movq %rdx, 352(%r8) - sbbq 360(%rsi), %rax - movq 368(%r8), %rdx - movq %rax, 360(%r8) - sbbq 368(%rsi), %rdx - movq 376(%r8), %rax - movq %rdx, 368(%r8) - sbbq 376(%rsi), %rax - movq 384(%r8), %rdx - movq %rax, 376(%r8) - sbbq 384(%rsi), %rdx - movq 392(%r8), %rax - movq %rdx, 384(%r8) - sbbq 392(%rsi), %rax - movq 400(%r8), %rdx - movq %rax, 392(%r8) - sbbq 400(%rsi), %rdx - movq 408(%r8), %rax - movq %rdx, 400(%r8) - sbbq 408(%rsi), %rax - movq 416(%r8), %rdx - movq %rax, 408(%r8) - sbbq 416(%rsi), %rdx - movq 424(%r8), %rax - movq %rdx, 416(%r8) - sbbq 424(%rsi), %rax - movq 432(%r8), %rdx - movq %rax, 424(%r8) - sbbq 432(%rsi), %rdx - movq 440(%r8), %rax - movq %rdx, 432(%r8) - sbbq 440(%rsi), %rax - movq 448(%r8), %rdx - movq %rax, 440(%r8) - sbbq 448(%rsi), %rdx - movq 456(%r8), %rax - movq %rdx, 448(%r8) - sbbq 456(%rsi), %rax - movq 464(%r8), %rdx - movq %rax, 456(%r8) - sbbq 464(%rsi), %rdx - movq 472(%r8), %rax - movq %rdx, 464(%r8) - sbbq 472(%rsi), %rax - movq 480(%r8), %rdx - movq %rax, 472(%r8) - sbbq 480(%rsi), %rdx - movq 488(%r8), %rax - movq %rdx, 480(%r8) - sbbq 488(%rsi), %rax - movq 496(%r8), %rdx - movq %rax, 488(%r8) - sbbq 496(%rsi), %rdx - movq 504(%r8), %rax - movq %rdx, 496(%r8) - sbbq 504(%rsi), %rax - movq %rax, 504(%r8) + movq 512(%rsp), %rsi + leaq 256(%rsp), %r8 + addq $0x300, %rsi + movq $0x00, %rcx + movq -256(%r8), %rax + subq -256(%rsi), %rax + movq -248(%r8), %rdx + movq %rax, -256(%r8) + sbbq -248(%rsi), %rdx + movq -240(%r8), %rax + movq %rdx, -248(%r8) + sbbq -240(%rsi), %rax + movq -232(%r8), %rdx + movq %rax, -240(%r8) + sbbq -232(%rsi), %rdx + movq -224(%r8), %rax + movq %rdx, -232(%r8) + sbbq -224(%rsi), %rax + movq -216(%r8), %rdx + movq %rax, -224(%r8) + sbbq -216(%rsi), %rdx + movq -208(%r8), %rax + movq %rdx, -216(%r8) + sbbq -208(%rsi), %rax + movq -200(%r8), %rdx + movq %rax, -208(%r8) + sbbq -200(%rsi), %rdx + movq -192(%r8), %rax + movq %rdx, -200(%r8) + sbbq -192(%rsi), %rax + movq -184(%r8), %rdx + movq %rax, -192(%r8) + sbbq -184(%rsi), %rdx + movq -176(%r8), %rax + movq %rdx, -184(%r8) + sbbq -176(%rsi), %rax + movq -168(%r8), %rdx + movq %rax, -176(%r8) + sbbq -168(%rsi), %rdx + movq -160(%r8), %rax + movq %rdx, -168(%r8) + sbbq -160(%rsi), %rax + movq -152(%r8), %rdx + movq %rax, -160(%r8) + sbbq -152(%rsi), %rdx + movq -144(%r8), %rax + movq %rdx, -152(%r8) + sbbq -144(%rsi), %rax + movq -136(%r8), %rdx + movq %rax, -144(%r8) + sbbq -136(%rsi), %rdx + movq -128(%r8), %rax + movq %rdx, -136(%r8) + sbbq -128(%rsi), %rax + movq -120(%r8), %rdx + movq %rax, -128(%r8) + sbbq -120(%rsi), %rdx + movq -112(%r8), %rax + movq %rdx, -120(%r8) + sbbq -112(%rsi), %rax + movq -104(%r8), %rdx + movq %rax, -112(%r8) + sbbq -104(%rsi), %rdx + movq -96(%r8), %rax + movq %rdx, -104(%r8) + sbbq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq 96(%r8), %rax + movq %rdx, 88(%r8) + sbbq 96(%rsi), %rax + movq 104(%r8), %rdx + movq %rax, 96(%r8) + sbbq 104(%rsi), %rdx + movq 112(%r8), %rax + movq %rdx, 104(%r8) + sbbq 112(%rsi), %rax + movq 120(%r8), %rdx + movq %rax, 112(%r8) + sbbq 120(%rsi), %rdx + movq 128(%r8), %rax + movq %rdx, 120(%r8) + sbbq 128(%rsi), %rax + movq 136(%r8), %rdx + movq %rax, 128(%r8) + sbbq 136(%rsi), %rdx + movq 144(%r8), %rax + movq %rdx, 136(%r8) + sbbq 144(%rsi), %rax + movq 152(%r8), %rdx + movq %rax, 144(%r8) + sbbq 152(%rsi), %rdx + movq 160(%r8), %rax + movq %rdx, 152(%r8) + sbbq 160(%rsi), %rax + movq 168(%r8), %rdx + movq %rax, 160(%r8) + sbbq 168(%rsi), %rdx + movq 176(%r8), %rax + movq %rdx, 168(%r8) + sbbq 176(%rsi), %rax + movq 184(%r8), %rdx + movq %rax, 176(%r8) + sbbq 184(%rsi), %rdx + movq 192(%r8), %rax + movq %rdx, 184(%r8) + sbbq 192(%rsi), %rax + movq 200(%r8), %rdx + movq %rax, 192(%r8) + sbbq 200(%rsi), %rdx + movq 208(%r8), %rax + movq %rdx, 200(%r8) + sbbq 208(%rsi), %rax + movq 216(%r8), %rdx + movq %rax, 208(%r8) + sbbq 216(%rsi), %rdx + movq 224(%r8), %rax + movq %rdx, 216(%r8) + sbbq 224(%rsi), %rax + movq 232(%r8), %rdx + movq %rax, 224(%r8) + sbbq 232(%rsi), %rdx + movq 240(%r8), %rax + movq %rdx, 232(%r8) + sbbq 240(%rsi), %rax + movq 248(%r8), %rdx + movq %rax, 240(%r8) + sbbq 248(%rsi), %rdx + movq %rdx, 248(%r8) sbbq $0x00, %rcx - movq (%r8), %rdx - subq (%rdi), %rdx - movq 8(%r8), %rax - movq %rdx, (%r8) - sbbq 8(%rdi), %rax - movq 16(%r8), %rdx - movq %rax, 8(%r8) - sbbq 16(%rdi), %rdx - movq 24(%r8), %rax - movq %rdx, 16(%r8) - sbbq 24(%rdi), %rax - movq 32(%r8), %rdx - movq %rax, 24(%r8) - sbbq 32(%rdi), %rdx - movq 40(%r8), %rax - movq %rdx, 32(%r8) - sbbq 40(%rdi), %rax - movq 48(%r8), %rdx - movq %rax, 40(%r8) - sbbq 48(%rdi), %rdx - movq 56(%r8), %rax - movq %rdx, 48(%r8) - sbbq 56(%rdi), %rax - movq 64(%r8), %rdx - movq %rax, 56(%r8) - sbbq 64(%rdi), %rdx - movq 72(%r8), %rax - movq %rdx, 64(%r8) - sbbq 72(%rdi), %rax - movq 80(%r8), %rdx - movq %rax, 72(%r8) - sbbq 80(%rdi), %rdx - movq 88(%r8), %rax - movq %rdx, 80(%r8) - sbbq 88(%rdi), %rax - movq 96(%r8), %rdx - movq %rax, 88(%r8) - sbbq 96(%rdi), %rdx - movq 104(%r8), %rax - movq %rdx, 96(%r8) - sbbq 104(%rdi), %rax - movq 112(%r8), %rdx - movq %rax, 104(%r8) - sbbq 112(%rdi), %rdx - movq 120(%r8), %rax - movq %rdx, 112(%r8) - sbbq 120(%rdi), %rax - movq 128(%r8), %rdx - movq %rax, 120(%r8) - sbbq 128(%rdi), %rdx - movq 136(%r8), %rax - movq %rdx, 128(%r8) - sbbq 136(%rdi), %rax - movq 144(%r8), %rdx - movq %rax, 136(%r8) - sbbq 144(%rdi), %rdx - movq 152(%r8), %rax - movq %rdx, 144(%r8) - sbbq 152(%rdi), %rax - movq 160(%r8), %rdx - movq %rax, 152(%r8) - sbbq 160(%rdi), %rdx - movq 168(%r8), %rax - movq %rdx, 160(%r8) - sbbq 168(%rdi), %rax - movq 176(%r8), %rdx - movq %rax, 168(%r8) - sbbq 176(%rdi), %rdx - movq 184(%r8), %rax - movq %rdx, 176(%r8) - sbbq 184(%rdi), %rax - movq 192(%r8), %rdx - movq %rax, 184(%r8) - sbbq 192(%rdi), %rdx - movq 200(%r8), %rax - movq %rdx, 192(%r8) - sbbq 200(%rdi), %rax - movq 208(%r8), %rdx - movq %rax, 200(%r8) - sbbq 208(%rdi), %rdx - movq 216(%r8), %rax - movq %rdx, 208(%r8) - sbbq 216(%rdi), %rax - movq 224(%r8), %rdx - movq %rax, 216(%r8) - sbbq 224(%rdi), %rdx - movq 232(%r8), %rax - movq %rdx, 224(%r8) - sbbq 232(%rdi), %rax - movq 240(%r8), %rdx - movq %rax, 232(%r8) - sbbq 240(%rdi), %rdx - movq 248(%r8), %rax - movq %rdx, 240(%r8) - sbbq 248(%rdi), %rax - movq 256(%r8), %rdx - movq %rax, 248(%r8) - sbbq 256(%rdi), %rdx - movq 264(%r8), %rax - movq %rdx, 256(%r8) - sbbq 264(%rdi), %rax - movq 272(%r8), %rdx - movq %rax, 264(%r8) - sbbq 272(%rdi), %rdx - movq 280(%r8), %rax - movq %rdx, 272(%r8) - sbbq 280(%rdi), %rax - movq 288(%r8), %rdx - movq %rax, 280(%r8) - sbbq 288(%rdi), %rdx - movq 296(%r8), %rax - movq %rdx, 288(%r8) - sbbq 296(%rdi), %rax - movq 304(%r8), %rdx - movq %rax, 296(%r8) - sbbq 304(%rdi), %rdx - movq 312(%r8), %rax - movq %rdx, 304(%r8) - sbbq 312(%rdi), %rax - movq 320(%r8), %rdx - movq %rax, 312(%r8) - sbbq 320(%rdi), %rdx - movq 328(%r8), %rax - movq %rdx, 320(%r8) - sbbq 328(%rdi), %rax - movq 336(%r8), %rdx - movq %rax, 328(%r8) - sbbq 336(%rdi), %rdx - movq 344(%r8), %rax - movq %rdx, 336(%r8) - sbbq 344(%rdi), %rax - movq 352(%r8), %rdx - movq %rax, 344(%r8) - sbbq 352(%rdi), %rdx - movq 360(%r8), %rax - movq %rdx, 352(%r8) - sbbq 360(%rdi), %rax - movq 368(%r8), %rdx - movq %rax, 360(%r8) - sbbq 368(%rdi), %rdx - movq 376(%r8), %rax - movq %rdx, 368(%r8) - sbbq 376(%rdi), %rax - movq 384(%r8), %rdx - movq %rax, 376(%r8) - sbbq 384(%rdi), %rdx - movq 392(%r8), %rax - movq %rdx, 384(%r8) - sbbq 392(%rdi), %rax - movq 400(%r8), %rdx - movq %rax, 392(%r8) - sbbq 400(%rdi), %rdx - movq 408(%r8), %rax - movq %rdx, 400(%r8) - sbbq 408(%rdi), %rax - movq 416(%r8), %rdx - movq %rax, 408(%r8) - sbbq 416(%rdi), %rdx - movq 424(%r8), %rax - movq %rdx, 416(%r8) - sbbq 424(%rdi), %rax - movq 432(%r8), %rdx - movq %rax, 424(%r8) - sbbq 432(%rdi), %rdx - movq 440(%r8), %rax - movq %rdx, 432(%r8) - sbbq 440(%rdi), %rax - movq 448(%r8), %rdx - movq %rax, 440(%r8) - sbbq 448(%rdi), %rdx - movq 456(%r8), %rax - movq %rdx, 448(%r8) - sbbq 456(%rdi), %rax - movq 464(%r8), %rdx - movq %rax, 456(%r8) - sbbq 464(%rdi), %rdx - movq 472(%r8), %rax - movq %rdx, 464(%r8) - sbbq 472(%rdi), %rax - movq 480(%r8), %rdx - movq %rax, 472(%r8) - sbbq 480(%rdi), %rdx - movq 488(%r8), %rax - movq %rdx, 480(%r8) - sbbq 488(%rdi), %rax - movq 496(%r8), %rdx - movq %rax, 488(%r8) - sbbq 496(%rdi), %rdx - movq 504(%r8), %rax - movq %rdx, 496(%r8) - sbbq 504(%rdi), %rax - movq %rax, 504(%r8) + subq $0x200, %rsi + movq -256(%r8), %rax + subq -256(%rsi), %rax + movq -248(%r8), %rdx + movq %rax, -256(%r8) + sbbq -248(%rsi), %rdx + movq -240(%r8), %rax + movq %rdx, -248(%r8) + sbbq -240(%rsi), %rax + movq -232(%r8), %rdx + movq %rax, -240(%r8) + sbbq -232(%rsi), %rdx + movq -224(%r8), %rax + movq %rdx, -232(%r8) + sbbq -224(%rsi), %rax + movq -216(%r8), %rdx + movq %rax, -224(%r8) + sbbq -216(%rsi), %rdx + movq -208(%r8), %rax + movq %rdx, -216(%r8) + sbbq -208(%rsi), %rax + movq -200(%r8), %rdx + movq %rax, -208(%r8) + sbbq -200(%rsi), %rdx + movq -192(%r8), %rax + movq %rdx, -200(%r8) + sbbq -192(%rsi), %rax + movq -184(%r8), %rdx + movq %rax, -192(%r8) + sbbq -184(%rsi), %rdx + movq -176(%r8), %rax + movq %rdx, -184(%r8) + sbbq -176(%rsi), %rax + movq -168(%r8), %rdx + movq %rax, -176(%r8) + sbbq -168(%rsi), %rdx + movq -160(%r8), %rax + movq %rdx, -168(%r8) + sbbq -160(%rsi), %rax + movq -152(%r8), %rdx + movq %rax, -160(%r8) + sbbq -152(%rsi), %rdx + movq -144(%r8), %rax + movq %rdx, -152(%r8) + sbbq -144(%rsi), %rax + movq -136(%r8), %rdx + movq %rax, -144(%r8) + sbbq -136(%rsi), %rdx + movq -128(%r8), %rax + movq %rdx, -136(%r8) + sbbq -128(%rsi), %rax + movq -120(%r8), %rdx + movq %rax, -128(%r8) + sbbq -120(%rsi), %rdx + movq -112(%r8), %rax + movq %rdx, -120(%r8) + sbbq -112(%rsi), %rax + movq -104(%r8), %rdx + movq %rax, -112(%r8) + sbbq -104(%rsi), %rdx + movq -96(%r8), %rax + movq %rdx, -104(%r8) + sbbq -96(%rsi), %rax + movq -88(%r8), %rdx + movq %rax, -96(%r8) + sbbq -88(%rsi), %rdx + movq -80(%r8), %rax + movq %rdx, -88(%r8) + sbbq -80(%rsi), %rax + movq -72(%r8), %rdx + movq %rax, -80(%r8) + sbbq -72(%rsi), %rdx + movq -64(%r8), %rax + movq %rdx, -72(%r8) + sbbq -64(%rsi), %rax + movq -56(%r8), %rdx + movq %rax, -64(%r8) + sbbq -56(%rsi), %rdx + movq -48(%r8), %rax + movq %rdx, -56(%r8) + sbbq -48(%rsi), %rax + movq -40(%r8), %rdx + movq %rax, -48(%r8) + sbbq -40(%rsi), %rdx + movq -32(%r8), %rax + movq %rdx, -40(%r8) + sbbq -32(%rsi), %rax + movq -24(%r8), %rdx + movq %rax, -32(%r8) + sbbq -24(%rsi), %rdx + movq -16(%r8), %rax + movq %rdx, -24(%r8) + sbbq -16(%rsi), %rax + movq -8(%r8), %rdx + movq %rax, -16(%r8) + sbbq -8(%rsi), %rdx + movq (%r8), %rax + movq %rdx, -8(%r8) + sbbq (%rsi), %rax + movq 8(%r8), %rdx + movq %rax, (%r8) + sbbq 8(%rsi), %rdx + movq 16(%r8), %rax + movq %rdx, 8(%r8) + sbbq 16(%rsi), %rax + movq 24(%r8), %rdx + movq %rax, 16(%r8) + sbbq 24(%rsi), %rdx + movq 32(%r8), %rax + movq %rdx, 24(%r8) + sbbq 32(%rsi), %rax + movq 40(%r8), %rdx + movq %rax, 32(%r8) + sbbq 40(%rsi), %rdx + movq 48(%r8), %rax + movq %rdx, 40(%r8) + sbbq 48(%rsi), %rax + movq 56(%r8), %rdx + movq %rax, 48(%r8) + sbbq 56(%rsi), %rdx + movq 64(%r8), %rax + movq %rdx, 56(%r8) + sbbq 64(%rsi), %rax + movq 72(%r8), %rdx + movq %rax, 64(%r8) + sbbq 72(%rsi), %rdx + movq 80(%r8), %rax + movq %rdx, 72(%r8) + sbbq 80(%rsi), %rax + movq 88(%r8), %rdx + movq %rax, 80(%r8) + sbbq 88(%rsi), %rdx + movq 96(%r8), %rax + movq %rdx, 88(%r8) + sbbq 96(%rsi), %rax + movq 104(%r8), %rdx + movq %rax, 96(%r8) + sbbq 104(%rsi), %rdx + movq 112(%r8), %rax + movq %rdx, 104(%r8) + sbbq 112(%rsi), %rax + movq 120(%r8), %rdx + movq %rax, 112(%r8) + sbbq 120(%rsi), %rdx + movq 128(%r8), %rax + movq %rdx, 120(%r8) + sbbq 128(%rsi), %rax + movq 136(%r8), %rdx + movq %rax, 128(%r8) + sbbq 136(%rsi), %rdx + movq 144(%r8), %rax + movq %rdx, 136(%r8) + sbbq 144(%rsi), %rax + movq 152(%r8), %rdx + movq %rax, 144(%r8) + sbbq 152(%rsi), %rdx + movq 160(%r8), %rax + movq %rdx, 152(%r8) + sbbq 160(%rsi), %rax + movq 168(%r8), %rdx + movq %rax, 160(%r8) + sbbq 168(%rsi), %rdx + movq 176(%r8), %rax + movq %rdx, 168(%r8) + sbbq 176(%rsi), %rax + movq 184(%r8), %rdx + movq %rax, 176(%r8) + sbbq 184(%rsi), %rdx + movq 192(%r8), %rax + movq %rdx, 184(%r8) + sbbq 192(%rsi), %rax + movq 200(%r8), %rdx + movq %rax, 192(%r8) + sbbq 200(%rsi), %rdx + movq 208(%r8), %rax + movq %rdx, 200(%r8) + sbbq 208(%rsi), %rax + movq 216(%r8), %rdx + movq %rax, 208(%r8) + sbbq 216(%rsi), %rdx + movq 224(%r8), %rax + movq %rdx, 216(%r8) + sbbq 224(%rsi), %rax + movq 232(%r8), %rdx + movq %rax, 224(%r8) + sbbq 232(%rsi), %rdx + movq 240(%r8), %rax + movq %rdx, 232(%r8) + sbbq 240(%rsi), %rax + movq 248(%r8), %rdx + movq %rax, 240(%r8) + sbbq 248(%rsi), %rdx + movq %rdx, 248(%r8) sbbq $0x00, %rcx - # Add in place - movq 256(%rdi), %rdx - addq (%r8), %rdx - movq 264(%rdi), %rax - movq %rdx, 256(%rdi) - adcq 8(%r8), %rax - movq 272(%rdi), %rdx - movq %rax, 264(%rdi) - adcq 16(%r8), %rdx - movq 280(%rdi), %rax - movq %rdx, 272(%rdi) - adcq 24(%r8), %rax - movq 288(%rdi), %rdx - movq %rax, 280(%rdi) - adcq 32(%r8), %rdx - movq 296(%rdi), %rax - movq %rdx, 288(%rdi) - adcq 40(%r8), %rax - movq 304(%rdi), %rdx - movq %rax, 296(%rdi) - adcq 48(%r8), %rdx - movq 312(%rdi), %rax - movq %rdx, 304(%rdi) - adcq 56(%r8), %rax - movq 320(%rdi), %rdx - movq %rax, 312(%rdi) - adcq 64(%r8), %rdx - movq 328(%rdi), %rax - movq %rdx, 320(%rdi) - adcq 72(%r8), %rax - movq 336(%rdi), %rdx - movq %rax, 328(%rdi) - adcq 80(%r8), %rdx - movq 344(%rdi), %rax - movq %rdx, 336(%rdi) - adcq 88(%r8), %rax - movq 352(%rdi), %rdx - movq %rax, 344(%rdi) - adcq 96(%r8), %rdx - movq 360(%rdi), %rax - movq %rdx, 352(%rdi) - adcq 104(%r8), %rax - movq 368(%rdi), %rdx - movq %rax, 360(%rdi) - adcq 112(%r8), %rdx - movq 376(%rdi), %rax - movq %rdx, 368(%rdi) - adcq 120(%r8), %rax - movq 384(%rdi), %rdx - movq %rax, 376(%rdi) - adcq 128(%r8), %rdx - movq 392(%rdi), %rax - movq %rdx, 384(%rdi) - adcq 136(%r8), %rax - movq 400(%rdi), %rdx - movq %rax, 392(%rdi) - adcq 144(%r8), %rdx - movq 408(%rdi), %rax - movq %rdx, 400(%rdi) - adcq 152(%r8), %rax - movq 416(%rdi), %rdx - movq %rax, 408(%rdi) - adcq 160(%r8), %rdx - movq 424(%rdi), %rax - movq %rdx, 416(%rdi) - adcq 168(%r8), %rax - movq 432(%rdi), %rdx - movq %rax, 424(%rdi) - adcq 176(%r8), %rdx - movq 440(%rdi), %rax - movq %rdx, 432(%rdi) - adcq 184(%r8), %rax - movq 448(%rdi), %rdx - movq %rax, 440(%rdi) - adcq 192(%r8), %rdx - movq 456(%rdi), %rax - movq %rdx, 448(%rdi) - adcq 200(%r8), %rax - movq 464(%rdi), %rdx - movq %rax, 456(%rdi) - adcq 208(%r8), %rdx - movq 472(%rdi), %rax - movq %rdx, 464(%rdi) - adcq 216(%r8), %rax - movq 480(%rdi), %rdx - movq %rax, 472(%rdi) - adcq 224(%r8), %rdx - movq 488(%rdi), %rax - movq %rdx, 480(%rdi) - adcq 232(%r8), %rax - movq 496(%rdi), %rdx - movq %rax, 488(%rdi) - adcq 240(%r8), %rdx - movq 504(%rdi), %rax - movq %rdx, 496(%rdi) - adcq 248(%r8), %rax - movq 512(%rdi), %rdx - movq %rax, 504(%rdi) - adcq 256(%r8), %rdx - movq 520(%rdi), %rax - movq %rdx, 512(%rdi) - adcq 264(%r8), %rax - movq 528(%rdi), %rdx - movq %rax, 520(%rdi) - adcq 272(%r8), %rdx - movq 536(%rdi), %rax - movq %rdx, 528(%rdi) - adcq 280(%r8), %rax - movq 544(%rdi), %rdx - movq %rax, 536(%rdi) - adcq 288(%r8), %rdx - movq 552(%rdi), %rax - movq %rdx, 544(%rdi) - adcq 296(%r8), %rax - movq 560(%rdi), %rdx - movq %rax, 552(%rdi) - adcq 304(%r8), %rdx - movq 568(%rdi), %rax - movq %rdx, 560(%rdi) - adcq 312(%r8), %rax - movq 576(%rdi), %rdx - movq %rax, 568(%rdi) - adcq 320(%r8), %rdx - movq 584(%rdi), %rax - movq %rdx, 576(%rdi) - adcq 328(%r8), %rax - movq 592(%rdi), %rdx - movq %rax, 584(%rdi) - adcq 336(%r8), %rdx - movq 600(%rdi), %rax - movq %rdx, 592(%rdi) - adcq 344(%r8), %rax - movq 608(%rdi), %rdx - movq %rax, 600(%rdi) - adcq 352(%r8), %rdx - movq 616(%rdi), %rax - movq %rdx, 608(%rdi) - adcq 360(%r8), %rax - movq 624(%rdi), %rdx - movq %rax, 616(%rdi) - adcq 368(%r8), %rdx - movq 632(%rdi), %rax - movq %rdx, 624(%rdi) - adcq 376(%r8), %rax - movq 640(%rdi), %rdx - movq %rax, 632(%rdi) - adcq 384(%r8), %rdx - movq 648(%rdi), %rax - movq %rdx, 640(%rdi) - adcq 392(%r8), %rax - movq 656(%rdi), %rdx - movq %rax, 648(%rdi) - adcq 400(%r8), %rdx - movq 664(%rdi), %rax - movq %rdx, 656(%rdi) - adcq 408(%r8), %rax - movq 672(%rdi), %rdx - movq %rax, 664(%rdi) - adcq 416(%r8), %rdx - movq 680(%rdi), %rax - movq %rdx, 672(%rdi) - adcq 424(%r8), %rax - movq 688(%rdi), %rdx - movq %rax, 680(%rdi) - adcq 432(%r8), %rdx - movq 696(%rdi), %rax - movq %rdx, 688(%rdi) - adcq 440(%r8), %rax - movq 704(%rdi), %rdx - movq %rax, 696(%rdi) - adcq 448(%r8), %rdx - movq 712(%rdi), %rax - movq %rdx, 704(%rdi) - adcq 456(%r8), %rax - movq 720(%rdi), %rdx - movq %rax, 712(%rdi) - adcq 464(%r8), %rdx - movq 728(%rdi), %rax - movq %rdx, 720(%rdi) - adcq 472(%r8), %rax - movq 736(%rdi), %rdx - movq %rax, 728(%rdi) - adcq 480(%r8), %rdx - movq 744(%rdi), %rax - movq %rdx, 736(%rdi) - adcq 488(%r8), %rax - movq 752(%rdi), %rdx - movq %rax, 744(%rdi) - adcq 496(%r8), %rdx - movq 760(%rdi), %rax - movq %rdx, 752(%rdi) - adcq 504(%r8), %rax - movq %rax, 760(%rdi) - adcq $0x00, %rcx - movq %rcx, 768(%rdi) - # Add in place - movq 512(%rdi), %rdx - xorq %rcx, %rcx - addq (%rsi), %rdx - movq 520(%rdi), %rax - movq %rdx, 512(%rdi) - adcq 8(%rsi), %rax - movq 528(%rdi), %rdx - movq %rax, 520(%rdi) - adcq 16(%rsi), %rdx - movq 536(%rdi), %rax - movq %rdx, 528(%rdi) - adcq 24(%rsi), %rax - movq 544(%rdi), %rdx - movq %rax, 536(%rdi) - adcq 32(%rsi), %rdx - movq 552(%rdi), %rax - movq %rdx, 544(%rdi) - adcq 40(%rsi), %rax - movq 560(%rdi), %rdx - movq %rax, 552(%rdi) - adcq 48(%rsi), %rdx - movq 568(%rdi), %rax - movq %rdx, 560(%rdi) - adcq 56(%rsi), %rax - movq 576(%rdi), %rdx - movq %rax, 568(%rdi) - adcq 64(%rsi), %rdx - movq 584(%rdi), %rax - movq %rdx, 576(%rdi) - adcq 72(%rsi), %rax - movq 592(%rdi), %rdx - movq %rax, 584(%rdi) - adcq 80(%rsi), %rdx - movq 600(%rdi), %rax - movq %rdx, 592(%rdi) - adcq 88(%rsi), %rax - movq 608(%rdi), %rdx - movq %rax, 600(%rdi) - adcq 96(%rsi), %rdx - movq 616(%rdi), %rax - movq %rdx, 608(%rdi) - adcq 104(%rsi), %rax - movq 624(%rdi), %rdx - movq %rax, 616(%rdi) - adcq 112(%rsi), %rdx - movq 632(%rdi), %rax - movq %rdx, 624(%rdi) - adcq 120(%rsi), %rax - movq 640(%rdi), %rdx - movq %rax, 632(%rdi) - adcq 128(%rsi), %rdx - movq 648(%rdi), %rax - movq %rdx, 640(%rdi) - adcq 136(%rsi), %rax - movq 656(%rdi), %rdx - movq %rax, 648(%rdi) - adcq 144(%rsi), %rdx - movq 664(%rdi), %rax - movq %rdx, 656(%rdi) - adcq 152(%rsi), %rax - movq 672(%rdi), %rdx - movq %rax, 664(%rdi) - adcq 160(%rsi), %rdx - movq 680(%rdi), %rax - movq %rdx, 672(%rdi) - adcq 168(%rsi), %rax - movq 688(%rdi), %rdx - movq %rax, 680(%rdi) - adcq 176(%rsi), %rdx - movq 696(%rdi), %rax - movq %rdx, 688(%rdi) - adcq 184(%rsi), %rax - movq 704(%rdi), %rdx - movq %rax, 696(%rdi) - adcq 192(%rsi), %rdx - movq 712(%rdi), %rax - movq %rdx, 704(%rdi) - adcq 200(%rsi), %rax - movq 720(%rdi), %rdx - movq %rax, 712(%rdi) - adcq 208(%rsi), %rdx - movq 728(%rdi), %rax - movq %rdx, 720(%rdi) - adcq 216(%rsi), %rax - movq 736(%rdi), %rdx - movq %rax, 728(%rdi) - adcq 224(%rsi), %rdx - movq 744(%rdi), %rax - movq %rdx, 736(%rdi) - adcq 232(%rsi), %rax - movq 752(%rdi), %rdx - movq %rax, 744(%rdi) - adcq 240(%rsi), %rdx - movq 760(%rdi), %rax - movq %rdx, 752(%rdi) - adcq 248(%rsi), %rax - movq 768(%rdi), %rdx - movq %rax, 760(%rdi) - adcq 256(%rsi), %rdx - movq %rdx, 768(%rdi) - adcq $0x00, %rcx - # Add to zero - movq 264(%rsi), %rdx + movq 512(%rsp), %rdi + negq %rcx + addq $0x200, %rdi + movq -256(%rdi), %rax + subq -256(%r8), %rax + movq -248(%rdi), %rdx + movq %rax, -256(%rdi) + sbbq -248(%r8), %rdx + movq -240(%rdi), %rax + movq %rdx, -248(%rdi) + sbbq -240(%r8), %rax + movq -232(%rdi), %rdx + movq %rax, -240(%rdi) + sbbq -232(%r8), %rdx + movq -224(%rdi), %rax + movq %rdx, -232(%rdi) + sbbq -224(%r8), %rax + movq -216(%rdi), %rdx + movq %rax, -224(%rdi) + sbbq -216(%r8), %rdx + movq -208(%rdi), %rax + movq %rdx, -216(%rdi) + sbbq -208(%r8), %rax + movq -200(%rdi), %rdx + movq %rax, -208(%rdi) + sbbq -200(%r8), %rdx + movq -192(%rdi), %rax + movq %rdx, -200(%rdi) + sbbq -192(%r8), %rax + movq -184(%rdi), %rdx + movq %rax, -192(%rdi) + sbbq -184(%r8), %rdx + movq -176(%rdi), %rax + movq %rdx, -184(%rdi) + sbbq -176(%r8), %rax + movq -168(%rdi), %rdx + movq %rax, -176(%rdi) + sbbq -168(%r8), %rdx + movq -160(%rdi), %rax + movq %rdx, -168(%rdi) + sbbq -160(%r8), %rax + movq -152(%rdi), %rdx + movq %rax, -160(%rdi) + sbbq -152(%r8), %rdx + movq -144(%rdi), %rax + movq %rdx, -152(%rdi) + sbbq -144(%r8), %rax + movq -136(%rdi), %rdx + movq %rax, -144(%rdi) + sbbq -136(%r8), %rdx + movq -128(%rdi), %rax + movq %rdx, -136(%rdi) + sbbq -128(%r8), %rax + movq -120(%rdi), %rdx + movq %rax, -128(%rdi) + sbbq -120(%r8), %rdx + movq -112(%rdi), %rax + movq %rdx, -120(%rdi) + sbbq -112(%r8), %rax + movq -104(%rdi), %rdx + movq %rax, -112(%rdi) + sbbq -104(%r8), %rdx + movq -96(%rdi), %rax + movq %rdx, -104(%rdi) + sbbq -96(%r8), %rax + movq -88(%rdi), %rdx + movq %rax, -96(%rdi) + sbbq -88(%r8), %rdx + movq -80(%rdi), %rax + movq %rdx, -88(%rdi) + sbbq -80(%r8), %rax + movq -72(%rdi), %rdx + movq %rax, -80(%rdi) + sbbq -72(%r8), %rdx + movq -64(%rdi), %rax + movq %rdx, -72(%rdi) + sbbq -64(%r8), %rax + movq -56(%rdi), %rdx + movq %rax, -64(%rdi) + sbbq -56(%r8), %rdx + movq -48(%rdi), %rax + movq %rdx, -56(%rdi) + sbbq -48(%r8), %rax + movq -40(%rdi), %rdx + movq %rax, -48(%rdi) + sbbq -40(%r8), %rdx + movq -32(%rdi), %rax + movq %rdx, -40(%rdi) + sbbq -32(%r8), %rax + movq -24(%rdi), %rdx + movq %rax, -32(%rdi) + sbbq -24(%r8), %rdx + movq -16(%rdi), %rax + movq %rdx, -24(%rdi) + sbbq -16(%r8), %rax + movq -8(%rdi), %rdx + movq %rax, -16(%rdi) + sbbq -8(%r8), %rdx + movq (%rdi), %rax + movq %rdx, -8(%rdi) + sbbq (%r8), %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) + sbbq 8(%r8), %rdx + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) + sbbq 16(%r8), %rax + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) + sbbq 24(%r8), %rdx + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) + sbbq 32(%r8), %rax + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) + sbbq 40(%r8), %rdx + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) + sbbq 48(%r8), %rax + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) + sbbq 56(%r8), %rdx + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) + sbbq 64(%r8), %rax + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) + sbbq 72(%r8), %rdx + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) + sbbq 80(%r8), %rax + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) + sbbq 88(%r8), %rdx + movq 96(%rdi), %rax + movq %rdx, 88(%rdi) + sbbq 96(%r8), %rax + movq 104(%rdi), %rdx + movq %rax, 96(%rdi) + sbbq 104(%r8), %rdx + movq 112(%rdi), %rax + movq %rdx, 104(%rdi) + sbbq 112(%r8), %rax + movq 120(%rdi), %rdx + movq %rax, 112(%rdi) + sbbq 120(%r8), %rdx + movq 128(%rdi), %rax + movq %rdx, 120(%rdi) + sbbq 128(%r8), %rax + movq 136(%rdi), %rdx + movq %rax, 128(%rdi) + sbbq 136(%r8), %rdx + movq 144(%rdi), %rax + movq %rdx, 136(%rdi) + sbbq 144(%r8), %rax + movq 152(%rdi), %rdx + movq %rax, 144(%rdi) + sbbq 152(%r8), %rdx + movq 160(%rdi), %rax + movq %rdx, 152(%rdi) + sbbq 160(%r8), %rax + movq 168(%rdi), %rdx + movq %rax, 160(%rdi) + sbbq 168(%r8), %rdx + movq 176(%rdi), %rax + movq %rdx, 168(%rdi) + sbbq 176(%r8), %rax + movq 184(%rdi), %rdx + movq %rax, 176(%rdi) + sbbq 184(%r8), %rdx + movq 192(%rdi), %rax + movq %rdx, 184(%rdi) + sbbq 192(%r8), %rax + movq 200(%rdi), %rdx + movq %rax, 192(%rdi) + sbbq 200(%r8), %rdx + movq 208(%rdi), %rax + movq %rdx, 200(%rdi) + sbbq 208(%r8), %rax + movq 216(%rdi), %rdx + movq %rax, 208(%rdi) + sbbq 216(%r8), %rdx + movq 224(%rdi), %rax + movq %rdx, 216(%rdi) + sbbq 224(%r8), %rax + movq 232(%rdi), %rdx + movq %rax, 224(%rdi) + sbbq 232(%r8), %rdx + movq 240(%rdi), %rax + movq %rdx, 232(%rdi) + sbbq 240(%r8), %rax + movq 248(%rdi), %rdx + movq %rax, 240(%rdi) + sbbq 248(%r8), %rdx + movq %rdx, 248(%rdi) + sbbq $0x00, %rcx + movq 512(%rsp), %rdi + addq $0x300, %rdi + # Add in word + movq (%rdi), %rax + addq %rcx, %rax + movq 8(%rdi), %rdx + movq %rax, (%rdi) adcq $0x00, %rdx - movq 272(%rsi), %rax - movq %rdx, 776(%rdi) + movq 16(%rdi), %rax + movq %rdx, 8(%rdi) adcq $0x00, %rax - movq 280(%rsi), %rdx - movq %rax, 784(%rdi) + movq 24(%rdi), %rdx + movq %rax, 16(%rdi) adcq $0x00, %rdx - movq 288(%rsi), %rax - movq %rdx, 792(%rdi) + movq 32(%rdi), %rax + movq %rdx, 24(%rdi) adcq $0x00, %rax - movq 296(%rsi), %rdx - movq %rax, 800(%rdi) + movq 40(%rdi), %rdx + movq %rax, 32(%rdi) adcq $0x00, %rdx - movq 304(%rsi), %rax - movq %rdx, 808(%rdi) + movq 48(%rdi), %rax + movq %rdx, 40(%rdi) adcq $0x00, %rax - movq 312(%rsi), %rdx - movq %rax, 816(%rdi) + movq 56(%rdi), %rdx + movq %rax, 48(%rdi) adcq $0x00, %rdx - movq 320(%rsi), %rax - movq %rdx, 824(%rdi) + movq 64(%rdi), %rax + movq %rdx, 56(%rdi) adcq $0x00, %rax - movq 328(%rsi), %rdx - movq %rax, 832(%rdi) + movq 72(%rdi), %rdx + movq %rax, 64(%rdi) adcq $0x00, %rdx - movq 336(%rsi), %rax - movq %rdx, 840(%rdi) + movq 80(%rdi), %rax + movq %rdx, 72(%rdi) adcq $0x00, %rax - movq 344(%rsi), %rdx - movq %rax, 848(%rdi) + movq 88(%rdi), %rdx + movq %rax, 80(%rdi) adcq $0x00, %rdx - movq 352(%rsi), %rax - movq %rdx, 856(%rdi) + movq 96(%rdi), %rax + movq %rdx, 88(%rdi) adcq $0x00, %rax - movq 360(%rsi), %rdx - movq %rax, 864(%rdi) + movq 104(%rdi), %rdx + movq %rax, 96(%rdi) adcq $0x00, %rdx - movq 368(%rsi), %rax - movq %rdx, 872(%rdi) + movq 112(%rdi), %rax + movq %rdx, 104(%rdi) adcq $0x00, %rax - movq 376(%rsi), %rdx - movq %rax, 880(%rdi) + movq 120(%rdi), %rdx + movq %rax, 112(%rdi) adcq $0x00, %rdx - movq 384(%rsi), %rax - movq %rdx, 888(%rdi) + movq 128(%rdi), %rax + movq %rdx, 120(%rdi) adcq $0x00, %rax - movq 392(%rsi), %rdx - movq %rax, 896(%rdi) + movq 136(%rdi), %rdx + movq %rax, 128(%rdi) adcq $0x00, %rdx - movq 400(%rsi), %rax - movq %rdx, 904(%rdi) + movq 144(%rdi), %rax + movq %rdx, 136(%rdi) adcq $0x00, %rax - movq 408(%rsi), %rdx - movq %rax, 912(%rdi) + movq 152(%rdi), %rdx + movq %rax, 144(%rdi) adcq $0x00, %rdx - movq 416(%rsi), %rax - movq %rdx, 920(%rdi) + movq 160(%rdi), %rax + movq %rdx, 152(%rdi) adcq $0x00, %rax - movq 424(%rsi), %rdx - movq %rax, 928(%rdi) + movq 168(%rdi), %rdx + movq %rax, 160(%rdi) adcq $0x00, %rdx - movq 432(%rsi), %rax - movq %rdx, 936(%rdi) + movq 176(%rdi), %rax + movq %rdx, 168(%rdi) adcq $0x00, %rax - movq 440(%rsi), %rdx - movq %rax, 944(%rdi) + movq 184(%rdi), %rdx + movq %rax, 176(%rdi) adcq $0x00, %rdx - movq 448(%rsi), %rax - movq %rdx, 952(%rdi) + movq 192(%rdi), %rax + movq %rdx, 184(%rdi) adcq $0x00, %rax - movq 456(%rsi), %rdx - movq %rax, 960(%rdi) + movq 200(%rdi), %rdx + movq %rax, 192(%rdi) adcq $0x00, %rdx - movq 464(%rsi), %rax - movq %rdx, 968(%rdi) + movq 208(%rdi), %rax + movq %rdx, 200(%rdi) adcq $0x00, %rax - movq 472(%rsi), %rdx - movq %rax, 976(%rdi) + movq 216(%rdi), %rdx + movq %rax, 208(%rdi) adcq $0x00, %rdx - movq 480(%rsi), %rax - movq %rdx, 984(%rdi) + movq 224(%rdi), %rax + movq %rdx, 216(%rdi) adcq $0x00, %rax - movq 488(%rsi), %rdx - movq %rax, 992(%rdi) + movq 232(%rdi), %rdx + movq %rax, 224(%rdi) adcq $0x00, %rdx - movq 496(%rsi), %rax - movq %rdx, 1000(%rdi) + movq 240(%rdi), %rax + movq %rdx, 232(%rdi) adcq $0x00, %rax - movq 504(%rsi), %rdx - movq %rax, 1008(%rdi) + movq 248(%rdi), %rdx + movq %rax, 240(%rdi) adcq $0x00, %rdx - movq %rdx, 1016(%rdi) - addq $0x518, %rsp + movq %rdx, 248(%rdi) + movq 520(%rsp), %rsi + movq 512(%rsp), %rdi + addq $0x210, %rsp repz retq #ifndef __APPLE__ .size sp_4096_sqr_avx2_64,.-sp_4096_sqr_avx2_64 @@ -34558,7 +47531,6 @@ sp_4096_cond_sub_64: _sp_4096_cond_sub_64: #endif /* __APPLE__ */ subq $0x200, %rsp - movq $0x00, %rax movq (%rdx), %r8 movq 8(%rdx), %r9 andq %rcx, %r8 @@ -35007,7 +47979,7 @@ _sp_4096_cond_sub_64: sbbq %rdx, %r9 movq %r8, 496(%rdi) movq %r9, 504(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax addq $0x200, %rsp repz retq #ifndef __APPLE__ @@ -35730,7 +48702,6 @@ sp_4096_sub_64: _sp_4096_sub_64: #endif /* __APPLE__ */ movq (%rsi), %rcx - xorq %rax, %rax subq (%rdx), %rcx movq 8(%rsi), %r8 movq %rcx, (%rdi) @@ -35922,7 +48893,7 @@ _sp_4096_sub_64: movq %rcx, 496(%rdi) sbbq 504(%rdx), %r8 movq %r8, 504(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_4096_sub_64,.-sp_4096_sub_64 @@ -36387,7 +49358,6 @@ sp_4096_cond_sub_avx2_64: .p2align 4 _sp_4096_cond_sub_avx2_64: #endif /* __APPLE__ */ - movq $0x00, %rax movq (%rdx), %r10 movq (%rsi), %r8 pextq %rcx, %r10, %r10 @@ -36708,7 +49678,7 @@ _sp_4096_cond_sub_avx2_64: movq %r10, 496(%rdi) sbbq %r9, %r8 movq %r8, 504(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_4096_cond_sub_avx2_64,.-sp_4096_cond_sub_avx2_64 @@ -37254,6 +50224,2413 @@ _sp_4096_cmp_64: #ifndef __APPLE__ .size sp_4096_cmp_64,.-sp_4096_cmp_64 #endif /* __APPLE__ */ +#ifndef WC_NO_CACHE_RESISTANT +#ifndef __APPLE__ +.text +.globl sp_4096_get_from_table_64 +.type sp_4096_get_from_table_64,@function +.align 16 +sp_4096_get_from_table_64: +#else +.section __TEXT,__text +.globl _sp_4096_get_from_table_64 +.p2align 4 +_sp_4096_get_from_table_64: +#endif /* __APPLE__ */ + movq $0x01, %rax + movd %rdx, %xmm10 + movd %rax, %xmm11 + pxor %xmm13, %xmm13 + pshufd $0x00, %xmm11, %xmm11 + pshufd $0x00, %xmm10, %xmm10 + # START: 0-7 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 0-7 + # START: 8-15 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x40, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 8-15 + # START: 16-23 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x80, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 16-23 + # START: 24-31 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0xc0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 24-31 + # START: 32-39 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x100, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 32-39 + # START: 40-47 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x140, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 40-47 + # START: 48-55 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x180, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + addq $0x40, %rdi + # END: 48-55 + # START: 56-63 + pxor %xmm13, %xmm13 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x1c0, %rcx + movdqu %xmm13, %xmm12 + pcmpeqd %xmm10, %xmm12 + movdqu (%rcx), %xmm0 + movdqu 16(%rcx), %xmm1 + movdqu 32(%rcx), %xmm2 + movdqu 48(%rcx), %xmm3 + pand %xmm12, %xmm0 + pand %xmm12, %xmm1 + pand %xmm12, %xmm2 + pand %xmm12, %xmm3 + por %xmm0, %xmm4 + por %xmm1, %xmm5 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + paddd %xmm11, %xmm13 + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + # END: 56-63 + repz retq +#ifndef __APPLE__ +.size sp_4096_get_from_table_64,.-sp_4096_get_from_table_64 +#endif /* __APPLE__ */ +#endif /* !WC_NO_CACHE_RESISTANT */ #ifdef HAVE_INTEL_AVX2 /* Reduce the number back to 4096 bits using Montgomery reduction. * @@ -38019,6 +53396,1149 @@ L_4096_mont_reduce_avx2_64_loop: .size sp_4096_mont_reduce_avx2_64,.-sp_4096_mont_reduce_avx2_64 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ +#ifndef WC_NO_CACHE_RESISTANT +#ifndef __APPLE__ +.text +.globl sp_4096_get_from_table_avx2_64 +.type sp_4096_get_from_table_avx2_64,@function +.align 16 +sp_4096_get_from_table_avx2_64: +#else +.section __TEXT,__text +.globl _sp_4096_get_from_table_avx2_64 +.p2align 4 +_sp_4096_get_from_table_avx2_64: +#endif /* __APPLE__ */ + movq $0x01, %rax + movd %rdx, %xmm10 + movd %rax, %xmm11 + vpxor %ymm13, %ymm13, %ymm13 + vpermd %ymm10, %ymm13, %ymm10 + vpermd %ymm11, %ymm13, %ymm11 + # START: 0-15 + vpxor %ymm13, %ymm13, %ymm13 + vpxor %ymm4, %ymm4, %ymm4 + vpxor %ymm5, %ymm5, %ymm5 + vpxor %ymm6, %ymm6, %ymm6 + vpxor %ymm7, %ymm7, %ymm7 + # ENTRY: 0 + movq (%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, 32(%rdi) + vmovdqu %ymm6, 64(%rdi) + vmovdqu %ymm7, 96(%rdi) + addq $0x80, %rdi + # END: 0-15 + # START: 16-31 + vpxor %ymm13, %ymm13, %ymm13 + vpxor %ymm4, %ymm4, %ymm4 + vpxor %ymm5, %ymm5, %ymm5 + vpxor %ymm6, %ymm6, %ymm6 + vpxor %ymm7, %ymm7, %ymm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x80, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, 32(%rdi) + vmovdqu %ymm6, 64(%rdi) + vmovdqu %ymm7, 96(%rdi) + addq $0x80, %rdi + # END: 16-31 + # START: 32-47 + vpxor %ymm13, %ymm13, %ymm13 + vpxor %ymm4, %ymm4, %ymm4 + vpxor %ymm5, %ymm5, %ymm5 + vpxor %ymm6, %ymm6, %ymm6 + vpxor %ymm7, %ymm7, %ymm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x100, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, 32(%rdi) + vmovdqu %ymm6, 64(%rdi) + vmovdqu %ymm7, 96(%rdi) + addq $0x80, %rdi + # END: 32-47 + # START: 48-63 + vpxor %ymm13, %ymm13, %ymm13 + vpxor %ymm4, %ymm4, %ymm4 + vpxor %ymm5, %ymm5, %ymm5 + vpxor %ymm6, %ymm6, %ymm6 + vpxor %ymm7, %ymm7, %ymm7 + # ENTRY: 0 + movq (%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 1 + movq 8(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 2 + movq 16(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 3 + movq 24(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 4 + movq 32(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 5 + movq 40(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 6 + movq 48(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 7 + movq 56(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 8 + movq 64(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 9 + movq 72(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 10 + movq 80(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 11 + movq 88(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 12 + movq 96(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 13 + movq 104(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 14 + movq 112(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + # ENTRY: 15 + movq 120(%rsi), %rcx + addq $0x180, %rcx + vpcmpeqd %ymm10, %ymm13, %ymm12 + vmovdqu (%rcx), %ymm0 + vmovdqu 32(%rcx), %ymm1 + vmovdqu 64(%rcx), %ymm2 + vmovdqu 96(%rcx), %ymm3 + vpand %ymm12, %ymm0, %ymm0 + vpand %ymm12, %ymm1, %ymm1 + vpand %ymm12, %ymm2, %ymm2 + vpand %ymm12, %ymm3, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm6, %ymm6 + vpor %ymm3, %ymm7, %ymm7 + vpaddd %ymm11, %ymm13, %ymm13 + vmovdqu %ymm4, (%rdi) + vmovdqu %ymm5, 32(%rdi) + vmovdqu %ymm6, 64(%rdi) + vmovdqu %ymm7, 96(%rdi) + # END: 48-63 + repz retq +#ifndef __APPLE__ +.size sp_4096_get_from_table_avx2_64,.-sp_4096_get_from_table_avx2_64 +#endif /* __APPLE__ */ +#endif /* !WC_NO_CACHE_RESISTANT */ /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -38839,89 +55359,84 @@ sp_256_mul_avx2_4: .p2align 4 _sp_256_mul_avx2_4: #endif /* __APPLE__ */ - pushq %rbx + pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %rbp + pushq %rbx movq %rdx, %rbp - # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rax, %rcx - xorq %r15, %r15 + movq (%rsi), %rdx + # A[0] * B[0] + mulxq (%rbp), %r8, %r9 + xorq %rbx, %rbx + # A[0] * B[1] + mulxq 8(%rbp), %rax, %r10 adcxq %rax, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rcx, %r10 - # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rsi), %rax, %rcx - adoxq %rax, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rax, %r14 - adoxq %rcx, %r10 - adcxq %rax, %r11 - # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rsi), %rax, %rcx - adcxq %r14, %r12 - adoxq %rax, %r11 - adcxq %r15, %r13 - adoxq %rcx, %r12 - # A[0] * B[2] - mulxq (%rsi), %rax, %rcx - adoxq %r15, %r13 - xorq %r14, %r14 + # A[0] * B[2] + mulxq 16(%rbp), %rax, %r11 adcxq %rax, %r10 - # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rsi), %rdx, %rax - adcxq %rcx, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rax, %r11 - mulxq 24(%rsi), %rax, %rcx + # A[0] * B[3] + mulxq 24(%rbp), %rax, %r12 + adcxq %rax, %r11 + movq 8(%rsi), %rdx + adcxq %rbx, %r12 + # A[1] * B[0] + mulxq (%rbp), %rax, %rcx + xorq %rbx, %rbx + adcxq %rax, %r9 + # A[1] * B[1] + mulxq 8(%rbp), %rax, %r15 + adoxq %rcx, %r10 + adcxq %rax, %r10 + # A[1] * B[2] + mulxq 16(%rbp), %rax, %rcx + adoxq %r15, %r11 + adcxq %rax, %r11 + # A[1] * B[3] + mulxq 24(%rbp), %rax, %r13 + adoxq %rcx, %r12 adcxq %rax, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rsi), %rdx, %rax - adcxq %rcx, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rax, %r13 - mulxq 24(%rsi), %rax, %rcx - adoxq %r15, %r14 - adcxq %rax, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rax - adcxq %rcx, %r15 - xorq %rcx, %rcx - adcxq %rdx, %r11 - # A[3] * B[0] + adoxq %rbx, %r13 + movq 16(%rsi), %rdx + adcxq %rbx, %r13 + # A[2] * B[0] + mulxq (%rbp), %rax, %rcx + xorq %rbx, %rbx + adcxq %rax, %r10 + # A[2] * B[1] + mulxq 8(%rbp), %rax, %r15 + adoxq %rcx, %r11 + adcxq %rax, %r11 + # A[2] * B[2] + mulxq 16(%rbp), %rax, %rcx + adoxq %r15, %r12 + adcxq %rax, %r12 + # A[2] * B[3] + mulxq 24(%rbp), %rax, %r14 + adoxq %rcx, %r13 + adcxq %rax, %r13 + adoxq %rbx, %r14 movq 24(%rsi), %rdx + adcxq %rbx, %r14 + # A[3] * B[0] + mulxq (%rbp), %rax, %rcx + xorq %rbx, %rbx + adcxq %rax, %r11 + # A[3] * B[1] + mulxq 8(%rbp), %rax, %r15 + adoxq %rcx, %r12 adcxq %rax, %r12 - mulxq (%rbp), %rbx, %rax - adoxq %rbx, %r11 - adoxq %rax, %r12 - # A[3] * B[2] - mulxq 16(%rbp), %rdx, %rax - adcxq %rdx, %r13 - # A[2] * B[3] - movq 24(%rbp), %rdx + # A[3] * B[2] + mulxq 16(%rbp), %rax, %rcx + adoxq %r15, %r13 + adcxq %rax, %r13 + # A[3] * B[3] + mulxq 24(%rbp), %rax, %r15 + adoxq %rcx, %r14 adcxq %rax, %r14 - mulxq 16(%rsi), %rax, %rdx - adcxq %rcx, %r15 - adoxq %rax, %r13 - adoxq %rdx, %r14 - adoxq %rcx, %r15 + adoxq %rbx, %r15 + adcxq %rbx, %r15 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) @@ -38930,12 +55445,12 @@ _sp_256_mul_avx2_4: movq %r13, 40(%rdi) movq %r14, 48(%rdi) movq %r15, 56(%rdi) - popq %rbp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx + popq %rbp repz retq #ifndef __APPLE__ .size sp_256_mul_avx2_4,.-sp_256_mul_avx2_4 @@ -39086,31 +55601,34 @@ _sp_256_sqr_avx2_4: pushq %r14 pushq %r15 pushq %rbx - # A[0] * A[1] + xorq %r8, %r8 movq (%rsi), %rdx - movq 16(%rsi), %r15 - mulxq 8(%rsi), %r9, %r10 + movq 8(%rsi), %rcx + movq 16(%rsi), %rbx + movq 24(%rsi), %r15 + # A[0] * A[1] + mulxq %rcx, %r9, %r10 + # A[0] * A[2] + mulxq %rbx, %r8, %r11 + adoxq %r8, %r10 # A[0] * A[3] - mulxq 24(%rsi), %r11, %r12 - # A[2] * A[1] + mulxq %r15, %r8, %r12 + movq %rcx, %rdx + adoxq %r8, %r11 + # A[1] * A[2] + mulxq %rbx, %r8, %rax movq %r15, %rdx - mulxq 8(%rsi), %rcx, %rbx - # A[2] * A[3] - mulxq 24(%rsi), %r13, %r14 - xorq %r15, %r15 - adoxq %rcx, %r11 - adoxq %rbx, %r12 - # A[2] * A[0] - mulxq (%rsi), %rcx, %rbx + adcxq %r8, %r11 # A[1] * A[3] - movq 8(%rsi), %rdx + mulxq %rcx, %r8, %r13 + movq $0x00, %r15 + adoxq %rax, %r12 + adcxq %r8, %r12 + # A[2] * A[3] + mulxq %rbx, %r8, %r14 adoxq %r15, %r13 - mulxq 24(%rsi), %rax, %r8 - adcxq %rcx, %r10 - adoxq %r15, %r14 - adcxq %rbx, %r11 - adcxq %rax, %r12 adcxq %r8, %r13 + adoxq %r15, %r14 adcxq %r15, %r14 # Double with Carry Flag xorq %r15, %r15 @@ -39176,20 +55694,19 @@ sp_256_add_4: .p2align 4 _sp_256_add_4: #endif /* __APPLE__ */ - # Add - movq (%rsi), %rcx xorq %rax, %rax - addq (%rdx), %rcx + movq (%rsi), %rcx movq 8(%rsi), %r8 - movq %rcx, (%rdi) + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 + addq (%rdx), %rcx adcq 8(%rdx), %r8 - movq 16(%rsi), %rcx + adcq 16(%rdx), %r9 + adcq 24(%rdx), %r10 + movq %rcx, (%rdi) movq %r8, 8(%rdi) - adcq 16(%rdx), %rcx - movq 24(%rsi), %r8 - movq %rcx, 16(%rdi) - adcq 24(%rdx), %r8 - movq %r8, 24(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) adcq $0x00, %rax repz retq #ifndef __APPLE__ @@ -39226,7 +55743,7 @@ _sp_256_sub_4: movq %r8, 8(%rdi) movq %r9, 16(%rdi) movq %r10, 24(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_256_sub_4,.-sp_256_sub_4 @@ -39398,52 +55915,44 @@ _sp_256_mont_mul_4: # Start Reduction # mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192 # - a[0] << 32 << 192 - # + (a[0] * 2) << 192 + # a[0]-a[3] + (a[0] * 2) << 192 movq %r9, %rax - movq %r12, %rdx - addq %r9, %rdx + leaq (%r12,%r9,2), %rdx movq %r10, %rsi - addq %r9, %rdx movq %r11, %r8 + movq %r11, %rcx # a[0]-a[2] << 32 shlq $32, %r9 - shldq $32, %rsi, %r11 + shldq $32, %rsi, %rcx shldq $32, %rax, %r10 # - a[0] << 32 << 192 subq %r9, %rdx # + a[0]-a[2] << 32 << 64 addq %r9, %rsi adcq %r10, %r8 - adcq %r11, %rdx + adcq %rcx, %rdx # a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu + xorq %rcx, %rcx # a += mu << 256 - xorq %r9, %r9 addq %rax, %r13 adcq %rsi, %r14 adcq %r8, %r15 adcq %rdx, %rbx - sbbq $0x00, %r9 + sbbq %r9, %r9 # a += mu << 192 addq %rax, %r12 adcq %rsi, %r13 + movq %rsi, %r10 adcq %r8, %r14 adcq %rdx, %r15 adcq $0x00, %rbx sbbq $0x00, %r9 # mu <<= 32 - movq %rdx, %rcx + shldq $32, %rdx, %rcx shldq $32, %r8, %rdx shldq $32, %rsi, %r8 shldq $32, %rax, %rsi - shrq $32, %rcx shlq $32, %rax - # a += (mu << 32) << 64 - addq %r8, %r12 - adcq %rdx, %r13 - adcq %rcx, %r14 - adcq $0x00, %r15 - adcq $0x00, %rbx - sbbq $0x00, %r9 # a -= (mu << 32) << 192 subq %rax, %r12 sbbq %rsi, %r13 @@ -39451,19 +55960,28 @@ _sp_256_mont_mul_4: sbbq %rdx, %r15 sbbq %rcx, %rbx adcq $0x00, %r9 - movq $0xffffffff, %rax + # a += (mu << 32) << 64 + subq %rax, %r10 + adcq %rsi, %r11 + adcq %r8, %r12 + adcq %rdx, %r13 + adcq %rcx, %r14 + adcq $0x00, %r15 + adcq $0x00, %rbx + sbbq $0x00, %r9 movq $0xffffffff00000001, %rsi + movq %r9, %rax # mask m and sub from result if overflow # m[0] = -1 & mask = mask - andq %r9, %rax + shrq $32, %rax # m[2] = 0 & mask = 0 andq %r9, %rsi subq %r9, %r13 sbbq %rax, %r14 - sbbq $0x00, %r15 - sbbq %rsi, %rbx movq %r13, (%rdi) + sbbq $0x00, %r15 movq %r14, 8(%rdi) + sbbq %rsi, %rbx movq %r15, 16(%rdi) movq %rbx, 24(%rdi) popq %rbx @@ -39579,52 +56097,44 @@ _sp_256_mont_sqr_4: # Start Reduction # mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192 # - a[0] << 32 << 192 - # + (a[0] * 2) << 192 + # a[0]-a[3] + (a[0] * 2) << 192 movq %r8, %rax - movq %r11, %rdx - addq %r8, %rdx + leaq (%r11,%r8,2), %rdx movq %r9, %rsi - addq %r8, %rdx movq %r10, %rbx + movq %r10, %rcx # a[0]-a[2] << 32 shlq $32, %r8 - shldq $32, %rsi, %r10 + shldq $32, %rsi, %rcx shldq $32, %rax, %r9 # - a[0] << 32 << 192 subq %r8, %rdx # + a[0]-a[2] << 32 << 64 addq %r8, %rsi adcq %r9, %rbx - adcq %r10, %rdx + adcq %rcx, %rdx # a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu + xorq %rcx, %rcx # a += mu << 256 - xorq %r8, %r8 addq %rax, %r12 adcq %rsi, %r13 adcq %rbx, %r14 adcq %rdx, %r15 - sbbq $0x00, %r8 + sbbq %r8, %r8 # a += mu << 192 addq %rax, %r11 adcq %rsi, %r12 + movq %rsi, %r9 adcq %rbx, %r13 adcq %rdx, %r14 adcq $0x00, %r15 sbbq $0x00, %r8 # mu <<= 32 - movq %rdx, %rcx + shldq $32, %rdx, %rcx shldq $32, %rbx, %rdx shldq $32, %rsi, %rbx shldq $32, %rax, %rsi - shrq $32, %rcx shlq $32, %rax - # a += (mu << 32) << 64 - addq %rbx, %r11 - adcq %rdx, %r12 - adcq %rcx, %r13 - adcq $0x00, %r14 - adcq $0x00, %r15 - sbbq $0x00, %r8 # a -= (mu << 32) << 192 subq %rax, %r11 sbbq %rsi, %r12 @@ -39632,19 +56142,28 @@ _sp_256_mont_sqr_4: sbbq %rdx, %r14 sbbq %rcx, %r15 adcq $0x00, %r8 - movq $0xffffffff, %rax + # a += (mu << 32) << 64 + subq %rax, %r9 + adcq %rsi, %r10 + adcq %rbx, %r11 + adcq %rdx, %r12 + adcq %rcx, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + sbbq $0x00, %r8 movq $0xffffffff00000001, %rsi + movq %r8, %rax # mask m and sub from result if overflow # m[0] = -1 & mask = mask - andq %r8, %rax + shrq $32, %rax # m[2] = 0 & mask = 0 andq %r8, %rsi subq %r8, %r12 sbbq %rax, %r13 - sbbq $0x00, %r14 - sbbq %rsi, %r15 movq %r12, (%rdi) + sbbq $0x00, %r14 movq %r13, 8(%rdi) + sbbq %rsi, %r15 movq %r14, 16(%rdi) movq %r15, 24(%rdi) popq %rbx @@ -39740,7 +56259,6 @@ _sp_256_cond_sub_4: pushq %r13 pushq %r14 pushq %r15 - movq $0x00, %rax movq (%rdx), %r12 movq 8(%rdx), %r13 movq 16(%rdx), %r14 @@ -39761,7 +56279,7 @@ _sp_256_cond_sub_4: movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax popq %r15 popq %r14 popq %r13 @@ -39787,6 +56305,118 @@ sp_256_mont_reduce_4: .globl _sp_256_mont_reduce_4 .p2align 4 _sp_256_mont_reduce_4: +#endif /* __APPLE__ */ + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq (%rdi), %r8 + movq 8(%rdi), %r9 + movq 16(%rdi), %r10 + movq 24(%rdi), %r11 + movq 32(%rdi), %r12 + movq 40(%rdi), %r13 + movq 48(%rdi), %r14 + movq 56(%rdi), %r15 + # Start Reduction + # mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192 + # - a[0] << 32 << 192 + # a[0]-a[3] + (a[0] * 2) << 192 + movq %r8, %rax + leaq (%r11,%r8,2), %rdx + movq %r9, %rbx + movq %r10, %rcx + movq %r10, %rsi + # a[0]-a[2] << 32 + shlq $32, %r8 + shldq $32, %rbx, %rsi + shldq $32, %rax, %r9 + # - a[0] << 32 << 192 + subq %r8, %rdx + # + a[0]-a[2] << 32 << 64 + addq %r8, %rbx + adcq %r9, %rcx + adcq %rsi, %rdx + # a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu + xorq %rsi, %rsi + # a += mu << 256 + addq %rax, %r12 + adcq %rbx, %r13 + adcq %rcx, %r14 + adcq %rdx, %r15 + sbbq %r8, %r8 + # a += mu << 192 + addq %rax, %r11 + adcq %rbx, %r12 + movq %rbx, %r9 + adcq %rcx, %r13 + adcq %rdx, %r14 + adcq $0x00, %r15 + sbbq $0x00, %r8 + # mu <<= 32 + shldq $32, %rdx, %rsi + shldq $32, %rcx, %rdx + shldq $32, %rbx, %rcx + shldq $32, %rax, %rbx + shlq $32, %rax + # a -= (mu << 32) << 192 + subq %rax, %r11 + sbbq %rbx, %r12 + sbbq %rcx, %r13 + sbbq %rdx, %r14 + sbbq %rsi, %r15 + adcq $0x00, %r8 + # a += (mu << 32) << 64 + subq %rax, %r9 + adcq %rbx, %r10 + adcq %rcx, %r11 + adcq %rdx, %r12 + adcq %rsi, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + sbbq $0x00, %r8 + movq $0xffffffff00000001, %rbx + movq %r8, %rax + # mask m and sub from result if overflow + # m[0] = -1 & mask = mask + shrq $32, %rax + # m[2] = 0 & mask = 0 + andq %r8, %rbx + subq %r8, %r12 + sbbq %rax, %r13 + movq %r12, (%rdi) + sbbq $0x00, %r14 + movq %r13, 8(%rdi) + sbbq %rbx, %r15 + movq %r14, 16(%rdi) + movq %r15, 24(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + repz retq +#ifndef __APPLE__ +.size sp_256_mont_reduce_4,.-sp_256_mont_reduce_4 +#endif /* __APPLE__ */ +/* Reduce the number back to 256 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +#ifndef __APPLE__ +.text +.globl sp_256_mont_reduce_order_4 +.type sp_256_mont_reduce_order_4,@function +.align 16 +sp_256_mont_reduce_order_4: +#else +.section __TEXT,__text +.globl _sp_256_mont_reduce_order_4 +.p2align 4 +_sp_256_mont_reduce_order_4: #endif /* __APPLE__ */ pushq %r12 pushq %r13 @@ -39876,7 +56506,7 @@ L_mont_loop_4: popq %r12 repz retq #ifndef __APPLE__ -.size sp_256_mont_reduce_4,.-sp_256_mont_reduce_4 +.size sp_256_mont_reduce_order_4,.-sp_256_mont_reduce_order_4 #endif /* __APPLE__ */ /* Add two Montgomery form numbers (r = a + b % m). * @@ -39901,14 +56531,13 @@ _sp_256_mont_add_4: movq 8(%rsi), %rcx movq 16(%rsi), %r8 movq 24(%rsi), %r9 - movq $0xffffffff, %r10 - movq $0xffffffff00000001, %r11 addq (%rdx), %rax + movq $0xffffffff, %r10 adcq 8(%rdx), %rcx + movq $0xffffffff00000001, %r11 adcq 16(%rdx), %r8 - movq $0x00, %rsi adcq 24(%rdx), %r9 - sbbq $0x00, %rsi + sbbq %rsi, %rsi andq %rsi, %r10 andq %rsi, %r11 subq %rsi, %rax @@ -39952,14 +56581,13 @@ _sp_256_mont_dbl_4: movq 8(%rsi), %rax movq 16(%rsi), %rcx movq 24(%rsi), %r8 - movq $0xffffffff, %r9 - movq $0xffffffff00000001, %r10 addq %rdx, %rdx + movq $0xffffffff, %r9 adcq %rax, %rax + movq $0xffffffff00000001, %r10 adcq %rcx, %rcx - movq $0x00, %r11 adcq %r8, %r8 - sbbq $0x00, %r11 + sbbq %r11, %r11 andq %r11, %r9 andq %r11, %r10 subq %r11, %rdx @@ -40003,14 +56631,13 @@ _sp_256_mont_tpl_4: movq 8(%rsi), %rax movq 16(%rsi), %rcx movq 24(%rsi), %r8 - movq $0xffffffff, %r9 - movq $0xffffffff00000001, %r10 addq %rdx, %rdx + movq $0xffffffff, %r9 adcq %rax, %rax + movq $0xffffffff00000001, %r10 adcq %rcx, %rcx - movq $0x00, %r11 adcq %r8, %r8 - sbbq $0x00, %r11 + sbbq %r11, %r11 andq %r11, %r9 andq %r11, %r10 subq %r11, %rdx @@ -40024,14 +56651,13 @@ _sp_256_mont_tpl_4: sbbq %r9, %rax sbbq $0x00, %rcx sbbq %r10, %r8 - movq $0xffffffff, %r9 - movq $0xffffffff00000001, %r10 addq (%rsi), %rdx + movq $0xffffffff, %r9 adcq 8(%rsi), %rax + movq $0xffffffff00000001, %r10 adcq 16(%rsi), %rcx - movq $0x00, %r11 adcq 24(%rsi), %r8 - sbbq $0x00, %r11 + sbbq %r11, %r11 andq %r11, %r9 andq %r11, %r10 subq %r11, %rdx @@ -40076,14 +56702,13 @@ _sp_256_mont_sub_4: movq 8(%rsi), %rcx movq 16(%rsi), %r8 movq 24(%rsi), %r9 - movq $0xffffffff, %r10 - movq $0xffffffff00000001, %r11 subq (%rdx), %rax + movq $0xffffffff, %r10 sbbq 8(%rdx), %rcx + movq $0xffffffff00000001, %r11 sbbq 16(%rdx), %r8 - movq $0x00, %rsi sbbq 24(%rdx), %r9 - sbbq $0x00, %rsi + sbbq %rsi, %rsi andq %rsi, %r10 andq %rsi, %r11 addq %rsi, %rax @@ -40105,6 +56730,52 @@ _sp_256_mont_sub_4: #ifndef __APPLE__ .size sp_256_mont_sub_4,.-sp_256_mont_sub_4 #endif /* __APPLE__ */ +/* Subtract two Montgomery form numbers (r = a - b % m). + * + * b is less than the modulus. + * + * r Result of subtration. + * a Number to subtract from in Montgomery form. + * b Number to subtract with in Montgomery form. + * m Modulus (prime). + */ +#ifndef __APPLE__ +.text +.globl sp_256_mont_sub_lower_4 +.type sp_256_mont_sub_lower_4,@function +.align 16 +sp_256_mont_sub_lower_4: +#else +.section __TEXT,__text +.globl _sp_256_mont_sub_lower_4 +.p2align 4 +_sp_256_mont_sub_lower_4: +#endif /* __APPLE__ */ + movq (%rsi), %rax + movq 8(%rsi), %rcx + movq 16(%rsi), %r8 + movq 24(%rsi), %r9 + subq (%rdx), %rax + movq $0xffffffff, %r10 + sbbq 8(%rdx), %rcx + movq $0xffffffff00000001, %r11 + sbbq 16(%rdx), %r8 + sbbq 24(%rdx), %r9 + sbbq %rsi, %rsi + andq %rsi, %r10 + andq %rsi, %r11 + addq %rsi, %rax + adcq %r10, %rcx + movq %rax, (%rdi) + adcq $0x00, %r8 + movq %rcx, 8(%rdi) + adcq %r11, %r9 + movq %r8, 16(%rdi) + movq %r9, 24(%rdi) + repz retq +#ifndef __APPLE__ +.size sp_256_mont_sub_lower_4,.-sp_256_mont_sub_lower_4 +#endif /* __APPLE__ */ /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) * * r Result of division by 2. @@ -40152,6 +56823,214 @@ _sp_256_div2_4: #ifndef __APPLE__ .size sp_256_div2_4,.-sp_256_div2_4 #endif /* __APPLE__ */ +/* Triple a Montgomery form number (r = a + a + a % m). + * + * a is less than m. + * + * r Result of Tripling. + * a Number to triple in Montgomery form. + * m Modulus (prime). + */ +#ifndef __APPLE__ +.text +.globl sp_256_mont_tpl_lower_4 +.type sp_256_mont_tpl_lower_4,@function +.align 16 +sp_256_mont_tpl_lower_4: +#else +.section __TEXT,__text +.globl _sp_256_mont_tpl_lower_4 +.p2align 4 +_sp_256_mont_tpl_lower_4: +#endif /* __APPLE__ */ + movq (%rsi), %rdx + movq 8(%rsi), %rax + movq 16(%rsi), %rcx + movq 24(%rsi), %r8 + addq %rdx, %rdx + movq $0xffffffff, %r9 + adcq %rax, %rax + movq $0xffffffff00000001, %r10 + adcq %rcx, %rcx + adcq %r8, %r8 + sbbq %r11, %r11 + andq %r11, %r9 + andq %r11, %r10 + subq %r11, %rdx + sbbq %r9, %rax + sbbq $0x00, %rcx + sbbq %r10, %r8 + addq (%rsi), %rdx + movq $0xffffffff, %r9 + adcq 8(%rsi), %rax + movq $0xffffffff00000001, %r10 + adcq 16(%rsi), %rcx + adcq 24(%rsi), %r8 + sbbq %r11, %r11 + andq %r11, %r9 + andq %r11, %r10 + subq %r11, %rdx + sbbq %r9, %rax + sbbq $0x00, %rcx + sbbq %r10, %r8 + adcq $0x00, %r11 + andq %r11, %r9 + andq %r11, %r10 + subq %r11, %rdx + sbbq %r9, %rax + movq %rdx, (%rdi) + sbbq $0x00, %rcx + movq %rax, 8(%rdi) + sbbq %r10, %r8 + movq %rcx, 16(%rdi) + movq %r8, 24(%rdi) + repz retq +#ifndef __APPLE__ +.size sp_256_mont_tpl_lower_4,.-sp_256_mont_tpl_lower_4 +#endif /* __APPLE__ */ +/* Two Montgomery numbers, subtract double second from first (r = a - 2.b % m). + * + * r Result of subtration. + * a Number to subtract from in Montgomery form. + * b Number to double and subtract with in Montgomery form. + * m Modulus (prime). + */ +#ifndef __APPLE__ +.text +.globl sp_256_mont_sub_dbl_4 +.type sp_256_mont_sub_dbl_4,@function +.align 16 +sp_256_mont_sub_dbl_4: +#else +.section __TEXT,__text +.globl _sp_256_mont_sub_dbl_4 +.p2align 4 +_sp_256_mont_sub_dbl_4: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq (%rsi), %rax + movq 8(%rsi), %rcx + movq 16(%rsi), %r8 + movq 24(%rsi), %r9 + movq (%rdx), %r10 + movq 8(%rdx), %r11 + movq 16(%rdx), %r12 + movq 24(%rdx), %r13 + addq %r10, %r10 + movq $0xffffffff, %r14 + adcq %r11, %r11 + movq $0xffffffff00000001, %r15 + adcq %r12, %r12 + adcq %r13, %r13 + sbbq %rdx, %rdx + andq %rdx, %r14 + andq %rdx, %r15 + subq %rdx, %r10 + sbbq %r14, %r11 + sbbq $0x00, %r12 + sbbq %r15, %r13 + adcq $0x00, %rdx + andq %rdx, %r14 + andq %rdx, %r15 + subq %rdx, %r10 + sbbq %r14, %r11 + sbbq $0x00, %r12 + sbbq %r15, %r13 + subq %r10, %rax + movq $0xffffffff, %r14 + sbbq %r11, %rcx + movq $0xffffffff00000001, %r15 + sbbq %r12, %r8 + sbbq %r13, %r9 + sbbq %rdx, %rdx + andq %rdx, %r14 + andq %rdx, %r15 + addq %rdx, %rax + adcq %r14, %rcx + adcq $0x00, %r8 + adcq %r15, %r9 + adcq $0x00, %rdx + andq %rdx, %r14 + andq %rdx, %r15 + addq %rdx, %rax + adcq %r14, %rcx + movq %rax, (%rdi) + adcq $0x00, %r8 + movq %rcx, 8(%rdi) + adcq %r15, %r9 + movq %r8, 16(%rdi) + movq %r9, 24(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_256_mont_sub_dbl_4,.-sp_256_mont_sub_dbl_4 +#endif /* __APPLE__ */ +/* Two Montgomery numbers, subtract second from first and double. + * (r = 2.(a - b) % m). + * + * b must have came from a mont_sub operation. + * + * r Result of subtration. + * a Number to subtract from in Montgomery form. + * b Number to subtract with in Montgomery form. + * m Modulus (prime). + */ +#ifndef __APPLE__ +.text +.globl sp_256_mont_dbl_sub_4 +.type sp_256_mont_dbl_sub_4,@function +.align 16 +sp_256_mont_dbl_sub_4: +#else +.section __TEXT,__text +.globl _sp_256_mont_dbl_sub_4 +.p2align 4 +_sp_256_mont_dbl_sub_4: +#endif /* __APPLE__ */ + movq (%rsi), %rax + movq 8(%rsi), %rcx + movq 16(%rsi), %r8 + movq 24(%rsi), %r9 + subq (%rdx), %rax + movq $0xffffffff, %r10 + sbbq 8(%rdx), %rcx + movq $0xffffffff00000001, %r11 + sbbq 16(%rdx), %r8 + sbbq 24(%rdx), %r9 + sbbq %rdx, %rdx + andq %rdx, %r10 + andq %rdx, %r11 + addq %rdx, %rax + adcq %r10, %rcx + adcq $0x00, %r8 + adcq %r11, %r9 + addq %rax, %rax + movq $0xffffffff, %r10 + adcq %rcx, %rcx + movq $0xffffffff00000001, %r11 + adcq %r8, %r8 + adcq %r9, %r9 + sbbq %rdx, %rdx + andq %rdx, %r10 + andq %rdx, %r11 + subq %rdx, %rax + sbbq %r10, %rcx + movq %rax, (%rdi) + sbbq $0x00, %r8 + movq %rcx, 8(%rdi) + sbbq %r11, %r9 + movq %r8, 16(%rdi) + movq %r9, 24(%rdi) + repz retq +#ifndef __APPLE__ +.size sp_256_mont_dbl_sub_4,.-sp_256_mont_dbl_sub_4 +#endif /* __APPLE__ */ #ifndef WC_NO_CACHE_RESISTANT /* Touch each possible point that could be being copied. * @@ -40298,138 +57177,125 @@ sp_256_mont_mul_avx2_4: .p2align 4 _sp_256_mont_mul_avx2_4: #endif /* __APPLE__ */ - pushq %rbx + pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %rbp + pushq %rbx movq %rdx, %rbp - # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rax, %rcx - xorq %r15, %r15 + movq (%rsi), %rdx + # A[0] * B[0] + mulxq (%rbp), %r8, %r9 + xorq %rbx, %rbx + # A[0] * B[1] + mulxq 8(%rbp), %rax, %r10 adcxq %rax, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rcx, %r10 - # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rsi), %rax, %rcx - adoxq %rax, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rax, %r14 - adoxq %rcx, %r10 - adcxq %rax, %r11 - # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rsi), %rax, %rcx - adcxq %r14, %r12 - adoxq %rax, %r11 - adcxq %r15, %r13 - adoxq %rcx, %r12 - # A[0] * B[2] - mulxq (%rsi), %rax, %rcx - adoxq %r15, %r13 - xorq %r14, %r14 + # A[0] * B[2] + mulxq 16(%rbp), %rax, %r11 adcxq %rax, %r10 - # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rsi), %rdx, %rax - adcxq %rcx, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rax, %r11 - mulxq 24(%rsi), %rax, %rcx + # A[0] * B[3] + mulxq 24(%rbp), %rax, %r12 + adcxq %rax, %r11 + movq 8(%rsi), %rdx + adcxq %rbx, %r12 + # A[1] * B[0] + mulxq (%rbp), %rax, %rcx + xorq %rbx, %rbx + adcxq %rax, %r9 + # A[1] * B[1] + mulxq 8(%rbp), %rax, %r15 + adoxq %rcx, %r10 + adcxq %rax, %r10 + # A[1] * B[2] + mulxq 16(%rbp), %rax, %rcx + adoxq %r15, %r11 + adcxq %rax, %r11 + # A[1] * B[3] + mulxq 24(%rbp), %rax, %r13 + adoxq %rcx, %r12 adcxq %rax, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rsi), %rdx, %rax - adcxq %rcx, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rax, %r13 - mulxq 24(%rsi), %rax, %rcx - adoxq %r15, %r14 - adcxq %rax, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rax - adcxq %rcx, %r15 - xorq %rcx, %rcx - adcxq %rdx, %r11 - # A[3] * B[0] + adoxq %rbx, %r13 + movq 16(%rsi), %rdx + adcxq %rbx, %r13 + # A[2] * B[0] + mulxq (%rbp), %rax, %rcx + xorq %rbx, %rbx + adcxq %rax, %r10 + # A[2] * B[1] + mulxq 8(%rbp), %rax, %r15 + adoxq %rcx, %r11 + adcxq %rax, %r11 + # A[2] * B[2] + mulxq 16(%rbp), %rax, %rcx + adoxq %r15, %r12 + adcxq %rax, %r12 + # A[2] * B[3] + mulxq 24(%rbp), %rax, %r14 + adoxq %rcx, %r13 + adcxq %rax, %r13 + adoxq %rbx, %r14 movq 24(%rsi), %rdx + adcxq %rbx, %r14 + # A[3] * B[0] + mulxq (%rbp), %rax, %rcx + xorq %rbx, %rbx + adcxq %rax, %r11 + # A[3] * B[1] + mulxq 8(%rbp), %rax, %r15 + adoxq %rcx, %r12 adcxq %rax, %r12 - mulxq (%rbp), %rbx, %rax - adoxq %rbx, %r11 - adoxq %rax, %r12 - # A[3] * B[2] - mulxq 16(%rbp), %rdx, %rax - adcxq %rdx, %r13 - # A[2] * B[3] - movq 24(%rbp), %rdx + # A[3] * B[2] + mulxq 16(%rbp), %rax, %rcx + adoxq %r15, %r13 + adcxq %rax, %r13 + # A[3] * B[3] + mulxq 24(%rbp), %rax, %r15 + adoxq %rcx, %r14 adcxq %rax, %r14 - mulxq 16(%rsi), %rax, %rdx - adcxq %rcx, %r15 - adoxq %rax, %r13 - adoxq %rdx, %r14 - adoxq %rcx, %r15 + adoxq %rbx, %r15 + adcxq %rbx, %r15 # Start Reduction # mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192 # - a[0] << 32 << 192 - # + (a[0] * 2) << 192 + # a[0]-a[3] + (a[0] * 2) << 192 movq %r8, %rax - movq %r11, %rdx - addq %r8, %rdx + leaq (%r11,%r8,2), %rdx movq %r9, %rsi - addq %r8, %rdx movq %r10, %rbp + movq %r10, %rcx # a[0]-a[2] << 32 shlq $32, %r8 - shldq $32, %rsi, %r10 + shldq $32, %rsi, %rcx shldq $32, %rax, %r9 # - a[0] << 32 << 192 subq %r8, %rdx # + a[0]-a[2] << 32 << 64 addq %r8, %rsi adcq %r9, %rbp - adcq %r10, %rdx + adcq %rcx, %rdx # a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu + xorq %rcx, %rcx # a += mu << 256 - xorq %r8, %r8 addq %rax, %r12 adcq %rsi, %r13 adcq %rbp, %r14 adcq %rdx, %r15 - sbbq $0x00, %r8 + sbbq %r8, %r8 # a += mu << 192 addq %rax, %r11 adcq %rsi, %r12 + movq %rsi, %r9 adcq %rbp, %r13 adcq %rdx, %r14 adcq $0x00, %r15 sbbq $0x00, %r8 # mu <<= 32 - movq %rdx, %rcx + shldq $32, %rdx, %rcx shldq $32, %rbp, %rdx shldq $32, %rsi, %rbp shldq $32, %rax, %rsi - shrq $32, %rcx shlq $32, %rax - # a += (mu << 32) << 64 - addq %rbp, %r11 - adcq %rdx, %r12 - adcq %rcx, %r13 - adcq $0x00, %r14 - adcq $0x00, %r15 - sbbq $0x00, %r8 # a -= (mu << 32) << 192 subq %rax, %r11 sbbq %rsi, %r12 @@ -40437,27 +57303,36 @@ _sp_256_mont_mul_avx2_4: sbbq %rdx, %r14 sbbq %rcx, %r15 adcq $0x00, %r8 - movq $0xffffffff, %rax + # a += (mu << 32) << 64 + subq %rax, %r9 + adcq %rsi, %r10 + adcq %rbp, %r11 + adcq %rdx, %r12 + adcq %rcx, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + sbbq $0x00, %r8 movq $0xffffffff00000001, %rsi + movq %r8, %rax # mask m and sub from result if overflow # m[0] = -1 & mask = mask - andq %r8, %rax + shrq $32, %rax # m[2] = 0 & mask = 0 andq %r8, %rsi subq %r8, %r12 sbbq %rax, %r13 - sbbq $0x00, %r14 - sbbq %rsi, %r15 movq %r12, (%rdi) + sbbq $0x00, %r14 movq %r13, 8(%rdi) + sbbq %rsi, %r15 movq %r14, 16(%rdi) movq %r15, 24(%rdi) - popq %rbp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx + popq %rbp repz retq #ifndef __APPLE__ .size sp_256_mont_mul_avx2_4,.-sp_256_mont_mul_avx2_4 @@ -40488,31 +57363,34 @@ _sp_256_mont_sqr_avx2_4: pushq %r14 pushq %r15 pushq %rbx - # A[0] * A[1] + xorq %r8, %r8 movq (%rsi), %rdx - movq 16(%rsi), %r15 - mulxq 8(%rsi), %r9, %r10 + movq 8(%rsi), %rcx + movq 16(%rsi), %rbx + movq 24(%rsi), %r15 + # A[0] * A[1] + mulxq %rcx, %r9, %r10 + # A[0] * A[2] + mulxq %rbx, %r8, %r11 + adoxq %r8, %r10 # A[0] * A[3] - mulxq 24(%rsi), %r11, %r12 - # A[2] * A[1] + mulxq %r15, %r8, %r12 + movq %rcx, %rdx + adoxq %r8, %r11 + # A[1] * A[2] + mulxq %rbx, %r8, %rax movq %r15, %rdx - mulxq 8(%rsi), %rcx, %rbx - # A[2] * A[3] - mulxq 24(%rsi), %r13, %r14 - xorq %r15, %r15 - adoxq %rcx, %r11 - adoxq %rbx, %r12 - # A[2] * A[0] - mulxq (%rsi), %rcx, %rbx + adcxq %r8, %r11 # A[1] * A[3] - movq 8(%rsi), %rdx + mulxq %rcx, %r8, %r13 + movq $0x00, %r15 + adoxq %rax, %r12 + adcxq %r8, %r12 + # A[2] * A[3] + mulxq %rbx, %r8, %r14 adoxq %r15, %r13 - mulxq 24(%rsi), %rax, %r8 - adcxq %rcx, %r10 - adoxq %r15, %r14 - adcxq %rbx, %r11 - adcxq %rax, %r12 adcxq %r8, %r13 + adoxq %r15, %r14 adcxq %r15, %r14 # Double with Carry Flag xorq %r15, %r15 @@ -40545,52 +57423,44 @@ _sp_256_mont_sqr_avx2_4: # Start Reduction # mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192 # - a[0] << 32 << 192 - # + (a[0] * 2) << 192 + # a[0]-a[3] + (a[0] * 2) << 192 movq %r8, %rax - movq %r11, %rdx - addq %r8, %rdx + leaq (%r11,%r8,2), %rdx movq %r9, %rsi - addq %r8, %rdx movq %r10, %rcx + movq %r10, %rbx # a[0]-a[2] << 32 shlq $32, %r8 - shldq $32, %rsi, %r10 + shldq $32, %rsi, %rbx shldq $32, %rax, %r9 # - a[0] << 32 << 192 subq %r8, %rdx # + a[0]-a[2] << 32 << 64 addq %r8, %rsi adcq %r9, %rcx - adcq %r10, %rdx + adcq %rbx, %rdx # a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu + xorq %rbx, %rbx # a += mu << 256 - xorq %r8, %r8 addq %rax, %r12 adcq %rsi, %r13 adcq %rcx, %r14 adcq %rdx, %r15 - sbbq $0x00, %r8 + sbbq %r8, %r8 # a += mu << 192 addq %rax, %r11 adcq %rsi, %r12 + movq %rsi, %r9 adcq %rcx, %r13 adcq %rdx, %r14 adcq $0x00, %r15 sbbq $0x00, %r8 # mu <<= 32 - movq %rdx, %rbx + shldq $32, %rdx, %rbx shldq $32, %rcx, %rdx shldq $32, %rsi, %rcx shldq $32, %rax, %rsi - shrq $32, %rbx shlq $32, %rax - # a += (mu << 32) << 64 - addq %rcx, %r11 - adcq %rdx, %r12 - adcq %rbx, %r13 - adcq $0x00, %r14 - adcq $0x00, %r15 - sbbq $0x00, %r8 # a -= (mu << 32) << 192 subq %rax, %r11 sbbq %rsi, %r12 @@ -40598,19 +57468,28 @@ _sp_256_mont_sqr_avx2_4: sbbq %rdx, %r14 sbbq %rbx, %r15 adcq $0x00, %r8 - movq $0xffffffff, %rax + # a += (mu << 32) << 64 + subq %rax, %r9 + adcq %rsi, %r10 + adcq %rcx, %r11 + adcq %rdx, %r12 + adcq %rbx, %r13 + adcq $0x00, %r14 + adcq $0x00, %r15 + sbbq $0x00, %r8 movq $0xffffffff00000001, %rsi + movq %r8, %rax # mask m and sub from result if overflow # m[0] = -1 & mask = mask - andq %r8, %rax + shrq $32, %rax # m[2] = 0 & mask = 0 andq %r8, %rsi subq %r8, %r12 sbbq %rax, %r13 - sbbq $0x00, %r14 - sbbq %rsi, %r15 movq %r12, (%rdi) + sbbq $0x00, %r14 movq %r13, 8(%rdi) + sbbq %rsi, %r15 movq %r14, 16(%rdi) movq %r15, 24(%rdi) popq %rbx @@ -40648,7 +57527,6 @@ _sp_256_cond_sub_avx2_4: pushq %r13 pushq %r14 pushq %r15 - movq $0x00, %rax movq (%rdx), %r12 movq 8(%rdx), %r13 movq 16(%rdx), %r14 @@ -40669,7 +57547,7 @@ _sp_256_cond_sub_avx2_4: movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax popq %r15 popq %r14 popq %r13 @@ -40688,15 +57566,15 @@ _sp_256_cond_sub_avx2_4: */ #ifndef __APPLE__ .text -.globl sp_256_mont_reduce_avx2_4 -.type sp_256_mont_reduce_avx2_4,@function +.globl sp_256_mont_reduce_avx2_order_4 +.type sp_256_mont_reduce_avx2_order_4,@function .align 16 -sp_256_mont_reduce_avx2_4: +sp_256_mont_reduce_avx2_order_4: #else .section __TEXT,__text -.globl _sp_256_mont_reduce_avx2_4 +.globl _sp_256_mont_reduce_avx2_order_4 .p2align 4 -_sp_256_mont_reduce_avx2_4: +_sp_256_mont_reduce_avx2_order_4: #endif /* __APPLE__ */ pushq %r12 pushq %r13 @@ -40840,7 +57718,7 @@ _sp_256_mont_reduce_avx2_4: popq %r12 repz retq #ifndef __APPLE__ -.size sp_256_mont_reduce_avx2_4,.-sp_256_mont_reduce_avx2_4 +.size sp_256_mont_reduce_avx2_order_4,.-sp_256_mont_reduce_avx2_order_4 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ #ifdef HAVE_INTEL_AVX2 @@ -41401,7 +58279,6 @@ sp_256_sub_in_place_4: .p2align 4 _sp_256_sub_in_place_4: #endif /* __APPLE__ */ - xorq %rax, %rax movq (%rsi), %rdx movq 8(%rsi), %rcx movq 16(%rsi), %r8 @@ -41410,7 +58287,7 @@ _sp_256_sub_in_place_4: sbbq %rcx, 8(%rdi) sbbq %r8, 16(%rdi) sbbq %r9, 24(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_256_sub_in_place_4,.-sp_256_sub_in_place_4 @@ -41567,89 +58444,84 @@ sp_256_mont_mul_order_avx2_4: .p2align 4 _sp_256_mont_mul_order_avx2_4: #endif /* __APPLE__ */ - pushq %rbx + pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %rbp + pushq %rbx movq %rdx, %rbp - # A[0] * B[0] - movq (%rbp), %rdx - mulxq (%rsi), %r8, %r9 - # A[2] * B[0] - mulxq 16(%rsi), %r10, %r11 - # A[1] * B[0] - mulxq 8(%rsi), %rax, %rcx - xorq %r15, %r15 + movq (%rsi), %rdx + # A[0] * B[0] + mulxq (%rbp), %r8, %r9 + xorq %rbx, %rbx + # A[0] * B[1] + mulxq 8(%rbp), %rax, %r10 adcxq %rax, %r9 - # A[1] * B[3] - movq 24(%rbp), %rdx - mulxq 8(%rsi), %r12, %r13 - adcxq %rcx, %r10 - # A[0] * B[1] - movq 8(%rbp), %rdx - mulxq (%rsi), %rax, %rcx - adoxq %rax, %r9 - # A[2] * B[1] - mulxq 16(%rsi), %rax, %r14 - adoxq %rcx, %r10 - adcxq %rax, %r11 - # A[1] * B[2] - movq 16(%rbp), %rdx - mulxq 8(%rsi), %rax, %rcx - adcxq %r14, %r12 - adoxq %rax, %r11 - adcxq %r15, %r13 - adoxq %rcx, %r12 - # A[0] * B[2] - mulxq (%rsi), %rax, %rcx - adoxq %r15, %r13 - xorq %r14, %r14 + # A[0] * B[2] + mulxq 16(%rbp), %rax, %r11 adcxq %rax, %r10 - # A[1] * B[1] - movq 8(%rbp), %rdx - mulxq 8(%rsi), %rdx, %rax - adcxq %rcx, %r11 - adoxq %rdx, %r10 - # A[3] * B[1] - movq 8(%rbp), %rdx - adoxq %rax, %r11 - mulxq 24(%rsi), %rax, %rcx + # A[0] * B[3] + mulxq 24(%rbp), %rax, %r12 + adcxq %rax, %r11 + movq 8(%rsi), %rdx + adcxq %rbx, %r12 + # A[1] * B[0] + mulxq (%rbp), %rax, %rcx + xorq %rbx, %rbx + adcxq %rax, %r9 + # A[1] * B[1] + mulxq 8(%rbp), %rax, %r15 + adoxq %rcx, %r10 + adcxq %rax, %r10 + # A[1] * B[2] + mulxq 16(%rbp), %rax, %rcx + adoxq %r15, %r11 + adcxq %rax, %r11 + # A[1] * B[3] + mulxq 24(%rbp), %rax, %r13 + adoxq %rcx, %r12 adcxq %rax, %r12 - # A[2] * B[2] - movq 16(%rbp), %rdx - mulxq 16(%rsi), %rdx, %rax - adcxq %rcx, %r13 - adoxq %rdx, %r12 - # A[3] * B[3] - movq 24(%rbp), %rdx - adoxq %rax, %r13 - mulxq 24(%rsi), %rax, %rcx - adoxq %r15, %r14 - adcxq %rax, %r14 - # A[0] * B[3] - mulxq (%rsi), %rdx, %rax - adcxq %rcx, %r15 - xorq %rcx, %rcx - adcxq %rdx, %r11 - # A[3] * B[0] + adoxq %rbx, %r13 + movq 16(%rsi), %rdx + adcxq %rbx, %r13 + # A[2] * B[0] + mulxq (%rbp), %rax, %rcx + xorq %rbx, %rbx + adcxq %rax, %r10 + # A[2] * B[1] + mulxq 8(%rbp), %rax, %r15 + adoxq %rcx, %r11 + adcxq %rax, %r11 + # A[2] * B[2] + mulxq 16(%rbp), %rax, %rcx + adoxq %r15, %r12 + adcxq %rax, %r12 + # A[2] * B[3] + mulxq 24(%rbp), %rax, %r14 + adoxq %rcx, %r13 + adcxq %rax, %r13 + adoxq %rbx, %r14 movq 24(%rsi), %rdx + adcxq %rbx, %r14 + # A[3] * B[0] + mulxq (%rbp), %rax, %rcx + xorq %rbx, %rbx + adcxq %rax, %r11 + # A[3] * B[1] + mulxq 8(%rbp), %rax, %r15 + adoxq %rcx, %r12 adcxq %rax, %r12 - mulxq (%rbp), %rbx, %rax - adoxq %rbx, %r11 - adoxq %rax, %r12 - # A[3] * B[2] - mulxq 16(%rbp), %rdx, %rax - adcxq %rdx, %r13 - # A[2] * B[3] - movq 24(%rbp), %rdx + # A[3] * B[2] + mulxq 16(%rbp), %rax, %rcx + adoxq %r15, %r13 + adcxq %rax, %r13 + # A[3] * B[3] + mulxq 24(%rbp), %rax, %r15 + adoxq %rcx, %r14 adcxq %rax, %r14 - mulxq 16(%rsi), %rax, %rdx - adcxq %rcx, %r15 - adoxq %rax, %r13 - adoxq %rdx, %r14 - adoxq %rcx, %r15 + adoxq %rbx, %r15 + adcxq %rbx, %r15 # Start Reduction movq $0xccd1c8aaee00bc4f, %rbx # A[0] @@ -41767,12 +58639,12 @@ _sp_256_mont_mul_order_avx2_4: sbbq %rbp, %r15 movq %r14, 16(%rdi) movq %r15, 24(%rdi) - popq %rbp + popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 - popq %rbx + popq %rbp repz retq #ifndef __APPLE__ .size sp_256_mont_mul_order_avx2_4,.-sp_256_mont_mul_order_avx2_4 @@ -41802,31 +58674,34 @@ _sp_256_mont_sqr_order_avx2_4: pushq %r14 pushq %r15 pushq %rbx - # A[0] * A[1] + xorq %r8, %r8 movq (%rsi), %rdx - movq 16(%rsi), %r15 - mulxq 8(%rsi), %r9, %r10 + movq 8(%rsi), %rcx + movq 16(%rsi), %rbx + movq 24(%rsi), %r15 + # A[0] * A[1] + mulxq %rcx, %r9, %r10 + # A[0] * A[2] + mulxq %rbx, %r8, %r11 + adoxq %r8, %r10 # A[0] * A[3] - mulxq 24(%rsi), %r11, %r12 - # A[2] * A[1] + mulxq %r15, %r8, %r12 + movq %rcx, %rdx + adoxq %r8, %r11 + # A[1] * A[2] + mulxq %rbx, %r8, %rax movq %r15, %rdx - mulxq 8(%rsi), %rcx, %rbx - # A[2] * A[3] - mulxq 24(%rsi), %r13, %r14 - xorq %r15, %r15 - adoxq %rcx, %r11 - adoxq %rbx, %r12 - # A[2] * A[0] - mulxq (%rsi), %rcx, %rbx + adcxq %r8, %r11 # A[1] * A[3] - movq 8(%rsi), %rdx + mulxq %rcx, %r8, %r13 + movq $0x00, %r15 + adoxq %rax, %r12 + adcxq %r8, %r12 + # A[2] * A[3] + mulxq %rbx, %r8, %r14 adoxq %r15, %r13 - mulxq 24(%rsi), %rax, %r8 - adcxq %rcx, %r10 - adoxq %r15, %r14 - adcxq %rbx, %r11 - adcxq %rax, %r12 adcxq %r8, %r13 + adoxq %r15, %r14 adcxq %r15, %r14 # Double with Carry Flag xorq %r15, %r15 @@ -43445,27 +60320,28 @@ sp_384_add_6: .p2align 4 _sp_384_add_6: #endif /* __APPLE__ */ - # Add - movq (%rsi), %rcx + pushq %r12 xorq %rax, %rax - addq (%rdx), %rcx + movq (%rsi), %rcx movq 8(%rsi), %r8 - movq %rcx, (%rdi) + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 + movq 32(%rsi), %r11 + movq 40(%rsi), %r12 + addq (%rdx), %rcx adcq 8(%rdx), %r8 - movq 16(%rsi), %rcx + adcq 16(%rdx), %r9 + adcq 24(%rdx), %r10 + adcq 32(%rdx), %r11 + adcq 40(%rdx), %r12 + movq %rcx, (%rdi) movq %r8, 8(%rdi) - adcq 16(%rdx), %rcx - movq 24(%rsi), %r8 - movq %rcx, 16(%rdi) - adcq 24(%rdx), %r8 - movq 32(%rsi), %rcx - movq %r8, 24(%rdi) - adcq 32(%rdx), %rcx - movq 40(%rsi), %r8 - movq %rcx, 32(%rdi) - adcq 40(%rdx), %r8 - movq %r8, 40(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + movq %r11, 32(%rdi) + movq %r12, 40(%rdi) adcq $0x00, %rax + popq %r12 repz retq #ifndef __APPLE__ .size sp_384_add_6,.-sp_384_add_6 @@ -43508,7 +60384,7 @@ _sp_384_sub_6: movq %r10, 24(%rdi) movq %r11, 32(%rdi) movq %r12, 40(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax popq %r12 repz retq #ifndef __APPLE__ @@ -43582,7 +60458,6 @@ sp_384_cond_sub_6: _sp_384_cond_sub_6: #endif /* __APPLE__ */ subq $48, %rsp - movq $0x00, %rax movq (%rdx), %r8 movq 8(%rdx), %r9 andq %rcx, %r8 @@ -43625,7 +60500,7 @@ _sp_384_cond_sub_6: sbbq %rdx, %r9 movq %r8, 32(%rdi) movq %r9, 40(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax addq $48, %rsp repz retq #ifndef __APPLE__ @@ -44005,116 +60880,375 @@ _sp_384_cmp_6: #ifndef __APPLE__ .size sp_384_cmp_6,.-sp_384_cmp_6 #endif /* __APPLE__ */ -/* Add a to a into r. (r = a + a) +/* Add two Montgomery form numbers (r = a + b % m). * - * r A single precision integer. - * a A single precision integer. + * r Result of addition. + * a First number to add in Montgomery form. + * b Second number to add in Montgomery form. + * m Modulus (prime). */ #ifndef __APPLE__ .text -.globl sp_384_dbl_6 -.type sp_384_dbl_6,@function +.globl sp_384_mont_add_6 +.type sp_384_mont_add_6,@function .align 16 -sp_384_dbl_6: +sp_384_mont_add_6: #else .section __TEXT,__text -.globl _sp_384_dbl_6 +.globl _sp_384_mont_add_6 .p2align 4 -_sp_384_dbl_6: +_sp_384_mont_add_6: #endif /* __APPLE__ */ - movq (%rsi), %rdx - xorq %rax, %rax - addq %rdx, %rdx + pushq %r12 + pushq %r13 + pushq %r14 + movq (%rsi), %rax movq 8(%rsi), %rcx - movq %rdx, (%rdi) - adcq %rcx, %rcx - movq 16(%rsi), %rdx + movq 16(%rsi), %r8 + movq 24(%rsi), %r9 + movq 32(%rsi), %r10 + movq 40(%rsi), %r11 + addq (%rdx), %rax + movq $0xffffffff, %r12 + adcq 8(%rdx), %rcx + movq $0xffffffff00000000, %r13 + adcq 16(%rdx), %r8 + movq $0xfffffffffffffffe, %r14 + adcq 24(%rdx), %r9 + adcq 32(%rdx), %r10 + adcq 40(%rdx), %r11 + sbbq %rsi, %rsi + andq %rsi, %r12 + andq %rsi, %r13 + andq %rsi, %r14 + subq %r12, %rax + sbbq %r13, %rcx + sbbq %r14, %r8 + sbbq %rsi, %r9 + sbbq %rsi, %r10 + sbbq %rsi, %r11 + adcq $0x00, %rsi + andq %rsi, %r12 + andq %rsi, %r13 + andq %rsi, %r14 + subq %r12, %rax + sbbq %r13, %rcx + movq %rax, (%rdi) + sbbq %r14, %r8 movq %rcx, 8(%rdi) - adcq %rdx, %rdx - movq 24(%rsi), %rcx - movq %rdx, 16(%rdi) - adcq %rcx, %rcx - movq 32(%rsi), %rdx - movq %rcx, 24(%rdi) - adcq %rdx, %rdx - movq 40(%rsi), %rcx - movq %rdx, 32(%rdi) - adcq %rcx, %rcx - movq %rcx, 40(%rdi) - adcq $0x00, %rax + sbbq %rsi, %r9 + movq %r8, 16(%rdi) + sbbq %rsi, %r10 + movq %r9, 24(%rdi) + sbbq %rsi, %r11 + movq %r10, 32(%rdi) + movq %r11, 40(%rdi) + popq %r14 + popq %r13 + popq %r12 repz retq #ifndef __APPLE__ -.size sp_384_dbl_6,.-sp_384_dbl_6 +.size sp_384_mont_add_6,.-sp_384_mont_add_6 #endif /* __APPLE__ */ -/* Conditionally add a and b using the mask m. - * m is -1 to add and 0 when not. +/* Double a Montgomery form number (r = a + a % m). * - * r A single precision number representing conditional add result. - * a A single precision number to add with. - * b A single precision number to add. - * m Mask value to apply. + * r Result of doubling. + * a Number to double in Montgomery form. + * m Modulus (prime). */ #ifndef __APPLE__ .text -.globl sp_384_cond_add_6 -.type sp_384_cond_add_6,@function +.globl sp_384_mont_dbl_6 +.type sp_384_mont_dbl_6,@function .align 16 -sp_384_cond_add_6: +sp_384_mont_dbl_6: #else .section __TEXT,__text -.globl _sp_384_cond_add_6 +.globl _sp_384_mont_dbl_6 .p2align 4 -_sp_384_cond_add_6: +_sp_384_mont_dbl_6: #endif /* __APPLE__ */ - subq $48, %rsp - movq $0x00, %rax - movq (%rdx), %r8 - movq 8(%rdx), %r9 - andq %rcx, %r8 - andq %rcx, %r9 - movq %r8, (%rsp) - movq %r9, 8(%rsp) - movq 16(%rdx), %r8 - movq 24(%rdx), %r9 - andq %rcx, %r8 - andq %rcx, %r9 - movq %r8, 16(%rsp) - movq %r9, 24(%rsp) - movq 32(%rdx), %r8 - movq 40(%rdx), %r9 - andq %rcx, %r8 - andq %rcx, %r9 - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movq (%rsi), %r8 - movq (%rsp), %rdx - addq %rdx, %r8 - movq 8(%rsi), %r9 - movq 8(%rsp), %rdx - adcq %rdx, %r9 - movq %r8, (%rdi) - movq 16(%rsi), %r8 - movq 16(%rsp), %rdx - adcq %rdx, %r8 - movq %r9, 8(%rdi) - movq 24(%rsi), %r9 - movq 24(%rsp), %rdx - adcq %rdx, %r9 - movq %r8, 16(%rdi) - movq 32(%rsi), %r8 - movq 32(%rsp), %rdx - adcq %rdx, %r8 - movq %r9, 24(%rdi) - movq 40(%rsi), %r9 - movq 40(%rsp), %rdx - adcq %rdx, %r9 - movq %r8, 32(%rdi) - movq %r9, 40(%rdi) - adcq $0x00, %rax - addq $48, %rsp + pushq %r12 + pushq %r13 + pushq %r14 + movq (%rsi), %rdx + movq 8(%rsi), %rax + movq 16(%rsi), %rcx + movq 24(%rsi), %r8 + movq 32(%rsi), %r9 + movq 40(%rsi), %r10 + addq %rdx, %rdx + movq $0xffffffff, %r11 + adcq %rax, %rax + movq $0xffffffff00000000, %r12 + adcq %rcx, %rcx + movq $0xfffffffffffffffe, %r13 + adcq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + sbbq %r14, %r14 + andq %r14, %r11 + andq %r14, %r12 + andq %r14, %r13 + subq %r11, %rdx + sbbq %r12, %rax + sbbq %r13, %rcx + sbbq %r14, %r8 + sbbq %r14, %r9 + sbbq %r14, %r10 + adcq $0x00, %r14 + andq %r14, %r11 + andq %r14, %r12 + andq %r14, %r13 + subq %r11, %rdx + sbbq %r12, %rax + movq %rdx, (%rdi) + sbbq %r13, %rcx + movq %rax, 8(%rdi) + sbbq %r14, %r8 + movq %rcx, 16(%rdi) + sbbq %r14, %r9 + movq %r8, 24(%rdi) + sbbq %r14, %r10 + movq %r9, 32(%rdi) + movq %r10, 40(%rdi) + popq %r14 + popq %r13 + popq %r12 repz retq #ifndef __APPLE__ -.size sp_384_cond_add_6,.-sp_384_cond_add_6 +.size sp_384_mont_dbl_6,.-sp_384_mont_dbl_6 +#endif /* __APPLE__ */ +/* Double a Montgomery form number (r = a + a % m). + * + * r Result of doubling. + * a Number to double in Montgomery form. + * m Modulus (prime). + */ +#ifndef __APPLE__ +.text +.globl sp_384_mont_tpl_6 +.type sp_384_mont_tpl_6,@function +.align 16 +sp_384_mont_tpl_6: +#else +.section __TEXT,__text +.globl _sp_384_mont_tpl_6 +.p2align 4 +_sp_384_mont_tpl_6: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + movq (%rsi), %rdx + movq 8(%rsi), %rax + movq 16(%rsi), %rcx + movq 24(%rsi), %r8 + movq 32(%rsi), %r9 + movq 40(%rsi), %r10 + addq %rdx, %rdx + movq $0xffffffff, %r11 + adcq %rax, %rax + movq $0xffffffff00000000, %r12 + adcq %rcx, %rcx + movq $0xfffffffffffffffe, %r13 + adcq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + sbbq %r14, %r14 + andq %r14, %r11 + andq %r14, %r12 + andq %r14, %r13 + subq %r11, %rdx + sbbq %r12, %rax + sbbq %r13, %rcx + sbbq %r14, %r8 + sbbq %r14, %r9 + sbbq %r14, %r10 + adcq $0x00, %r14 + andq %r14, %r11 + andq %r14, %r12 + andq %r14, %r13 + subq %r11, %rdx + sbbq %r12, %rax + movq %rdx, (%rdi) + sbbq %r13, %rcx + sbbq %r14, %r8 + sbbq %r14, %r9 + sbbq %r14, %r10 + addq (%rsi), %rdx + movq $0xffffffff, %r11 + adcq 8(%rsi), %rax + movq $0xffffffff00000000, %r12 + adcq 16(%rsi), %rcx + movq $0xfffffffffffffffe, %r13 + adcq 24(%rsi), %r8 + adcq 32(%rsi), %r9 + adcq 40(%rsi), %r10 + sbbq %r14, %r14 + andq %r14, %r11 + andq %r14, %r12 + andq %r14, %r13 + subq %r11, %rdx + sbbq %r12, %rax + sbbq %r13, %rcx + sbbq %r14, %r8 + sbbq %r14, %r9 + sbbq %r14, %r10 + adcq $0x00, %r14 + andq %r14, %r11 + andq %r14, %r12 + andq %r14, %r13 + subq %r11, %rdx + sbbq %r12, %rax + movq %rdx, (%rdi) + sbbq %r13, %rcx + movq %rax, 8(%rdi) + sbbq %r14, %r8 + movq %rcx, 16(%rdi) + sbbq %r14, %r9 + movq %r8, 24(%rdi) + sbbq %r14, %r10 + movq %r9, 32(%rdi) + movq %r10, 40(%rdi) + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_384_mont_tpl_6,.-sp_384_mont_tpl_6 +#endif /* __APPLE__ */ +/* Subtract two Montgomery form numbers (r = a - b % m). + * + * r Result of subtration. + * a Number to subtract from in Montgomery form. + * b Number to subtract with in Montgomery form. + * m Modulus (prime). + */ +#ifndef __APPLE__ +.text +.globl sp_384_mont_sub_6 +.type sp_384_mont_sub_6,@function +.align 16 +sp_384_mont_sub_6: +#else +.section __TEXT,__text +.globl _sp_384_mont_sub_6 +.p2align 4 +_sp_384_mont_sub_6: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + movq (%rsi), %rax + movq 8(%rsi), %rcx + movq 16(%rsi), %r8 + movq 24(%rsi), %r9 + movq 32(%rsi), %r10 + movq 40(%rsi), %r11 + subq (%rdx), %rax + movq $0xffffffff, %r12 + sbbq 8(%rdx), %rcx + movq $0xffffffff00000000, %r13 + sbbq 16(%rdx), %r8 + movq $0xfffffffffffffffe, %r14 + sbbq 24(%rdx), %r9 + sbbq 32(%rdx), %r10 + sbbq 40(%rdx), %r11 + sbbq %rsi, %rsi + andq %rsi, %r12 + andq %rsi, %r13 + andq %rsi, %r14 + addq %r12, %rax + adcq %r13, %rcx + adcq %r14, %r8 + adcq %rsi, %r9 + adcq %rsi, %r10 + adcq %rsi, %r11 + adcq $0x00, %rsi + andq %rsi, %r12 + andq %rsi, %r13 + andq %rsi, %r14 + addq %r12, %rax + adcq %r13, %rcx + movq %rax, (%rdi) + adcq %r14, %r8 + movq %rcx, 8(%rdi) + adcq %rsi, %r9 + movq %r8, 16(%rdi) + adcq %rsi, %r10 + movq %r9, 24(%rdi) + adcq %rsi, %r11 + movq %r10, 32(%rdi) + movq %r11, 40(%rdi) + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_384_mont_sub_6,.-sp_384_mont_sub_6 +#endif /* __APPLE__ */ +/* Subtract two Montgomery form numbers (r = a - b % m). + * + * b is less than the modulus. + * + * r Result of subtration. + * a Number to subtract from in Montgomery form. + * b Number to subtract with in Montgomery form. + * m Modulus (prime). + */ +#ifndef __APPLE__ +.text +.globl sp_384_mont_sub_lower_6 +.type sp_384_mont_sub_lower_6,@function +.align 16 +sp_384_mont_sub_lower_6: +#else +.section __TEXT,__text +.globl _sp_384_mont_sub_lower_6 +.p2align 4 +_sp_384_mont_sub_lower_6: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + movq (%rsi), %rax + movq 8(%rsi), %rcx + movq 16(%rsi), %r8 + movq 24(%rsi), %r9 + movq 32(%rsi), %r10 + movq 40(%rsi), %r11 + subq (%rdx), %rax + movq $0xffffffff, %r12 + sbbq 8(%rdx), %rcx + movq $0xffffffff00000000, %r13 + sbbq 16(%rdx), %r8 + movq $0xfffffffffffffffe, %r14 + sbbq 24(%rdx), %r9 + sbbq 32(%rdx), %r10 + sbbq 40(%rdx), %r11 + sbbq %rsi, %rsi + andq %rsi, %r12 + andq %rsi, %r13 + andq %rsi, %r14 + addq %r12, %rax + adcq %r13, %rcx + movq %rax, (%rdi) + adcq %r14, %r8 + movq %rcx, 8(%rdi) + adcq %rsi, %r9 + movq %r8, 16(%rdi) + adcq %rsi, %r10 + movq %r9, 24(%rdi) + adcq %rsi, %r11 + movq %r10, 32(%rdi) + movq %r11, 40(%rdi) + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_384_mont_sub_lower_6,.-sp_384_mont_sub_lower_6 #endif /* __APPLE__ */ /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) * @@ -44193,6 +61327,158 @@ _sp_384_div2_6: #ifndef __APPLE__ .size sp_384_div2_6,.-sp_384_div2_6 #endif /* __APPLE__ */ +/* Double a Montgomery form number (r = a + a % m). + * + * a is less than m. + * + * r Result of doubling. + * a Number to double in Montgomery form. + * m Modulus (prime). + */ +#ifndef __APPLE__ +.text +.globl sp_384_mont_dbl_lower_6 +.type sp_384_mont_dbl_lower_6,@function +.align 16 +sp_384_mont_dbl_lower_6: +#else +.section __TEXT,__text +.globl _sp_384_mont_dbl_lower_6 +.p2align 4 +_sp_384_mont_dbl_lower_6: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + movq (%rsi), %rdx + movq 8(%rsi), %rax + movq 16(%rsi), %rcx + movq 24(%rsi), %r8 + movq 32(%rsi), %r9 + movq 40(%rsi), %r10 + addq %rdx, %rdx + movq $0xffffffff, %r11 + adcq %rax, %rax + movq $0xffffffff00000000, %r12 + adcq %rcx, %rcx + movq $0xfffffffffffffffe, %r13 + adcq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + sbbq %r14, %r14 + andq %r14, %r11 + andq %r14, %r12 + andq %r14, %r13 + subq %r11, %rdx + sbbq %r12, %rax + movq %rdx, (%rdi) + sbbq %r13, %rcx + movq %rax, 8(%rdi) + sbbq %r14, %r8 + movq %rcx, 16(%rdi) + sbbq %r14, %r9 + movq %r8, 24(%rdi) + sbbq %r14, %r10 + movq %r9, 32(%rdi) + movq %r10, 40(%rdi) + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_384_mont_dbl_lower_6,.-sp_384_mont_dbl_lower_6 +#endif /* __APPLE__ */ +/* Double a Montgomery form number (r = a + a % m). + * + * a is less than m. + * + * r Result of doubling. + * a Number to double in Montgomery form. + * m Modulus (prime). + */ +#ifndef __APPLE__ +.text +.globl sp_384_mont_tpl_lower_6 +.type sp_384_mont_tpl_lower_6,@function +.align 16 +sp_384_mont_tpl_lower_6: +#else +.section __TEXT,__text +.globl _sp_384_mont_tpl_lower_6 +.p2align 4 +_sp_384_mont_tpl_lower_6: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + movq (%rsi), %rdx + movq 8(%rsi), %rax + movq 16(%rsi), %rcx + movq 24(%rsi), %r8 + movq 32(%rsi), %r9 + movq 40(%rsi), %r10 + addq %rdx, %rdx + movq $0xffffffff, %r11 + adcq %rax, %rax + movq $0xffffffff00000000, %r12 + adcq %rcx, %rcx + movq $0xfffffffffffffffe, %r13 + adcq %r8, %r8 + adcq %r9, %r9 + adcq %r10, %r10 + sbbq %r14, %r14 + andq %r14, %r11 + andq %r14, %r12 + andq %r14, %r13 + subq %r11, %rdx + sbbq %r12, %rax + movq %rdx, (%rdi) + sbbq %r13, %rcx + sbbq %r14, %r8 + sbbq %r14, %r9 + sbbq %r14, %r10 + addq (%rsi), %rdx + movq $0xffffffff, %r11 + adcq 8(%rsi), %rax + movq $0xffffffff00000000, %r12 + adcq 16(%rsi), %rcx + movq $0xfffffffffffffffe, %r13 + adcq 24(%rsi), %r8 + adcq 32(%rsi), %r9 + adcq 40(%rsi), %r10 + sbbq %r14, %r14 + andq %r14, %r11 + andq %r14, %r12 + andq %r14, %r13 + subq %r11, %rdx + sbbq %r12, %rax + sbbq %r13, %rcx + sbbq %r14, %r8 + sbbq %r14, %r9 + sbbq %r14, %r10 + adcq $0x00, %r14 + andq %r14, %r11 + andq %r14, %r12 + andq %r14, %r13 + subq %r11, %rdx + sbbq %r12, %rax + movq %rdx, (%rdi) + sbbq %r13, %rcx + movq %rax, 8(%rdi) + sbbq %r14, %r8 + movq %rcx, 16(%rdi) + sbbq %r14, %r9 + movq %r8, 24(%rdi) + sbbq %r14, %r10 + movq %r9, 32(%rdi) + movq %r10, 40(%rdi) + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_384_mont_tpl_lower_6,.-sp_384_mont_tpl_lower_6 +#endif /* __APPLE__ */ #ifndef WC_NO_CACHE_RESISTANT /* Touch each possible point that could be being copied. * @@ -44714,7 +62000,6 @@ sp_384_cond_sub_avx2_6: .p2align 4 _sp_384_cond_sub_avx2_6: #endif /* __APPLE__ */ - movq $0x00, %rax movq (%rdx), %r10 movq (%rsi), %r8 pextq %rcx, %r10, %r10 @@ -44745,7 +62030,7 @@ _sp_384_cond_sub_avx2_6: movq %r9, 32(%rdi) sbbq %r8, %r10 movq %r10, 40(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_384_cond_sub_avx2_6,.-sp_384_cond_sub_avx2_6 @@ -45390,7 +62675,6 @@ sp_384_sub_in_place_6: .p2align 4 _sp_384_sub_in_place_6: #endif /* __APPLE__ */ - xorq %rax, %rax movq (%rsi), %rdx movq 8(%rsi), %rcx movq 16(%rsi), %r8 @@ -45403,7 +62687,7 @@ _sp_384_sub_in_place_6: sbbq %r9, 24(%rdi) sbbq %r10, 32(%rdi) sbbq %r11, 40(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_384_sub_in_place_6,.-sp_384_sub_in_place_6 @@ -47786,7 +65070,6 @@ sp_521_sub_9: _sp_521_sub_9: #endif /* __APPLE__ */ movq (%rsi), %rcx - xorq %rax, %rax subq (%rdx), %rcx movq 8(%rsi), %r8 movq %rcx, (%rdi) @@ -47813,7 +65096,7 @@ _sp_521_sub_9: movq %r8, 56(%rdi) sbbq 64(%rdx), %rcx movq %rcx, 64(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_521_sub_9,.-sp_521_sub_9 @@ -49055,7 +66338,6 @@ sp_521_cond_sub_9: _sp_521_cond_sub_9: #endif /* __APPLE__ */ subq $0x48, %rsp - movq $0x00, %rax movq (%rdx), %r8 movq 8(%rdx), %r9 andq %rcx, %r8 @@ -49119,7 +66401,7 @@ _sp_521_cond_sub_9: sbbq %rdx, %r8 movq %r9, 56(%rdi) movq %r8, 64(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax addq $0x48, %rsp repz retq #ifndef __APPLE__ @@ -49142,6 +66424,84 @@ sp_521_mont_reduce_9: .globl _sp_521_mont_reduce_9 .p2align 4 _sp_521_mont_reduce_9: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + pushq %r14 + movq 64(%rdi), %rdx + movq 72(%rdi), %rax + movq 80(%rdi), %rcx + movq %rdx, %r14 + andq $0x1ff, %r14 + movq 88(%rdi), %r8 + movq 96(%rdi), %r9 + movq 104(%rdi), %r10 + movq 112(%rdi), %r11 + movq 120(%rdi), %r12 + movq 128(%rdi), %r13 + shrdq $9, %rax, %rdx + shrdq $9, %rcx, %rax + shrdq $9, %r8, %rcx + shrdq $9, %r9, %r8 + shrdq $9, %r10, %r9 + shrdq $9, %r11, %r10 + shrdq $9, %r12, %r11 + shrdq $9, %r13, %r12 + shrq $9, %r13 + addq (%rdi), %rdx + adcq 8(%rdi), %rax + adcq 16(%rdi), %rcx + adcq 24(%rdi), %r8 + adcq 32(%rdi), %r9 + adcq 40(%rdi), %r10 + adcq 48(%rdi), %r11 + adcq 56(%rdi), %r12 + adcq %r13, %r14 + movq %r14, %r13 + shrq $9, %r14 + andq $0x1ff, %r13 + addq %r14, %rdx + adcq $0x00, %rax + adcq $0x00, %rcx + adcq $0x00, %r8 + adcq $0x00, %r9 + adcq $0x00, %r10 + adcq $0x00, %r11 + adcq $0x00, %r12 + adcq $0x00, %r13 + movq %rdx, (%rdi) + movq %rax, 8(%rdi) + movq %rcx, 16(%rdi) + movq %r8, 24(%rdi) + movq %r9, 32(%rdi) + movq %r10, 40(%rdi) + movq %r11, 48(%rdi) + movq %r12, 56(%rdi) + movq %r13, 64(%rdi) + popq %r14 + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size sp_521_mont_reduce_9,.-sp_521_mont_reduce_9 +#endif /* __APPLE__ */ +/* Reduce the number back to 521 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +#ifndef __APPLE__ +.text +.globl sp_521_mont_reduce_order_9 +.type sp_521_mont_reduce_order_9,@function +.align 16 +sp_521_mont_reduce_order_9: +#else +.section __TEXT,__text +.globl _sp_521_mont_reduce_order_9 +.p2align 4 +_sp_521_mont_reduce_order_9: #endif /* __APPLE__ */ pushq %r12 pushq %r13 @@ -49153,14 +66513,14 @@ _sp_521_mont_reduce_9: movq $9, %r8 movq (%rdi), %r13 movq 8(%rdi), %r14 -L_521_mont_reduce_9_loop: +L_521_mont_reduce_order_9_loop: # mu = a[i] * mp movq %r13, %r11 imulq %rcx, %r11 cmpq $0x01, %r8 - jne L_521_mont_reduce_9_nomask + jne L_521_mont_reduce_order_9_nomask andq $0x1ff, %r11 -L_521_mont_reduce_9_nomask: +L_521_mont_reduce_order_9_nomask: # a[i+0] += m[0] * mu movq %r11, %rax xorq %r10, %r10 @@ -49251,7 +66611,7 @@ L_521_mont_reduce_9_nomask: # i -= 1 addq $8, %rdi decq %r8 - jnz L_521_mont_reduce_9_loop + jnz L_521_mont_reduce_order_9_loop movq %r13, (%rdi) movq %r14, 8(%rdi) movq %rdi, %rcx @@ -49309,7 +66669,7 @@ L_521_mont_reduce_9_nomask: popq %r12 repz retq #ifndef __APPLE__ -.size sp_521_mont_reduce_9,.-sp_521_mont_reduce_9 +.size sp_521_mont_reduce_order_9,.-sp_521_mont_reduce_order_9 #endif /* __APPLE__ */ /* Add two Montgomery form numbers (r = a + b % m). * @@ -50990,7 +68350,6 @@ sp_521_cond_sub_avx2_9: .p2align 4 _sp_521_cond_sub_avx2_9: #endif /* __APPLE__ */ - movq $0x00, %rax movq (%rdx), %r10 movq (%rsi), %r8 pextq %rcx, %r10, %r10 @@ -51036,7 +68395,7 @@ _sp_521_cond_sub_avx2_9: movq %r9, 56(%rdi) sbbq %r8, %r10 movq %r10, 64(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_521_cond_sub_avx2_9,.-sp_521_cond_sub_avx2_9 @@ -51051,15 +68410,15 @@ _sp_521_cond_sub_avx2_9: */ #ifndef __APPLE__ .text -.globl sp_521_mont_reduce_avx2_9 -.type sp_521_mont_reduce_avx2_9,@function +.globl sp_521_mont_reduce_order_avx2_9 +.type sp_521_mont_reduce_order_avx2_9,@function .align 16 -sp_521_mont_reduce_avx2_9: +sp_521_mont_reduce_order_avx2_9: #else .section __TEXT,__text -.globl _sp_521_mont_reduce_avx2_9 +.globl _sp_521_mont_reduce_order_avx2_9 .p2align 4 -_sp_521_mont_reduce_avx2_9: +_sp_521_mont_reduce_order_avx2_9: #endif /* __APPLE__ */ pushq %r12 pushq %r13 @@ -51070,22 +68429,18 @@ _sp_521_mont_reduce_avx2_9: movq %rdx, %r8 xorq %rbp, %rbp # i = 9 - movq $9, %r9 + movq $8, %r9 movq (%rdi), %r12 movq 8(%rdi), %r13 movq 16(%rdi), %r14 movq 24(%rdi), %r15 addq $32, %rdi xorq %rbp, %rbp -L_521_mont_reduce_avx2_9_loop: +L_521_mont_reduce_order_avx2_9_loop: # mu = a[i] * mp movq %r12, %rdx movq %r12, %r10 imulq %r8, %rdx - cmpq $0x01, %r9 - jne L_521_mont_reduce_avx2_9_nomask - andq $0x1ff, %rdx -L_521_mont_reduce_avx2_9_nomask: xorq %rbx, %rbx # a[i+0] += m[0] * mu mulxq (%rsi), %rax, %rcx @@ -51142,19 +68497,140 @@ L_521_mont_reduce_avx2_9_nomask: movq %r11, 40(%rdi) adoxq %rbx, %rbp adcxq %rbx, %rbp + # mu = a[i] * mp + movq %r12, %rdx + movq %r12, %r11 + imulq %r8, %rdx + xorq %rbx, %rbx + # a[i+0] += m[0] * mu + mulxq (%rsi), %rax, %rcx + movq %r13, %r12 + adcxq %rax, %r11 + adoxq %rcx, %r12 + movq %r11, -24(%rdi) + # a[i+1] += m[1] * mu + mulxq 8(%rsi), %rax, %rcx + movq %r14, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + # a[i+2] += m[2] * mu + mulxq 16(%rsi), %rax, %rcx + movq %r15, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + # a[i+3] += m[3] * mu + mulxq 24(%rsi), %rax, %rcx + movq 8(%rdi), %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + # a[i+4] += m[4] * mu + mulxq 32(%rsi), %rax, %rcx + movq 16(%rdi), %r10 + adcxq %rax, %r15 + adoxq %rcx, %r10 + # a[i+5] += m[5] * mu + mulxq 40(%rsi), %rax, %rcx + movq 24(%rdi), %r11 + adcxq %rax, %r10 + adoxq %rcx, %r11 + movq %r10, 16(%rdi) + # a[i+6] += m[6] * mu + mulxq 48(%rsi), %rax, %rcx + movq 32(%rdi), %r10 + adcxq %rax, %r11 + adoxq %rcx, %r10 + movq %r11, 24(%rdi) + # a[i+7] += m[7] * mu + mulxq 56(%rsi), %rax, %rcx + movq 40(%rdi), %r11 + adcxq %rax, %r10 + adoxq %rcx, %r11 + movq %r10, 32(%rdi) + # a[i+8] += m[8] * mu + mulxq 64(%rsi), %rax, %rcx + movq 48(%rdi), %r10 + adcxq %rax, %r11 + adoxq %rcx, %r10 + movq %r11, 40(%rdi) + adcxq %rbp, %r10 + movq %rbx, %rbp + movq %r10, 48(%rdi) + adoxq %rbx, %rbp + adcxq %rbx, %rbp + # a += 2 + addq $16, %rdi + # i -= 2 + subq $2, %r9 + jnz L_521_mont_reduce_order_avx2_9_loop + # mu = a[i] * mp + movq %r12, %rdx + movq %r12, %r10 + imulq %r8, %rdx + andq $0x1ff, %rdx + xorq %rbx, %rbx + # a[i+0] += m[0] * mu + mulxq (%rsi), %rax, %rcx + movq %r13, %r12 + adcxq %rax, %r10 + adoxq %rcx, %r12 + movq %r10, -32(%rdi) + # a[i+1] += m[1] * mu + mulxq 8(%rsi), %rax, %rcx + movq %r14, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + # a[i+2] += m[2] * mu + mulxq 16(%rsi), %rax, %rcx + movq %r15, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + # a[i+3] += m[3] * mu + mulxq 24(%rsi), %rax, %rcx + movq (%rdi), %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + # a[i+4] += m[4] * mu + mulxq 32(%rsi), %rax, %rcx + movq 8(%rdi), %r11 + adcxq %rax, %r15 + adoxq %rcx, %r11 + # a[i+5] += m[5] * mu + mulxq 40(%rsi), %rax, %rcx + movq 16(%rdi), %r10 + adcxq %rax, %r11 + adoxq %rcx, %r10 + movq %r11, 8(%rdi) + # a[i+6] += m[6] * mu + mulxq 48(%rsi), %rax, %rcx + movq 24(%rdi), %r11 + adcxq %rax, %r10 + adoxq %rcx, %r11 + movq %r10, 16(%rdi) + # a[i+7] += m[7] * mu + mulxq 56(%rsi), %rax, %rcx + movq 32(%rdi), %r10 + adcxq %rax, %r11 + adoxq %rcx, %r10 + movq %r11, 24(%rdi) + # a[i+8] += m[8] * mu + mulxq 64(%rsi), %rax, %rcx + movq 40(%rdi), %r11 + adcxq %rax, %r10 + adoxq %rcx, %r11 + movq %r10, 32(%rdi) + adcxq %rbp, %r11 + movq %rbx, %rbp + movq %r11, 40(%rdi) + adoxq %rbx, %rbp # a += 1 addq $8, %rdi - # i -= 1 - subq $0x01, %r9 - jnz L_521_mont_reduce_avx2_9_loop movq %r12, -32(%rdi) movq %r13, -24(%rdi) movq %r14, -16(%rdi) movq %r15, -8(%rdi) subq $32, %rdi - movq %rdi, %r8 + leaq -8(%rdi), %r8 subq $0x48, %rdi - subq $8, %r8 movq (%r8), %r10 movq 8(%r8), %r12 movq 16(%r8), %r13 @@ -51241,7 +68717,7 @@ L_521_mont_reduce_avx2_9_nomask: popq %r12 repz retq #ifndef __APPLE__ -.size sp_521_mont_reduce_avx2_9,.-sp_521_mont_reduce_avx2_9 +.size sp_521_mont_reduce_order_avx2_9,.-sp_521_mont_reduce_order_avx2_9 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ #ifdef HAVE_INTEL_AVX2 @@ -52213,7 +69689,6 @@ sp_521_sub_in_place_9: _sp_521_sub_in_place_9: #endif /* __APPLE__ */ movq (%rdi), %rdx - xorq %rax, %rax subq (%rsi), %rdx movq 8(%rdi), %rcx movq %rdx, (%rdi) @@ -52240,7 +69715,7 @@ _sp_521_sub_in_place_9: movq %rcx, 56(%rdi) sbbq 64(%rsi), %rdx movq %rdx, 64(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_521_sub_in_place_9,.-sp_521_sub_in_place_9 @@ -52580,6 +70055,10 @@ L_521_mod_inv_9_div2_mod_no_add: movq %r9, 56(%rdi) shrq $0x01, %rax movq %rax, 64(%rdi) + repz retq +#ifndef __APPLE__ +.size sp_521_div2_mod_9,.-sp_521_div2_mod_9 +#endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl sp_521_num_bits_9 @@ -58245,7 +75724,6 @@ sp_1024_sub_in_place_16: _sp_1024_sub_in_place_16: #endif /* __APPLE__ */ movq (%rdi), %rdx - xorq %rax, %rax subq (%rsi), %rdx movq 8(%rdi), %rcx movq %rdx, (%rdi) @@ -58293,7 +75771,7 @@ _sp_1024_sub_in_place_16: movq %rdx, 112(%rdi) sbbq 120(%rsi), %rcx movq %rcx, 120(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_1024_sub_in_place_16,.-sp_1024_sub_in_place_16 @@ -58319,7 +75797,6 @@ sp_1024_cond_sub_16: _sp_1024_cond_sub_16: #endif /* __APPLE__ */ subq $0x80, %rsp - movq $0x00, %rax movq (%rdx), %r8 movq 8(%rdx), %r9 andq %rcx, %r8 @@ -58432,7 +75909,7 @@ _sp_1024_cond_sub_16: sbbq %rdx, %r9 movq %r8, 112(%rdi) movq %r9, 120(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax addq $0x80, %rsp repz retq #ifndef __APPLE__ @@ -58459,7 +75936,6 @@ sp_1024_cond_sub_avx2_16: .p2align 4 _sp_1024_cond_sub_avx2_16: #endif /* __APPLE__ */ - movq $0x00, %rax movq (%rdx), %r10 movq (%rsi), %r8 pextq %rcx, %r10, %r10 @@ -58540,7 +76016,7 @@ _sp_1024_cond_sub_avx2_16: movq %r10, 112(%rdi) sbbq %r9, %r8 movq %r8, 120(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_1024_cond_sub_avx2_16,.-sp_1024_cond_sub_avx2_16 @@ -60332,7 +77808,6 @@ sp_1024_sub_16: _sp_1024_sub_16: #endif /* __APPLE__ */ movq (%rsi), %rcx - xorq %rax, %rax subq (%rdx), %rcx movq 8(%rsi), %r8 movq %rcx, (%rdi) @@ -60380,7 +77855,7 @@ _sp_1024_sub_16: movq %rcx, 112(%rdi) sbbq 120(%rdx), %r8 movq %r8, 120(%rdi) - sbbq $0x00, %rax + sbbq %rax, %rax repz retq #ifndef __APPLE__ .size sp_1024_sub_16,.-sp_1024_sub_16 diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm index b991cfbda..a46b83bd3 100644 --- a/wolfcrypt/src/sp_x86_64_asm.asm +++ b/wolfcrypt/src/sp_x86_64_asm.asm @@ -3777,7 +3777,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_2048_sub_in_place_32 PROC mov r8, QWORD PTR [rcx] - xor rax, rax sub r8, QWORD PTR [rdx] mov r9, QWORD PTR [rcx+8] mov QWORD PTR [rcx], r8 @@ -3873,7 +3872,7 @@ sp_2048_sub_in_place_32 PROC mov QWORD PTR [rcx+240], r8 sbb r9, QWORD PTR [rdx+248] mov QWORD PTR [rcx+248], r9 - sbb rax, 0 + sbb rax, rax ret sp_2048_sub_in_place_32 ENDP _text ENDS @@ -7497,1192 +7496,1038 @@ L_end_2048_sqr_avx2_16: sp_2048_sqr_avx2_16 ENDP _text ENDS ENDIF -; /* Add a to a into r. (r = a + a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_2048_dbl_16 PROC - mov r8, QWORD PTR [rdx] - xor rax, rax - add r8, r8 - mov r9, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+16] - mov QWORD PTR [rcx+8], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+40] - mov QWORD PTR [rcx+32], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+48] - mov QWORD PTR [rcx+40], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+56] - mov QWORD PTR [rcx+48], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+64] - mov QWORD PTR [rcx+56], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+72] - mov QWORD PTR [rcx+64], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+80] - mov QWORD PTR [rcx+72], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+88] - mov QWORD PTR [rcx+80], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+96] - mov QWORD PTR [rcx+88], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+104] - mov QWORD PTR [rcx+96], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+112] - mov QWORD PTR [rcx+104], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+120] - mov QWORD PTR [rcx+112], r8 - adc r9, r9 - mov QWORD PTR [rcx+120], r9 - adc rax, 0 - ret -sp_2048_dbl_16 ENDP -_text ENDS ; /* Square a and put result in r. (r = a * a) ; * +; * Karatsuba: ah^2, al^2, (al - ah)^2 +; * ; * r A single precision integer. ; * a A single precision integer. ; */ _text SEGMENT READONLY PARA sp_2048_sqr_32 PROC - push r12 - sub rsp, 664 - mov QWORD PTR [rsp+640], rcx - mov QWORD PTR [rsp+648], rdx - lea r10, QWORD PTR [rsp+512] + sub rsp, 272 + mov QWORD PTR [rsp+256], rcx + mov QWORD PTR [rsp+264], rdx + mov r9, 0 + mov r10, rsp lea r11, QWORD PTR [rdx+128] - ; Add mov rax, QWORD PTR [rdx] - xor r9, r9 - add rax, QWORD PTR [r11] + sub rax, QWORD PTR [r11] mov r8, QWORD PTR [rdx+8] mov QWORD PTR [r10], rax - adc r8, QWORD PTR [r11+8] + sbb r8, QWORD PTR [r11+8] mov rax, QWORD PTR [rdx+16] mov QWORD PTR [r10+8], r8 - adc rax, QWORD PTR [r11+16] + sbb rax, QWORD PTR [r11+16] mov r8, QWORD PTR [rdx+24] mov QWORD PTR [r10+16], rax - adc r8, QWORD PTR [r11+24] + sbb r8, QWORD PTR [r11+24] mov rax, QWORD PTR [rdx+32] mov QWORD PTR [r10+24], r8 - adc rax, QWORD PTR [r11+32] + sbb rax, QWORD PTR [r11+32] mov r8, QWORD PTR [rdx+40] mov QWORD PTR [r10+32], rax - adc r8, QWORD PTR [r11+40] + sbb r8, QWORD PTR [r11+40] mov rax, QWORD PTR [rdx+48] mov QWORD PTR [r10+40], r8 - adc rax, QWORD PTR [r11+48] + sbb rax, QWORD PTR [r11+48] mov r8, QWORD PTR [rdx+56] mov QWORD PTR [r10+48], rax - adc r8, QWORD PTR [r11+56] + sbb r8, QWORD PTR [r11+56] mov rax, QWORD PTR [rdx+64] mov QWORD PTR [r10+56], r8 - adc rax, QWORD PTR [r11+64] + sbb rax, QWORD PTR [r11+64] mov r8, QWORD PTR [rdx+72] mov QWORD PTR [r10+64], rax - adc r8, QWORD PTR [r11+72] + sbb r8, QWORD PTR [r11+72] mov rax, QWORD PTR [rdx+80] mov QWORD PTR [r10+72], r8 - adc rax, QWORD PTR [r11+80] + sbb rax, QWORD PTR [r11+80] mov r8, QWORD PTR [rdx+88] mov QWORD PTR [r10+80], rax - adc r8, QWORD PTR [r11+88] + sbb r8, QWORD PTR [r11+88] mov rax, QWORD PTR [rdx+96] mov QWORD PTR [r10+88], r8 - adc rax, QWORD PTR [r11+96] + sbb rax, QWORD PTR [r11+96] mov r8, QWORD PTR [rdx+104] mov QWORD PTR [r10+96], rax - adc r8, QWORD PTR [r11+104] + sbb r8, QWORD PTR [r11+104] mov rax, QWORD PTR [rdx+112] mov QWORD PTR [r10+104], r8 - adc rax, QWORD PTR [r11+112] + sbb rax, QWORD PTR [r11+112] mov r8, QWORD PTR [rdx+120] mov QWORD PTR [r10+112], rax - adc r8, QWORD PTR [r11+120] + sbb r8, QWORD PTR [r11+120] + mov QWORD PTR [r10+120], r8 + sbb r9, 0 + ; Cond Negate + mov rax, QWORD PTR [r10] + mov r11, r9 + xor rax, r9 + neg r11 + sub rax, r9 + mov r8, QWORD PTR [r10+8] + sbb r11, 0 + mov QWORD PTR [r10], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+16] + setc r11b + mov QWORD PTR [r10+8], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+24] + setc r11b + mov QWORD PTR [r10+16], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+32] + setc r11b + mov QWORD PTR [r10+24], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+40] + setc r11b + mov QWORD PTR [r10+32], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+48] + setc r11b + mov QWORD PTR [r10+40], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+56] + setc r11b + mov QWORD PTR [r10+48], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+64] + setc r11b + mov QWORD PTR [r10+56], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+72] + setc r11b + mov QWORD PTR [r10+64], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+80] + setc r11b + mov QWORD PTR [r10+72], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+88] + setc r11b + mov QWORD PTR [r10+80], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+96] + setc r11b + mov QWORD PTR [r10+88], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+104] + setc r11b + mov QWORD PTR [r10+96], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+112] + setc r11b + mov QWORD PTR [r10+104], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+120] + setc r11b + mov QWORD PTR [r10+112], rax + xor r8, r9 + add r8, r11 mov QWORD PTR [r10+120], r8 - adc r9, 0 - mov QWORD PTR [rsp+656], r9 mov rdx, r10 mov rcx, rsp call sp_2048_sqr_16 - mov rdx, QWORD PTR [rsp+648] - lea rcx, QWORD PTR [rsp+256] + mov rdx, QWORD PTR [rsp+264] + mov rcx, QWORD PTR [rsp+256] add rdx, 128 + add rcx, 256 call sp_2048_sqr_16 - mov rdx, QWORD PTR [rsp+648] - mov rcx, QWORD PTR [rsp+640] + mov rdx, QWORD PTR [rsp+264] + mov rcx, QWORD PTR [rsp+256] call sp_2048_sqr_16 IFDEF _WIN64 - mov rdx, QWORD PTR [rsp+648] - mov rcx, QWORD PTR [rsp+640] + mov rdx, QWORD PTR [rsp+264] + mov rcx, QWORD PTR [rsp+256] ENDIF - mov r12, QWORD PTR [rsp+656] - lea r10, QWORD PTR [rsp+512] - mov r9, r12 - neg r12 - mov rax, QWORD PTR [r10] - mov r8, QWORD PTR [r10+8] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+256], rax - mov QWORD PTR [rcx+264], r8 - mov rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r10+24] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+272], rax - mov QWORD PTR [rcx+280], r8 - mov rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r10+40] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+288], rax - mov QWORD PTR [rcx+296], r8 - mov rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r10+56] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+304], rax - mov QWORD PTR [rcx+312], r8 - mov rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r10+72] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+320], rax - mov QWORD PTR [rcx+328], r8 - mov rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r10+88] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+336], rax - mov QWORD PTR [rcx+344], r8 - mov rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [r10+104] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+352], rax - mov QWORD PTR [rcx+360], r8 - mov rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [r10+120] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+368], rax - mov QWORD PTR [rcx+376], r8 - mov rax, QWORD PTR [rcx+256] - add rax, rax - mov r8, QWORD PTR [rcx+264] - mov QWORD PTR [rcx+256], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+272] - mov QWORD PTR [rcx+264], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+280] - mov QWORD PTR [rcx+272], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+288] - mov QWORD PTR [rcx+280], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+296] - mov QWORD PTR [rcx+288], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+304] - mov QWORD PTR [rcx+296], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+312] - mov QWORD PTR [rcx+304], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+320] - mov QWORD PTR [rcx+312], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+328] - mov QWORD PTR [rcx+320], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+336] - mov QWORD PTR [rcx+328], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+344] - mov QWORD PTR [rcx+336], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+352] - mov QWORD PTR [rcx+344], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+360] - mov QWORD PTR [rcx+352], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+368] - mov QWORD PTR [rcx+360], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+376] - mov QWORD PTR [rcx+368], rax - adc r8, r8 - mov QWORD PTR [rcx+376], r8 - adc r9, 0 - lea rdx, QWORD PTR [rsp+256] - mov r10, rsp - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rdx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rdx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rdx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rdx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rdx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rdx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rdx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rdx+248] - mov QWORD PTR [r10+248], r8 + mov rdx, QWORD PTR [rsp+256] + lea r10, QWORD PTR [rsp+128] + add rdx, 384 + mov r9, 0 + mov r8, QWORD PTR [r10+-128] + sub r8, QWORD PTR [rdx+-128] + mov rax, QWORD PTR [r10+-120] + mov QWORD PTR [r10+-128], r8 + sbb rax, QWORD PTR [rdx+-120] + mov r8, QWORD PTR [r10+-112] + mov QWORD PTR [r10+-120], rax + sbb r8, QWORD PTR [rdx+-112] + mov rax, QWORD PTR [r10+-104] + mov QWORD PTR [r10+-112], r8 + sbb rax, QWORD PTR [rdx+-104] + mov r8, QWORD PTR [r10+-96] + mov QWORD PTR [r10+-104], rax + sbb r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov r8, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], rax + sbb r8, QWORD PTR [rdx+96] + mov rax, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], r8 + sbb rax, QWORD PTR [rdx+104] + mov r8, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], rax + sbb r8, QWORD PTR [rdx+112] + mov rax, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], r8 + sbb rax, QWORD PTR [rdx+120] + mov QWORD PTR [r10+120], rax sbb r9, 0 - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rcx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rcx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rcx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rcx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rcx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rcx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rcx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rcx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rcx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rcx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rcx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rcx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rcx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rcx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rcx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rcx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rcx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rcx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rcx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rcx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rcx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rcx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rcx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rcx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rcx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rcx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rcx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rcx+248] - mov QWORD PTR [r10+248], r8 + sub rdx, 256 + mov r8, QWORD PTR [r10+-128] + sub r8, QWORD PTR [rdx+-128] + mov rax, QWORD PTR [r10+-120] + mov QWORD PTR [r10+-128], r8 + sbb rax, QWORD PTR [rdx+-120] + mov r8, QWORD PTR [r10+-112] + mov QWORD PTR [r10+-120], rax + sbb r8, QWORD PTR [rdx+-112] + mov rax, QWORD PTR [r10+-104] + mov QWORD PTR [r10+-112], r8 + sbb rax, QWORD PTR [rdx+-104] + mov r8, QWORD PTR [r10+-96] + mov QWORD PTR [r10+-104], rax + sbb r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov r8, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], rax + sbb r8, QWORD PTR [rdx+96] + mov rax, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], r8 + sbb rax, QWORD PTR [rdx+104] + mov r8, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], rax + sbb r8, QWORD PTR [rdx+112] + mov rax, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], r8 + sbb rax, QWORD PTR [rdx+120] + mov QWORD PTR [r10+120], rax sbb r9, 0 - ; Add in place - mov rax, QWORD PTR [rcx+128] - add rax, QWORD PTR [r10] - mov r8, QWORD PTR [rcx+136] - mov QWORD PTR [rcx+128], rax - adc r8, QWORD PTR [r10+8] - mov rax, QWORD PTR [rcx+144] - mov QWORD PTR [rcx+136], r8 - adc rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [rcx+152] - mov QWORD PTR [rcx+144], rax - adc r8, QWORD PTR [r10+24] - mov rax, QWORD PTR [rcx+160] - mov QWORD PTR [rcx+152], r8 - adc rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [rcx+168] - mov QWORD PTR [rcx+160], rax - adc r8, QWORD PTR [r10+40] - mov rax, QWORD PTR [rcx+176] - mov QWORD PTR [rcx+168], r8 - adc rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [rcx+184] - mov QWORD PTR [rcx+176], rax - adc r8, QWORD PTR [r10+56] - mov rax, QWORD PTR [rcx+192] - mov QWORD PTR [rcx+184], r8 - adc rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [rcx+200] - mov QWORD PTR [rcx+192], rax - adc r8, QWORD PTR [r10+72] - mov rax, QWORD PTR [rcx+208] - mov QWORD PTR [rcx+200], r8 - adc rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [rcx+216] - mov QWORD PTR [rcx+208], rax - adc r8, QWORD PTR [r10+88] - mov rax, QWORD PTR [rcx+224] - mov QWORD PTR [rcx+216], r8 - adc rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [rcx+232] - mov QWORD PTR [rcx+224], rax - adc r8, QWORD PTR [r10+104] - mov rax, QWORD PTR [rcx+240] - mov QWORD PTR [rcx+232], r8 - adc rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [rcx+248] - mov QWORD PTR [rcx+240], rax - adc r8, QWORD PTR [r10+120] - mov rax, QWORD PTR [rcx+256] - mov QWORD PTR [rcx+248], r8 - adc rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [rcx+264] - mov QWORD PTR [rcx+256], rax - adc r8, QWORD PTR [r10+136] - mov rax, QWORD PTR [rcx+272] - mov QWORD PTR [rcx+264], r8 - adc rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [rcx+280] - mov QWORD PTR [rcx+272], rax - adc r8, QWORD PTR [r10+152] - mov rax, QWORD PTR [rcx+288] - mov QWORD PTR [rcx+280], r8 - adc rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [rcx+296] - mov QWORD PTR [rcx+288], rax - adc r8, QWORD PTR [r10+168] - mov rax, QWORD PTR [rcx+304] - mov QWORD PTR [rcx+296], r8 - adc rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [rcx+312] - mov QWORD PTR [rcx+304], rax - adc r8, QWORD PTR [r10+184] - mov rax, QWORD PTR [rcx+320] - mov QWORD PTR [rcx+312], r8 - adc rax, QWORD PTR [r10+192] - mov r8, QWORD PTR [rcx+328] - mov QWORD PTR [rcx+320], rax - adc r8, QWORD PTR [r10+200] - mov rax, QWORD PTR [rcx+336] - mov QWORD PTR [rcx+328], r8 - adc rax, QWORD PTR [r10+208] - mov r8, QWORD PTR [rcx+344] - mov QWORD PTR [rcx+336], rax - adc r8, QWORD PTR [r10+216] - mov rax, QWORD PTR [rcx+352] - mov QWORD PTR [rcx+344], r8 - adc rax, QWORD PTR [r10+224] - mov r8, QWORD PTR [rcx+360] - mov QWORD PTR [rcx+352], rax - adc r8, QWORD PTR [r10+232] - mov rax, QWORD PTR [rcx+368] - mov QWORD PTR [rcx+360], r8 - adc rax, QWORD PTR [r10+240] - mov r8, QWORD PTR [rcx+376] - mov QWORD PTR [rcx+368], rax - adc r8, QWORD PTR [r10+248] - mov QWORD PTR [rcx+376], r8 - adc r9, 0 - mov QWORD PTR [rcx+384], r9 - ; Add in place - mov rax, QWORD PTR [rcx+256] - xor r9, r9 - add rax, QWORD PTR [rdx] - mov r8, QWORD PTR [rcx+264] - mov QWORD PTR [rcx+256], rax - adc r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [rcx+272] - mov QWORD PTR [rcx+264], r8 - adc rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [rcx+280] - mov QWORD PTR [rcx+272], rax - adc r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [rcx+288] - mov QWORD PTR [rcx+280], r8 - adc rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [rcx+296] - mov QWORD PTR [rcx+288], rax - adc r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [rcx+304] - mov QWORD PTR [rcx+296], r8 - adc rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [rcx+312] - mov QWORD PTR [rcx+304], rax - adc r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [rcx+320] - mov QWORD PTR [rcx+312], r8 - adc rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [rcx+328] - mov QWORD PTR [rcx+320], rax - adc r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [rcx+336] - mov QWORD PTR [rcx+328], r8 - adc rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [rcx+344] - mov QWORD PTR [rcx+336], rax - adc r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [rcx+352] - mov QWORD PTR [rcx+344], r8 - adc rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [rcx+360] - mov QWORD PTR [rcx+352], rax - adc r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [rcx+368] - mov QWORD PTR [rcx+360], r8 - adc rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [rcx+376] - mov QWORD PTR [rcx+368], rax - adc r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [rcx+384] - mov QWORD PTR [rcx+376], r8 - adc rax, QWORD PTR [rdx+128] - mov QWORD PTR [rcx+384], rax - adc r9, 0 - ; Add to zero - mov rax, QWORD PTR [rdx+136] + mov rcx, QWORD PTR [rsp+256] + neg r9 + add rcx, 256 + mov r8, QWORD PTR [rcx+-128] + sub r8, QWORD PTR [r10+-128] + mov rax, QWORD PTR [rcx+-120] + mov QWORD PTR [rcx+-128], r8 + sbb rax, QWORD PTR [r10+-120] + mov r8, QWORD PTR [rcx+-112] + mov QWORD PTR [rcx+-120], rax + sbb r8, QWORD PTR [r10+-112] + mov rax, QWORD PTR [rcx+-104] + mov QWORD PTR [rcx+-112], r8 + sbb rax, QWORD PTR [r10+-104] + mov r8, QWORD PTR [rcx+-96] + mov QWORD PTR [rcx+-104], rax + sbb r8, QWORD PTR [r10+-96] + mov rax, QWORD PTR [rcx+-88] + mov QWORD PTR [rcx+-96], r8 + sbb rax, QWORD PTR [r10+-88] + mov r8, QWORD PTR [rcx+-80] + mov QWORD PTR [rcx+-88], rax + sbb r8, QWORD PTR [r10+-80] + mov rax, QWORD PTR [rcx+-72] + mov QWORD PTR [rcx+-80], r8 + sbb rax, QWORD PTR [r10+-72] + mov r8, QWORD PTR [rcx+-64] + mov QWORD PTR [rcx+-72], rax + sbb r8, QWORD PTR [r10+-64] + mov rax, QWORD PTR [rcx+-56] + mov QWORD PTR [rcx+-64], r8 + sbb rax, QWORD PTR [r10+-56] + mov r8, QWORD PTR [rcx+-48] + mov QWORD PTR [rcx+-56], rax + sbb r8, QWORD PTR [r10+-48] + mov rax, QWORD PTR [rcx+-40] + mov QWORD PTR [rcx+-48], r8 + sbb rax, QWORD PTR [r10+-40] + mov r8, QWORD PTR [rcx+-32] + mov QWORD PTR [rcx+-40], rax + sbb r8, QWORD PTR [r10+-32] + mov rax, QWORD PTR [rcx+-24] + mov QWORD PTR [rcx+-32], r8 + sbb rax, QWORD PTR [r10+-24] + mov r8, QWORD PTR [rcx+-16] + mov QWORD PTR [rcx+-24], rax + sbb r8, QWORD PTR [r10+-16] + mov rax, QWORD PTR [rcx+-8] + mov QWORD PTR [rcx+-16], r8 + sbb rax, QWORD PTR [r10+-8] + mov r8, QWORD PTR [rcx] + mov QWORD PTR [rcx+-8], rax + sbb r8, QWORD PTR [r10] + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 + sbb rax, QWORD PTR [r10+8] + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax + sbb r8, QWORD PTR [r10+16] + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 + sbb rax, QWORD PTR [r10+24] + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax + sbb r8, QWORD PTR [r10+32] + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 + sbb rax, QWORD PTR [r10+40] + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax + sbb r8, QWORD PTR [r10+48] + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 + sbb rax, QWORD PTR [r10+56] + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax + sbb r8, QWORD PTR [r10+64] + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 + sbb rax, QWORD PTR [r10+72] + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax + sbb r8, QWORD PTR [r10+80] + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 + sbb rax, QWORD PTR [r10+88] + mov r8, QWORD PTR [rcx+96] + mov QWORD PTR [rcx+88], rax + sbb r8, QWORD PTR [r10+96] + mov rax, QWORD PTR [rcx+104] + mov QWORD PTR [rcx+96], r8 + sbb rax, QWORD PTR [r10+104] + mov r8, QWORD PTR [rcx+112] + mov QWORD PTR [rcx+104], rax + sbb r8, QWORD PTR [r10+112] + mov rax, QWORD PTR [rcx+120] + mov QWORD PTR [rcx+112], r8 + sbb rax, QWORD PTR [r10+120] + mov QWORD PTR [rcx+120], rax + sbb r9, 0 + mov rcx, QWORD PTR [rsp+256] + add rcx, 384 + ; Add in word + mov r8, QWORD PTR [rcx] + add r8, r9 + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+144] - mov QWORD PTR [rcx+392], rax + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax adc r8, 0 - mov rax, QWORD PTR [rdx+152] - mov QWORD PTR [rcx+400], r8 + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+160] - mov QWORD PTR [rcx+408], rax + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax adc r8, 0 - mov rax, QWORD PTR [rdx+168] - mov QWORD PTR [rcx+416], r8 + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+176] - mov QWORD PTR [rcx+424], rax + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax adc r8, 0 - mov rax, QWORD PTR [rdx+184] - mov QWORD PTR [rcx+432], r8 + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+192] - mov QWORD PTR [rcx+440], rax + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax adc r8, 0 - mov rax, QWORD PTR [rdx+200] - mov QWORD PTR [rcx+448], r8 + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+208] - mov QWORD PTR [rcx+456], rax + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax adc r8, 0 - mov rax, QWORD PTR [rdx+216] - mov QWORD PTR [rcx+464], r8 + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+224] - mov QWORD PTR [rcx+472], rax + mov r8, QWORD PTR [rcx+96] + mov QWORD PTR [rcx+88], rax adc r8, 0 - mov rax, QWORD PTR [rdx+232] - mov QWORD PTR [rcx+480], r8 + mov rax, QWORD PTR [rcx+104] + mov QWORD PTR [rcx+96], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+240] - mov QWORD PTR [rcx+488], rax + mov r8, QWORD PTR [rcx+112] + mov QWORD PTR [rcx+104], rax adc r8, 0 - mov rax, QWORD PTR [rdx+248] - mov QWORD PTR [rcx+496], r8 + mov rax, QWORD PTR [rcx+120] + mov QWORD PTR [rcx+112], r8 adc rax, 0 - mov QWORD PTR [rcx+504], rax - add rsp, 664 - pop r12 + mov QWORD PTR [rcx+120], rax + mov rdx, QWORD PTR [rsp+264] + mov rcx, QWORD PTR [rsp+256] + add rsp, 272 ret sp_2048_sqr_32 ENDP _text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Square a and put result in r. (r = a * a) ; * +; * Karatsuba: ah^2, al^2, (al - ah)^2 +; * ; * r A single precision integer. ; * a A single precision integer. ; */ _text SEGMENT READONLY PARA sp_2048_sqr_avx2_32 PROC - push r12 - sub rsp, 664 - mov QWORD PTR [rsp+640], rcx - mov QWORD PTR [rsp+648], rdx - lea r10, QWORD PTR [rsp+512] + sub rsp, 272 + mov QWORD PTR [rsp+256], rcx + mov QWORD PTR [rsp+264], rdx + mov r9, 0 + mov r10, rsp lea r11, QWORD PTR [rdx+128] - ; Add mov rax, QWORD PTR [rdx] - xor r9, r9 - add rax, QWORD PTR [r11] + sub rax, QWORD PTR [r11] mov r8, QWORD PTR [rdx+8] mov QWORD PTR [r10], rax - adc r8, QWORD PTR [r11+8] + sbb r8, QWORD PTR [r11+8] mov rax, QWORD PTR [rdx+16] mov QWORD PTR [r10+8], r8 - adc rax, QWORD PTR [r11+16] + sbb rax, QWORD PTR [r11+16] mov r8, QWORD PTR [rdx+24] mov QWORD PTR [r10+16], rax - adc r8, QWORD PTR [r11+24] + sbb r8, QWORD PTR [r11+24] mov rax, QWORD PTR [rdx+32] mov QWORD PTR [r10+24], r8 - adc rax, QWORD PTR [r11+32] + sbb rax, QWORD PTR [r11+32] mov r8, QWORD PTR [rdx+40] mov QWORD PTR [r10+32], rax - adc r8, QWORD PTR [r11+40] + sbb r8, QWORD PTR [r11+40] mov rax, QWORD PTR [rdx+48] mov QWORD PTR [r10+40], r8 - adc rax, QWORD PTR [r11+48] + sbb rax, QWORD PTR [r11+48] mov r8, QWORD PTR [rdx+56] mov QWORD PTR [r10+48], rax - adc r8, QWORD PTR [r11+56] + sbb r8, QWORD PTR [r11+56] mov rax, QWORD PTR [rdx+64] mov QWORD PTR [r10+56], r8 - adc rax, QWORD PTR [r11+64] + sbb rax, QWORD PTR [r11+64] mov r8, QWORD PTR [rdx+72] mov QWORD PTR [r10+64], rax - adc r8, QWORD PTR [r11+72] + sbb r8, QWORD PTR [r11+72] mov rax, QWORD PTR [rdx+80] mov QWORD PTR [r10+72], r8 - adc rax, QWORD PTR [r11+80] + sbb rax, QWORD PTR [r11+80] mov r8, QWORD PTR [rdx+88] mov QWORD PTR [r10+80], rax - adc r8, QWORD PTR [r11+88] + sbb r8, QWORD PTR [r11+88] mov rax, QWORD PTR [rdx+96] mov QWORD PTR [r10+88], r8 - adc rax, QWORD PTR [r11+96] + sbb rax, QWORD PTR [r11+96] mov r8, QWORD PTR [rdx+104] mov QWORD PTR [r10+96], rax - adc r8, QWORD PTR [r11+104] + sbb r8, QWORD PTR [r11+104] mov rax, QWORD PTR [rdx+112] mov QWORD PTR [r10+104], r8 - adc rax, QWORD PTR [r11+112] + sbb rax, QWORD PTR [r11+112] mov r8, QWORD PTR [rdx+120] mov QWORD PTR [r10+112], rax - adc r8, QWORD PTR [r11+120] + sbb r8, QWORD PTR [r11+120] + mov QWORD PTR [r10+120], r8 + sbb r9, 0 + ; Cond Negate + mov rax, QWORD PTR [r10] + mov r11, r9 + xor rax, r9 + neg r11 + sub rax, r9 + mov r8, QWORD PTR [r10+8] + sbb r11, 0 + mov QWORD PTR [r10], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+16] + setc r11b + mov QWORD PTR [r10+8], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+24] + setc r11b + mov QWORD PTR [r10+16], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+32] + setc r11b + mov QWORD PTR [r10+24], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+40] + setc r11b + mov QWORD PTR [r10+32], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+48] + setc r11b + mov QWORD PTR [r10+40], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+56] + setc r11b + mov QWORD PTR [r10+48], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+64] + setc r11b + mov QWORD PTR [r10+56], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+72] + setc r11b + mov QWORD PTR [r10+64], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+80] + setc r11b + mov QWORD PTR [r10+72], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+88] + setc r11b + mov QWORD PTR [r10+80], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+96] + setc r11b + mov QWORD PTR [r10+88], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+104] + setc r11b + mov QWORD PTR [r10+96], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+112] + setc r11b + mov QWORD PTR [r10+104], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+120] + setc r11b + mov QWORD PTR [r10+112], rax + xor r8, r9 + add r8, r11 mov QWORD PTR [r10+120], r8 - adc r9, 0 - mov QWORD PTR [rsp+656], r9 mov rdx, r10 mov rcx, rsp call sp_2048_sqr_avx2_16 - mov rdx, QWORD PTR [rsp+648] - lea rcx, QWORD PTR [rsp+256] + mov rdx, QWORD PTR [rsp+264] + mov rcx, QWORD PTR [rsp+256] add rdx, 128 + add rcx, 256 call sp_2048_sqr_avx2_16 - mov rdx, QWORD PTR [rsp+648] - mov rcx, QWORD PTR [rsp+640] + mov rdx, QWORD PTR [rsp+264] + mov rcx, QWORD PTR [rsp+256] call sp_2048_sqr_avx2_16 IFDEF _WIN64 - mov rdx, QWORD PTR [rsp+648] - mov rcx, QWORD PTR [rsp+640] + mov rdx, QWORD PTR [rsp+264] + mov rcx, QWORD PTR [rsp+256] ENDIF - mov r12, QWORD PTR [rsp+656] - lea r10, QWORD PTR [rsp+512] - mov r9, r12 - neg r12 - mov rax, QWORD PTR [r10] - pext rax, rax, r12 - add rax, rax - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [rcx+256], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [rcx+264], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [rcx+272], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [rcx+280], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [rcx+288], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [rcx+296], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [rcx+304], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [rcx+312], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [rcx+320], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [rcx+328], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [rcx+336], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [rcx+344], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [rcx+352], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [rcx+360], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [rcx+368], rax - pext r8, r8, r12 - adc r8, r8 - mov QWORD PTR [rcx+376], r8 - adc r9, 0 - lea rdx, QWORD PTR [rsp+256] - mov r10, rsp - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rdx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rdx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rdx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rdx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rdx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rdx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rdx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rdx+248] - mov QWORD PTR [r10+248], r8 + mov rdx, QWORD PTR [rsp+256] + lea r10, QWORD PTR [rsp+128] + add rdx, 384 + mov r9, 0 + mov r8, QWORD PTR [r10+-128] + sub r8, QWORD PTR [rdx+-128] + mov rax, QWORD PTR [r10+-120] + mov QWORD PTR [r10+-128], r8 + sbb rax, QWORD PTR [rdx+-120] + mov r8, QWORD PTR [r10+-112] + mov QWORD PTR [r10+-120], rax + sbb r8, QWORD PTR [rdx+-112] + mov rax, QWORD PTR [r10+-104] + mov QWORD PTR [r10+-112], r8 + sbb rax, QWORD PTR [rdx+-104] + mov r8, QWORD PTR [r10+-96] + mov QWORD PTR [r10+-104], rax + sbb r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov r8, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], rax + sbb r8, QWORD PTR [rdx+96] + mov rax, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], r8 + sbb rax, QWORD PTR [rdx+104] + mov r8, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], rax + sbb r8, QWORD PTR [rdx+112] + mov rax, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], r8 + sbb rax, QWORD PTR [rdx+120] + mov QWORD PTR [r10+120], rax sbb r9, 0 - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rcx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rcx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rcx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rcx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rcx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rcx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rcx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rcx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rcx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rcx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rcx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rcx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rcx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rcx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rcx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rcx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rcx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rcx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rcx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rcx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rcx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rcx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rcx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rcx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rcx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rcx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rcx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rcx+248] - mov QWORD PTR [r10+248], r8 + sub rdx, 256 + mov r8, QWORD PTR [r10+-128] + sub r8, QWORD PTR [rdx+-128] + mov rax, QWORD PTR [r10+-120] + mov QWORD PTR [r10+-128], r8 + sbb rax, QWORD PTR [rdx+-120] + mov r8, QWORD PTR [r10+-112] + mov QWORD PTR [r10+-120], rax + sbb r8, QWORD PTR [rdx+-112] + mov rax, QWORD PTR [r10+-104] + mov QWORD PTR [r10+-112], r8 + sbb rax, QWORD PTR [rdx+-104] + mov r8, QWORD PTR [r10+-96] + mov QWORD PTR [r10+-104], rax + sbb r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov r8, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], rax + sbb r8, QWORD PTR [rdx+96] + mov rax, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], r8 + sbb rax, QWORD PTR [rdx+104] + mov r8, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], rax + sbb r8, QWORD PTR [rdx+112] + mov rax, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], r8 + sbb rax, QWORD PTR [rdx+120] + mov QWORD PTR [r10+120], rax sbb r9, 0 - ; Add in place - mov rax, QWORD PTR [rcx+128] - add rax, QWORD PTR [r10] - mov r8, QWORD PTR [rcx+136] - mov QWORD PTR [rcx+128], rax - adc r8, QWORD PTR [r10+8] - mov rax, QWORD PTR [rcx+144] - mov QWORD PTR [rcx+136], r8 - adc rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [rcx+152] - mov QWORD PTR [rcx+144], rax - adc r8, QWORD PTR [r10+24] - mov rax, QWORD PTR [rcx+160] - mov QWORD PTR [rcx+152], r8 - adc rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [rcx+168] - mov QWORD PTR [rcx+160], rax - adc r8, QWORD PTR [r10+40] - mov rax, QWORD PTR [rcx+176] - mov QWORD PTR [rcx+168], r8 - adc rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [rcx+184] - mov QWORD PTR [rcx+176], rax - adc r8, QWORD PTR [r10+56] - mov rax, QWORD PTR [rcx+192] - mov QWORD PTR [rcx+184], r8 - adc rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [rcx+200] - mov QWORD PTR [rcx+192], rax - adc r8, QWORD PTR [r10+72] - mov rax, QWORD PTR [rcx+208] - mov QWORD PTR [rcx+200], r8 - adc rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [rcx+216] - mov QWORD PTR [rcx+208], rax - adc r8, QWORD PTR [r10+88] - mov rax, QWORD PTR [rcx+224] - mov QWORD PTR [rcx+216], r8 - adc rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [rcx+232] - mov QWORD PTR [rcx+224], rax - adc r8, QWORD PTR [r10+104] - mov rax, QWORD PTR [rcx+240] - mov QWORD PTR [rcx+232], r8 - adc rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [rcx+248] - mov QWORD PTR [rcx+240], rax - adc r8, QWORD PTR [r10+120] - mov rax, QWORD PTR [rcx+256] - mov QWORD PTR [rcx+248], r8 - adc rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [rcx+264] - mov QWORD PTR [rcx+256], rax - adc r8, QWORD PTR [r10+136] - mov rax, QWORD PTR [rcx+272] - mov QWORD PTR [rcx+264], r8 - adc rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [rcx+280] - mov QWORD PTR [rcx+272], rax - adc r8, QWORD PTR [r10+152] - mov rax, QWORD PTR [rcx+288] - mov QWORD PTR [rcx+280], r8 - adc rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [rcx+296] - mov QWORD PTR [rcx+288], rax - adc r8, QWORD PTR [r10+168] - mov rax, QWORD PTR [rcx+304] - mov QWORD PTR [rcx+296], r8 - adc rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [rcx+312] - mov QWORD PTR [rcx+304], rax - adc r8, QWORD PTR [r10+184] - mov rax, QWORD PTR [rcx+320] - mov QWORD PTR [rcx+312], r8 - adc rax, QWORD PTR [r10+192] - mov r8, QWORD PTR [rcx+328] - mov QWORD PTR [rcx+320], rax - adc r8, QWORD PTR [r10+200] - mov rax, QWORD PTR [rcx+336] - mov QWORD PTR [rcx+328], r8 - adc rax, QWORD PTR [r10+208] - mov r8, QWORD PTR [rcx+344] - mov QWORD PTR [rcx+336], rax - adc r8, QWORD PTR [r10+216] - mov rax, QWORD PTR [rcx+352] - mov QWORD PTR [rcx+344], r8 - adc rax, QWORD PTR [r10+224] - mov r8, QWORD PTR [rcx+360] - mov QWORD PTR [rcx+352], rax - adc r8, QWORD PTR [r10+232] - mov rax, QWORD PTR [rcx+368] - mov QWORD PTR [rcx+360], r8 - adc rax, QWORD PTR [r10+240] - mov r8, QWORD PTR [rcx+376] - mov QWORD PTR [rcx+368], rax - adc r8, QWORD PTR [r10+248] - mov QWORD PTR [rcx+376], r8 - adc r9, 0 - mov QWORD PTR [rcx+384], r9 - ; Add in place - mov rax, QWORD PTR [rcx+256] - xor r9, r9 - add rax, QWORD PTR [rdx] - mov r8, QWORD PTR [rcx+264] - mov QWORD PTR [rcx+256], rax - adc r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [rcx+272] - mov QWORD PTR [rcx+264], r8 - adc rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [rcx+280] - mov QWORD PTR [rcx+272], rax - adc r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [rcx+288] - mov QWORD PTR [rcx+280], r8 - adc rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [rcx+296] - mov QWORD PTR [rcx+288], rax - adc r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [rcx+304] - mov QWORD PTR [rcx+296], r8 - adc rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [rcx+312] - mov QWORD PTR [rcx+304], rax - adc r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [rcx+320] - mov QWORD PTR [rcx+312], r8 - adc rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [rcx+328] - mov QWORD PTR [rcx+320], rax - adc r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [rcx+336] - mov QWORD PTR [rcx+328], r8 - adc rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [rcx+344] - mov QWORD PTR [rcx+336], rax - adc r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [rcx+352] - mov QWORD PTR [rcx+344], r8 - adc rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [rcx+360] - mov QWORD PTR [rcx+352], rax - adc r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [rcx+368] - mov QWORD PTR [rcx+360], r8 - adc rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [rcx+376] - mov QWORD PTR [rcx+368], rax - adc r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [rcx+384] - mov QWORD PTR [rcx+376], r8 - adc rax, QWORD PTR [rdx+128] - mov QWORD PTR [rcx+384], rax - adc r9, 0 - ; Add to zero - mov rax, QWORD PTR [rdx+136] + mov rcx, QWORD PTR [rsp+256] + neg r9 + add rcx, 256 + mov r8, QWORD PTR [rcx+-128] + sub r8, QWORD PTR [r10+-128] + mov rax, QWORD PTR [rcx+-120] + mov QWORD PTR [rcx+-128], r8 + sbb rax, QWORD PTR [r10+-120] + mov r8, QWORD PTR [rcx+-112] + mov QWORD PTR [rcx+-120], rax + sbb r8, QWORD PTR [r10+-112] + mov rax, QWORD PTR [rcx+-104] + mov QWORD PTR [rcx+-112], r8 + sbb rax, QWORD PTR [r10+-104] + mov r8, QWORD PTR [rcx+-96] + mov QWORD PTR [rcx+-104], rax + sbb r8, QWORD PTR [r10+-96] + mov rax, QWORD PTR [rcx+-88] + mov QWORD PTR [rcx+-96], r8 + sbb rax, QWORD PTR [r10+-88] + mov r8, QWORD PTR [rcx+-80] + mov QWORD PTR [rcx+-88], rax + sbb r8, QWORD PTR [r10+-80] + mov rax, QWORD PTR [rcx+-72] + mov QWORD PTR [rcx+-80], r8 + sbb rax, QWORD PTR [r10+-72] + mov r8, QWORD PTR [rcx+-64] + mov QWORD PTR [rcx+-72], rax + sbb r8, QWORD PTR [r10+-64] + mov rax, QWORD PTR [rcx+-56] + mov QWORD PTR [rcx+-64], r8 + sbb rax, QWORD PTR [r10+-56] + mov r8, QWORD PTR [rcx+-48] + mov QWORD PTR [rcx+-56], rax + sbb r8, QWORD PTR [r10+-48] + mov rax, QWORD PTR [rcx+-40] + mov QWORD PTR [rcx+-48], r8 + sbb rax, QWORD PTR [r10+-40] + mov r8, QWORD PTR [rcx+-32] + mov QWORD PTR [rcx+-40], rax + sbb r8, QWORD PTR [r10+-32] + mov rax, QWORD PTR [rcx+-24] + mov QWORD PTR [rcx+-32], r8 + sbb rax, QWORD PTR [r10+-24] + mov r8, QWORD PTR [rcx+-16] + mov QWORD PTR [rcx+-24], rax + sbb r8, QWORD PTR [r10+-16] + mov rax, QWORD PTR [rcx+-8] + mov QWORD PTR [rcx+-16], r8 + sbb rax, QWORD PTR [r10+-8] + mov r8, QWORD PTR [rcx] + mov QWORD PTR [rcx+-8], rax + sbb r8, QWORD PTR [r10] + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 + sbb rax, QWORD PTR [r10+8] + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax + sbb r8, QWORD PTR [r10+16] + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 + sbb rax, QWORD PTR [r10+24] + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax + sbb r8, QWORD PTR [r10+32] + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 + sbb rax, QWORD PTR [r10+40] + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax + sbb r8, QWORD PTR [r10+48] + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 + sbb rax, QWORD PTR [r10+56] + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax + sbb r8, QWORD PTR [r10+64] + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 + sbb rax, QWORD PTR [r10+72] + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax + sbb r8, QWORD PTR [r10+80] + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 + sbb rax, QWORD PTR [r10+88] + mov r8, QWORD PTR [rcx+96] + mov QWORD PTR [rcx+88], rax + sbb r8, QWORD PTR [r10+96] + mov rax, QWORD PTR [rcx+104] + mov QWORD PTR [rcx+96], r8 + sbb rax, QWORD PTR [r10+104] + mov r8, QWORD PTR [rcx+112] + mov QWORD PTR [rcx+104], rax + sbb r8, QWORD PTR [r10+112] + mov rax, QWORD PTR [rcx+120] + mov QWORD PTR [rcx+112], r8 + sbb rax, QWORD PTR [r10+120] + mov QWORD PTR [rcx+120], rax + sbb r9, 0 + mov rcx, QWORD PTR [rsp+256] + add rcx, 384 + ; Add in word + mov r8, QWORD PTR [rcx] + add r8, r9 + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+144] - mov QWORD PTR [rcx+392], rax + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax adc r8, 0 - mov rax, QWORD PTR [rdx+152] - mov QWORD PTR [rcx+400], r8 + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+160] - mov QWORD PTR [rcx+408], rax + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax adc r8, 0 - mov rax, QWORD PTR [rdx+168] - mov QWORD PTR [rcx+416], r8 + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+176] - mov QWORD PTR [rcx+424], rax + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax adc r8, 0 - mov rax, QWORD PTR [rdx+184] - mov QWORD PTR [rcx+432], r8 + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+192] - mov QWORD PTR [rcx+440], rax + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax adc r8, 0 - mov rax, QWORD PTR [rdx+200] - mov QWORD PTR [rcx+448], r8 + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+208] - mov QWORD PTR [rcx+456], rax + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax adc r8, 0 - mov rax, QWORD PTR [rdx+216] - mov QWORD PTR [rcx+464], r8 + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+224] - mov QWORD PTR [rcx+472], rax + mov r8, QWORD PTR [rcx+96] + mov QWORD PTR [rcx+88], rax adc r8, 0 - mov rax, QWORD PTR [rdx+232] - mov QWORD PTR [rcx+480], r8 + mov rax, QWORD PTR [rcx+104] + mov QWORD PTR [rcx+96], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+240] - mov QWORD PTR [rcx+488], rax + mov r8, QWORD PTR [rcx+112] + mov QWORD PTR [rcx+104], rax adc r8, 0 - mov rax, QWORD PTR [rdx+248] - mov QWORD PTR [rcx+496], r8 + mov rax, QWORD PTR [rcx+120] + mov QWORD PTR [rcx+112], r8 adc rax, 0 - mov QWORD PTR [rcx+504], rax - add rsp, 664 - pop r12 + mov QWORD PTR [rcx+120], rax + mov rdx, QWORD PTR [rsp+264] + mov rcx, QWORD PTR [rsp+256] + add rsp, 272 ret sp_2048_sqr_avx2_32 ENDP _text ENDS @@ -8695,7 +8540,6 @@ ENDIF _text SEGMENT READONLY PARA sp_2048_sub_in_place_16 PROC mov r8, QWORD PTR [rcx] - xor rax, rax sub r8, QWORD PTR [rdx] mov r9, QWORD PTR [rcx+8] mov QWORD PTR [rcx], r8 @@ -8743,7 +8587,7 @@ sp_2048_sub_in_place_16 PROC mov QWORD PTR [rcx+112], r8 sbb r9, QWORD PTR [rdx+120] mov QWORD PTR [rcx+120], r9 - sbb rax, 0 + sbb rax, rax ret sp_2048_sub_in_place_16 ENDP _text ENDS @@ -9026,7 +8870,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_2048_cond_sub_16 PROC sub rsp, 128 - mov rax, 0 mov r10, QWORD PTR [r8] mov r11, QWORD PTR [r8+8] and r10, r9 @@ -9139,7 +8982,7 @@ sp_2048_cond_sub_16 PROC sbb r11, r8 mov QWORD PTR [rcx+112], r10 mov QWORD PTR [rcx+120], r11 - sbb rax, 0 + sbb rax, rax add rsp, 128 ret sp_2048_cond_sub_16 ENDP @@ -9363,7 +9206,6 @@ IFDEF HAVE_INTEL_AVX2 _text SEGMENT READONLY PARA sp_2048_cond_sub_avx2_16 PROC push r12 - mov rax, 0 mov r12, QWORD PTR [r8] mov r10, QWORD PTR [rdx] pext r12, r12, r9 @@ -9444,7 +9286,7 @@ sp_2048_cond_sub_avx2_16 PROC mov QWORD PTR [rcx+112], r12 sbb r10, r11 mov QWORD PTR [rcx+120], r10 - sbb rax, 0 + sbb rax, rax pop r12 ret sp_2048_cond_sub_avx2_16 ENDP @@ -9870,6 +9712,1162 @@ sp_2048_cmp_16 PROC ret sp_2048_cmp_16 ENDP _text ENDS +IFNDEF WC_NO_CACHE_RESISTANT +_text SEGMENT READONLY PARA +sp_2048_get_from_table_16 PROC + mov rax, 1 + movd xmm10, r8 + movd xmm11, rax + pxor xmm13, xmm13 + pshufd xmm11, xmm11, 0 + pshufd xmm10, xmm10, 0 + ; START: 0-7 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 0-7 + ; START: 8-15 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + ; END: 8-15 + ret +sp_2048_get_from_table_16 ENDP +_text ENDS +ENDIF IFDEF HAVE_INTEL_AVX2 ; /* Reduce the number back to 2048 bits using Montgomery reduction. ; * @@ -10202,6 +11200,542 @@ L_2048_mont_reduce_avx2_16_loop: sp_2048_mont_reduce_avx2_16 ENDP _text ENDS ENDIF +IFNDEF WC_NO_CACHE_RESISTANT +_text SEGMENT READONLY PARA +sp_2048_get_from_table_avx2_16 PROC + mov rax, 1 + movd xmm10, r8 + movd xmm11, rax + vpxor ymm13, ymm13, ymm13 + vpermd ymm10, ymm13, ymm10 + vpermd ymm11, ymm13, ymm11 + ; START: 0-15 + vpxor ymm13, ymm13, ymm13 + vpxor ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 + vpxor ymm6, ymm6, ymm6 + vpxor ymm7, ymm7, ymm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm4 + vmovdqu YMMWORD PTR [rcx+32], ymm5 + vmovdqu YMMWORD PTR [rcx+64], ymm6 + vmovdqu YMMWORD PTR [rcx+96], ymm7 + ; END: 0-15 + ret +sp_2048_get_from_table_avx2_16 ENDP +_text ENDS +ENDIF ; /* Conditionally subtract b from a using the mask m. ; * m is -1 to subtract and 0 when not copying. ; * @@ -10213,7 +11747,6 @@ ENDIF _text SEGMENT READONLY PARA sp_2048_cond_sub_32 PROC sub rsp, 256 - mov rax, 0 mov r10, QWORD PTR [r8] mov r11, QWORD PTR [r8+8] and r10, r9 @@ -10438,7 +11971,7 @@ sp_2048_cond_sub_32 PROC sbb r11, r8 mov QWORD PTR [rcx+240], r10 mov QWORD PTR [rcx+248], r11 - sbb rax, 0 + sbb rax, rax add rsp, 256 ret sp_2048_cond_sub_32 ENDP @@ -10819,7 +12352,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_2048_sub_32 PROC mov r9, QWORD PTR [rdx] - xor rax, rax sub r9, QWORD PTR [r8] mov r10, QWORD PTR [rdx+8] mov QWORD PTR [rcx], r9 @@ -10915,7 +12447,7 @@ sp_2048_sub_32 PROC mov QWORD PTR [rcx+240], r9 sbb r10, QWORD PTR [r8+248] mov QWORD PTR [rcx+248], r10 - sbb rax, 0 + sbb rax, rax ret sp_2048_sub_32 ENDP _text ENDS @@ -11160,7 +12692,6 @@ IFDEF HAVE_INTEL_AVX2 _text SEGMENT READONLY PARA sp_2048_cond_sub_avx2_32 PROC push r12 - mov rax, 0 mov r12, QWORD PTR [r8] mov r10, QWORD PTR [rdx] pext r12, r12, r9 @@ -11321,7 +12852,7 @@ sp_2048_cond_sub_avx2_32 PROC mov QWORD PTR [rcx+240], r10 sbb r11, r12 mov QWORD PTR [rcx+248], r11 - sbb rax, 0 + sbb rax, rax pop r12 ret sp_2048_cond_sub_avx2_32 ENDP @@ -11602,6 +13133,4610 @@ sp_2048_cmp_32 PROC ret sp_2048_cmp_32 ENDP _text ENDS +IFNDEF WC_NO_CACHE_RESISTANT +_text SEGMENT READONLY PARA +sp_2048_get_from_table_32 PROC + mov rax, 1 + movd xmm10, r8 + movd xmm11, rax + pxor xmm13, xmm13 + pshufd xmm11, xmm11, 0 + pshufd xmm10, xmm10, 0 + ; START: 0-7 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 32 + mov r9, QWORD PTR [rdx+256] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 33 + mov r9, QWORD PTR [rdx+264] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 34 + mov r9, QWORD PTR [rdx+272] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 35 + mov r9, QWORD PTR [rdx+280] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 36 + mov r9, QWORD PTR [rdx+288] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 37 + mov r9, QWORD PTR [rdx+296] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 38 + mov r9, QWORD PTR [rdx+304] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 39 + mov r9, QWORD PTR [rdx+312] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 40 + mov r9, QWORD PTR [rdx+320] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 41 + mov r9, QWORD PTR [rdx+328] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 42 + mov r9, QWORD PTR [rdx+336] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 43 + mov r9, QWORD PTR [rdx+344] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 44 + mov r9, QWORD PTR [rdx+352] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 45 + mov r9, QWORD PTR [rdx+360] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 46 + mov r9, QWORD PTR [rdx+368] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 47 + mov r9, QWORD PTR [rdx+376] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 48 + mov r9, QWORD PTR [rdx+384] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 49 + mov r9, QWORD PTR [rdx+392] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 50 + mov r9, QWORD PTR [rdx+400] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 51 + mov r9, QWORD PTR [rdx+408] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 52 + mov r9, QWORD PTR [rdx+416] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 53 + mov r9, QWORD PTR [rdx+424] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 54 + mov r9, QWORD PTR [rdx+432] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 55 + mov r9, QWORD PTR [rdx+440] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 56 + mov r9, QWORD PTR [rdx+448] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 57 + mov r9, QWORD PTR [rdx+456] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 58 + mov r9, QWORD PTR [rdx+464] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 59 + mov r9, QWORD PTR [rdx+472] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 60 + mov r9, QWORD PTR [rdx+480] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 61 + mov r9, QWORD PTR [rdx+488] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 62 + mov r9, QWORD PTR [rdx+496] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 63 + mov r9, QWORD PTR [rdx+504] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 0-7 + ; START: 8-15 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 32 + mov r9, QWORD PTR [rdx+256] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 33 + mov r9, QWORD PTR [rdx+264] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 34 + mov r9, QWORD PTR [rdx+272] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 35 + mov r9, QWORD PTR [rdx+280] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 36 + mov r9, QWORD PTR [rdx+288] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 37 + mov r9, QWORD PTR [rdx+296] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 38 + mov r9, QWORD PTR [rdx+304] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 39 + mov r9, QWORD PTR [rdx+312] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 40 + mov r9, QWORD PTR [rdx+320] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 41 + mov r9, QWORD PTR [rdx+328] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 42 + mov r9, QWORD PTR [rdx+336] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 43 + mov r9, QWORD PTR [rdx+344] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 44 + mov r9, QWORD PTR [rdx+352] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 45 + mov r9, QWORD PTR [rdx+360] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 46 + mov r9, QWORD PTR [rdx+368] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 47 + mov r9, QWORD PTR [rdx+376] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 48 + mov r9, QWORD PTR [rdx+384] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 49 + mov r9, QWORD PTR [rdx+392] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 50 + mov r9, QWORD PTR [rdx+400] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 51 + mov r9, QWORD PTR [rdx+408] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 52 + mov r9, QWORD PTR [rdx+416] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 53 + mov r9, QWORD PTR [rdx+424] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 54 + mov r9, QWORD PTR [rdx+432] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 55 + mov r9, QWORD PTR [rdx+440] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 56 + mov r9, QWORD PTR [rdx+448] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 57 + mov r9, QWORD PTR [rdx+456] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 58 + mov r9, QWORD PTR [rdx+464] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 59 + mov r9, QWORD PTR [rdx+472] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 60 + mov r9, QWORD PTR [rdx+480] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 61 + mov r9, QWORD PTR [rdx+488] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 62 + mov r9, QWORD PTR [rdx+496] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 63 + mov r9, QWORD PTR [rdx+504] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 8-15 + ; START: 16-23 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 32 + mov r9, QWORD PTR [rdx+256] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 33 + mov r9, QWORD PTR [rdx+264] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 34 + mov r9, QWORD PTR [rdx+272] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 35 + mov r9, QWORD PTR [rdx+280] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 36 + mov r9, QWORD PTR [rdx+288] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 37 + mov r9, QWORD PTR [rdx+296] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 38 + mov r9, QWORD PTR [rdx+304] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 39 + mov r9, QWORD PTR [rdx+312] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 40 + mov r9, QWORD PTR [rdx+320] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 41 + mov r9, QWORD PTR [rdx+328] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 42 + mov r9, QWORD PTR [rdx+336] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 43 + mov r9, QWORD PTR [rdx+344] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 44 + mov r9, QWORD PTR [rdx+352] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 45 + mov r9, QWORD PTR [rdx+360] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 46 + mov r9, QWORD PTR [rdx+368] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 47 + mov r9, QWORD PTR [rdx+376] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 48 + mov r9, QWORD PTR [rdx+384] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 49 + mov r9, QWORD PTR [rdx+392] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 50 + mov r9, QWORD PTR [rdx+400] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 51 + mov r9, QWORD PTR [rdx+408] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 52 + mov r9, QWORD PTR [rdx+416] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 53 + mov r9, QWORD PTR [rdx+424] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 54 + mov r9, QWORD PTR [rdx+432] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 55 + mov r9, QWORD PTR [rdx+440] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 56 + mov r9, QWORD PTR [rdx+448] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 57 + mov r9, QWORD PTR [rdx+456] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 58 + mov r9, QWORD PTR [rdx+464] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 59 + mov r9, QWORD PTR [rdx+472] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 60 + mov r9, QWORD PTR [rdx+480] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 61 + mov r9, QWORD PTR [rdx+488] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 62 + mov r9, QWORD PTR [rdx+496] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 63 + mov r9, QWORD PTR [rdx+504] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 16-23 + ; START: 24-31 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 32 + mov r9, QWORD PTR [rdx+256] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 33 + mov r9, QWORD PTR [rdx+264] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 34 + mov r9, QWORD PTR [rdx+272] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 35 + mov r9, QWORD PTR [rdx+280] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 36 + mov r9, QWORD PTR [rdx+288] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 37 + mov r9, QWORD PTR [rdx+296] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 38 + mov r9, QWORD PTR [rdx+304] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 39 + mov r9, QWORD PTR [rdx+312] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 40 + mov r9, QWORD PTR [rdx+320] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 41 + mov r9, QWORD PTR [rdx+328] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 42 + mov r9, QWORD PTR [rdx+336] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 43 + mov r9, QWORD PTR [rdx+344] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 44 + mov r9, QWORD PTR [rdx+352] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 45 + mov r9, QWORD PTR [rdx+360] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 46 + mov r9, QWORD PTR [rdx+368] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 47 + mov r9, QWORD PTR [rdx+376] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 48 + mov r9, QWORD PTR [rdx+384] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 49 + mov r9, QWORD PTR [rdx+392] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 50 + mov r9, QWORD PTR [rdx+400] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 51 + mov r9, QWORD PTR [rdx+408] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 52 + mov r9, QWORD PTR [rdx+416] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 53 + mov r9, QWORD PTR [rdx+424] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 54 + mov r9, QWORD PTR [rdx+432] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 55 + mov r9, QWORD PTR [rdx+440] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 56 + mov r9, QWORD PTR [rdx+448] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 57 + mov r9, QWORD PTR [rdx+456] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 58 + mov r9, QWORD PTR [rdx+464] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 59 + mov r9, QWORD PTR [rdx+472] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 60 + mov r9, QWORD PTR [rdx+480] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 61 + mov r9, QWORD PTR [rdx+488] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 62 + mov r9, QWORD PTR [rdx+496] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 63 + mov r9, QWORD PTR [rdx+504] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + ; END: 24-31 + ret +sp_2048_get_from_table_32 ENDP +_text ENDS +ENDIF IFDEF HAVE_INTEL_AVX2 ; /* Reduce the number back to 2048 bits using Montgomery reduction. ; * @@ -12009,6 +18144,2154 @@ L_2048_mont_reduce_avx2_32_loop: sp_2048_mont_reduce_avx2_32 ENDP _text ENDS ENDIF +IFNDEF WC_NO_CACHE_RESISTANT +_text SEGMENT READONLY PARA +sp_2048_get_from_table_avx2_32 PROC + mov rax, 1 + movd xmm10, r8 + movd xmm11, rax + vpxor ymm13, ymm13, ymm13 + vpermd ymm10, ymm13, ymm10 + vpermd ymm11, ymm13, ymm11 + ; START: 0-15 + vpxor ymm13, ymm13, ymm13 + vpxor ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 + vpxor ymm6, ymm6, ymm6 + vpxor ymm7, ymm7, ymm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 32 + mov r9, QWORD PTR [rdx+256] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 33 + mov r9, QWORD PTR [rdx+264] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 34 + mov r9, QWORD PTR [rdx+272] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 35 + mov r9, QWORD PTR [rdx+280] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 36 + mov r9, QWORD PTR [rdx+288] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 37 + mov r9, QWORD PTR [rdx+296] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 38 + mov r9, QWORD PTR [rdx+304] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 39 + mov r9, QWORD PTR [rdx+312] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 40 + mov r9, QWORD PTR [rdx+320] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 41 + mov r9, QWORD PTR [rdx+328] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 42 + mov r9, QWORD PTR [rdx+336] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 43 + mov r9, QWORD PTR [rdx+344] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 44 + mov r9, QWORD PTR [rdx+352] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 45 + mov r9, QWORD PTR [rdx+360] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 46 + mov r9, QWORD PTR [rdx+368] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 47 + mov r9, QWORD PTR [rdx+376] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 48 + mov r9, QWORD PTR [rdx+384] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 49 + mov r9, QWORD PTR [rdx+392] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 50 + mov r9, QWORD PTR [rdx+400] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 51 + mov r9, QWORD PTR [rdx+408] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 52 + mov r9, QWORD PTR [rdx+416] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 53 + mov r9, QWORD PTR [rdx+424] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 54 + mov r9, QWORD PTR [rdx+432] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 55 + mov r9, QWORD PTR [rdx+440] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 56 + mov r9, QWORD PTR [rdx+448] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 57 + mov r9, QWORD PTR [rdx+456] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 58 + mov r9, QWORD PTR [rdx+464] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 59 + mov r9, QWORD PTR [rdx+472] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 60 + mov r9, QWORD PTR [rdx+480] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 61 + mov r9, QWORD PTR [rdx+488] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 62 + mov r9, QWORD PTR [rdx+496] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 63 + mov r9, QWORD PTR [rdx+504] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm4 + vmovdqu YMMWORD PTR [rcx+32], ymm5 + vmovdqu YMMWORD PTR [rcx+64], ymm6 + vmovdqu YMMWORD PTR [rcx+96], ymm7 + add rcx, 128 + ; END: 0-15 + ; START: 16-31 + vpxor ymm13, ymm13, ymm13 + vpxor ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 + vpxor ymm6, ymm6, ymm6 + vpxor ymm7, ymm7, ymm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 32 + mov r9, QWORD PTR [rdx+256] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 33 + mov r9, QWORD PTR [rdx+264] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 34 + mov r9, QWORD PTR [rdx+272] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 35 + mov r9, QWORD PTR [rdx+280] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 36 + mov r9, QWORD PTR [rdx+288] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 37 + mov r9, QWORD PTR [rdx+296] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 38 + mov r9, QWORD PTR [rdx+304] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 39 + mov r9, QWORD PTR [rdx+312] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 40 + mov r9, QWORD PTR [rdx+320] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 41 + mov r9, QWORD PTR [rdx+328] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 42 + mov r9, QWORD PTR [rdx+336] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 43 + mov r9, QWORD PTR [rdx+344] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 44 + mov r9, QWORD PTR [rdx+352] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 45 + mov r9, QWORD PTR [rdx+360] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 46 + mov r9, QWORD PTR [rdx+368] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 47 + mov r9, QWORD PTR [rdx+376] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 48 + mov r9, QWORD PTR [rdx+384] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 49 + mov r9, QWORD PTR [rdx+392] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 50 + mov r9, QWORD PTR [rdx+400] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 51 + mov r9, QWORD PTR [rdx+408] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 52 + mov r9, QWORD PTR [rdx+416] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 53 + mov r9, QWORD PTR [rdx+424] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 54 + mov r9, QWORD PTR [rdx+432] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 55 + mov r9, QWORD PTR [rdx+440] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 56 + mov r9, QWORD PTR [rdx+448] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 57 + mov r9, QWORD PTR [rdx+456] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 58 + mov r9, QWORD PTR [rdx+464] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 59 + mov r9, QWORD PTR [rdx+472] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 60 + mov r9, QWORD PTR [rdx+480] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 61 + mov r9, QWORD PTR [rdx+488] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 62 + mov r9, QWORD PTR [rdx+496] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 63 + mov r9, QWORD PTR [rdx+504] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm4 + vmovdqu YMMWORD PTR [rcx+32], ymm5 + vmovdqu YMMWORD PTR [rcx+64], ymm6 + vmovdqu YMMWORD PTR [rcx+96], ymm7 + ; END: 16-31 + ret +sp_2048_get_from_table_avx2_32 ENDP +_text ENDS +ENDIF ; /* Conditionally add a and b using the mask m. ; * m is -1 to add and 0 when not. ; * @@ -14764,7 +23047,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_3072_sub_in_place_24 PROC mov r8, QWORD PTR [rcx] - xor rax, rax sub r8, QWORD PTR [rdx] mov r9, QWORD PTR [rcx+8] mov QWORD PTR [rcx], r8 @@ -14836,7 +23118,7 @@ sp_3072_sub_in_place_24 PROC mov QWORD PTR [rcx+176], r8 sbb r9, QWORD PTR [rdx+184] mov QWORD PTR [rcx+184], r9 - sbb rax, 0 + sbb rax, rax ret sp_3072_sub_in_place_24 ENDP _text ENDS @@ -15998,7 +24280,6 @@ ENDIF _text SEGMENT READONLY PARA sp_3072_sub_in_place_48 PROC mov r8, QWORD PTR [rcx] - xor rax, rax sub r8, QWORD PTR [rdx] mov r9, QWORD PTR [rcx+8] mov QWORD PTR [rcx], r8 @@ -16142,7 +24423,7 @@ sp_3072_sub_in_place_48 PROC mov QWORD PTR [rcx+368], r8 sbb r9, QWORD PTR [rdx+376] mov QWORD PTR [rcx+376], r9 - sbb rax, 0 + sbb rax, rax ret sp_3072_sub_in_place_48 ENDP _text ENDS @@ -19579,2654 +27860,2306 @@ L_end_3072_sqr_avx2_12: sp_3072_sqr_avx2_12 ENDP _text ENDS ENDIF -; /* Add a to a into r. (r = a + a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_3072_dbl_12 PROC - mov r8, QWORD PTR [rdx] - xor rax, rax - add r8, r8 - mov r9, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+16] - mov QWORD PTR [rcx+8], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+40] - mov QWORD PTR [rcx+32], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+48] - mov QWORD PTR [rcx+40], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+56] - mov QWORD PTR [rcx+48], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+64] - mov QWORD PTR [rcx+56], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+72] - mov QWORD PTR [rcx+64], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+80] - mov QWORD PTR [rcx+72], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+88] - mov QWORD PTR [rcx+80], r8 - adc r9, r9 - mov QWORD PTR [rcx+88], r9 - adc rax, 0 - ret -sp_3072_dbl_12 ENDP -_text ENDS ; /* Square a and put result in r. (r = a * a) ; * +; * Karatsuba: ah^2, al^2, (al - ah)^2 +; * ; * r A single precision integer. ; * a A single precision integer. ; */ _text SEGMENT READONLY PARA sp_3072_sqr_24 PROC - push r12 - sub rsp, 504 - mov QWORD PTR [rsp+480], rcx - mov QWORD PTR [rsp+488], rdx - lea r10, QWORD PTR [rsp+384] + sub rsp, 208 + mov QWORD PTR [rsp+192], rcx + mov QWORD PTR [rsp+200], rdx + mov r9, 0 + mov r10, rsp lea r11, QWORD PTR [rdx+96] - ; Add mov rax, QWORD PTR [rdx] - xor r9, r9 - add rax, QWORD PTR [r11] + sub rax, QWORD PTR [r11] mov r8, QWORD PTR [rdx+8] mov QWORD PTR [r10], rax - adc r8, QWORD PTR [r11+8] + sbb r8, QWORD PTR [r11+8] mov rax, QWORD PTR [rdx+16] mov QWORD PTR [r10+8], r8 - adc rax, QWORD PTR [r11+16] + sbb rax, QWORD PTR [r11+16] mov r8, QWORD PTR [rdx+24] mov QWORD PTR [r10+16], rax - adc r8, QWORD PTR [r11+24] + sbb r8, QWORD PTR [r11+24] mov rax, QWORD PTR [rdx+32] mov QWORD PTR [r10+24], r8 - adc rax, QWORD PTR [r11+32] + sbb rax, QWORD PTR [r11+32] mov r8, QWORD PTR [rdx+40] mov QWORD PTR [r10+32], rax - adc r8, QWORD PTR [r11+40] + sbb r8, QWORD PTR [r11+40] mov rax, QWORD PTR [rdx+48] mov QWORD PTR [r10+40], r8 - adc rax, QWORD PTR [r11+48] + sbb rax, QWORD PTR [r11+48] mov r8, QWORD PTR [rdx+56] mov QWORD PTR [r10+48], rax - adc r8, QWORD PTR [r11+56] + sbb r8, QWORD PTR [r11+56] mov rax, QWORD PTR [rdx+64] mov QWORD PTR [r10+56], r8 - adc rax, QWORD PTR [r11+64] + sbb rax, QWORD PTR [r11+64] mov r8, QWORD PTR [rdx+72] mov QWORD PTR [r10+64], rax - adc r8, QWORD PTR [r11+72] + sbb r8, QWORD PTR [r11+72] mov rax, QWORD PTR [rdx+80] mov QWORD PTR [r10+72], r8 - adc rax, QWORD PTR [r11+80] + sbb rax, QWORD PTR [r11+80] mov r8, QWORD PTR [rdx+88] mov QWORD PTR [r10+80], rax - adc r8, QWORD PTR [r11+88] + sbb r8, QWORD PTR [r11+88] + mov QWORD PTR [r10+88], r8 + sbb r9, 0 + ; Cond Negate + mov rax, QWORD PTR [r10] + mov r11, r9 + xor rax, r9 + neg r11 + sub rax, r9 + mov r8, QWORD PTR [r10+8] + sbb r11, 0 + mov QWORD PTR [r10], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+16] + setc r11b + mov QWORD PTR [r10+8], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+24] + setc r11b + mov QWORD PTR [r10+16], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+32] + setc r11b + mov QWORD PTR [r10+24], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+40] + setc r11b + mov QWORD PTR [r10+32], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+48] + setc r11b + mov QWORD PTR [r10+40], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+56] + setc r11b + mov QWORD PTR [r10+48], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+64] + setc r11b + mov QWORD PTR [r10+56], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+72] + setc r11b + mov QWORD PTR [r10+64], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+80] + setc r11b + mov QWORD PTR [r10+72], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+88] + setc r11b + mov QWORD PTR [r10+80], rax + xor r8, r9 + add r8, r11 mov QWORD PTR [r10+88], r8 - adc r9, 0 - mov QWORD PTR [rsp+496], r9 mov rdx, r10 mov rcx, rsp call sp_3072_sqr_12 - mov rdx, QWORD PTR [rsp+488] - lea rcx, QWORD PTR [rsp+192] + mov rdx, QWORD PTR [rsp+200] + mov rcx, QWORD PTR [rsp+192] add rdx, 96 + add rcx, 192 call sp_3072_sqr_12 - mov rdx, QWORD PTR [rsp+488] - mov rcx, QWORD PTR [rsp+480] + mov rdx, QWORD PTR [rsp+200] + mov rcx, QWORD PTR [rsp+192] call sp_3072_sqr_12 IFDEF _WIN64 - mov rdx, QWORD PTR [rsp+488] - mov rcx, QWORD PTR [rsp+480] + mov rdx, QWORD PTR [rsp+200] + mov rcx, QWORD PTR [rsp+192] ENDIF - mov r12, QWORD PTR [rsp+496] - mov r11, rcx - lea r10, QWORD PTR [rsp+384] - mov r9, r12 - neg r12 - add r11, 192 - mov rax, QWORD PTR [r10] - mov r8, QWORD PTR [r10+8] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11], rax - mov QWORD PTR [r11+8], r8 - mov rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r10+24] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+16], rax - mov QWORD PTR [r11+24], r8 - mov rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r10+40] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+32], rax - mov QWORD PTR [r11+40], r8 - mov rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r10+56] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+48], rax - mov QWORD PTR [r11+56], r8 - mov rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r10+72] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+64], rax - mov QWORD PTR [r11+72], r8 - mov rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r10+88] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+80], rax - mov QWORD PTR [r11+88], r8 - mov rax, QWORD PTR [r11] - add rax, rax - mov r8, QWORD PTR [r11+8] - mov QWORD PTR [r11], rax - adc r8, r8 - mov rax, QWORD PTR [r11+16] - mov QWORD PTR [r11+8], r8 - adc rax, rax - mov r8, QWORD PTR [r11+24] - mov QWORD PTR [r11+16], rax - adc r8, r8 - mov rax, QWORD PTR [r11+32] - mov QWORD PTR [r11+24], r8 - adc rax, rax - mov r8, QWORD PTR [r11+40] - mov QWORD PTR [r11+32], rax - adc r8, r8 - mov rax, QWORD PTR [r11+48] - mov QWORD PTR [r11+40], r8 - adc rax, rax - mov r8, QWORD PTR [r11+56] - mov QWORD PTR [r11+48], rax - adc r8, r8 - mov rax, QWORD PTR [r11+64] - mov QWORD PTR [r11+56], r8 - adc rax, rax - mov r8, QWORD PTR [r11+72] - mov QWORD PTR [r11+64], rax - adc r8, r8 - mov rax, QWORD PTR [r11+80] - mov QWORD PTR [r11+72], r8 - adc rax, rax - mov r8, QWORD PTR [r11+88] - mov QWORD PTR [r11+80], rax - adc r8, r8 - mov QWORD PTR [r11+88], r8 - adc r9, 0 - lea rdx, QWORD PTR [rsp+192] - mov r10, rsp - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rdx+184] - mov QWORD PTR [r10+184], r8 + mov rdx, QWORD PTR [rsp+192] + lea r10, QWORD PTR [rsp+96] + add rdx, 288 + mov r9, 0 + mov r8, QWORD PTR [r10+-96] + sub r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov QWORD PTR [r10+88], rax sbb r9, 0 - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rcx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rcx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rcx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rcx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rcx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rcx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rcx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rcx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rcx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rcx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rcx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rcx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rcx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rcx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rcx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rcx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rcx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rcx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rcx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rcx+184] - mov QWORD PTR [r10+184], r8 + sub rdx, 192 + mov r8, QWORD PTR [r10+-96] + sub r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov QWORD PTR [r10+88], rax sbb r9, 0 - sub r11, 96 - ; Add in place - mov rax, QWORD PTR [r11] - add rax, QWORD PTR [r10] - mov r8, QWORD PTR [r11+8] - mov QWORD PTR [r11], rax - adc r8, QWORD PTR [r10+8] - mov rax, QWORD PTR [r11+16] - mov QWORD PTR [r11+8], r8 - adc rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r11+24] - mov QWORD PTR [r11+16], rax - adc r8, QWORD PTR [r10+24] - mov rax, QWORD PTR [r11+32] - mov QWORD PTR [r11+24], r8 - adc rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r11+40] - mov QWORD PTR [r11+32], rax - adc r8, QWORD PTR [r10+40] - mov rax, QWORD PTR [r11+48] - mov QWORD PTR [r11+40], r8 - adc rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r11+56] - mov QWORD PTR [r11+48], rax - adc r8, QWORD PTR [r10+56] - mov rax, QWORD PTR [r11+64] - mov QWORD PTR [r11+56], r8 - adc rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r11+72] - mov QWORD PTR [r11+64], rax - adc r8, QWORD PTR [r10+72] - mov rax, QWORD PTR [r11+80] - mov QWORD PTR [r11+72], r8 - adc rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r11+88] - mov QWORD PTR [r11+80], rax - adc r8, QWORD PTR [r10+88] - mov rax, QWORD PTR [r11+96] - mov QWORD PTR [r11+88], r8 - adc rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [r11+104] - mov QWORD PTR [r11+96], rax - adc r8, QWORD PTR [r10+104] - mov rax, QWORD PTR [r11+112] - mov QWORD PTR [r11+104], r8 - adc rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [r11+120] - mov QWORD PTR [r11+112], rax - adc r8, QWORD PTR [r10+120] - mov rax, QWORD PTR [r11+128] - mov QWORD PTR [r11+120], r8 - adc rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [r11+136] - mov QWORD PTR [r11+128], rax - adc r8, QWORD PTR [r10+136] - mov rax, QWORD PTR [r11+144] - mov QWORD PTR [r11+136], r8 - adc rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [r11+152] - mov QWORD PTR [r11+144], rax - adc r8, QWORD PTR [r10+152] - mov rax, QWORD PTR [r11+160] - mov QWORD PTR [r11+152], r8 - adc rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [r11+168] - mov QWORD PTR [r11+160], rax - adc r8, QWORD PTR [r10+168] - mov rax, QWORD PTR [r11+176] - mov QWORD PTR [r11+168], r8 - adc rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [r11+184] - mov QWORD PTR [r11+176], rax - adc r8, QWORD PTR [r10+184] - mov QWORD PTR [r11+184], r8 - adc r9, 0 - mov QWORD PTR [rcx+288], r9 - ; Add in place - mov rax, QWORD PTR [r11+96] - add rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r11+104] - mov QWORD PTR [r11+96], rax - adc r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r11+112] - mov QWORD PTR [r11+104], r8 - adc rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r11+120] - mov QWORD PTR [r11+112], rax - adc r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r11+128] - mov QWORD PTR [r11+120], r8 - adc rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r11+136] - mov QWORD PTR [r11+128], rax - adc r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r11+144] - mov QWORD PTR [r11+136], r8 - adc rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r11+152] - mov QWORD PTR [r11+144], rax - adc r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r11+160] - mov QWORD PTR [r11+152], r8 - adc rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r11+168] - mov QWORD PTR [r11+160], rax - adc r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r11+176] - mov QWORD PTR [r11+168], r8 - adc rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r11+184] - mov QWORD PTR [r11+176], rax - adc r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r11+192] - mov QWORD PTR [r11+184], r8 - adc rax, QWORD PTR [rdx+96] - mov QWORD PTR [r11+192], rax - ; Add to zero - mov rax, QWORD PTR [rdx+104] + mov rcx, QWORD PTR [rsp+192] + neg r9 + add rcx, 192 + mov r8, QWORD PTR [rcx+-96] + sub r8, QWORD PTR [r10+-96] + mov rax, QWORD PTR [rcx+-88] + mov QWORD PTR [rcx+-96], r8 + sbb rax, QWORD PTR [r10+-88] + mov r8, QWORD PTR [rcx+-80] + mov QWORD PTR [rcx+-88], rax + sbb r8, QWORD PTR [r10+-80] + mov rax, QWORD PTR [rcx+-72] + mov QWORD PTR [rcx+-80], r8 + sbb rax, QWORD PTR [r10+-72] + mov r8, QWORD PTR [rcx+-64] + mov QWORD PTR [rcx+-72], rax + sbb r8, QWORD PTR [r10+-64] + mov rax, QWORD PTR [rcx+-56] + mov QWORD PTR [rcx+-64], r8 + sbb rax, QWORD PTR [r10+-56] + mov r8, QWORD PTR [rcx+-48] + mov QWORD PTR [rcx+-56], rax + sbb r8, QWORD PTR [r10+-48] + mov rax, QWORD PTR [rcx+-40] + mov QWORD PTR [rcx+-48], r8 + sbb rax, QWORD PTR [r10+-40] + mov r8, QWORD PTR [rcx+-32] + mov QWORD PTR [rcx+-40], rax + sbb r8, QWORD PTR [r10+-32] + mov rax, QWORD PTR [rcx+-24] + mov QWORD PTR [rcx+-32], r8 + sbb rax, QWORD PTR [r10+-24] + mov r8, QWORD PTR [rcx+-16] + mov QWORD PTR [rcx+-24], rax + sbb r8, QWORD PTR [r10+-16] + mov rax, QWORD PTR [rcx+-8] + mov QWORD PTR [rcx+-16], r8 + sbb rax, QWORD PTR [r10+-8] + mov r8, QWORD PTR [rcx] + mov QWORD PTR [rcx+-8], rax + sbb r8, QWORD PTR [r10] + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 + sbb rax, QWORD PTR [r10+8] + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax + sbb r8, QWORD PTR [r10+16] + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 + sbb rax, QWORD PTR [r10+24] + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax + sbb r8, QWORD PTR [r10+32] + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 + sbb rax, QWORD PTR [r10+40] + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax + sbb r8, QWORD PTR [r10+48] + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 + sbb rax, QWORD PTR [r10+56] + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax + sbb r8, QWORD PTR [r10+64] + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 + sbb rax, QWORD PTR [r10+72] + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax + sbb r8, QWORD PTR [r10+80] + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 + sbb rax, QWORD PTR [r10+88] + mov QWORD PTR [rcx+88], rax + sbb r9, 0 + mov rcx, QWORD PTR [rsp+192] + add rcx, 288 + ; Add in word + mov r8, QWORD PTR [rcx] + add r8, r9 + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+112] - mov QWORD PTR [r11+200], rax + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax adc r8, 0 - mov rax, QWORD PTR [rdx+120] - mov QWORD PTR [r11+208], r8 + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+128] - mov QWORD PTR [r11+216], rax + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax adc r8, 0 - mov rax, QWORD PTR [rdx+136] - mov QWORD PTR [r11+224], r8 + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+144] - mov QWORD PTR [r11+232], rax + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax adc r8, 0 - mov rax, QWORD PTR [rdx+152] - mov QWORD PTR [r11+240], r8 + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+160] - mov QWORD PTR [r11+248], rax + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax adc r8, 0 - mov rax, QWORD PTR [rdx+168] - mov QWORD PTR [r11+256], r8 + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+176] - mov QWORD PTR [r11+264], rax + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax adc r8, 0 - mov rax, QWORD PTR [rdx+184] - mov QWORD PTR [r11+272], r8 + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 adc rax, 0 - mov QWORD PTR [r11+280], rax - add rsp, 504 - pop r12 + mov QWORD PTR [rcx+88], rax + mov rdx, QWORD PTR [rsp+200] + mov rcx, QWORD PTR [rsp+192] + add rsp, 208 ret sp_3072_sqr_24 ENDP _text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Square a and put result in r. (r = a * a) ; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_3072_sqr_avx2_24 PROC - push r12 - sub rsp, 504 - mov QWORD PTR [rsp+480], rcx - mov QWORD PTR [rsp+488], rdx - lea r10, QWORD PTR [rsp+384] - lea r11, QWORD PTR [rdx+96] - ; Add - mov rax, QWORD PTR [rdx] - xor r9, r9 - add rax, QWORD PTR [r11] - mov r8, QWORD PTR [rdx+8] - mov QWORD PTR [r10], rax - adc r8, QWORD PTR [r11+8] - mov rax, QWORD PTR [rdx+16] - mov QWORD PTR [r10+8], r8 - adc rax, QWORD PTR [r11+16] - mov r8, QWORD PTR [rdx+24] - mov QWORD PTR [r10+16], rax - adc r8, QWORD PTR [r11+24] - mov rax, QWORD PTR [rdx+32] - mov QWORD PTR [r10+24], r8 - adc rax, QWORD PTR [r11+32] - mov r8, QWORD PTR [rdx+40] - mov QWORD PTR [r10+32], rax - adc r8, QWORD PTR [r11+40] - mov rax, QWORD PTR [rdx+48] - mov QWORD PTR [r10+40], r8 - adc rax, QWORD PTR [r11+48] - mov r8, QWORD PTR [rdx+56] - mov QWORD PTR [r10+48], rax - adc r8, QWORD PTR [r11+56] - mov rax, QWORD PTR [rdx+64] - mov QWORD PTR [r10+56], r8 - adc rax, QWORD PTR [r11+64] - mov r8, QWORD PTR [rdx+72] - mov QWORD PTR [r10+64], rax - adc r8, QWORD PTR [r11+72] - mov rax, QWORD PTR [rdx+80] - mov QWORD PTR [r10+72], r8 - adc rax, QWORD PTR [r11+80] - mov r8, QWORD PTR [rdx+88] - mov QWORD PTR [r10+80], rax - adc r8, QWORD PTR [r11+88] - mov QWORD PTR [r10+88], r8 - adc r9, 0 - mov QWORD PTR [rsp+496], r9 - mov rdx, r10 - mov rcx, rsp - call sp_3072_sqr_avx2_12 - mov rdx, QWORD PTR [rsp+488] - lea rcx, QWORD PTR [rsp+192] - add rdx, 96 - call sp_3072_sqr_avx2_12 - mov rdx, QWORD PTR [rsp+488] - mov rcx, QWORD PTR [rsp+480] - call sp_3072_sqr_avx2_12 -IFDEF _WIN64 - mov rdx, QWORD PTR [rsp+488] - mov rcx, QWORD PTR [rsp+480] -ENDIF - mov r12, QWORD PTR [rsp+496] - mov r11, rcx - lea r10, QWORD PTR [rsp+384] - mov r9, r12 - neg r12 - add r11, 192 - mov rax, QWORD PTR [r10] - pext rax, rax, r12 - add rax, rax - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r11], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r11+8], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r11+16], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r11+24], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r11+32], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r11+40], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r11+48], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r11+56], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r11+64], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r11+72], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r11+80], rax - pext r8, r8, r12 - adc r8, r8 - mov QWORD PTR [r11+88], r8 - adc r9, 0 - lea rdx, QWORD PTR [rsp+192] - mov r10, rsp - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rdx+184] - mov QWORD PTR [r10+184], r8 - sbb r9, 0 - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rcx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rcx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rcx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rcx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rcx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rcx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rcx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rcx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rcx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rcx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rcx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rcx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rcx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rcx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rcx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rcx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rcx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rcx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rcx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rcx+184] - mov QWORD PTR [r10+184], r8 - sbb r9, 0 - sub r11, 96 - ; Add in place - mov rax, QWORD PTR [r11] - add rax, QWORD PTR [r10] - mov r8, QWORD PTR [r11+8] - mov QWORD PTR [r11], rax - adc r8, QWORD PTR [r10+8] - mov rax, QWORD PTR [r11+16] - mov QWORD PTR [r11+8], r8 - adc rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r11+24] - mov QWORD PTR [r11+16], rax - adc r8, QWORD PTR [r10+24] - mov rax, QWORD PTR [r11+32] - mov QWORD PTR [r11+24], r8 - adc rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r11+40] - mov QWORD PTR [r11+32], rax - adc r8, QWORD PTR [r10+40] - mov rax, QWORD PTR [r11+48] - mov QWORD PTR [r11+40], r8 - adc rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r11+56] - mov QWORD PTR [r11+48], rax - adc r8, QWORD PTR [r10+56] - mov rax, QWORD PTR [r11+64] - mov QWORD PTR [r11+56], r8 - adc rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r11+72] - mov QWORD PTR [r11+64], rax - adc r8, QWORD PTR [r10+72] - mov rax, QWORD PTR [r11+80] - mov QWORD PTR [r11+72], r8 - adc rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r11+88] - mov QWORD PTR [r11+80], rax - adc r8, QWORD PTR [r10+88] - mov rax, QWORD PTR [r11+96] - mov QWORD PTR [r11+88], r8 - adc rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [r11+104] - mov QWORD PTR [r11+96], rax - adc r8, QWORD PTR [r10+104] - mov rax, QWORD PTR [r11+112] - mov QWORD PTR [r11+104], r8 - adc rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [r11+120] - mov QWORD PTR [r11+112], rax - adc r8, QWORD PTR [r10+120] - mov rax, QWORD PTR [r11+128] - mov QWORD PTR [r11+120], r8 - adc rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [r11+136] - mov QWORD PTR [r11+128], rax - adc r8, QWORD PTR [r10+136] - mov rax, QWORD PTR [r11+144] - mov QWORD PTR [r11+136], r8 - adc rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [r11+152] - mov QWORD PTR [r11+144], rax - adc r8, QWORD PTR [r10+152] - mov rax, QWORD PTR [r11+160] - mov QWORD PTR [r11+152], r8 - adc rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [r11+168] - mov QWORD PTR [r11+160], rax - adc r8, QWORD PTR [r10+168] - mov rax, QWORD PTR [r11+176] - mov QWORD PTR [r11+168], r8 - adc rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [r11+184] - mov QWORD PTR [r11+176], rax - adc r8, QWORD PTR [r10+184] - mov QWORD PTR [r11+184], r8 - adc r9, 0 - mov QWORD PTR [rcx+288], r9 - ; Add in place - mov rax, QWORD PTR [r11+96] - add rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r11+104] - mov QWORD PTR [r11+96], rax - adc r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r11+112] - mov QWORD PTR [r11+104], r8 - adc rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r11+120] - mov QWORD PTR [r11+112], rax - adc r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r11+128] - mov QWORD PTR [r11+120], r8 - adc rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r11+136] - mov QWORD PTR [r11+128], rax - adc r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r11+144] - mov QWORD PTR [r11+136], r8 - adc rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r11+152] - mov QWORD PTR [r11+144], rax - adc r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r11+160] - mov QWORD PTR [r11+152], r8 - adc rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r11+168] - mov QWORD PTR [r11+160], rax - adc r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r11+176] - mov QWORD PTR [r11+168], r8 - adc rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r11+184] - mov QWORD PTR [r11+176], rax - adc r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r11+192] - mov QWORD PTR [r11+184], r8 - adc rax, QWORD PTR [rdx+96] - mov QWORD PTR [r11+192], rax - ; Add to zero - mov rax, QWORD PTR [rdx+104] - adc rax, 0 - mov r8, QWORD PTR [rdx+112] - mov QWORD PTR [r11+200], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+120] - mov QWORD PTR [r11+208], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+128] - mov QWORD PTR [r11+216], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+136] - mov QWORD PTR [r11+224], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+144] - mov QWORD PTR [r11+232], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+152] - mov QWORD PTR [r11+240], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+160] - mov QWORD PTR [r11+248], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+168] - mov QWORD PTR [r11+256], r8 - adc rax, 0 - mov r8, QWORD PTR [rdx+176] - mov QWORD PTR [r11+264], rax - adc r8, 0 - mov rax, QWORD PTR [rdx+184] - mov QWORD PTR [r11+272], r8 - adc rax, 0 - mov QWORD PTR [r11+280], rax - add rsp, 504 - pop r12 - ret -sp_3072_sqr_avx2_24 ENDP -_text ENDS -ENDIF -; /* Add a to a into r. (r = a + a) +; * Karatsuba: ah^2, al^2, (al - ah)^2 ; * ; * r A single precision integer. ; * a A single precision integer. ; */ _text SEGMENT READONLY PARA -sp_3072_dbl_24 PROC - mov r8, QWORD PTR [rdx] - xor rax, rax - add r8, r8 - mov r9, QWORD PTR [rdx+8] +sp_3072_sqr_avx2_24 PROC + sub rsp, 208 + mov QWORD PTR [rsp+192], rcx + mov QWORD PTR [rsp+200], rdx + mov r9, 0 + mov r10, rsp + lea r11, QWORD PTR [rdx+96] + mov rax, QWORD PTR [rdx] + sub rax, QWORD PTR [r11] + mov r8, QWORD PTR [rdx+8] + mov QWORD PTR [r10], rax + sbb r8, QWORD PTR [r11+8] + mov rax, QWORD PTR [rdx+16] + mov QWORD PTR [r10+8], r8 + sbb rax, QWORD PTR [r11+16] + mov r8, QWORD PTR [rdx+24] + mov QWORD PTR [r10+16], rax + sbb r8, QWORD PTR [r11+24] + mov rax, QWORD PTR [rdx+32] + mov QWORD PTR [r10+24], r8 + sbb rax, QWORD PTR [r11+32] + mov r8, QWORD PTR [rdx+40] + mov QWORD PTR [r10+32], rax + sbb r8, QWORD PTR [r11+40] + mov rax, QWORD PTR [rdx+48] + mov QWORD PTR [r10+40], r8 + sbb rax, QWORD PTR [r11+48] + mov r8, QWORD PTR [rdx+56] + mov QWORD PTR [r10+48], rax + sbb r8, QWORD PTR [r11+56] + mov rax, QWORD PTR [rdx+64] + mov QWORD PTR [r10+56], r8 + sbb rax, QWORD PTR [r11+64] + mov r8, QWORD PTR [rdx+72] + mov QWORD PTR [r10+64], rax + sbb r8, QWORD PTR [r11+72] + mov rax, QWORD PTR [rdx+80] + mov QWORD PTR [r10+72], r8 + sbb rax, QWORD PTR [r11+80] + mov r8, QWORD PTR [rdx+88] + mov QWORD PTR [r10+80], rax + sbb r8, QWORD PTR [r11+88] + mov QWORD PTR [r10+88], r8 + sbb r9, 0 + ; Cond Negate + mov rax, QWORD PTR [r10] + mov r11, r9 + xor rax, r9 + neg r11 + sub rax, r9 + mov r8, QWORD PTR [r10+8] + sbb r11, 0 + mov QWORD PTR [r10], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+16] + setc r11b + mov QWORD PTR [r10+8], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+24] + setc r11b + mov QWORD PTR [r10+16], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+32] + setc r11b + mov QWORD PTR [r10+24], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+40] + setc r11b + mov QWORD PTR [r10+32], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+48] + setc r11b + mov QWORD PTR [r10+40], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+56] + setc r11b + mov QWORD PTR [r10+48], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+64] + setc r11b + mov QWORD PTR [r10+56], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+72] + setc r11b + mov QWORD PTR [r10+64], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+80] + setc r11b + mov QWORD PTR [r10+72], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+88] + setc r11b + mov QWORD PTR [r10+80], rax + xor r8, r9 + add r8, r11 + mov QWORD PTR [r10+88], r8 + mov rdx, r10 + mov rcx, rsp + call sp_3072_sqr_avx2_12 + mov rdx, QWORD PTR [rsp+200] + mov rcx, QWORD PTR [rsp+192] + add rdx, 96 + add rcx, 192 + call sp_3072_sqr_avx2_12 + mov rdx, QWORD PTR [rsp+200] + mov rcx, QWORD PTR [rsp+192] + call sp_3072_sqr_avx2_12 +IFDEF _WIN64 + mov rdx, QWORD PTR [rsp+200] + mov rcx, QWORD PTR [rsp+192] +ENDIF + mov rdx, QWORD PTR [rsp+192] + lea r10, QWORD PTR [rsp+96] + add rdx, 288 + mov r9, 0 + mov r8, QWORD PTR [r10+-96] + sub r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov QWORD PTR [r10+88], rax + sbb r9, 0 + sub rdx, 192 + mov r8, QWORD PTR [r10+-96] + sub r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov QWORD PTR [r10+88], rax + sbb r9, 0 + mov rcx, QWORD PTR [rsp+192] + neg r9 + add rcx, 192 + mov r8, QWORD PTR [rcx+-96] + sub r8, QWORD PTR [r10+-96] + mov rax, QWORD PTR [rcx+-88] + mov QWORD PTR [rcx+-96], r8 + sbb rax, QWORD PTR [r10+-88] + mov r8, QWORD PTR [rcx+-80] + mov QWORD PTR [rcx+-88], rax + sbb r8, QWORD PTR [r10+-80] + mov rax, QWORD PTR [rcx+-72] + mov QWORD PTR [rcx+-80], r8 + sbb rax, QWORD PTR [r10+-72] + mov r8, QWORD PTR [rcx+-64] + mov QWORD PTR [rcx+-72], rax + sbb r8, QWORD PTR [r10+-64] + mov rax, QWORD PTR [rcx+-56] + mov QWORD PTR [rcx+-64], r8 + sbb rax, QWORD PTR [r10+-56] + mov r8, QWORD PTR [rcx+-48] + mov QWORD PTR [rcx+-56], rax + sbb r8, QWORD PTR [r10+-48] + mov rax, QWORD PTR [rcx+-40] + mov QWORD PTR [rcx+-48], r8 + sbb rax, QWORD PTR [r10+-40] + mov r8, QWORD PTR [rcx+-32] + mov QWORD PTR [rcx+-40], rax + sbb r8, QWORD PTR [r10+-32] + mov rax, QWORD PTR [rcx+-24] + mov QWORD PTR [rcx+-32], r8 + sbb rax, QWORD PTR [r10+-24] + mov r8, QWORD PTR [rcx+-16] + mov QWORD PTR [rcx+-24], rax + sbb r8, QWORD PTR [r10+-16] + mov rax, QWORD PTR [rcx+-8] + mov QWORD PTR [rcx+-16], r8 + sbb rax, QWORD PTR [r10+-8] + mov r8, QWORD PTR [rcx] + mov QWORD PTR [rcx+-8], rax + sbb r8, QWORD PTR [r10] + mov rax, QWORD PTR [rcx+8] mov QWORD PTR [rcx], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+16] - mov QWORD PTR [rcx+8], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+24] + sbb rax, QWORD PTR [r10+8] + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax + sbb r8, QWORD PTR [r10+16] + mov rax, QWORD PTR [rcx+24] mov QWORD PTR [rcx+16], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+40] + sbb rax, QWORD PTR [r10+24] + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax + sbb r8, QWORD PTR [r10+32] + mov rax, QWORD PTR [rcx+40] mov QWORD PTR [rcx+32], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+48] - mov QWORD PTR [rcx+40], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+56] + sbb rax, QWORD PTR [r10+40] + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax + sbb r8, QWORD PTR [r10+48] + mov rax, QWORD PTR [rcx+56] mov QWORD PTR [rcx+48], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+64] - mov QWORD PTR [rcx+56], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+72] + sbb rax, QWORD PTR [r10+56] + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax + sbb r8, QWORD PTR [r10+64] + mov rax, QWORD PTR [rcx+72] mov QWORD PTR [rcx+64], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+80] - mov QWORD PTR [rcx+72], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+88] + sbb rax, QWORD PTR [r10+72] + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax + sbb r8, QWORD PTR [r10+80] + mov rax, QWORD PTR [rcx+88] mov QWORD PTR [rcx+80], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+96] - mov QWORD PTR [rcx+88], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+104] - mov QWORD PTR [rcx+96], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+112] - mov QWORD PTR [rcx+104], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+120] - mov QWORD PTR [rcx+112], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+128] - mov QWORD PTR [rcx+120], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+136] - mov QWORD PTR [rcx+128], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+144] - mov QWORD PTR [rcx+136], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+152] - mov QWORD PTR [rcx+144], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+160] - mov QWORD PTR [rcx+152], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+168] - mov QWORD PTR [rcx+160], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+176] - mov QWORD PTR [rcx+168], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+184] - mov QWORD PTR [rcx+176], r8 - adc r9, r9 - mov QWORD PTR [rcx+184], r9 + sbb rax, QWORD PTR [r10+88] + mov QWORD PTR [rcx+88], rax + sbb r9, 0 + mov rcx, QWORD PTR [rsp+192] + add rcx, 288 + ; Add in word + mov r8, QWORD PTR [rcx] + add r8, r9 + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 adc rax, 0 + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax + adc r8, 0 + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 + adc rax, 0 + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax + adc r8, 0 + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 + adc rax, 0 + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax + adc r8, 0 + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 + adc rax, 0 + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax + adc r8, 0 + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 + adc rax, 0 + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax + adc r8, 0 + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 + adc rax, 0 + mov QWORD PTR [rcx+88], rax + mov rdx, QWORD PTR [rsp+200] + mov rcx, QWORD PTR [rsp+192] + add rsp, 208 ret -sp_3072_dbl_24 ENDP +sp_3072_sqr_avx2_24 ENDP _text ENDS +ENDIF ; /* Square a and put result in r. (r = a * a) ; * +; * Karatsuba: ah^2, al^2, (al - ah)^2 +; * ; * r A single precision integer. ; * a A single precision integer. ; */ _text SEGMENT READONLY PARA sp_3072_sqr_48 PROC - push r12 - sub rsp, 984 - mov QWORD PTR [rsp+960], rcx - mov QWORD PTR [rsp+968], rdx - lea r10, QWORD PTR [rsp+768] + sub rsp, 400 + mov QWORD PTR [rsp+384], rcx + mov QWORD PTR [rsp+392], rdx + mov r9, 0 + mov r10, rsp lea r11, QWORD PTR [rdx+192] - ; Add mov rax, QWORD PTR [rdx] - xor r9, r9 - add rax, QWORD PTR [r11] + sub rax, QWORD PTR [r11] mov r8, QWORD PTR [rdx+8] mov QWORD PTR [r10], rax - adc r8, QWORD PTR [r11+8] + sbb r8, QWORD PTR [r11+8] mov rax, QWORD PTR [rdx+16] mov QWORD PTR [r10+8], r8 - adc rax, QWORD PTR [r11+16] + sbb rax, QWORD PTR [r11+16] mov r8, QWORD PTR [rdx+24] mov QWORD PTR [r10+16], rax - adc r8, QWORD PTR [r11+24] + sbb r8, QWORD PTR [r11+24] mov rax, QWORD PTR [rdx+32] mov QWORD PTR [r10+24], r8 - adc rax, QWORD PTR [r11+32] + sbb rax, QWORD PTR [r11+32] mov r8, QWORD PTR [rdx+40] mov QWORD PTR [r10+32], rax - adc r8, QWORD PTR [r11+40] + sbb r8, QWORD PTR [r11+40] mov rax, QWORD PTR [rdx+48] mov QWORD PTR [r10+40], r8 - adc rax, QWORD PTR [r11+48] + sbb rax, QWORD PTR [r11+48] mov r8, QWORD PTR [rdx+56] mov QWORD PTR [r10+48], rax - adc r8, QWORD PTR [r11+56] + sbb r8, QWORD PTR [r11+56] mov rax, QWORD PTR [rdx+64] mov QWORD PTR [r10+56], r8 - adc rax, QWORD PTR [r11+64] + sbb rax, QWORD PTR [r11+64] mov r8, QWORD PTR [rdx+72] mov QWORD PTR [r10+64], rax - adc r8, QWORD PTR [r11+72] + sbb r8, QWORD PTR [r11+72] mov rax, QWORD PTR [rdx+80] mov QWORD PTR [r10+72], r8 - adc rax, QWORD PTR [r11+80] + sbb rax, QWORD PTR [r11+80] mov r8, QWORD PTR [rdx+88] mov QWORD PTR [r10+80], rax - adc r8, QWORD PTR [r11+88] + sbb r8, QWORD PTR [r11+88] mov rax, QWORD PTR [rdx+96] mov QWORD PTR [r10+88], r8 - adc rax, QWORD PTR [r11+96] + sbb rax, QWORD PTR [r11+96] mov r8, QWORD PTR [rdx+104] mov QWORD PTR [r10+96], rax - adc r8, QWORD PTR [r11+104] + sbb r8, QWORD PTR [r11+104] mov rax, QWORD PTR [rdx+112] mov QWORD PTR [r10+104], r8 - adc rax, QWORD PTR [r11+112] + sbb rax, QWORD PTR [r11+112] mov r8, QWORD PTR [rdx+120] mov QWORD PTR [r10+112], rax - adc r8, QWORD PTR [r11+120] + sbb r8, QWORD PTR [r11+120] mov rax, QWORD PTR [rdx+128] mov QWORD PTR [r10+120], r8 - adc rax, QWORD PTR [r11+128] + sbb rax, QWORD PTR [r11+128] mov r8, QWORD PTR [rdx+136] mov QWORD PTR [r10+128], rax - adc r8, QWORD PTR [r11+136] + sbb r8, QWORD PTR [r11+136] mov rax, QWORD PTR [rdx+144] mov QWORD PTR [r10+136], r8 - adc rax, QWORD PTR [r11+144] + sbb rax, QWORD PTR [r11+144] mov r8, QWORD PTR [rdx+152] mov QWORD PTR [r10+144], rax - adc r8, QWORD PTR [r11+152] + sbb r8, QWORD PTR [r11+152] mov rax, QWORD PTR [rdx+160] mov QWORD PTR [r10+152], r8 - adc rax, QWORD PTR [r11+160] + sbb rax, QWORD PTR [r11+160] mov r8, QWORD PTR [rdx+168] mov QWORD PTR [r10+160], rax - adc r8, QWORD PTR [r11+168] + sbb r8, QWORD PTR [r11+168] mov rax, QWORD PTR [rdx+176] mov QWORD PTR [r10+168], r8 - adc rax, QWORD PTR [r11+176] + sbb rax, QWORD PTR [r11+176] mov r8, QWORD PTR [rdx+184] mov QWORD PTR [r10+176], rax - adc r8, QWORD PTR [r11+184] + sbb r8, QWORD PTR [r11+184] + mov QWORD PTR [r10+184], r8 + sbb r9, 0 + ; Cond Negate + mov rax, QWORD PTR [r10] + mov r11, r9 + xor rax, r9 + neg r11 + sub rax, r9 + mov r8, QWORD PTR [r10+8] + sbb r11, 0 + mov QWORD PTR [r10], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+16] + setc r11b + mov QWORD PTR [r10+8], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+24] + setc r11b + mov QWORD PTR [r10+16], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+32] + setc r11b + mov QWORD PTR [r10+24], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+40] + setc r11b + mov QWORD PTR [r10+32], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+48] + setc r11b + mov QWORD PTR [r10+40], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+56] + setc r11b + mov QWORD PTR [r10+48], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+64] + setc r11b + mov QWORD PTR [r10+56], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+72] + setc r11b + mov QWORD PTR [r10+64], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+80] + setc r11b + mov QWORD PTR [r10+72], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+88] + setc r11b + mov QWORD PTR [r10+80], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+96] + setc r11b + mov QWORD PTR [r10+88], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+104] + setc r11b + mov QWORD PTR [r10+96], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+112] + setc r11b + mov QWORD PTR [r10+104], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+120] + setc r11b + mov QWORD PTR [r10+112], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+128] + setc r11b + mov QWORD PTR [r10+120], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+136] + setc r11b + mov QWORD PTR [r10+128], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+144] + setc r11b + mov QWORD PTR [r10+136], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+152] + setc r11b + mov QWORD PTR [r10+144], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+160] + setc r11b + mov QWORD PTR [r10+152], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+168] + setc r11b + mov QWORD PTR [r10+160], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+176] + setc r11b + mov QWORD PTR [r10+168], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+184] + setc r11b + mov QWORD PTR [r10+176], rax + xor r8, r9 + add r8, r11 mov QWORD PTR [r10+184], r8 - adc r9, 0 - mov QWORD PTR [rsp+976], r9 mov rdx, r10 mov rcx, rsp call sp_3072_sqr_24 - mov rdx, QWORD PTR [rsp+968] - lea rcx, QWORD PTR [rsp+384] + mov rdx, QWORD PTR [rsp+392] + mov rcx, QWORD PTR [rsp+384] add rdx, 192 + add rcx, 384 call sp_3072_sqr_24 - mov rdx, QWORD PTR [rsp+968] - mov rcx, QWORD PTR [rsp+960] + mov rdx, QWORD PTR [rsp+392] + mov rcx, QWORD PTR [rsp+384] call sp_3072_sqr_24 IFDEF _WIN64 - mov rdx, QWORD PTR [rsp+968] - mov rcx, QWORD PTR [rsp+960] + mov rdx, QWORD PTR [rsp+392] + mov rcx, QWORD PTR [rsp+384] ENDIF - mov r12, QWORD PTR [rsp+976] - mov r11, rcx - lea r10, QWORD PTR [rsp+768] - mov r9, r12 - neg r12 - add r11, 384 - mov rax, QWORD PTR [r10] - mov r8, QWORD PTR [r10+8] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11], rax - mov QWORD PTR [r11+8], r8 - mov rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r10+24] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+16], rax - mov QWORD PTR [r11+24], r8 - mov rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r10+40] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+32], rax - mov QWORD PTR [r11+40], r8 - mov rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r10+56] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+48], rax - mov QWORD PTR [r11+56], r8 - mov rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r10+72] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+64], rax - mov QWORD PTR [r11+72], r8 - mov rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r10+88] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+80], rax - mov QWORD PTR [r11+88], r8 - mov rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [r10+104] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+96], rax - mov QWORD PTR [r11+104], r8 - mov rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [r10+120] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+112], rax - mov QWORD PTR [r11+120], r8 - mov rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [r10+136] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+128], rax - mov QWORD PTR [r11+136], r8 - mov rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [r10+152] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+144], rax - mov QWORD PTR [r11+152], r8 - mov rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [r10+168] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+160], rax - mov QWORD PTR [r11+168], r8 - mov rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [r10+184] - and rax, r12 - and r8, r12 - mov QWORD PTR [r11+176], rax - mov QWORD PTR [r11+184], r8 - mov rax, QWORD PTR [r11] - add rax, rax - mov r8, QWORD PTR [r11+8] - mov QWORD PTR [r11], rax - adc r8, r8 - mov rax, QWORD PTR [r11+16] - mov QWORD PTR [r11+8], r8 - adc rax, rax - mov r8, QWORD PTR [r11+24] - mov QWORD PTR [r11+16], rax - adc r8, r8 - mov rax, QWORD PTR [r11+32] - mov QWORD PTR [r11+24], r8 - adc rax, rax - mov r8, QWORD PTR [r11+40] - mov QWORD PTR [r11+32], rax - adc r8, r8 - mov rax, QWORD PTR [r11+48] - mov QWORD PTR [r11+40], r8 - adc rax, rax - mov r8, QWORD PTR [r11+56] - mov QWORD PTR [r11+48], rax - adc r8, r8 - mov rax, QWORD PTR [r11+64] - mov QWORD PTR [r11+56], r8 - adc rax, rax - mov r8, QWORD PTR [r11+72] - mov QWORD PTR [r11+64], rax - adc r8, r8 - mov rax, QWORD PTR [r11+80] - mov QWORD PTR [r11+72], r8 - adc rax, rax - mov r8, QWORD PTR [r11+88] - mov QWORD PTR [r11+80], rax - adc r8, r8 - mov rax, QWORD PTR [r11+96] - mov QWORD PTR [r11+88], r8 - adc rax, rax - mov r8, QWORD PTR [r11+104] - mov QWORD PTR [r11+96], rax - adc r8, r8 - mov rax, QWORD PTR [r11+112] - mov QWORD PTR [r11+104], r8 - adc rax, rax - mov r8, QWORD PTR [r11+120] - mov QWORD PTR [r11+112], rax - adc r8, r8 - mov rax, QWORD PTR [r11+128] - mov QWORD PTR [r11+120], r8 - adc rax, rax - mov r8, QWORD PTR [r11+136] - mov QWORD PTR [r11+128], rax - adc r8, r8 - mov rax, QWORD PTR [r11+144] - mov QWORD PTR [r11+136], r8 - adc rax, rax - mov r8, QWORD PTR [r11+152] - mov QWORD PTR [r11+144], rax - adc r8, r8 - mov rax, QWORD PTR [r11+160] - mov QWORD PTR [r11+152], r8 - adc rax, rax - mov r8, QWORD PTR [r11+168] - mov QWORD PTR [r11+160], rax - adc r8, r8 - mov rax, QWORD PTR [r11+176] - mov QWORD PTR [r11+168], r8 - adc rax, rax - mov r8, QWORD PTR [r11+184] - mov QWORD PTR [r11+176], rax - adc r8, r8 - mov QWORD PTR [r11+184], r8 - adc r9, 0 - lea rdx, QWORD PTR [rsp+384] - mov r10, rsp - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rdx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rdx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rdx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rdx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rdx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rdx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rdx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rdx+248] - mov rax, QWORD PTR [r10+256] - mov QWORD PTR [r10+248], r8 - sbb rax, QWORD PTR [rdx+256] - mov r8, QWORD PTR [r10+264] - mov QWORD PTR [r10+256], rax - sbb r8, QWORD PTR [rdx+264] - mov rax, QWORD PTR [r10+272] - mov QWORD PTR [r10+264], r8 - sbb rax, QWORD PTR [rdx+272] - mov r8, QWORD PTR [r10+280] - mov QWORD PTR [r10+272], rax - sbb r8, QWORD PTR [rdx+280] - mov rax, QWORD PTR [r10+288] - mov QWORD PTR [r10+280], r8 - sbb rax, QWORD PTR [rdx+288] - mov r8, QWORD PTR [r10+296] - mov QWORD PTR [r10+288], rax - sbb r8, QWORD PTR [rdx+296] - mov rax, QWORD PTR [r10+304] - mov QWORD PTR [r10+296], r8 - sbb rax, QWORD PTR [rdx+304] - mov r8, QWORD PTR [r10+312] - mov QWORD PTR [r10+304], rax - sbb r8, QWORD PTR [rdx+312] - mov rax, QWORD PTR [r10+320] - mov QWORD PTR [r10+312], r8 - sbb rax, QWORD PTR [rdx+320] - mov r8, QWORD PTR [r10+328] - mov QWORD PTR [r10+320], rax - sbb r8, QWORD PTR [rdx+328] - mov rax, QWORD PTR [r10+336] - mov QWORD PTR [r10+328], r8 - sbb rax, QWORD PTR [rdx+336] - mov r8, QWORD PTR [r10+344] - mov QWORD PTR [r10+336], rax - sbb r8, QWORD PTR [rdx+344] - mov rax, QWORD PTR [r10+352] - mov QWORD PTR [r10+344], r8 - sbb rax, QWORD PTR [rdx+352] - mov r8, QWORD PTR [r10+360] - mov QWORD PTR [r10+352], rax - sbb r8, QWORD PTR [rdx+360] - mov rax, QWORD PTR [r10+368] - mov QWORD PTR [r10+360], r8 - sbb rax, QWORD PTR [rdx+368] - mov r8, QWORD PTR [r10+376] - mov QWORD PTR [r10+368], rax - sbb r8, QWORD PTR [rdx+376] - mov QWORD PTR [r10+376], r8 + mov rdx, QWORD PTR [rsp+384] + lea r10, QWORD PTR [rsp+192] + add rdx, 576 + mov r9, 0 + mov r8, QWORD PTR [r10+-192] + sub r8, QWORD PTR [rdx+-192] + mov rax, QWORD PTR [r10+-184] + mov QWORD PTR [r10+-192], r8 + sbb rax, QWORD PTR [rdx+-184] + mov r8, QWORD PTR [r10+-176] + mov QWORD PTR [r10+-184], rax + sbb r8, QWORD PTR [rdx+-176] + mov rax, QWORD PTR [r10+-168] + mov QWORD PTR [r10+-176], r8 + sbb rax, QWORD PTR [rdx+-168] + mov r8, QWORD PTR [r10+-160] + mov QWORD PTR [r10+-168], rax + sbb r8, QWORD PTR [rdx+-160] + mov rax, QWORD PTR [r10+-152] + mov QWORD PTR [r10+-160], r8 + sbb rax, QWORD PTR [rdx+-152] + mov r8, QWORD PTR [r10+-144] + mov QWORD PTR [r10+-152], rax + sbb r8, QWORD PTR [rdx+-144] + mov rax, QWORD PTR [r10+-136] + mov QWORD PTR [r10+-144], r8 + sbb rax, QWORD PTR [rdx+-136] + mov r8, QWORD PTR [r10+-128] + mov QWORD PTR [r10+-136], rax + sbb r8, QWORD PTR [rdx+-128] + mov rax, QWORD PTR [r10+-120] + mov QWORD PTR [r10+-128], r8 + sbb rax, QWORD PTR [rdx+-120] + mov r8, QWORD PTR [r10+-112] + mov QWORD PTR [r10+-120], rax + sbb r8, QWORD PTR [rdx+-112] + mov rax, QWORD PTR [r10+-104] + mov QWORD PTR [r10+-112], r8 + sbb rax, QWORD PTR [rdx+-104] + mov r8, QWORD PTR [r10+-96] + mov QWORD PTR [r10+-104], rax + sbb r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov r8, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], rax + sbb r8, QWORD PTR [rdx+96] + mov rax, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], r8 + sbb rax, QWORD PTR [rdx+104] + mov r8, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], rax + sbb r8, QWORD PTR [rdx+112] + mov rax, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], r8 + sbb rax, QWORD PTR [rdx+120] + mov r8, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], rax + sbb r8, QWORD PTR [rdx+128] + mov rax, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], r8 + sbb rax, QWORD PTR [rdx+136] + mov r8, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], rax + sbb r8, QWORD PTR [rdx+144] + mov rax, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], r8 + sbb rax, QWORD PTR [rdx+152] + mov r8, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], rax + sbb r8, QWORD PTR [rdx+160] + mov rax, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], r8 + sbb rax, QWORD PTR [rdx+168] + mov r8, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], rax + sbb r8, QWORD PTR [rdx+176] + mov rax, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], r8 + sbb rax, QWORD PTR [rdx+184] + mov QWORD PTR [r10+184], rax sbb r9, 0 - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rcx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rcx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rcx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rcx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rcx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rcx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rcx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rcx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rcx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rcx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rcx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rcx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rcx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rcx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rcx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rcx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rcx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rcx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rcx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rcx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rcx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rcx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rcx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rcx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rcx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rcx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rcx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rcx+248] - mov rax, QWORD PTR [r10+256] - mov QWORD PTR [r10+248], r8 - sbb rax, QWORD PTR [rcx+256] - mov r8, QWORD PTR [r10+264] - mov QWORD PTR [r10+256], rax - sbb r8, QWORD PTR [rcx+264] - mov rax, QWORD PTR [r10+272] - mov QWORD PTR [r10+264], r8 - sbb rax, QWORD PTR [rcx+272] - mov r8, QWORD PTR [r10+280] - mov QWORD PTR [r10+272], rax - sbb r8, QWORD PTR [rcx+280] - mov rax, QWORD PTR [r10+288] - mov QWORD PTR [r10+280], r8 - sbb rax, QWORD PTR [rcx+288] - mov r8, QWORD PTR [r10+296] - mov QWORD PTR [r10+288], rax - sbb r8, QWORD PTR [rcx+296] - mov rax, QWORD PTR [r10+304] - mov QWORD PTR [r10+296], r8 - sbb rax, QWORD PTR [rcx+304] - mov r8, QWORD PTR [r10+312] - mov QWORD PTR [r10+304], rax - sbb r8, QWORD PTR [rcx+312] - mov rax, QWORD PTR [r10+320] - mov QWORD PTR [r10+312], r8 - sbb rax, QWORD PTR [rcx+320] - mov r8, QWORD PTR [r10+328] - mov QWORD PTR [r10+320], rax - sbb r8, QWORD PTR [rcx+328] - mov rax, QWORD PTR [r10+336] - mov QWORD PTR [r10+328], r8 - sbb rax, QWORD PTR [rcx+336] - mov r8, QWORD PTR [r10+344] - mov QWORD PTR [r10+336], rax - sbb r8, QWORD PTR [rcx+344] - mov rax, QWORD PTR [r10+352] - mov QWORD PTR [r10+344], r8 - sbb rax, QWORD PTR [rcx+352] - mov r8, QWORD PTR [r10+360] - mov QWORD PTR [r10+352], rax - sbb r8, QWORD PTR [rcx+360] - mov rax, QWORD PTR [r10+368] - mov QWORD PTR [r10+360], r8 - sbb rax, QWORD PTR [rcx+368] - mov r8, QWORD PTR [r10+376] - mov QWORD PTR [r10+368], rax - sbb r8, QWORD PTR [rcx+376] - mov QWORD PTR [r10+376], r8 + sub rdx, 384 + mov r8, QWORD PTR [r10+-192] + sub r8, QWORD PTR [rdx+-192] + mov rax, QWORD PTR [r10+-184] + mov QWORD PTR [r10+-192], r8 + sbb rax, QWORD PTR [rdx+-184] + mov r8, QWORD PTR [r10+-176] + mov QWORD PTR [r10+-184], rax + sbb r8, QWORD PTR [rdx+-176] + mov rax, QWORD PTR [r10+-168] + mov QWORD PTR [r10+-176], r8 + sbb rax, QWORD PTR [rdx+-168] + mov r8, QWORD PTR [r10+-160] + mov QWORD PTR [r10+-168], rax + sbb r8, QWORD PTR [rdx+-160] + mov rax, QWORD PTR [r10+-152] + mov QWORD PTR [r10+-160], r8 + sbb rax, QWORD PTR [rdx+-152] + mov r8, QWORD PTR [r10+-144] + mov QWORD PTR [r10+-152], rax + sbb r8, QWORD PTR [rdx+-144] + mov rax, QWORD PTR [r10+-136] + mov QWORD PTR [r10+-144], r8 + sbb rax, QWORD PTR [rdx+-136] + mov r8, QWORD PTR [r10+-128] + mov QWORD PTR [r10+-136], rax + sbb r8, QWORD PTR [rdx+-128] + mov rax, QWORD PTR [r10+-120] + mov QWORD PTR [r10+-128], r8 + sbb rax, QWORD PTR [rdx+-120] + mov r8, QWORD PTR [r10+-112] + mov QWORD PTR [r10+-120], rax + sbb r8, QWORD PTR [rdx+-112] + mov rax, QWORD PTR [r10+-104] + mov QWORD PTR [r10+-112], r8 + sbb rax, QWORD PTR [rdx+-104] + mov r8, QWORD PTR [r10+-96] + mov QWORD PTR [r10+-104], rax + sbb r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov r8, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], rax + sbb r8, QWORD PTR [rdx+96] + mov rax, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], r8 + sbb rax, QWORD PTR [rdx+104] + mov r8, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], rax + sbb r8, QWORD PTR [rdx+112] + mov rax, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], r8 + sbb rax, QWORD PTR [rdx+120] + mov r8, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], rax + sbb r8, QWORD PTR [rdx+128] + mov rax, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], r8 + sbb rax, QWORD PTR [rdx+136] + mov r8, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], rax + sbb r8, QWORD PTR [rdx+144] + mov rax, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], r8 + sbb rax, QWORD PTR [rdx+152] + mov r8, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], rax + sbb r8, QWORD PTR [rdx+160] + mov rax, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], r8 + sbb rax, QWORD PTR [rdx+168] + mov r8, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], rax + sbb r8, QWORD PTR [rdx+176] + mov rax, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], r8 + sbb rax, QWORD PTR [rdx+184] + mov QWORD PTR [r10+184], rax sbb r9, 0 - sub r11, 192 - ; Add in place - mov rax, QWORD PTR [r11] - add rax, QWORD PTR [r10] - mov r8, QWORD PTR [r11+8] - mov QWORD PTR [r11], rax - adc r8, QWORD PTR [r10+8] - mov rax, QWORD PTR [r11+16] - mov QWORD PTR [r11+8], r8 - adc rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r11+24] - mov QWORD PTR [r11+16], rax - adc r8, QWORD PTR [r10+24] - mov rax, QWORD PTR [r11+32] - mov QWORD PTR [r11+24], r8 - adc rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r11+40] - mov QWORD PTR [r11+32], rax - adc r8, QWORD PTR [r10+40] - mov rax, QWORD PTR [r11+48] - mov QWORD PTR [r11+40], r8 - adc rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r11+56] - mov QWORD PTR [r11+48], rax - adc r8, QWORD PTR [r10+56] - mov rax, QWORD PTR [r11+64] - mov QWORD PTR [r11+56], r8 - adc rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r11+72] - mov QWORD PTR [r11+64], rax - adc r8, QWORD PTR [r10+72] - mov rax, QWORD PTR [r11+80] - mov QWORD PTR [r11+72], r8 - adc rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r11+88] - mov QWORD PTR [r11+80], rax - adc r8, QWORD PTR [r10+88] - mov rax, QWORD PTR [r11+96] - mov QWORD PTR [r11+88], r8 - adc rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [r11+104] - mov QWORD PTR [r11+96], rax - adc r8, QWORD PTR [r10+104] - mov rax, QWORD PTR [r11+112] - mov QWORD PTR [r11+104], r8 - adc rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [r11+120] - mov QWORD PTR [r11+112], rax - adc r8, QWORD PTR [r10+120] - mov rax, QWORD PTR [r11+128] - mov QWORD PTR [r11+120], r8 - adc rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [r11+136] - mov QWORD PTR [r11+128], rax - adc r8, QWORD PTR [r10+136] - mov rax, QWORD PTR [r11+144] - mov QWORD PTR [r11+136], r8 - adc rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [r11+152] - mov QWORD PTR [r11+144], rax - adc r8, QWORD PTR [r10+152] - mov rax, QWORD PTR [r11+160] - mov QWORD PTR [r11+152], r8 - adc rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [r11+168] - mov QWORD PTR [r11+160], rax - adc r8, QWORD PTR [r10+168] - mov rax, QWORD PTR [r11+176] - mov QWORD PTR [r11+168], r8 - adc rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [r11+184] - mov QWORD PTR [r11+176], rax - adc r8, QWORD PTR [r10+184] - mov rax, QWORD PTR [r11+192] - mov QWORD PTR [r11+184], r8 - adc rax, QWORD PTR [r10+192] - mov r8, QWORD PTR [r11+200] - mov QWORD PTR [r11+192], rax - adc r8, QWORD PTR [r10+200] - mov rax, QWORD PTR [r11+208] - mov QWORD PTR [r11+200], r8 - adc rax, QWORD PTR [r10+208] - mov r8, QWORD PTR [r11+216] - mov QWORD PTR [r11+208], rax - adc r8, QWORD PTR [r10+216] - mov rax, QWORD PTR [r11+224] - mov QWORD PTR [r11+216], r8 - adc rax, QWORD PTR [r10+224] - mov r8, QWORD PTR [r11+232] - mov QWORD PTR [r11+224], rax - adc r8, QWORD PTR [r10+232] - mov rax, QWORD PTR [r11+240] - mov QWORD PTR [r11+232], r8 - adc rax, QWORD PTR [r10+240] - mov r8, QWORD PTR [r11+248] - mov QWORD PTR [r11+240], rax - adc r8, QWORD PTR [r10+248] - mov rax, QWORD PTR [r11+256] - mov QWORD PTR [r11+248], r8 - adc rax, QWORD PTR [r10+256] - mov r8, QWORD PTR [r11+264] - mov QWORD PTR [r11+256], rax - adc r8, QWORD PTR [r10+264] - mov rax, QWORD PTR [r11+272] - mov QWORD PTR [r11+264], r8 - adc rax, QWORD PTR [r10+272] - mov r8, QWORD PTR [r11+280] - mov QWORD PTR [r11+272], rax - adc r8, QWORD PTR [r10+280] - mov rax, QWORD PTR [r11+288] - mov QWORD PTR [r11+280], r8 - adc rax, QWORD PTR [r10+288] - mov r8, QWORD PTR [r11+296] - mov QWORD PTR [r11+288], rax - adc r8, QWORD PTR [r10+296] - mov rax, QWORD PTR [r11+304] - mov QWORD PTR [r11+296], r8 - adc rax, QWORD PTR [r10+304] - mov r8, QWORD PTR [r11+312] - mov QWORD PTR [r11+304], rax - adc r8, QWORD PTR [r10+312] - mov rax, QWORD PTR [r11+320] - mov QWORD PTR [r11+312], r8 - adc rax, QWORD PTR [r10+320] - mov r8, QWORD PTR [r11+328] - mov QWORD PTR [r11+320], rax - adc r8, QWORD PTR [r10+328] - mov rax, QWORD PTR [r11+336] - mov QWORD PTR [r11+328], r8 - adc rax, QWORD PTR [r10+336] - mov r8, QWORD PTR [r11+344] - mov QWORD PTR [r11+336], rax - adc r8, QWORD PTR [r10+344] - mov rax, QWORD PTR [r11+352] - mov QWORD PTR [r11+344], r8 - adc rax, QWORD PTR [r10+352] - mov r8, QWORD PTR [r11+360] - mov QWORD PTR [r11+352], rax - adc r8, QWORD PTR [r10+360] - mov rax, QWORD PTR [r11+368] - mov QWORD PTR [r11+360], r8 - adc rax, QWORD PTR [r10+368] - mov r8, QWORD PTR [r11+376] - mov QWORD PTR [r11+368], rax - adc r8, QWORD PTR [r10+376] - mov QWORD PTR [r11+376], r8 - adc r9, 0 - mov QWORD PTR [rcx+576], r9 - ; Add in place - mov rax, QWORD PTR [r11+192] - add rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r11+200] - mov QWORD PTR [r11+192], rax - adc r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r11+208] - mov QWORD PTR [r11+200], r8 - adc rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r11+216] - mov QWORD PTR [r11+208], rax - adc r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r11+224] - mov QWORD PTR [r11+216], r8 - adc rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r11+232] - mov QWORD PTR [r11+224], rax - adc r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r11+240] - mov QWORD PTR [r11+232], r8 - adc rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r11+248] - mov QWORD PTR [r11+240], rax - adc r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r11+256] - mov QWORD PTR [r11+248], r8 - adc rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r11+264] - mov QWORD PTR [r11+256], rax - adc r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r11+272] - mov QWORD PTR [r11+264], r8 - adc rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r11+280] - mov QWORD PTR [r11+272], rax - adc r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r11+288] - mov QWORD PTR [r11+280], r8 - adc rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r11+296] - mov QWORD PTR [r11+288], rax - adc r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r11+304] - mov QWORD PTR [r11+296], r8 - adc rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r11+312] - mov QWORD PTR [r11+304], rax - adc r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r11+320] - mov QWORD PTR [r11+312], r8 - adc rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r11+328] - mov QWORD PTR [r11+320], rax - adc r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r11+336] - mov QWORD PTR [r11+328], r8 - adc rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r11+344] - mov QWORD PTR [r11+336], rax - adc r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r11+352] - mov QWORD PTR [r11+344], r8 - adc rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r11+360] - mov QWORD PTR [r11+352], rax - adc r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r11+368] - mov QWORD PTR [r11+360], r8 - adc rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r11+376] - mov QWORD PTR [r11+368], rax - adc r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [r11+384] - mov QWORD PTR [r11+376], r8 - adc rax, QWORD PTR [rdx+192] - mov QWORD PTR [r11+384], rax - ; Add to zero - mov rax, QWORD PTR [rdx+200] + mov rcx, QWORD PTR [rsp+384] + neg r9 + add rcx, 384 + mov r8, QWORD PTR [rcx+-192] + sub r8, QWORD PTR [r10+-192] + mov rax, QWORD PTR [rcx+-184] + mov QWORD PTR [rcx+-192], r8 + sbb rax, QWORD PTR [r10+-184] + mov r8, QWORD PTR [rcx+-176] + mov QWORD PTR [rcx+-184], rax + sbb r8, QWORD PTR [r10+-176] + mov rax, QWORD PTR [rcx+-168] + mov QWORD PTR [rcx+-176], r8 + sbb rax, QWORD PTR [r10+-168] + mov r8, QWORD PTR [rcx+-160] + mov QWORD PTR [rcx+-168], rax + sbb r8, QWORD PTR [r10+-160] + mov rax, QWORD PTR [rcx+-152] + mov QWORD PTR [rcx+-160], r8 + sbb rax, QWORD PTR [r10+-152] + mov r8, QWORD PTR [rcx+-144] + mov QWORD PTR [rcx+-152], rax + sbb r8, QWORD PTR [r10+-144] + mov rax, QWORD PTR [rcx+-136] + mov QWORD PTR [rcx+-144], r8 + sbb rax, QWORD PTR [r10+-136] + mov r8, QWORD PTR [rcx+-128] + mov QWORD PTR [rcx+-136], rax + sbb r8, QWORD PTR [r10+-128] + mov rax, QWORD PTR [rcx+-120] + mov QWORD PTR [rcx+-128], r8 + sbb rax, QWORD PTR [r10+-120] + mov r8, QWORD PTR [rcx+-112] + mov QWORD PTR [rcx+-120], rax + sbb r8, QWORD PTR [r10+-112] + mov rax, QWORD PTR [rcx+-104] + mov QWORD PTR [rcx+-112], r8 + sbb rax, QWORD PTR [r10+-104] + mov r8, QWORD PTR [rcx+-96] + mov QWORD PTR [rcx+-104], rax + sbb r8, QWORD PTR [r10+-96] + mov rax, QWORD PTR [rcx+-88] + mov QWORD PTR [rcx+-96], r8 + sbb rax, QWORD PTR [r10+-88] + mov r8, QWORD PTR [rcx+-80] + mov QWORD PTR [rcx+-88], rax + sbb r8, QWORD PTR [r10+-80] + mov rax, QWORD PTR [rcx+-72] + mov QWORD PTR [rcx+-80], r8 + sbb rax, QWORD PTR [r10+-72] + mov r8, QWORD PTR [rcx+-64] + mov QWORD PTR [rcx+-72], rax + sbb r8, QWORD PTR [r10+-64] + mov rax, QWORD PTR [rcx+-56] + mov QWORD PTR [rcx+-64], r8 + sbb rax, QWORD PTR [r10+-56] + mov r8, QWORD PTR [rcx+-48] + mov QWORD PTR [rcx+-56], rax + sbb r8, QWORD PTR [r10+-48] + mov rax, QWORD PTR [rcx+-40] + mov QWORD PTR [rcx+-48], r8 + sbb rax, QWORD PTR [r10+-40] + mov r8, QWORD PTR [rcx+-32] + mov QWORD PTR [rcx+-40], rax + sbb r8, QWORD PTR [r10+-32] + mov rax, QWORD PTR [rcx+-24] + mov QWORD PTR [rcx+-32], r8 + sbb rax, QWORD PTR [r10+-24] + mov r8, QWORD PTR [rcx+-16] + mov QWORD PTR [rcx+-24], rax + sbb r8, QWORD PTR [r10+-16] + mov rax, QWORD PTR [rcx+-8] + mov QWORD PTR [rcx+-16], r8 + sbb rax, QWORD PTR [r10+-8] + mov r8, QWORD PTR [rcx] + mov QWORD PTR [rcx+-8], rax + sbb r8, QWORD PTR [r10] + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 + sbb rax, QWORD PTR [r10+8] + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax + sbb r8, QWORD PTR [r10+16] + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 + sbb rax, QWORD PTR [r10+24] + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax + sbb r8, QWORD PTR [r10+32] + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 + sbb rax, QWORD PTR [r10+40] + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax + sbb r8, QWORD PTR [r10+48] + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 + sbb rax, QWORD PTR [r10+56] + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax + sbb r8, QWORD PTR [r10+64] + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 + sbb rax, QWORD PTR [r10+72] + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax + sbb r8, QWORD PTR [r10+80] + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 + sbb rax, QWORD PTR [r10+88] + mov r8, QWORD PTR [rcx+96] + mov QWORD PTR [rcx+88], rax + sbb r8, QWORD PTR [r10+96] + mov rax, QWORD PTR [rcx+104] + mov QWORD PTR [rcx+96], r8 + sbb rax, QWORD PTR [r10+104] + mov r8, QWORD PTR [rcx+112] + mov QWORD PTR [rcx+104], rax + sbb r8, QWORD PTR [r10+112] + mov rax, QWORD PTR [rcx+120] + mov QWORD PTR [rcx+112], r8 + sbb rax, QWORD PTR [r10+120] + mov r8, QWORD PTR [rcx+128] + mov QWORD PTR [rcx+120], rax + sbb r8, QWORD PTR [r10+128] + mov rax, QWORD PTR [rcx+136] + mov QWORD PTR [rcx+128], r8 + sbb rax, QWORD PTR [r10+136] + mov r8, QWORD PTR [rcx+144] + mov QWORD PTR [rcx+136], rax + sbb r8, QWORD PTR [r10+144] + mov rax, QWORD PTR [rcx+152] + mov QWORD PTR [rcx+144], r8 + sbb rax, QWORD PTR [r10+152] + mov r8, QWORD PTR [rcx+160] + mov QWORD PTR [rcx+152], rax + sbb r8, QWORD PTR [r10+160] + mov rax, QWORD PTR [rcx+168] + mov QWORD PTR [rcx+160], r8 + sbb rax, QWORD PTR [r10+168] + mov r8, QWORD PTR [rcx+176] + mov QWORD PTR [rcx+168], rax + sbb r8, QWORD PTR [r10+176] + mov rax, QWORD PTR [rcx+184] + mov QWORD PTR [rcx+176], r8 + sbb rax, QWORD PTR [r10+184] + mov QWORD PTR [rcx+184], rax + sbb r9, 0 + mov rcx, QWORD PTR [rsp+384] + add rcx, 576 + ; Add in word + mov r8, QWORD PTR [rcx] + add r8, r9 + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+208] - mov QWORD PTR [r11+392], rax + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax adc r8, 0 - mov rax, QWORD PTR [rdx+216] - mov QWORD PTR [r11+400], r8 + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+224] - mov QWORD PTR [r11+408], rax + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax adc r8, 0 - mov rax, QWORD PTR [rdx+232] - mov QWORD PTR [r11+416], r8 + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+240] - mov QWORD PTR [r11+424], rax + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax adc r8, 0 - mov rax, QWORD PTR [rdx+248] - mov QWORD PTR [r11+432], r8 + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+256] - mov QWORD PTR [r11+440], rax + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax adc r8, 0 - mov rax, QWORD PTR [rdx+264] - mov QWORD PTR [r11+448], r8 + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+272] - mov QWORD PTR [r11+456], rax + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax adc r8, 0 - mov rax, QWORD PTR [rdx+280] - mov QWORD PTR [r11+464], r8 + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+288] - mov QWORD PTR [r11+472], rax + mov r8, QWORD PTR [rcx+96] + mov QWORD PTR [rcx+88], rax adc r8, 0 - mov rax, QWORD PTR [rdx+296] - mov QWORD PTR [r11+480], r8 + mov rax, QWORD PTR [rcx+104] + mov QWORD PTR [rcx+96], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+304] - mov QWORD PTR [r11+488], rax + mov r8, QWORD PTR [rcx+112] + mov QWORD PTR [rcx+104], rax adc r8, 0 - mov rax, QWORD PTR [rdx+312] - mov QWORD PTR [r11+496], r8 + mov rax, QWORD PTR [rcx+120] + mov QWORD PTR [rcx+112], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+320] - mov QWORD PTR [r11+504], rax + mov r8, QWORD PTR [rcx+128] + mov QWORD PTR [rcx+120], rax adc r8, 0 - mov rax, QWORD PTR [rdx+328] - mov QWORD PTR [r11+512], r8 + mov rax, QWORD PTR [rcx+136] + mov QWORD PTR [rcx+128], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+336] - mov QWORD PTR [r11+520], rax + mov r8, QWORD PTR [rcx+144] + mov QWORD PTR [rcx+136], rax adc r8, 0 - mov rax, QWORD PTR [rdx+344] - mov QWORD PTR [r11+528], r8 + mov rax, QWORD PTR [rcx+152] + mov QWORD PTR [rcx+144], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+352] - mov QWORD PTR [r11+536], rax + mov r8, QWORD PTR [rcx+160] + mov QWORD PTR [rcx+152], rax adc r8, 0 - mov rax, QWORD PTR [rdx+360] - mov QWORD PTR [r11+544], r8 + mov rax, QWORD PTR [rcx+168] + mov QWORD PTR [rcx+160], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+368] - mov QWORD PTR [r11+552], rax + mov r8, QWORD PTR [rcx+176] + mov QWORD PTR [rcx+168], rax adc r8, 0 - mov rax, QWORD PTR [rdx+376] - mov QWORD PTR [r11+560], r8 + mov rax, QWORD PTR [rcx+184] + mov QWORD PTR [rcx+176], r8 adc rax, 0 - mov QWORD PTR [r11+568], rax - add rsp, 984 - pop r12 + mov QWORD PTR [rcx+184], rax + mov rdx, QWORD PTR [rsp+392] + mov rcx, QWORD PTR [rsp+384] + add rsp, 400 ret sp_3072_sqr_48 ENDP _text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Square a and put result in r. (r = a * a) ; * +; * Karatsuba: ah^2, al^2, (al - ah)^2 +; * ; * r A single precision integer. ; * a A single precision integer. ; */ _text SEGMENT READONLY PARA sp_3072_sqr_avx2_48 PROC - push r12 - sub rsp, 984 - mov QWORD PTR [rsp+960], rcx - mov QWORD PTR [rsp+968], rdx - lea r10, QWORD PTR [rsp+768] + sub rsp, 400 + mov QWORD PTR [rsp+384], rcx + mov QWORD PTR [rsp+392], rdx + mov r9, 0 + mov r10, rsp lea r11, QWORD PTR [rdx+192] - ; Add mov rax, QWORD PTR [rdx] - xor r9, r9 - add rax, QWORD PTR [r11] + sub rax, QWORD PTR [r11] mov r8, QWORD PTR [rdx+8] mov QWORD PTR [r10], rax - adc r8, QWORD PTR [r11+8] + sbb r8, QWORD PTR [r11+8] mov rax, QWORD PTR [rdx+16] mov QWORD PTR [r10+8], r8 - adc rax, QWORD PTR [r11+16] + sbb rax, QWORD PTR [r11+16] mov r8, QWORD PTR [rdx+24] mov QWORD PTR [r10+16], rax - adc r8, QWORD PTR [r11+24] + sbb r8, QWORD PTR [r11+24] mov rax, QWORD PTR [rdx+32] mov QWORD PTR [r10+24], r8 - adc rax, QWORD PTR [r11+32] + sbb rax, QWORD PTR [r11+32] mov r8, QWORD PTR [rdx+40] mov QWORD PTR [r10+32], rax - adc r8, QWORD PTR [r11+40] + sbb r8, QWORD PTR [r11+40] mov rax, QWORD PTR [rdx+48] mov QWORD PTR [r10+40], r8 - adc rax, QWORD PTR [r11+48] + sbb rax, QWORD PTR [r11+48] mov r8, QWORD PTR [rdx+56] mov QWORD PTR [r10+48], rax - adc r8, QWORD PTR [r11+56] + sbb r8, QWORD PTR [r11+56] mov rax, QWORD PTR [rdx+64] mov QWORD PTR [r10+56], r8 - adc rax, QWORD PTR [r11+64] + sbb rax, QWORD PTR [r11+64] mov r8, QWORD PTR [rdx+72] mov QWORD PTR [r10+64], rax - adc r8, QWORD PTR [r11+72] + sbb r8, QWORD PTR [r11+72] mov rax, QWORD PTR [rdx+80] mov QWORD PTR [r10+72], r8 - adc rax, QWORD PTR [r11+80] + sbb rax, QWORD PTR [r11+80] mov r8, QWORD PTR [rdx+88] mov QWORD PTR [r10+80], rax - adc r8, QWORD PTR [r11+88] + sbb r8, QWORD PTR [r11+88] mov rax, QWORD PTR [rdx+96] mov QWORD PTR [r10+88], r8 - adc rax, QWORD PTR [r11+96] + sbb rax, QWORD PTR [r11+96] mov r8, QWORD PTR [rdx+104] mov QWORD PTR [r10+96], rax - adc r8, QWORD PTR [r11+104] + sbb r8, QWORD PTR [r11+104] mov rax, QWORD PTR [rdx+112] mov QWORD PTR [r10+104], r8 - adc rax, QWORD PTR [r11+112] + sbb rax, QWORD PTR [r11+112] mov r8, QWORD PTR [rdx+120] mov QWORD PTR [r10+112], rax - adc r8, QWORD PTR [r11+120] + sbb r8, QWORD PTR [r11+120] mov rax, QWORD PTR [rdx+128] mov QWORD PTR [r10+120], r8 - adc rax, QWORD PTR [r11+128] + sbb rax, QWORD PTR [r11+128] mov r8, QWORD PTR [rdx+136] mov QWORD PTR [r10+128], rax - adc r8, QWORD PTR [r11+136] + sbb r8, QWORD PTR [r11+136] mov rax, QWORD PTR [rdx+144] mov QWORD PTR [r10+136], r8 - adc rax, QWORD PTR [r11+144] + sbb rax, QWORD PTR [r11+144] mov r8, QWORD PTR [rdx+152] mov QWORD PTR [r10+144], rax - adc r8, QWORD PTR [r11+152] + sbb r8, QWORD PTR [r11+152] mov rax, QWORD PTR [rdx+160] mov QWORD PTR [r10+152], r8 - adc rax, QWORD PTR [r11+160] + sbb rax, QWORD PTR [r11+160] mov r8, QWORD PTR [rdx+168] mov QWORD PTR [r10+160], rax - adc r8, QWORD PTR [r11+168] + sbb r8, QWORD PTR [r11+168] mov rax, QWORD PTR [rdx+176] mov QWORD PTR [r10+168], r8 - adc rax, QWORD PTR [r11+176] + sbb rax, QWORD PTR [r11+176] mov r8, QWORD PTR [rdx+184] mov QWORD PTR [r10+176], rax - adc r8, QWORD PTR [r11+184] + sbb r8, QWORD PTR [r11+184] + mov QWORD PTR [r10+184], r8 + sbb r9, 0 + ; Cond Negate + mov rax, QWORD PTR [r10] + mov r11, r9 + xor rax, r9 + neg r11 + sub rax, r9 + mov r8, QWORD PTR [r10+8] + sbb r11, 0 + mov QWORD PTR [r10], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+16] + setc r11b + mov QWORD PTR [r10+8], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+24] + setc r11b + mov QWORD PTR [r10+16], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+32] + setc r11b + mov QWORD PTR [r10+24], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+40] + setc r11b + mov QWORD PTR [r10+32], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+48] + setc r11b + mov QWORD PTR [r10+40], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+56] + setc r11b + mov QWORD PTR [r10+48], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+64] + setc r11b + mov QWORD PTR [r10+56], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+72] + setc r11b + mov QWORD PTR [r10+64], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+80] + setc r11b + mov QWORD PTR [r10+72], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+88] + setc r11b + mov QWORD PTR [r10+80], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+96] + setc r11b + mov QWORD PTR [r10+88], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+104] + setc r11b + mov QWORD PTR [r10+96], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+112] + setc r11b + mov QWORD PTR [r10+104], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+120] + setc r11b + mov QWORD PTR [r10+112], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+128] + setc r11b + mov QWORD PTR [r10+120], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+136] + setc r11b + mov QWORD PTR [r10+128], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+144] + setc r11b + mov QWORD PTR [r10+136], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+152] + setc r11b + mov QWORD PTR [r10+144], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+160] + setc r11b + mov QWORD PTR [r10+152], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+168] + setc r11b + mov QWORD PTR [r10+160], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+176] + setc r11b + mov QWORD PTR [r10+168], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+184] + setc r11b + mov QWORD PTR [r10+176], rax + xor r8, r9 + add r8, r11 mov QWORD PTR [r10+184], r8 - adc r9, 0 - mov QWORD PTR [rsp+976], r9 mov rdx, r10 mov rcx, rsp call sp_3072_sqr_avx2_24 - mov rdx, QWORD PTR [rsp+968] - lea rcx, QWORD PTR [rsp+384] + mov rdx, QWORD PTR [rsp+392] + mov rcx, QWORD PTR [rsp+384] add rdx, 192 + add rcx, 384 call sp_3072_sqr_avx2_24 - mov rdx, QWORD PTR [rsp+968] - mov rcx, QWORD PTR [rsp+960] + mov rdx, QWORD PTR [rsp+392] + mov rcx, QWORD PTR [rsp+384] call sp_3072_sqr_avx2_24 IFDEF _WIN64 - mov rdx, QWORD PTR [rsp+968] - mov rcx, QWORD PTR [rsp+960] + mov rdx, QWORD PTR [rsp+392] + mov rcx, QWORD PTR [rsp+384] ENDIF - mov r12, QWORD PTR [rsp+976] - mov r11, rcx - lea r10, QWORD PTR [rsp+768] - mov r9, r12 - neg r12 - add r11, 384 - mov rax, QWORD PTR [r10] - pext rax, rax, r12 - add rax, rax - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r11], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r11+8], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r11+16], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r11+24], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r11+32], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r11+40], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r11+48], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r11+56], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r11+64], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r11+72], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r11+80], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r11+88], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r11+96], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r11+104], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r11+112], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r11+120], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r11+128], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r11+136], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r11+144], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r11+152], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r11+160], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r11+168], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r11+176], rax - pext r8, r8, r12 - adc r8, r8 - mov QWORD PTR [r11+184], r8 - adc r9, 0 - lea rdx, QWORD PTR [rsp+384] - mov r10, rsp - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rdx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rdx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rdx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rdx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rdx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rdx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rdx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rdx+248] - mov rax, QWORD PTR [r10+256] - mov QWORD PTR [r10+248], r8 - sbb rax, QWORD PTR [rdx+256] - mov r8, QWORD PTR [r10+264] - mov QWORD PTR [r10+256], rax - sbb r8, QWORD PTR [rdx+264] - mov rax, QWORD PTR [r10+272] - mov QWORD PTR [r10+264], r8 - sbb rax, QWORD PTR [rdx+272] - mov r8, QWORD PTR [r10+280] - mov QWORD PTR [r10+272], rax - sbb r8, QWORD PTR [rdx+280] - mov rax, QWORD PTR [r10+288] - mov QWORD PTR [r10+280], r8 - sbb rax, QWORD PTR [rdx+288] - mov r8, QWORD PTR [r10+296] - mov QWORD PTR [r10+288], rax - sbb r8, QWORD PTR [rdx+296] - mov rax, QWORD PTR [r10+304] - mov QWORD PTR [r10+296], r8 - sbb rax, QWORD PTR [rdx+304] - mov r8, QWORD PTR [r10+312] - mov QWORD PTR [r10+304], rax - sbb r8, QWORD PTR [rdx+312] - mov rax, QWORD PTR [r10+320] - mov QWORD PTR [r10+312], r8 - sbb rax, QWORD PTR [rdx+320] - mov r8, QWORD PTR [r10+328] - mov QWORD PTR [r10+320], rax - sbb r8, QWORD PTR [rdx+328] - mov rax, QWORD PTR [r10+336] - mov QWORD PTR [r10+328], r8 - sbb rax, QWORD PTR [rdx+336] - mov r8, QWORD PTR [r10+344] - mov QWORD PTR [r10+336], rax - sbb r8, QWORD PTR [rdx+344] - mov rax, QWORD PTR [r10+352] - mov QWORD PTR [r10+344], r8 - sbb rax, QWORD PTR [rdx+352] - mov r8, QWORD PTR [r10+360] - mov QWORD PTR [r10+352], rax - sbb r8, QWORD PTR [rdx+360] - mov rax, QWORD PTR [r10+368] - mov QWORD PTR [r10+360], r8 - sbb rax, QWORD PTR [rdx+368] - mov r8, QWORD PTR [r10+376] - mov QWORD PTR [r10+368], rax - sbb r8, QWORD PTR [rdx+376] - mov QWORD PTR [r10+376], r8 + mov rdx, QWORD PTR [rsp+384] + lea r10, QWORD PTR [rsp+192] + add rdx, 576 + mov r9, 0 + mov r8, QWORD PTR [r10+-192] + sub r8, QWORD PTR [rdx+-192] + mov rax, QWORD PTR [r10+-184] + mov QWORD PTR [r10+-192], r8 + sbb rax, QWORD PTR [rdx+-184] + mov r8, QWORD PTR [r10+-176] + mov QWORD PTR [r10+-184], rax + sbb r8, QWORD PTR [rdx+-176] + mov rax, QWORD PTR [r10+-168] + mov QWORD PTR [r10+-176], r8 + sbb rax, QWORD PTR [rdx+-168] + mov r8, QWORD PTR [r10+-160] + mov QWORD PTR [r10+-168], rax + sbb r8, QWORD PTR [rdx+-160] + mov rax, QWORD PTR [r10+-152] + mov QWORD PTR [r10+-160], r8 + sbb rax, QWORD PTR [rdx+-152] + mov r8, QWORD PTR [r10+-144] + mov QWORD PTR [r10+-152], rax + sbb r8, QWORD PTR [rdx+-144] + mov rax, QWORD PTR [r10+-136] + mov QWORD PTR [r10+-144], r8 + sbb rax, QWORD PTR [rdx+-136] + mov r8, QWORD PTR [r10+-128] + mov QWORD PTR [r10+-136], rax + sbb r8, QWORD PTR [rdx+-128] + mov rax, QWORD PTR [r10+-120] + mov QWORD PTR [r10+-128], r8 + sbb rax, QWORD PTR [rdx+-120] + mov r8, QWORD PTR [r10+-112] + mov QWORD PTR [r10+-120], rax + sbb r8, QWORD PTR [rdx+-112] + mov rax, QWORD PTR [r10+-104] + mov QWORD PTR [r10+-112], r8 + sbb rax, QWORD PTR [rdx+-104] + mov r8, QWORD PTR [r10+-96] + mov QWORD PTR [r10+-104], rax + sbb r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov r8, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], rax + sbb r8, QWORD PTR [rdx+96] + mov rax, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], r8 + sbb rax, QWORD PTR [rdx+104] + mov r8, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], rax + sbb r8, QWORD PTR [rdx+112] + mov rax, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], r8 + sbb rax, QWORD PTR [rdx+120] + mov r8, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], rax + sbb r8, QWORD PTR [rdx+128] + mov rax, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], r8 + sbb rax, QWORD PTR [rdx+136] + mov r8, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], rax + sbb r8, QWORD PTR [rdx+144] + mov rax, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], r8 + sbb rax, QWORD PTR [rdx+152] + mov r8, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], rax + sbb r8, QWORD PTR [rdx+160] + mov rax, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], r8 + sbb rax, QWORD PTR [rdx+168] + mov r8, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], rax + sbb r8, QWORD PTR [rdx+176] + mov rax, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], r8 + sbb rax, QWORD PTR [rdx+184] + mov QWORD PTR [r10+184], rax sbb r9, 0 - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rcx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rcx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rcx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rcx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rcx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rcx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rcx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rcx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rcx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rcx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rcx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rcx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rcx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rcx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rcx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rcx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rcx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rcx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rcx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rcx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rcx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rcx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rcx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rcx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rcx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rcx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rcx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rcx+248] - mov rax, QWORD PTR [r10+256] - mov QWORD PTR [r10+248], r8 - sbb rax, QWORD PTR [rcx+256] - mov r8, QWORD PTR [r10+264] - mov QWORD PTR [r10+256], rax - sbb r8, QWORD PTR [rcx+264] - mov rax, QWORD PTR [r10+272] - mov QWORD PTR [r10+264], r8 - sbb rax, QWORD PTR [rcx+272] - mov r8, QWORD PTR [r10+280] - mov QWORD PTR [r10+272], rax - sbb r8, QWORD PTR [rcx+280] - mov rax, QWORD PTR [r10+288] - mov QWORD PTR [r10+280], r8 - sbb rax, QWORD PTR [rcx+288] - mov r8, QWORD PTR [r10+296] - mov QWORD PTR [r10+288], rax - sbb r8, QWORD PTR [rcx+296] - mov rax, QWORD PTR [r10+304] - mov QWORD PTR [r10+296], r8 - sbb rax, QWORD PTR [rcx+304] - mov r8, QWORD PTR [r10+312] - mov QWORD PTR [r10+304], rax - sbb r8, QWORD PTR [rcx+312] - mov rax, QWORD PTR [r10+320] - mov QWORD PTR [r10+312], r8 - sbb rax, QWORD PTR [rcx+320] - mov r8, QWORD PTR [r10+328] - mov QWORD PTR [r10+320], rax - sbb r8, QWORD PTR [rcx+328] - mov rax, QWORD PTR [r10+336] - mov QWORD PTR [r10+328], r8 - sbb rax, QWORD PTR [rcx+336] - mov r8, QWORD PTR [r10+344] - mov QWORD PTR [r10+336], rax - sbb r8, QWORD PTR [rcx+344] - mov rax, QWORD PTR [r10+352] - mov QWORD PTR [r10+344], r8 - sbb rax, QWORD PTR [rcx+352] - mov r8, QWORD PTR [r10+360] - mov QWORD PTR [r10+352], rax - sbb r8, QWORD PTR [rcx+360] - mov rax, QWORD PTR [r10+368] - mov QWORD PTR [r10+360], r8 - sbb rax, QWORD PTR [rcx+368] - mov r8, QWORD PTR [r10+376] - mov QWORD PTR [r10+368], rax - sbb r8, QWORD PTR [rcx+376] - mov QWORD PTR [r10+376], r8 + sub rdx, 384 + mov r8, QWORD PTR [r10+-192] + sub r8, QWORD PTR [rdx+-192] + mov rax, QWORD PTR [r10+-184] + mov QWORD PTR [r10+-192], r8 + sbb rax, QWORD PTR [rdx+-184] + mov r8, QWORD PTR [r10+-176] + mov QWORD PTR [r10+-184], rax + sbb r8, QWORD PTR [rdx+-176] + mov rax, QWORD PTR [r10+-168] + mov QWORD PTR [r10+-176], r8 + sbb rax, QWORD PTR [rdx+-168] + mov r8, QWORD PTR [r10+-160] + mov QWORD PTR [r10+-168], rax + sbb r8, QWORD PTR [rdx+-160] + mov rax, QWORD PTR [r10+-152] + mov QWORD PTR [r10+-160], r8 + sbb rax, QWORD PTR [rdx+-152] + mov r8, QWORD PTR [r10+-144] + mov QWORD PTR [r10+-152], rax + sbb r8, QWORD PTR [rdx+-144] + mov rax, QWORD PTR [r10+-136] + mov QWORD PTR [r10+-144], r8 + sbb rax, QWORD PTR [rdx+-136] + mov r8, QWORD PTR [r10+-128] + mov QWORD PTR [r10+-136], rax + sbb r8, QWORD PTR [rdx+-128] + mov rax, QWORD PTR [r10+-120] + mov QWORD PTR [r10+-128], r8 + sbb rax, QWORD PTR [rdx+-120] + mov r8, QWORD PTR [r10+-112] + mov QWORD PTR [r10+-120], rax + sbb r8, QWORD PTR [rdx+-112] + mov rax, QWORD PTR [r10+-104] + mov QWORD PTR [r10+-112], r8 + sbb rax, QWORD PTR [rdx+-104] + mov r8, QWORD PTR [r10+-96] + mov QWORD PTR [r10+-104], rax + sbb r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov r8, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], rax + sbb r8, QWORD PTR [rdx+96] + mov rax, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], r8 + sbb rax, QWORD PTR [rdx+104] + mov r8, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], rax + sbb r8, QWORD PTR [rdx+112] + mov rax, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], r8 + sbb rax, QWORD PTR [rdx+120] + mov r8, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], rax + sbb r8, QWORD PTR [rdx+128] + mov rax, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], r8 + sbb rax, QWORD PTR [rdx+136] + mov r8, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], rax + sbb r8, QWORD PTR [rdx+144] + mov rax, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], r8 + sbb rax, QWORD PTR [rdx+152] + mov r8, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], rax + sbb r8, QWORD PTR [rdx+160] + mov rax, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], r8 + sbb rax, QWORD PTR [rdx+168] + mov r8, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], rax + sbb r8, QWORD PTR [rdx+176] + mov rax, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], r8 + sbb rax, QWORD PTR [rdx+184] + mov QWORD PTR [r10+184], rax sbb r9, 0 - sub r11, 192 - ; Add in place - mov rax, QWORD PTR [r11] - add rax, QWORD PTR [r10] - mov r8, QWORD PTR [r11+8] - mov QWORD PTR [r11], rax - adc r8, QWORD PTR [r10+8] - mov rax, QWORD PTR [r11+16] - mov QWORD PTR [r11+8], r8 - adc rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r11+24] - mov QWORD PTR [r11+16], rax - adc r8, QWORD PTR [r10+24] - mov rax, QWORD PTR [r11+32] - mov QWORD PTR [r11+24], r8 - adc rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r11+40] - mov QWORD PTR [r11+32], rax - adc r8, QWORD PTR [r10+40] - mov rax, QWORD PTR [r11+48] - mov QWORD PTR [r11+40], r8 - adc rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r11+56] - mov QWORD PTR [r11+48], rax - adc r8, QWORD PTR [r10+56] - mov rax, QWORD PTR [r11+64] - mov QWORD PTR [r11+56], r8 - adc rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r11+72] - mov QWORD PTR [r11+64], rax - adc r8, QWORD PTR [r10+72] - mov rax, QWORD PTR [r11+80] - mov QWORD PTR [r11+72], r8 - adc rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r11+88] - mov QWORD PTR [r11+80], rax - adc r8, QWORD PTR [r10+88] - mov rax, QWORD PTR [r11+96] - mov QWORD PTR [r11+88], r8 - adc rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [r11+104] - mov QWORD PTR [r11+96], rax - adc r8, QWORD PTR [r10+104] - mov rax, QWORD PTR [r11+112] - mov QWORD PTR [r11+104], r8 - adc rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [r11+120] - mov QWORD PTR [r11+112], rax - adc r8, QWORD PTR [r10+120] - mov rax, QWORD PTR [r11+128] - mov QWORD PTR [r11+120], r8 - adc rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [r11+136] - mov QWORD PTR [r11+128], rax - adc r8, QWORD PTR [r10+136] - mov rax, QWORD PTR [r11+144] - mov QWORD PTR [r11+136], r8 - adc rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [r11+152] - mov QWORD PTR [r11+144], rax - adc r8, QWORD PTR [r10+152] - mov rax, QWORD PTR [r11+160] - mov QWORD PTR [r11+152], r8 - adc rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [r11+168] - mov QWORD PTR [r11+160], rax - adc r8, QWORD PTR [r10+168] - mov rax, QWORD PTR [r11+176] - mov QWORD PTR [r11+168], r8 - adc rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [r11+184] - mov QWORD PTR [r11+176], rax - adc r8, QWORD PTR [r10+184] - mov rax, QWORD PTR [r11+192] - mov QWORD PTR [r11+184], r8 - adc rax, QWORD PTR [r10+192] - mov r8, QWORD PTR [r11+200] - mov QWORD PTR [r11+192], rax - adc r8, QWORD PTR [r10+200] - mov rax, QWORD PTR [r11+208] - mov QWORD PTR [r11+200], r8 - adc rax, QWORD PTR [r10+208] - mov r8, QWORD PTR [r11+216] - mov QWORD PTR [r11+208], rax - adc r8, QWORD PTR [r10+216] - mov rax, QWORD PTR [r11+224] - mov QWORD PTR [r11+216], r8 - adc rax, QWORD PTR [r10+224] - mov r8, QWORD PTR [r11+232] - mov QWORD PTR [r11+224], rax - adc r8, QWORD PTR [r10+232] - mov rax, QWORD PTR [r11+240] - mov QWORD PTR [r11+232], r8 - adc rax, QWORD PTR [r10+240] - mov r8, QWORD PTR [r11+248] - mov QWORD PTR [r11+240], rax - adc r8, QWORD PTR [r10+248] - mov rax, QWORD PTR [r11+256] - mov QWORD PTR [r11+248], r8 - adc rax, QWORD PTR [r10+256] - mov r8, QWORD PTR [r11+264] - mov QWORD PTR [r11+256], rax - adc r8, QWORD PTR [r10+264] - mov rax, QWORD PTR [r11+272] - mov QWORD PTR [r11+264], r8 - adc rax, QWORD PTR [r10+272] - mov r8, QWORD PTR [r11+280] - mov QWORD PTR [r11+272], rax - adc r8, QWORD PTR [r10+280] - mov rax, QWORD PTR [r11+288] - mov QWORD PTR [r11+280], r8 - adc rax, QWORD PTR [r10+288] - mov r8, QWORD PTR [r11+296] - mov QWORD PTR [r11+288], rax - adc r8, QWORD PTR [r10+296] - mov rax, QWORD PTR [r11+304] - mov QWORD PTR [r11+296], r8 - adc rax, QWORD PTR [r10+304] - mov r8, QWORD PTR [r11+312] - mov QWORD PTR [r11+304], rax - adc r8, QWORD PTR [r10+312] - mov rax, QWORD PTR [r11+320] - mov QWORD PTR [r11+312], r8 - adc rax, QWORD PTR [r10+320] - mov r8, QWORD PTR [r11+328] - mov QWORD PTR [r11+320], rax - adc r8, QWORD PTR [r10+328] - mov rax, QWORD PTR [r11+336] - mov QWORD PTR [r11+328], r8 - adc rax, QWORD PTR [r10+336] - mov r8, QWORD PTR [r11+344] - mov QWORD PTR [r11+336], rax - adc r8, QWORD PTR [r10+344] - mov rax, QWORD PTR [r11+352] - mov QWORD PTR [r11+344], r8 - adc rax, QWORD PTR [r10+352] - mov r8, QWORD PTR [r11+360] - mov QWORD PTR [r11+352], rax - adc r8, QWORD PTR [r10+360] - mov rax, QWORD PTR [r11+368] - mov QWORD PTR [r11+360], r8 - adc rax, QWORD PTR [r10+368] - mov r8, QWORD PTR [r11+376] - mov QWORD PTR [r11+368], rax - adc r8, QWORD PTR [r10+376] - mov QWORD PTR [r11+376], r8 - adc r9, 0 - mov QWORD PTR [rcx+576], r9 - ; Add in place - mov rax, QWORD PTR [r11+192] - add rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r11+200] - mov QWORD PTR [r11+192], rax - adc r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r11+208] - mov QWORD PTR [r11+200], r8 - adc rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r11+216] - mov QWORD PTR [r11+208], rax - adc r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r11+224] - mov QWORD PTR [r11+216], r8 - adc rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r11+232] - mov QWORD PTR [r11+224], rax - adc r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r11+240] - mov QWORD PTR [r11+232], r8 - adc rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r11+248] - mov QWORD PTR [r11+240], rax - adc r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r11+256] - mov QWORD PTR [r11+248], r8 - adc rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r11+264] - mov QWORD PTR [r11+256], rax - adc r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r11+272] - mov QWORD PTR [r11+264], r8 - adc rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r11+280] - mov QWORD PTR [r11+272], rax - adc r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r11+288] - mov QWORD PTR [r11+280], r8 - adc rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r11+296] - mov QWORD PTR [r11+288], rax - adc r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r11+304] - mov QWORD PTR [r11+296], r8 - adc rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r11+312] - mov QWORD PTR [r11+304], rax - adc r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r11+320] - mov QWORD PTR [r11+312], r8 - adc rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r11+328] - mov QWORD PTR [r11+320], rax - adc r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r11+336] - mov QWORD PTR [r11+328], r8 - adc rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r11+344] - mov QWORD PTR [r11+336], rax - adc r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r11+352] - mov QWORD PTR [r11+344], r8 - adc rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r11+360] - mov QWORD PTR [r11+352], rax - adc r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r11+368] - mov QWORD PTR [r11+360], r8 - adc rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r11+376] - mov QWORD PTR [r11+368], rax - adc r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [r11+384] - mov QWORD PTR [r11+376], r8 - adc rax, QWORD PTR [rdx+192] - mov QWORD PTR [r11+384], rax - ; Add to zero - mov rax, QWORD PTR [rdx+200] + mov rcx, QWORD PTR [rsp+384] + neg r9 + add rcx, 384 + mov r8, QWORD PTR [rcx+-192] + sub r8, QWORD PTR [r10+-192] + mov rax, QWORD PTR [rcx+-184] + mov QWORD PTR [rcx+-192], r8 + sbb rax, QWORD PTR [r10+-184] + mov r8, QWORD PTR [rcx+-176] + mov QWORD PTR [rcx+-184], rax + sbb r8, QWORD PTR [r10+-176] + mov rax, QWORD PTR [rcx+-168] + mov QWORD PTR [rcx+-176], r8 + sbb rax, QWORD PTR [r10+-168] + mov r8, QWORD PTR [rcx+-160] + mov QWORD PTR [rcx+-168], rax + sbb r8, QWORD PTR [r10+-160] + mov rax, QWORD PTR [rcx+-152] + mov QWORD PTR [rcx+-160], r8 + sbb rax, QWORD PTR [r10+-152] + mov r8, QWORD PTR [rcx+-144] + mov QWORD PTR [rcx+-152], rax + sbb r8, QWORD PTR [r10+-144] + mov rax, QWORD PTR [rcx+-136] + mov QWORD PTR [rcx+-144], r8 + sbb rax, QWORD PTR [r10+-136] + mov r8, QWORD PTR [rcx+-128] + mov QWORD PTR [rcx+-136], rax + sbb r8, QWORD PTR [r10+-128] + mov rax, QWORD PTR [rcx+-120] + mov QWORD PTR [rcx+-128], r8 + sbb rax, QWORD PTR [r10+-120] + mov r8, QWORD PTR [rcx+-112] + mov QWORD PTR [rcx+-120], rax + sbb r8, QWORD PTR [r10+-112] + mov rax, QWORD PTR [rcx+-104] + mov QWORD PTR [rcx+-112], r8 + sbb rax, QWORD PTR [r10+-104] + mov r8, QWORD PTR [rcx+-96] + mov QWORD PTR [rcx+-104], rax + sbb r8, QWORD PTR [r10+-96] + mov rax, QWORD PTR [rcx+-88] + mov QWORD PTR [rcx+-96], r8 + sbb rax, QWORD PTR [r10+-88] + mov r8, QWORD PTR [rcx+-80] + mov QWORD PTR [rcx+-88], rax + sbb r8, QWORD PTR [r10+-80] + mov rax, QWORD PTR [rcx+-72] + mov QWORD PTR [rcx+-80], r8 + sbb rax, QWORD PTR [r10+-72] + mov r8, QWORD PTR [rcx+-64] + mov QWORD PTR [rcx+-72], rax + sbb r8, QWORD PTR [r10+-64] + mov rax, QWORD PTR [rcx+-56] + mov QWORD PTR [rcx+-64], r8 + sbb rax, QWORD PTR [r10+-56] + mov r8, QWORD PTR [rcx+-48] + mov QWORD PTR [rcx+-56], rax + sbb r8, QWORD PTR [r10+-48] + mov rax, QWORD PTR [rcx+-40] + mov QWORD PTR [rcx+-48], r8 + sbb rax, QWORD PTR [r10+-40] + mov r8, QWORD PTR [rcx+-32] + mov QWORD PTR [rcx+-40], rax + sbb r8, QWORD PTR [r10+-32] + mov rax, QWORD PTR [rcx+-24] + mov QWORD PTR [rcx+-32], r8 + sbb rax, QWORD PTR [r10+-24] + mov r8, QWORD PTR [rcx+-16] + mov QWORD PTR [rcx+-24], rax + sbb r8, QWORD PTR [r10+-16] + mov rax, QWORD PTR [rcx+-8] + mov QWORD PTR [rcx+-16], r8 + sbb rax, QWORD PTR [r10+-8] + mov r8, QWORD PTR [rcx] + mov QWORD PTR [rcx+-8], rax + sbb r8, QWORD PTR [r10] + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 + sbb rax, QWORD PTR [r10+8] + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax + sbb r8, QWORD PTR [r10+16] + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 + sbb rax, QWORD PTR [r10+24] + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax + sbb r8, QWORD PTR [r10+32] + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 + sbb rax, QWORD PTR [r10+40] + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax + sbb r8, QWORD PTR [r10+48] + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 + sbb rax, QWORD PTR [r10+56] + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax + sbb r8, QWORD PTR [r10+64] + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 + sbb rax, QWORD PTR [r10+72] + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax + sbb r8, QWORD PTR [r10+80] + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 + sbb rax, QWORD PTR [r10+88] + mov r8, QWORD PTR [rcx+96] + mov QWORD PTR [rcx+88], rax + sbb r8, QWORD PTR [r10+96] + mov rax, QWORD PTR [rcx+104] + mov QWORD PTR [rcx+96], r8 + sbb rax, QWORD PTR [r10+104] + mov r8, QWORD PTR [rcx+112] + mov QWORD PTR [rcx+104], rax + sbb r8, QWORD PTR [r10+112] + mov rax, QWORD PTR [rcx+120] + mov QWORD PTR [rcx+112], r8 + sbb rax, QWORD PTR [r10+120] + mov r8, QWORD PTR [rcx+128] + mov QWORD PTR [rcx+120], rax + sbb r8, QWORD PTR [r10+128] + mov rax, QWORD PTR [rcx+136] + mov QWORD PTR [rcx+128], r8 + sbb rax, QWORD PTR [r10+136] + mov r8, QWORD PTR [rcx+144] + mov QWORD PTR [rcx+136], rax + sbb r8, QWORD PTR [r10+144] + mov rax, QWORD PTR [rcx+152] + mov QWORD PTR [rcx+144], r8 + sbb rax, QWORD PTR [r10+152] + mov r8, QWORD PTR [rcx+160] + mov QWORD PTR [rcx+152], rax + sbb r8, QWORD PTR [r10+160] + mov rax, QWORD PTR [rcx+168] + mov QWORD PTR [rcx+160], r8 + sbb rax, QWORD PTR [r10+168] + mov r8, QWORD PTR [rcx+176] + mov QWORD PTR [rcx+168], rax + sbb r8, QWORD PTR [r10+176] + mov rax, QWORD PTR [rcx+184] + mov QWORD PTR [rcx+176], r8 + sbb rax, QWORD PTR [r10+184] + mov QWORD PTR [rcx+184], rax + sbb r9, 0 + mov rcx, QWORD PTR [rsp+384] + add rcx, 576 + ; Add in word + mov r8, QWORD PTR [rcx] + add r8, r9 + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+208] - mov QWORD PTR [r11+392], rax + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax adc r8, 0 - mov rax, QWORD PTR [rdx+216] - mov QWORD PTR [r11+400], r8 + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+224] - mov QWORD PTR [r11+408], rax + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax adc r8, 0 - mov rax, QWORD PTR [rdx+232] - mov QWORD PTR [r11+416], r8 + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+240] - mov QWORD PTR [r11+424], rax + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax adc r8, 0 - mov rax, QWORD PTR [rdx+248] - mov QWORD PTR [r11+432], r8 + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+256] - mov QWORD PTR [r11+440], rax + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax adc r8, 0 - mov rax, QWORD PTR [rdx+264] - mov QWORD PTR [r11+448], r8 + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+272] - mov QWORD PTR [r11+456], rax + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax adc r8, 0 - mov rax, QWORD PTR [rdx+280] - mov QWORD PTR [r11+464], r8 + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+288] - mov QWORD PTR [r11+472], rax + mov r8, QWORD PTR [rcx+96] + mov QWORD PTR [rcx+88], rax adc r8, 0 - mov rax, QWORD PTR [rdx+296] - mov QWORD PTR [r11+480], r8 + mov rax, QWORD PTR [rcx+104] + mov QWORD PTR [rcx+96], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+304] - mov QWORD PTR [r11+488], rax + mov r8, QWORD PTR [rcx+112] + mov QWORD PTR [rcx+104], rax adc r8, 0 - mov rax, QWORD PTR [rdx+312] - mov QWORD PTR [r11+496], r8 + mov rax, QWORD PTR [rcx+120] + mov QWORD PTR [rcx+112], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+320] - mov QWORD PTR [r11+504], rax + mov r8, QWORD PTR [rcx+128] + mov QWORD PTR [rcx+120], rax adc r8, 0 - mov rax, QWORD PTR [rdx+328] - mov QWORD PTR [r11+512], r8 + mov rax, QWORD PTR [rcx+136] + mov QWORD PTR [rcx+128], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+336] - mov QWORD PTR [r11+520], rax + mov r8, QWORD PTR [rcx+144] + mov QWORD PTR [rcx+136], rax adc r8, 0 - mov rax, QWORD PTR [rdx+344] - mov QWORD PTR [r11+528], r8 + mov rax, QWORD PTR [rcx+152] + mov QWORD PTR [rcx+144], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+352] - mov QWORD PTR [r11+536], rax + mov r8, QWORD PTR [rcx+160] + mov QWORD PTR [rcx+152], rax adc r8, 0 - mov rax, QWORD PTR [rdx+360] - mov QWORD PTR [r11+544], r8 + mov rax, QWORD PTR [rcx+168] + mov QWORD PTR [rcx+160], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+368] - mov QWORD PTR [r11+552], rax + mov r8, QWORD PTR [rcx+176] + mov QWORD PTR [rcx+168], rax adc r8, 0 - mov rax, QWORD PTR [rdx+376] - mov QWORD PTR [r11+560], r8 + mov rax, QWORD PTR [rcx+184] + mov QWORD PTR [rcx+176], r8 adc rax, 0 - mov QWORD PTR [r11+568], rax - add rsp, 984 - pop r12 + mov QWORD PTR [rcx+184], rax + mov rdx, QWORD PTR [rsp+392] + mov rcx, QWORD PTR [rsp+384] + add rsp, 400 ret sp_3072_sqr_avx2_48 ENDP _text ENDS @@ -22638,7 +30571,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_3072_cond_sub_24 PROC sub rsp, 192 - mov rax, 0 mov r10, QWORD PTR [r8] mov r11, QWORD PTR [r8+8] and r10, r9 @@ -22807,7 +30739,7 @@ sp_3072_cond_sub_24 PROC sbb r11, r8 mov QWORD PTR [rcx+176], r10 mov QWORD PTR [rcx+184], r11 - sbb rax, 0 + sbb rax, rax add rsp, 192 ret sp_3072_cond_sub_24 ENDP @@ -23111,7 +31043,6 @@ IFDEF HAVE_INTEL_AVX2 _text SEGMENT READONLY PARA sp_3072_cond_sub_avx2_24 PROC push r12 - mov rax, 0 mov r12, QWORD PTR [r8] mov r10, QWORD PTR [rdx] pext r12, r12, r9 @@ -23232,7 +31163,7 @@ sp_3072_cond_sub_avx2_24 PROC mov QWORD PTR [rcx+176], r11 sbb r12, r10 mov QWORD PTR [rcx+184], r12 - sbb rax, 0 + sbb rax, rax pop r12 ret sp_3072_cond_sub_avx2_24 ENDP @@ -23834,6 +31765,1750 @@ sp_3072_cmp_24 PROC ret sp_3072_cmp_24 ENDP _text ENDS +IFNDEF WC_NO_CACHE_RESISTANT +_text SEGMENT READONLY PARA +sp_3072_get_from_table_24 PROC + mov rax, 1 + movd xmm10, r8 + movd xmm11, rax + pxor xmm13, xmm13 + pshufd xmm11, xmm11, 0 + pshufd xmm10, xmm10, 0 + ; START: 0-7 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 0-7 + ; START: 8-15 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 8-15 + ; START: 16-23 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + ; END: 16-23 + ret +sp_3072_get_from_table_24 ENDP +_text ENDS +ENDIF IFDEF HAVE_INTEL_AVX2 ; /* Reduce the number back to 3072 bits using Montgomery reduction. ; * @@ -24153,6 +33828,902 @@ L_3072_mont_reduce_avx2_24_loop: sp_3072_mont_reduce_avx2_24 ENDP _text ENDS ENDIF +IFNDEF WC_NO_CACHE_RESISTANT +_text SEGMENT READONLY PARA +sp_3072_get_from_table_avx2_24 PROC + mov rax, 1 + movd xmm10, r8 + movd xmm11, rax + vpxor ymm13, ymm13, ymm13 + vpermd ymm10, ymm13, ymm10 + vpermd ymm11, ymm13, ymm11 + ; START: 0-15 + vpxor ymm13, ymm13, ymm13 + vpxor ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 + vpxor ymm6, ymm6, ymm6 + vpxor ymm7, ymm7, ymm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm4 + vmovdqu YMMWORD PTR [rcx+32], ymm5 + vmovdqu YMMWORD PTR [rcx+64], ymm6 + vmovdqu YMMWORD PTR [rcx+96], ymm7 + add rcx, 128 + ; END: 0-15 + ; START: 16-23 + vpxor ymm13, ymm13, ymm13 + vpxor ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 16 + mov r9, QWORD PTR [rdx+128] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 17 + mov r9, QWORD PTR [rdx+136] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 18 + mov r9, QWORD PTR [rdx+144] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 19 + mov r9, QWORD PTR [rdx+152] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 20 + mov r9, QWORD PTR [rdx+160] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 21 + mov r9, QWORD PTR [rdx+168] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 22 + mov r9, QWORD PTR [rdx+176] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 23 + mov r9, QWORD PTR [rdx+184] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 24 + mov r9, QWORD PTR [rdx+192] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 25 + mov r9, QWORD PTR [rdx+200] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 26 + mov r9, QWORD PTR [rdx+208] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 27 + mov r9, QWORD PTR [rdx+216] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 28 + mov r9, QWORD PTR [rdx+224] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 29 + mov r9, QWORD PTR [rdx+232] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 30 + mov r9, QWORD PTR [rdx+240] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 31 + mov r9, QWORD PTR [rdx+248] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpaddd ymm13, ymm13, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm4 + vmovdqu YMMWORD PTR [rcx+32], ymm5 + ; END: 16-23 + ret +sp_3072_get_from_table_avx2_24 ENDP +_text ENDS +ENDIF ; /* Conditionally subtract b from a using the mask m. ; * m is -1 to subtract and 0 when not copying. ; * @@ -24164,7 +34735,6 @@ ENDIF _text SEGMENT READONLY PARA sp_3072_cond_sub_48 PROC sub rsp, 384 - mov rax, 0 mov r10, QWORD PTR [r8] mov r11, QWORD PTR [r8+8] and r10, r9 @@ -24501,7 +35071,7 @@ sp_3072_cond_sub_48 PROC sbb r11, r8 mov QWORD PTR [rcx+368], r10 mov QWORD PTR [rcx+376], r11 - sbb rax, 0 + sbb rax, rax add rsp, 384 ret sp_3072_cond_sub_48 ENDP @@ -25042,7 +35612,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_3072_sub_48 PROC mov r9, QWORD PTR [rdx] - xor rax, rax sub r9, QWORD PTR [r8] mov r10, QWORD PTR [rdx+8] mov QWORD PTR [rcx], r9 @@ -25186,7 +35755,7 @@ sp_3072_sub_48 PROC mov QWORD PTR [rcx+368], r9 sbb r10, QWORD PTR [r8+376] mov QWORD PTR [rcx+376], r10 - sbb rax, 0 + sbb rax, rax ret sp_3072_sub_48 ENDP _text ENDS @@ -25527,7 +36096,6 @@ IFDEF HAVE_INTEL_AVX2 _text SEGMENT READONLY PARA sp_3072_cond_sub_avx2_48 PROC push r12 - mov rax, 0 mov r12, QWORD PTR [r8] mov r10, QWORD PTR [rdx] pext r12, r12, r9 @@ -25768,7 +36336,7 @@ sp_3072_cond_sub_avx2_48 PROC mov QWORD PTR [rcx+368], r11 sbb r12, r10 mov QWORD PTR [rcx+376], r12 - sbb rax, 0 + sbb rax, rax pop r12 ret sp_3072_cond_sub_avx2_48 ENDP @@ -26177,6 +36745,1802 @@ sp_3072_cmp_48 PROC ret sp_3072_cmp_48 ENDP _text ENDS +IFNDEF WC_NO_CACHE_RESISTANT +_text SEGMENT READONLY PARA +sp_3072_get_from_table_48 PROC + mov rax, 1 + movd xmm10, r8 + movd xmm11, rax + pxor xmm13, xmm13 + pshufd xmm11, xmm11, 0 + pshufd xmm10, xmm10, 0 + ; START: 0-7 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 0-7 + ; START: 8-15 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 8-15 + ; START: 16-23 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 16-23 + ; START: 24-31 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 24-31 + ; START: 32-39 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 32-39 + ; START: 40-47 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + ; END: 40-47 + ret +sp_3072_get_from_table_48 ENDP +_text ENDS +ENDIF IFDEF HAVE_INTEL_AVX2 ; /* Reduce the number back to 3072 bits using Montgomery reduction. ; * @@ -26760,6 +39124,854 @@ L_3072_mont_reduce_avx2_48_loop: sp_3072_mont_reduce_avx2_48 ENDP _text ENDS ENDIF +IFNDEF WC_NO_CACHE_RESISTANT +_text SEGMENT READONLY PARA +sp_3072_get_from_table_avx2_48 PROC + mov rax, 1 + movd xmm10, r8 + movd xmm11, rax + vpxor ymm13, ymm13, ymm13 + vpermd ymm10, ymm13, ymm10 + vpermd ymm11, ymm13, ymm11 + ; START: 0-15 + vpxor ymm13, ymm13, ymm13 + vpxor ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 + vpxor ymm6, ymm6, ymm6 + vpxor ymm7, ymm7, ymm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm4 + vmovdqu YMMWORD PTR [rcx+32], ymm5 + vmovdqu YMMWORD PTR [rcx+64], ymm6 + vmovdqu YMMWORD PTR [rcx+96], ymm7 + add rcx, 128 + ; END: 0-15 + ; START: 16-31 + vpxor ymm13, ymm13, ymm13 + vpxor ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 + vpxor ymm6, ymm6, ymm6 + vpxor ymm7, ymm7, ymm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm4 + vmovdqu YMMWORD PTR [rcx+32], ymm5 + vmovdqu YMMWORD PTR [rcx+64], ymm6 + vmovdqu YMMWORD PTR [rcx+96], ymm7 + add rcx, 128 + ; END: 16-31 + ; START: 32-47 + vpxor ymm13, ymm13, ymm13 + vpxor ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 + vpxor ymm6, ymm6, ymm6 + vpxor ymm7, ymm7, ymm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm4 + vmovdqu YMMWORD PTR [rcx+32], ymm5 + vmovdqu YMMWORD PTR [rcx+64], ymm6 + vmovdqu YMMWORD PTR [rcx+96], ymm7 + ; END: 32-47 + ret +sp_3072_get_from_table_avx2_48 ENDP +_text ENDS +ENDIF ; /* Conditionally add a and b using the mask m. ; * m is -1 to add and 0 when not. ; * @@ -27770,7 +40982,6 @@ ENDIF _text SEGMENT READONLY PARA sp_4096_sub_in_place_64 PROC mov r8, QWORD PTR [rcx] - xor rax, rax sub r8, QWORD PTR [rdx] mov r9, QWORD PTR [rcx+8] mov QWORD PTR [rcx], r8 @@ -27962,7 +41173,7 @@ sp_4096_sub_in_place_64 PROC mov QWORD PTR [rcx+496], r8 sbb r9, QWORD PTR [rdx+504] mov QWORD PTR [rcx+504], r9 - sbb rax, 0 + sbb rax, rax ret sp_4096_sub_in_place_64 ENDP _text ENDS @@ -30740,2264 +43951,1966 @@ ENDIF sp_4096_mul_avx2_64 ENDP _text ENDS ENDIF -; /* Add a to a into r. (r = a + a) -; * -; * r A single precision integer. -; * a A single precision integer. -; */ -_text SEGMENT READONLY PARA -sp_2048_dbl_32 PROC - mov r8, QWORD PTR [rdx] - xor rax, rax - add r8, r8 - mov r9, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+16] - mov QWORD PTR [rcx+8], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+40] - mov QWORD PTR [rcx+32], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+48] - mov QWORD PTR [rcx+40], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+56] - mov QWORD PTR [rcx+48], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+64] - mov QWORD PTR [rcx+56], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+72] - mov QWORD PTR [rcx+64], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+80] - mov QWORD PTR [rcx+72], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+88] - mov QWORD PTR [rcx+80], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+96] - mov QWORD PTR [rcx+88], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+104] - mov QWORD PTR [rcx+96], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+112] - mov QWORD PTR [rcx+104], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+120] - mov QWORD PTR [rcx+112], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+128] - mov QWORD PTR [rcx+120], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+136] - mov QWORD PTR [rcx+128], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+144] - mov QWORD PTR [rcx+136], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+152] - mov QWORD PTR [rcx+144], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+160] - mov QWORD PTR [rcx+152], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+168] - mov QWORD PTR [rcx+160], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+176] - mov QWORD PTR [rcx+168], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+184] - mov QWORD PTR [rcx+176], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+192] - mov QWORD PTR [rcx+184], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+200] - mov QWORD PTR [rcx+192], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+208] - mov QWORD PTR [rcx+200], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+216] - mov QWORD PTR [rcx+208], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+224] - mov QWORD PTR [rcx+216], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+232] - mov QWORD PTR [rcx+224], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+240] - mov QWORD PTR [rcx+232], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+248] - mov QWORD PTR [rcx+240], r8 - adc r9, r9 - mov QWORD PTR [rcx+248], r9 - adc rax, 0 - ret -sp_2048_dbl_32 ENDP -_text ENDS ; /* Square a and put result in r. (r = a * a) ; * +; * Karatsuba: ah^2, al^2, (al - ah)^2 +; * ; * r A single precision integer. ; * a A single precision integer. ; */ _text SEGMENT READONLY PARA sp_4096_sqr_64 PROC - push r12 - sub rsp, 1304 - mov QWORD PTR [rsp+1280], rcx - mov QWORD PTR [rsp+1288], rdx - lea r10, QWORD PTR [rsp+1024] + sub rsp, 528 + mov QWORD PTR [rsp+512], rcx + mov QWORD PTR [rsp+520], rdx + mov r9, 0 + mov r10, rsp lea r11, QWORD PTR [rdx+256] - ; Add mov rax, QWORD PTR [rdx] - xor r9, r9 - add rax, QWORD PTR [r11] + sub rax, QWORD PTR [r11] mov r8, QWORD PTR [rdx+8] mov QWORD PTR [r10], rax - adc r8, QWORD PTR [r11+8] + sbb r8, QWORD PTR [r11+8] mov rax, QWORD PTR [rdx+16] mov QWORD PTR [r10+8], r8 - adc rax, QWORD PTR [r11+16] + sbb rax, QWORD PTR [r11+16] mov r8, QWORD PTR [rdx+24] mov QWORD PTR [r10+16], rax - adc r8, QWORD PTR [r11+24] + sbb r8, QWORD PTR [r11+24] mov rax, QWORD PTR [rdx+32] mov QWORD PTR [r10+24], r8 - adc rax, QWORD PTR [r11+32] + sbb rax, QWORD PTR [r11+32] mov r8, QWORD PTR [rdx+40] mov QWORD PTR [r10+32], rax - adc r8, QWORD PTR [r11+40] + sbb r8, QWORD PTR [r11+40] mov rax, QWORD PTR [rdx+48] mov QWORD PTR [r10+40], r8 - adc rax, QWORD PTR [r11+48] + sbb rax, QWORD PTR [r11+48] mov r8, QWORD PTR [rdx+56] mov QWORD PTR [r10+48], rax - adc r8, QWORD PTR [r11+56] + sbb r8, QWORD PTR [r11+56] mov rax, QWORD PTR [rdx+64] mov QWORD PTR [r10+56], r8 - adc rax, QWORD PTR [r11+64] + sbb rax, QWORD PTR [r11+64] mov r8, QWORD PTR [rdx+72] mov QWORD PTR [r10+64], rax - adc r8, QWORD PTR [r11+72] + sbb r8, QWORD PTR [r11+72] mov rax, QWORD PTR [rdx+80] mov QWORD PTR [r10+72], r8 - adc rax, QWORD PTR [r11+80] + sbb rax, QWORD PTR [r11+80] mov r8, QWORD PTR [rdx+88] mov QWORD PTR [r10+80], rax - adc r8, QWORD PTR [r11+88] + sbb r8, QWORD PTR [r11+88] mov rax, QWORD PTR [rdx+96] mov QWORD PTR [r10+88], r8 - adc rax, QWORD PTR [r11+96] + sbb rax, QWORD PTR [r11+96] mov r8, QWORD PTR [rdx+104] mov QWORD PTR [r10+96], rax - adc r8, QWORD PTR [r11+104] + sbb r8, QWORD PTR [r11+104] mov rax, QWORD PTR [rdx+112] mov QWORD PTR [r10+104], r8 - adc rax, QWORD PTR [r11+112] + sbb rax, QWORD PTR [r11+112] mov r8, QWORD PTR [rdx+120] mov QWORD PTR [r10+112], rax - adc r8, QWORD PTR [r11+120] + sbb r8, QWORD PTR [r11+120] mov rax, QWORD PTR [rdx+128] mov QWORD PTR [r10+120], r8 - adc rax, QWORD PTR [r11+128] + sbb rax, QWORD PTR [r11+128] mov r8, QWORD PTR [rdx+136] mov QWORD PTR [r10+128], rax - adc r8, QWORD PTR [r11+136] + sbb r8, QWORD PTR [r11+136] mov rax, QWORD PTR [rdx+144] mov QWORD PTR [r10+136], r8 - adc rax, QWORD PTR [r11+144] + sbb rax, QWORD PTR [r11+144] mov r8, QWORD PTR [rdx+152] mov QWORD PTR [r10+144], rax - adc r8, QWORD PTR [r11+152] + sbb r8, QWORD PTR [r11+152] mov rax, QWORD PTR [rdx+160] mov QWORD PTR [r10+152], r8 - adc rax, QWORD PTR [r11+160] + sbb rax, QWORD PTR [r11+160] mov r8, QWORD PTR [rdx+168] mov QWORD PTR [r10+160], rax - adc r8, QWORD PTR [r11+168] + sbb r8, QWORD PTR [r11+168] mov rax, QWORD PTR [rdx+176] mov QWORD PTR [r10+168], r8 - adc rax, QWORD PTR [r11+176] + sbb rax, QWORD PTR [r11+176] mov r8, QWORD PTR [rdx+184] mov QWORD PTR [r10+176], rax - adc r8, QWORD PTR [r11+184] + sbb r8, QWORD PTR [r11+184] mov rax, QWORD PTR [rdx+192] mov QWORD PTR [r10+184], r8 - adc rax, QWORD PTR [r11+192] + sbb rax, QWORD PTR [r11+192] mov r8, QWORD PTR [rdx+200] mov QWORD PTR [r10+192], rax - adc r8, QWORD PTR [r11+200] + sbb r8, QWORD PTR [r11+200] mov rax, QWORD PTR [rdx+208] mov QWORD PTR [r10+200], r8 - adc rax, QWORD PTR [r11+208] + sbb rax, QWORD PTR [r11+208] mov r8, QWORD PTR [rdx+216] mov QWORD PTR [r10+208], rax - adc r8, QWORD PTR [r11+216] + sbb r8, QWORD PTR [r11+216] mov rax, QWORD PTR [rdx+224] mov QWORD PTR [r10+216], r8 - adc rax, QWORD PTR [r11+224] + sbb rax, QWORD PTR [r11+224] mov r8, QWORD PTR [rdx+232] mov QWORD PTR [r10+224], rax - adc r8, QWORD PTR [r11+232] + sbb r8, QWORD PTR [r11+232] mov rax, QWORD PTR [rdx+240] mov QWORD PTR [r10+232], r8 - adc rax, QWORD PTR [r11+240] + sbb rax, QWORD PTR [r11+240] mov r8, QWORD PTR [rdx+248] mov QWORD PTR [r10+240], rax - adc r8, QWORD PTR [r11+248] + sbb r8, QWORD PTR [r11+248] + mov QWORD PTR [r10+248], r8 + sbb r9, 0 + ; Cond Negate + mov rax, QWORD PTR [r10] + mov r11, r9 + xor rax, r9 + neg r11 + sub rax, r9 + mov r8, QWORD PTR [r10+8] + sbb r11, 0 + mov QWORD PTR [r10], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+16] + setc r11b + mov QWORD PTR [r10+8], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+24] + setc r11b + mov QWORD PTR [r10+16], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+32] + setc r11b + mov QWORD PTR [r10+24], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+40] + setc r11b + mov QWORD PTR [r10+32], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+48] + setc r11b + mov QWORD PTR [r10+40], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+56] + setc r11b + mov QWORD PTR [r10+48], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+64] + setc r11b + mov QWORD PTR [r10+56], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+72] + setc r11b + mov QWORD PTR [r10+64], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+80] + setc r11b + mov QWORD PTR [r10+72], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+88] + setc r11b + mov QWORD PTR [r10+80], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+96] + setc r11b + mov QWORD PTR [r10+88], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+104] + setc r11b + mov QWORD PTR [r10+96], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+112] + setc r11b + mov QWORD PTR [r10+104], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+120] + setc r11b + mov QWORD PTR [r10+112], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+128] + setc r11b + mov QWORD PTR [r10+120], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+136] + setc r11b + mov QWORD PTR [r10+128], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+144] + setc r11b + mov QWORD PTR [r10+136], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+152] + setc r11b + mov QWORD PTR [r10+144], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+160] + setc r11b + mov QWORD PTR [r10+152], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+168] + setc r11b + mov QWORD PTR [r10+160], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+176] + setc r11b + mov QWORD PTR [r10+168], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+184] + setc r11b + mov QWORD PTR [r10+176], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+192] + setc r11b + mov QWORD PTR [r10+184], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+200] + setc r11b + mov QWORD PTR [r10+192], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+208] + setc r11b + mov QWORD PTR [r10+200], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+216] + setc r11b + mov QWORD PTR [r10+208], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+224] + setc r11b + mov QWORD PTR [r10+216], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+232] + setc r11b + mov QWORD PTR [r10+224], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+240] + setc r11b + mov QWORD PTR [r10+232], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+248] + setc r11b + mov QWORD PTR [r10+240], rax + xor r8, r9 + add r8, r11 mov QWORD PTR [r10+248], r8 - adc r9, 0 - mov QWORD PTR [rsp+1296], r9 mov rdx, r10 mov rcx, rsp call sp_2048_sqr_32 - mov rdx, QWORD PTR [rsp+1288] - lea rcx, QWORD PTR [rsp+512] + mov rdx, QWORD PTR [rsp+520] + mov rcx, QWORD PTR [rsp+512] add rdx, 256 + add rcx, 512 call sp_2048_sqr_32 - mov rdx, QWORD PTR [rsp+1288] - mov rcx, QWORD PTR [rsp+1280] + mov rdx, QWORD PTR [rsp+520] + mov rcx, QWORD PTR [rsp+512] call sp_2048_sqr_32 IFDEF _WIN64 - mov rdx, QWORD PTR [rsp+1288] - mov rcx, QWORD PTR [rsp+1280] + mov rdx, QWORD PTR [rsp+520] + mov rcx, QWORD PTR [rsp+512] ENDIF - mov r12, QWORD PTR [rsp+1296] - lea r10, QWORD PTR [rsp+1024] - mov r9, r12 - neg r12 - mov rax, QWORD PTR [r10] - mov r8, QWORD PTR [r10+8] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+512], rax - mov QWORD PTR [rcx+520], r8 - mov rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [r10+24] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+528], rax - mov QWORD PTR [rcx+536], r8 - mov rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [r10+40] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+544], rax - mov QWORD PTR [rcx+552], r8 - mov rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [r10+56] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+560], rax - mov QWORD PTR [rcx+568], r8 - mov rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [r10+72] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+576], rax - mov QWORD PTR [rcx+584], r8 - mov rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [r10+88] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+592], rax - mov QWORD PTR [rcx+600], r8 - mov rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [r10+104] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+608], rax - mov QWORD PTR [rcx+616], r8 - mov rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [r10+120] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+624], rax - mov QWORD PTR [rcx+632], r8 - mov rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [r10+136] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+640], rax - mov QWORD PTR [rcx+648], r8 - mov rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [r10+152] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+656], rax - mov QWORD PTR [rcx+664], r8 - mov rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [r10+168] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+672], rax - mov QWORD PTR [rcx+680], r8 - mov rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [r10+184] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+688], rax - mov QWORD PTR [rcx+696], r8 - mov rax, QWORD PTR [r10+192] - mov r8, QWORD PTR [r10+200] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+704], rax - mov QWORD PTR [rcx+712], r8 - mov rax, QWORD PTR [r10+208] - mov r8, QWORD PTR [r10+216] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+720], rax - mov QWORD PTR [rcx+728], r8 - mov rax, QWORD PTR [r10+224] - mov r8, QWORD PTR [r10+232] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+736], rax - mov QWORD PTR [rcx+744], r8 - mov rax, QWORD PTR [r10+240] - mov r8, QWORD PTR [r10+248] - and rax, r12 - and r8, r12 - mov QWORD PTR [rcx+752], rax - mov QWORD PTR [rcx+760], r8 - mov rax, QWORD PTR [rcx+512] - add rax, rax - mov r8, QWORD PTR [rcx+520] - mov QWORD PTR [rcx+512], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+528] - mov QWORD PTR [rcx+520], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+536] - mov QWORD PTR [rcx+528], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+544] - mov QWORD PTR [rcx+536], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+552] - mov QWORD PTR [rcx+544], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+560] - mov QWORD PTR [rcx+552], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+568] - mov QWORD PTR [rcx+560], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+576] - mov QWORD PTR [rcx+568], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+584] - mov QWORD PTR [rcx+576], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+592] - mov QWORD PTR [rcx+584], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+600] - mov QWORD PTR [rcx+592], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+608] - mov QWORD PTR [rcx+600], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+616] - mov QWORD PTR [rcx+608], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+624] - mov QWORD PTR [rcx+616], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+632] - mov QWORD PTR [rcx+624], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+640] - mov QWORD PTR [rcx+632], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+648] - mov QWORD PTR [rcx+640], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+656] - mov QWORD PTR [rcx+648], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+664] - mov QWORD PTR [rcx+656], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+672] - mov QWORD PTR [rcx+664], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+680] - mov QWORD PTR [rcx+672], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+688] - mov QWORD PTR [rcx+680], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+696] - mov QWORD PTR [rcx+688], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+704] - mov QWORD PTR [rcx+696], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+712] - mov QWORD PTR [rcx+704], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+720] - mov QWORD PTR [rcx+712], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+728] - mov QWORD PTR [rcx+720], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+736] - mov QWORD PTR [rcx+728], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+744] - mov QWORD PTR [rcx+736], rax - adc r8, r8 - mov rax, QWORD PTR [rcx+752] - mov QWORD PTR [rcx+744], r8 - adc rax, rax - mov r8, QWORD PTR [rcx+760] - mov QWORD PTR [rcx+752], rax - adc r8, r8 - mov QWORD PTR [rcx+760], r8 - adc r9, 0 - lea rdx, QWORD PTR [rsp+512] - mov r10, rsp - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rdx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rdx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rdx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rdx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rdx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rdx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rdx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rdx+248] - mov rax, QWORD PTR [r10+256] - mov QWORD PTR [r10+248], r8 - sbb rax, QWORD PTR [rdx+256] - mov r8, QWORD PTR [r10+264] - mov QWORD PTR [r10+256], rax - sbb r8, QWORD PTR [rdx+264] - mov rax, QWORD PTR [r10+272] - mov QWORD PTR [r10+264], r8 - sbb rax, QWORD PTR [rdx+272] - mov r8, QWORD PTR [r10+280] - mov QWORD PTR [r10+272], rax - sbb r8, QWORD PTR [rdx+280] - mov rax, QWORD PTR [r10+288] - mov QWORD PTR [r10+280], r8 - sbb rax, QWORD PTR [rdx+288] - mov r8, QWORD PTR [r10+296] - mov QWORD PTR [r10+288], rax - sbb r8, QWORD PTR [rdx+296] - mov rax, QWORD PTR [r10+304] - mov QWORD PTR [r10+296], r8 - sbb rax, QWORD PTR [rdx+304] - mov r8, QWORD PTR [r10+312] - mov QWORD PTR [r10+304], rax - sbb r8, QWORD PTR [rdx+312] - mov rax, QWORD PTR [r10+320] - mov QWORD PTR [r10+312], r8 - sbb rax, QWORD PTR [rdx+320] - mov r8, QWORD PTR [r10+328] - mov QWORD PTR [r10+320], rax - sbb r8, QWORD PTR [rdx+328] - mov rax, QWORD PTR [r10+336] - mov QWORD PTR [r10+328], r8 - sbb rax, QWORD PTR [rdx+336] - mov r8, QWORD PTR [r10+344] - mov QWORD PTR [r10+336], rax - sbb r8, QWORD PTR [rdx+344] - mov rax, QWORD PTR [r10+352] - mov QWORD PTR [r10+344], r8 - sbb rax, QWORD PTR [rdx+352] - mov r8, QWORD PTR [r10+360] - mov QWORD PTR [r10+352], rax - sbb r8, QWORD PTR [rdx+360] - mov rax, QWORD PTR [r10+368] - mov QWORD PTR [r10+360], r8 - sbb rax, QWORD PTR [rdx+368] - mov r8, QWORD PTR [r10+376] - mov QWORD PTR [r10+368], rax - sbb r8, QWORD PTR [rdx+376] - mov rax, QWORD PTR [r10+384] - mov QWORD PTR [r10+376], r8 - sbb rax, QWORD PTR [rdx+384] - mov r8, QWORD PTR [r10+392] - mov QWORD PTR [r10+384], rax - sbb r8, QWORD PTR [rdx+392] - mov rax, QWORD PTR [r10+400] - mov QWORD PTR [r10+392], r8 - sbb rax, QWORD PTR [rdx+400] - mov r8, QWORD PTR [r10+408] - mov QWORD PTR [r10+400], rax - sbb r8, QWORD PTR [rdx+408] - mov rax, QWORD PTR [r10+416] - mov QWORD PTR [r10+408], r8 - sbb rax, QWORD PTR [rdx+416] - mov r8, QWORD PTR [r10+424] - mov QWORD PTR [r10+416], rax - sbb r8, QWORD PTR [rdx+424] - mov rax, QWORD PTR [r10+432] - mov QWORD PTR [r10+424], r8 - sbb rax, QWORD PTR [rdx+432] - mov r8, QWORD PTR [r10+440] - mov QWORD PTR [r10+432], rax - sbb r8, QWORD PTR [rdx+440] - mov rax, QWORD PTR [r10+448] - mov QWORD PTR [r10+440], r8 - sbb rax, QWORD PTR [rdx+448] - mov r8, QWORD PTR [r10+456] - mov QWORD PTR [r10+448], rax - sbb r8, QWORD PTR [rdx+456] - mov rax, QWORD PTR [r10+464] - mov QWORD PTR [r10+456], r8 - sbb rax, QWORD PTR [rdx+464] - mov r8, QWORD PTR [r10+472] - mov QWORD PTR [r10+464], rax - sbb r8, QWORD PTR [rdx+472] - mov rax, QWORD PTR [r10+480] - mov QWORD PTR [r10+472], r8 - sbb rax, QWORD PTR [rdx+480] - mov r8, QWORD PTR [r10+488] - mov QWORD PTR [r10+480], rax - sbb r8, QWORD PTR [rdx+488] - mov rax, QWORD PTR [r10+496] - mov QWORD PTR [r10+488], r8 - sbb rax, QWORD PTR [rdx+496] - mov r8, QWORD PTR [r10+504] - mov QWORD PTR [r10+496], rax - sbb r8, QWORD PTR [rdx+504] - mov QWORD PTR [r10+504], r8 + mov rdx, QWORD PTR [rsp+512] + lea r10, QWORD PTR [rsp+256] + add rdx, 768 + mov r9, 0 + mov r8, QWORD PTR [r10+-256] + sub r8, QWORD PTR [rdx+-256] + mov rax, QWORD PTR [r10+-248] + mov QWORD PTR [r10+-256], r8 + sbb rax, QWORD PTR [rdx+-248] + mov r8, QWORD PTR [r10+-240] + mov QWORD PTR [r10+-248], rax + sbb r8, QWORD PTR [rdx+-240] + mov rax, QWORD PTR [r10+-232] + mov QWORD PTR [r10+-240], r8 + sbb rax, QWORD PTR [rdx+-232] + mov r8, QWORD PTR [r10+-224] + mov QWORD PTR [r10+-232], rax + sbb r8, QWORD PTR [rdx+-224] + mov rax, QWORD PTR [r10+-216] + mov QWORD PTR [r10+-224], r8 + sbb rax, QWORD PTR [rdx+-216] + mov r8, QWORD PTR [r10+-208] + mov QWORD PTR [r10+-216], rax + sbb r8, QWORD PTR [rdx+-208] + mov rax, QWORD PTR [r10+-200] + mov QWORD PTR [r10+-208], r8 + sbb rax, QWORD PTR [rdx+-200] + mov r8, QWORD PTR [r10+-192] + mov QWORD PTR [r10+-200], rax + sbb r8, QWORD PTR [rdx+-192] + mov rax, QWORD PTR [r10+-184] + mov QWORD PTR [r10+-192], r8 + sbb rax, QWORD PTR [rdx+-184] + mov r8, QWORD PTR [r10+-176] + mov QWORD PTR [r10+-184], rax + sbb r8, QWORD PTR [rdx+-176] + mov rax, QWORD PTR [r10+-168] + mov QWORD PTR [r10+-176], r8 + sbb rax, QWORD PTR [rdx+-168] + mov r8, QWORD PTR [r10+-160] + mov QWORD PTR [r10+-168], rax + sbb r8, QWORD PTR [rdx+-160] + mov rax, QWORD PTR [r10+-152] + mov QWORD PTR [r10+-160], r8 + sbb rax, QWORD PTR [rdx+-152] + mov r8, QWORD PTR [r10+-144] + mov QWORD PTR [r10+-152], rax + sbb r8, QWORD PTR [rdx+-144] + mov rax, QWORD PTR [r10+-136] + mov QWORD PTR [r10+-144], r8 + sbb rax, QWORD PTR [rdx+-136] + mov r8, QWORD PTR [r10+-128] + mov QWORD PTR [r10+-136], rax + sbb r8, QWORD PTR [rdx+-128] + mov rax, QWORD PTR [r10+-120] + mov QWORD PTR [r10+-128], r8 + sbb rax, QWORD PTR [rdx+-120] + mov r8, QWORD PTR [r10+-112] + mov QWORD PTR [r10+-120], rax + sbb r8, QWORD PTR [rdx+-112] + mov rax, QWORD PTR [r10+-104] + mov QWORD PTR [r10+-112], r8 + sbb rax, QWORD PTR [rdx+-104] + mov r8, QWORD PTR [r10+-96] + mov QWORD PTR [r10+-104], rax + sbb r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov r8, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], rax + sbb r8, QWORD PTR [rdx+96] + mov rax, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], r8 + sbb rax, QWORD PTR [rdx+104] + mov r8, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], rax + sbb r8, QWORD PTR [rdx+112] + mov rax, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], r8 + sbb rax, QWORD PTR [rdx+120] + mov r8, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], rax + sbb r8, QWORD PTR [rdx+128] + mov rax, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], r8 + sbb rax, QWORD PTR [rdx+136] + mov r8, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], rax + sbb r8, QWORD PTR [rdx+144] + mov rax, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], r8 + sbb rax, QWORD PTR [rdx+152] + mov r8, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], rax + sbb r8, QWORD PTR [rdx+160] + mov rax, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], r8 + sbb rax, QWORD PTR [rdx+168] + mov r8, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], rax + sbb r8, QWORD PTR [rdx+176] + mov rax, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], r8 + sbb rax, QWORD PTR [rdx+184] + mov r8, QWORD PTR [r10+192] + mov QWORD PTR [r10+184], rax + sbb r8, QWORD PTR [rdx+192] + mov rax, QWORD PTR [r10+200] + mov QWORD PTR [r10+192], r8 + sbb rax, QWORD PTR [rdx+200] + mov r8, QWORD PTR [r10+208] + mov QWORD PTR [r10+200], rax + sbb r8, QWORD PTR [rdx+208] + mov rax, QWORD PTR [r10+216] + mov QWORD PTR [r10+208], r8 + sbb rax, QWORD PTR [rdx+216] + mov r8, QWORD PTR [r10+224] + mov QWORD PTR [r10+216], rax + sbb r8, QWORD PTR [rdx+224] + mov rax, QWORD PTR [r10+232] + mov QWORD PTR [r10+224], r8 + sbb rax, QWORD PTR [rdx+232] + mov r8, QWORD PTR [r10+240] + mov QWORD PTR [r10+232], rax + sbb r8, QWORD PTR [rdx+240] + mov rax, QWORD PTR [r10+248] + mov QWORD PTR [r10+240], r8 + sbb rax, QWORD PTR [rdx+248] + mov QWORD PTR [r10+248], rax sbb r9, 0 - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rcx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rcx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rcx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rcx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rcx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rcx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rcx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rcx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rcx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rcx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rcx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rcx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rcx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rcx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rcx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rcx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rcx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rcx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rcx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rcx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rcx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rcx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rcx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rcx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rcx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rcx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rcx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rcx+248] - mov rax, QWORD PTR [r10+256] - mov QWORD PTR [r10+248], r8 - sbb rax, QWORD PTR [rcx+256] - mov r8, QWORD PTR [r10+264] - mov QWORD PTR [r10+256], rax - sbb r8, QWORD PTR [rcx+264] - mov rax, QWORD PTR [r10+272] - mov QWORD PTR [r10+264], r8 - sbb rax, QWORD PTR [rcx+272] - mov r8, QWORD PTR [r10+280] - mov QWORD PTR [r10+272], rax - sbb r8, QWORD PTR [rcx+280] - mov rax, QWORD PTR [r10+288] - mov QWORD PTR [r10+280], r8 - sbb rax, QWORD PTR [rcx+288] - mov r8, QWORD PTR [r10+296] - mov QWORD PTR [r10+288], rax - sbb r8, QWORD PTR [rcx+296] - mov rax, QWORD PTR [r10+304] - mov QWORD PTR [r10+296], r8 - sbb rax, QWORD PTR [rcx+304] - mov r8, QWORD PTR [r10+312] - mov QWORD PTR [r10+304], rax - sbb r8, QWORD PTR [rcx+312] - mov rax, QWORD PTR [r10+320] - mov QWORD PTR [r10+312], r8 - sbb rax, QWORD PTR [rcx+320] - mov r8, QWORD PTR [r10+328] - mov QWORD PTR [r10+320], rax - sbb r8, QWORD PTR [rcx+328] - mov rax, QWORD PTR [r10+336] - mov QWORD PTR [r10+328], r8 - sbb rax, QWORD PTR [rcx+336] - mov r8, QWORD PTR [r10+344] - mov QWORD PTR [r10+336], rax - sbb r8, QWORD PTR [rcx+344] - mov rax, QWORD PTR [r10+352] - mov QWORD PTR [r10+344], r8 - sbb rax, QWORD PTR [rcx+352] - mov r8, QWORD PTR [r10+360] - mov QWORD PTR [r10+352], rax - sbb r8, QWORD PTR [rcx+360] - mov rax, QWORD PTR [r10+368] - mov QWORD PTR [r10+360], r8 - sbb rax, QWORD PTR [rcx+368] - mov r8, QWORD PTR [r10+376] - mov QWORD PTR [r10+368], rax - sbb r8, QWORD PTR [rcx+376] - mov rax, QWORD PTR [r10+384] - mov QWORD PTR [r10+376], r8 - sbb rax, QWORD PTR [rcx+384] - mov r8, QWORD PTR [r10+392] - mov QWORD PTR [r10+384], rax - sbb r8, QWORD PTR [rcx+392] - mov rax, QWORD PTR [r10+400] - mov QWORD PTR [r10+392], r8 - sbb rax, QWORD PTR [rcx+400] - mov r8, QWORD PTR [r10+408] - mov QWORD PTR [r10+400], rax - sbb r8, QWORD PTR [rcx+408] - mov rax, QWORD PTR [r10+416] - mov QWORD PTR [r10+408], r8 - sbb rax, QWORD PTR [rcx+416] - mov r8, QWORD PTR [r10+424] - mov QWORD PTR [r10+416], rax - sbb r8, QWORD PTR [rcx+424] - mov rax, QWORD PTR [r10+432] - mov QWORD PTR [r10+424], r8 - sbb rax, QWORD PTR [rcx+432] - mov r8, QWORD PTR [r10+440] - mov QWORD PTR [r10+432], rax - sbb r8, QWORD PTR [rcx+440] - mov rax, QWORD PTR [r10+448] - mov QWORD PTR [r10+440], r8 - sbb rax, QWORD PTR [rcx+448] - mov r8, QWORD PTR [r10+456] - mov QWORD PTR [r10+448], rax - sbb r8, QWORD PTR [rcx+456] - mov rax, QWORD PTR [r10+464] - mov QWORD PTR [r10+456], r8 - sbb rax, QWORD PTR [rcx+464] - mov r8, QWORD PTR [r10+472] - mov QWORD PTR [r10+464], rax - sbb r8, QWORD PTR [rcx+472] - mov rax, QWORD PTR [r10+480] - mov QWORD PTR [r10+472], r8 - sbb rax, QWORD PTR [rcx+480] - mov r8, QWORD PTR [r10+488] - mov QWORD PTR [r10+480], rax - sbb r8, QWORD PTR [rcx+488] - mov rax, QWORD PTR [r10+496] - mov QWORD PTR [r10+488], r8 - sbb rax, QWORD PTR [rcx+496] - mov r8, QWORD PTR [r10+504] - mov QWORD PTR [r10+496], rax - sbb r8, QWORD PTR [rcx+504] - mov QWORD PTR [r10+504], r8 + sub rdx, 512 + mov r8, QWORD PTR [r10+-256] + sub r8, QWORD PTR [rdx+-256] + mov rax, QWORD PTR [r10+-248] + mov QWORD PTR [r10+-256], r8 + sbb rax, QWORD PTR [rdx+-248] + mov r8, QWORD PTR [r10+-240] + mov QWORD PTR [r10+-248], rax + sbb r8, QWORD PTR [rdx+-240] + mov rax, QWORD PTR [r10+-232] + mov QWORD PTR [r10+-240], r8 + sbb rax, QWORD PTR [rdx+-232] + mov r8, QWORD PTR [r10+-224] + mov QWORD PTR [r10+-232], rax + sbb r8, QWORD PTR [rdx+-224] + mov rax, QWORD PTR [r10+-216] + mov QWORD PTR [r10+-224], r8 + sbb rax, QWORD PTR [rdx+-216] + mov r8, QWORD PTR [r10+-208] + mov QWORD PTR [r10+-216], rax + sbb r8, QWORD PTR [rdx+-208] + mov rax, QWORD PTR [r10+-200] + mov QWORD PTR [r10+-208], r8 + sbb rax, QWORD PTR [rdx+-200] + mov r8, QWORD PTR [r10+-192] + mov QWORD PTR [r10+-200], rax + sbb r8, QWORD PTR [rdx+-192] + mov rax, QWORD PTR [r10+-184] + mov QWORD PTR [r10+-192], r8 + sbb rax, QWORD PTR [rdx+-184] + mov r8, QWORD PTR [r10+-176] + mov QWORD PTR [r10+-184], rax + sbb r8, QWORD PTR [rdx+-176] + mov rax, QWORD PTR [r10+-168] + mov QWORD PTR [r10+-176], r8 + sbb rax, QWORD PTR [rdx+-168] + mov r8, QWORD PTR [r10+-160] + mov QWORD PTR [r10+-168], rax + sbb r8, QWORD PTR [rdx+-160] + mov rax, QWORD PTR [r10+-152] + mov QWORD PTR [r10+-160], r8 + sbb rax, QWORD PTR [rdx+-152] + mov r8, QWORD PTR [r10+-144] + mov QWORD PTR [r10+-152], rax + sbb r8, QWORD PTR [rdx+-144] + mov rax, QWORD PTR [r10+-136] + mov QWORD PTR [r10+-144], r8 + sbb rax, QWORD PTR [rdx+-136] + mov r8, QWORD PTR [r10+-128] + mov QWORD PTR [r10+-136], rax + sbb r8, QWORD PTR [rdx+-128] + mov rax, QWORD PTR [r10+-120] + mov QWORD PTR [r10+-128], r8 + sbb rax, QWORD PTR [rdx+-120] + mov r8, QWORD PTR [r10+-112] + mov QWORD PTR [r10+-120], rax + sbb r8, QWORD PTR [rdx+-112] + mov rax, QWORD PTR [r10+-104] + mov QWORD PTR [r10+-112], r8 + sbb rax, QWORD PTR [rdx+-104] + mov r8, QWORD PTR [r10+-96] + mov QWORD PTR [r10+-104], rax + sbb r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov r8, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], rax + sbb r8, QWORD PTR [rdx+96] + mov rax, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], r8 + sbb rax, QWORD PTR [rdx+104] + mov r8, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], rax + sbb r8, QWORD PTR [rdx+112] + mov rax, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], r8 + sbb rax, QWORD PTR [rdx+120] + mov r8, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], rax + sbb r8, QWORD PTR [rdx+128] + mov rax, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], r8 + sbb rax, QWORD PTR [rdx+136] + mov r8, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], rax + sbb r8, QWORD PTR [rdx+144] + mov rax, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], r8 + sbb rax, QWORD PTR [rdx+152] + mov r8, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], rax + sbb r8, QWORD PTR [rdx+160] + mov rax, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], r8 + sbb rax, QWORD PTR [rdx+168] + mov r8, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], rax + sbb r8, QWORD PTR [rdx+176] + mov rax, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], r8 + sbb rax, QWORD PTR [rdx+184] + mov r8, QWORD PTR [r10+192] + mov QWORD PTR [r10+184], rax + sbb r8, QWORD PTR [rdx+192] + mov rax, QWORD PTR [r10+200] + mov QWORD PTR [r10+192], r8 + sbb rax, QWORD PTR [rdx+200] + mov r8, QWORD PTR [r10+208] + mov QWORD PTR [r10+200], rax + sbb r8, QWORD PTR [rdx+208] + mov rax, QWORD PTR [r10+216] + mov QWORD PTR [r10+208], r8 + sbb rax, QWORD PTR [rdx+216] + mov r8, QWORD PTR [r10+224] + mov QWORD PTR [r10+216], rax + sbb r8, QWORD PTR [rdx+224] + mov rax, QWORD PTR [r10+232] + mov QWORD PTR [r10+224], r8 + sbb rax, QWORD PTR [rdx+232] + mov r8, QWORD PTR [r10+240] + mov QWORD PTR [r10+232], rax + sbb r8, QWORD PTR [rdx+240] + mov rax, QWORD PTR [r10+248] + mov QWORD PTR [r10+240], r8 + sbb rax, QWORD PTR [rdx+248] + mov QWORD PTR [r10+248], rax sbb r9, 0 - ; Add in place - mov rax, QWORD PTR [rcx+256] - add rax, QWORD PTR [r10] - mov r8, QWORD PTR [rcx+264] - mov QWORD PTR [rcx+256], rax - adc r8, QWORD PTR [r10+8] - mov rax, QWORD PTR [rcx+272] - mov QWORD PTR [rcx+264], r8 - adc rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [rcx+280] - mov QWORD PTR [rcx+272], rax - adc r8, QWORD PTR [r10+24] - mov rax, QWORD PTR [rcx+288] - mov QWORD PTR [rcx+280], r8 - adc rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [rcx+296] - mov QWORD PTR [rcx+288], rax - adc r8, QWORD PTR [r10+40] - mov rax, QWORD PTR [rcx+304] - mov QWORD PTR [rcx+296], r8 - adc rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [rcx+312] - mov QWORD PTR [rcx+304], rax - adc r8, QWORD PTR [r10+56] - mov rax, QWORD PTR [rcx+320] - mov QWORD PTR [rcx+312], r8 - adc rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [rcx+328] - mov QWORD PTR [rcx+320], rax - adc r8, QWORD PTR [r10+72] - mov rax, QWORD PTR [rcx+336] - mov QWORD PTR [rcx+328], r8 - adc rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [rcx+344] - mov QWORD PTR [rcx+336], rax - adc r8, QWORD PTR [r10+88] - mov rax, QWORD PTR [rcx+352] - mov QWORD PTR [rcx+344], r8 - adc rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [rcx+360] - mov QWORD PTR [rcx+352], rax - adc r8, QWORD PTR [r10+104] - mov rax, QWORD PTR [rcx+368] - mov QWORD PTR [rcx+360], r8 - adc rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [rcx+376] - mov QWORD PTR [rcx+368], rax - adc r8, QWORD PTR [r10+120] - mov rax, QWORD PTR [rcx+384] - mov QWORD PTR [rcx+376], r8 - adc rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [rcx+392] - mov QWORD PTR [rcx+384], rax - adc r8, QWORD PTR [r10+136] - mov rax, QWORD PTR [rcx+400] - mov QWORD PTR [rcx+392], r8 - adc rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [rcx+408] - mov QWORD PTR [rcx+400], rax - adc r8, QWORD PTR [r10+152] - mov rax, QWORD PTR [rcx+416] - mov QWORD PTR [rcx+408], r8 - adc rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [rcx+424] - mov QWORD PTR [rcx+416], rax - adc r8, QWORD PTR [r10+168] - mov rax, QWORD PTR [rcx+432] - mov QWORD PTR [rcx+424], r8 - adc rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [rcx+440] - mov QWORD PTR [rcx+432], rax - adc r8, QWORD PTR [r10+184] - mov rax, QWORD PTR [rcx+448] - mov QWORD PTR [rcx+440], r8 - adc rax, QWORD PTR [r10+192] - mov r8, QWORD PTR [rcx+456] - mov QWORD PTR [rcx+448], rax - adc r8, QWORD PTR [r10+200] - mov rax, QWORD PTR [rcx+464] - mov QWORD PTR [rcx+456], r8 - adc rax, QWORD PTR [r10+208] - mov r8, QWORD PTR [rcx+472] - mov QWORD PTR [rcx+464], rax - adc r8, QWORD PTR [r10+216] - mov rax, QWORD PTR [rcx+480] - mov QWORD PTR [rcx+472], r8 - adc rax, QWORD PTR [r10+224] - mov r8, QWORD PTR [rcx+488] - mov QWORD PTR [rcx+480], rax - adc r8, QWORD PTR [r10+232] - mov rax, QWORD PTR [rcx+496] - mov QWORD PTR [rcx+488], r8 - adc rax, QWORD PTR [r10+240] - mov r8, QWORD PTR [rcx+504] - mov QWORD PTR [rcx+496], rax - adc r8, QWORD PTR [r10+248] - mov rax, QWORD PTR [rcx+512] - mov QWORD PTR [rcx+504], r8 - adc rax, QWORD PTR [r10+256] - mov r8, QWORD PTR [rcx+520] - mov QWORD PTR [rcx+512], rax - adc r8, QWORD PTR [r10+264] - mov rax, QWORD PTR [rcx+528] - mov QWORD PTR [rcx+520], r8 - adc rax, QWORD PTR [r10+272] - mov r8, QWORD PTR [rcx+536] - mov QWORD PTR [rcx+528], rax - adc r8, QWORD PTR [r10+280] - mov rax, QWORD PTR [rcx+544] - mov QWORD PTR [rcx+536], r8 - adc rax, QWORD PTR [r10+288] - mov r8, QWORD PTR [rcx+552] - mov QWORD PTR [rcx+544], rax - adc r8, QWORD PTR [r10+296] - mov rax, QWORD PTR [rcx+560] - mov QWORD PTR [rcx+552], r8 - adc rax, QWORD PTR [r10+304] - mov r8, QWORD PTR [rcx+568] - mov QWORD PTR [rcx+560], rax - adc r8, QWORD PTR [r10+312] - mov rax, QWORD PTR [rcx+576] - mov QWORD PTR [rcx+568], r8 - adc rax, QWORD PTR [r10+320] - mov r8, QWORD PTR [rcx+584] - mov QWORD PTR [rcx+576], rax - adc r8, QWORD PTR [r10+328] - mov rax, QWORD PTR [rcx+592] - mov QWORD PTR [rcx+584], r8 - adc rax, QWORD PTR [r10+336] - mov r8, QWORD PTR [rcx+600] - mov QWORD PTR [rcx+592], rax - adc r8, QWORD PTR [r10+344] - mov rax, QWORD PTR [rcx+608] - mov QWORD PTR [rcx+600], r8 - adc rax, QWORD PTR [r10+352] - mov r8, QWORD PTR [rcx+616] - mov QWORD PTR [rcx+608], rax - adc r8, QWORD PTR [r10+360] - mov rax, QWORD PTR [rcx+624] - mov QWORD PTR [rcx+616], r8 - adc rax, QWORD PTR [r10+368] - mov r8, QWORD PTR [rcx+632] - mov QWORD PTR [rcx+624], rax - adc r8, QWORD PTR [r10+376] - mov rax, QWORD PTR [rcx+640] - mov QWORD PTR [rcx+632], r8 - adc rax, QWORD PTR [r10+384] - mov r8, QWORD PTR [rcx+648] - mov QWORD PTR [rcx+640], rax - adc r8, QWORD PTR [r10+392] - mov rax, QWORD PTR [rcx+656] - mov QWORD PTR [rcx+648], r8 - adc rax, QWORD PTR [r10+400] - mov r8, QWORD PTR [rcx+664] - mov QWORD PTR [rcx+656], rax - adc r8, QWORD PTR [r10+408] - mov rax, QWORD PTR [rcx+672] - mov QWORD PTR [rcx+664], r8 - adc rax, QWORD PTR [r10+416] - mov r8, QWORD PTR [rcx+680] - mov QWORD PTR [rcx+672], rax - adc r8, QWORD PTR [r10+424] - mov rax, QWORD PTR [rcx+688] - mov QWORD PTR [rcx+680], r8 - adc rax, QWORD PTR [r10+432] - mov r8, QWORD PTR [rcx+696] - mov QWORD PTR [rcx+688], rax - adc r8, QWORD PTR [r10+440] - mov rax, QWORD PTR [rcx+704] - mov QWORD PTR [rcx+696], r8 - adc rax, QWORD PTR [r10+448] - mov r8, QWORD PTR [rcx+712] - mov QWORD PTR [rcx+704], rax - adc r8, QWORD PTR [r10+456] - mov rax, QWORD PTR [rcx+720] - mov QWORD PTR [rcx+712], r8 - adc rax, QWORD PTR [r10+464] - mov r8, QWORD PTR [rcx+728] - mov QWORD PTR [rcx+720], rax - adc r8, QWORD PTR [r10+472] - mov rax, QWORD PTR [rcx+736] - mov QWORD PTR [rcx+728], r8 - adc rax, QWORD PTR [r10+480] - mov r8, QWORD PTR [rcx+744] - mov QWORD PTR [rcx+736], rax - adc r8, QWORD PTR [r10+488] - mov rax, QWORD PTR [rcx+752] - mov QWORD PTR [rcx+744], r8 - adc rax, QWORD PTR [r10+496] - mov r8, QWORD PTR [rcx+760] - mov QWORD PTR [rcx+752], rax - adc r8, QWORD PTR [r10+504] - mov QWORD PTR [rcx+760], r8 - adc r9, 0 - mov QWORD PTR [rcx+768], r9 - ; Add in place - mov rax, QWORD PTR [rcx+512] - xor r9, r9 - add rax, QWORD PTR [rdx] - mov r8, QWORD PTR [rcx+520] - mov QWORD PTR [rcx+512], rax - adc r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [rcx+528] - mov QWORD PTR [rcx+520], r8 - adc rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [rcx+536] - mov QWORD PTR [rcx+528], rax - adc r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [rcx+544] - mov QWORD PTR [rcx+536], r8 - adc rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [rcx+552] - mov QWORD PTR [rcx+544], rax - adc r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [rcx+560] - mov QWORD PTR [rcx+552], r8 - adc rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [rcx+568] - mov QWORD PTR [rcx+560], rax - adc r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [rcx+576] - mov QWORD PTR [rcx+568], r8 - adc rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [rcx+584] - mov QWORD PTR [rcx+576], rax - adc r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [rcx+592] - mov QWORD PTR [rcx+584], r8 - adc rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [rcx+600] - mov QWORD PTR [rcx+592], rax - adc r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [rcx+608] - mov QWORD PTR [rcx+600], r8 - adc rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [rcx+616] - mov QWORD PTR [rcx+608], rax - adc r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [rcx+624] - mov QWORD PTR [rcx+616], r8 - adc rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [rcx+632] - mov QWORD PTR [rcx+624], rax - adc r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [rcx+640] - mov QWORD PTR [rcx+632], r8 - adc rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [rcx+648] - mov QWORD PTR [rcx+640], rax - adc r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [rcx+656] - mov QWORD PTR [rcx+648], r8 - adc rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [rcx+664] - mov QWORD PTR [rcx+656], rax - adc r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [rcx+672] - mov QWORD PTR [rcx+664], r8 - adc rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [rcx+680] - mov QWORD PTR [rcx+672], rax - adc r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [rcx+688] - mov QWORD PTR [rcx+680], r8 - adc rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [rcx+696] - mov QWORD PTR [rcx+688], rax - adc r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [rcx+704] - mov QWORD PTR [rcx+696], r8 - adc rax, QWORD PTR [rdx+192] - mov r8, QWORD PTR [rcx+712] - mov QWORD PTR [rcx+704], rax - adc r8, QWORD PTR [rdx+200] - mov rax, QWORD PTR [rcx+720] - mov QWORD PTR [rcx+712], r8 - adc rax, QWORD PTR [rdx+208] - mov r8, QWORD PTR [rcx+728] - mov QWORD PTR [rcx+720], rax - adc r8, QWORD PTR [rdx+216] - mov rax, QWORD PTR [rcx+736] - mov QWORD PTR [rcx+728], r8 - adc rax, QWORD PTR [rdx+224] - mov r8, QWORD PTR [rcx+744] - mov QWORD PTR [rcx+736], rax - adc r8, QWORD PTR [rdx+232] - mov rax, QWORD PTR [rcx+752] - mov QWORD PTR [rcx+744], r8 - adc rax, QWORD PTR [rdx+240] - mov r8, QWORD PTR [rcx+760] - mov QWORD PTR [rcx+752], rax - adc r8, QWORD PTR [rdx+248] - mov rax, QWORD PTR [rcx+768] - mov QWORD PTR [rcx+760], r8 - adc rax, QWORD PTR [rdx+256] - mov QWORD PTR [rcx+768], rax - adc r9, 0 - ; Add to zero - mov rax, QWORD PTR [rdx+264] + mov rcx, QWORD PTR [rsp+512] + neg r9 + add rcx, 512 + mov r8, QWORD PTR [rcx+-256] + sub r8, QWORD PTR [r10+-256] + mov rax, QWORD PTR [rcx+-248] + mov QWORD PTR [rcx+-256], r8 + sbb rax, QWORD PTR [r10+-248] + mov r8, QWORD PTR [rcx+-240] + mov QWORD PTR [rcx+-248], rax + sbb r8, QWORD PTR [r10+-240] + mov rax, QWORD PTR [rcx+-232] + mov QWORD PTR [rcx+-240], r8 + sbb rax, QWORD PTR [r10+-232] + mov r8, QWORD PTR [rcx+-224] + mov QWORD PTR [rcx+-232], rax + sbb r8, QWORD PTR [r10+-224] + mov rax, QWORD PTR [rcx+-216] + mov QWORD PTR [rcx+-224], r8 + sbb rax, QWORD PTR [r10+-216] + mov r8, QWORD PTR [rcx+-208] + mov QWORD PTR [rcx+-216], rax + sbb r8, QWORD PTR [r10+-208] + mov rax, QWORD PTR [rcx+-200] + mov QWORD PTR [rcx+-208], r8 + sbb rax, QWORD PTR [r10+-200] + mov r8, QWORD PTR [rcx+-192] + mov QWORD PTR [rcx+-200], rax + sbb r8, QWORD PTR [r10+-192] + mov rax, QWORD PTR [rcx+-184] + mov QWORD PTR [rcx+-192], r8 + sbb rax, QWORD PTR [r10+-184] + mov r8, QWORD PTR [rcx+-176] + mov QWORD PTR [rcx+-184], rax + sbb r8, QWORD PTR [r10+-176] + mov rax, QWORD PTR [rcx+-168] + mov QWORD PTR [rcx+-176], r8 + sbb rax, QWORD PTR [r10+-168] + mov r8, QWORD PTR [rcx+-160] + mov QWORD PTR [rcx+-168], rax + sbb r8, QWORD PTR [r10+-160] + mov rax, QWORD PTR [rcx+-152] + mov QWORD PTR [rcx+-160], r8 + sbb rax, QWORD PTR [r10+-152] + mov r8, QWORD PTR [rcx+-144] + mov QWORD PTR [rcx+-152], rax + sbb r8, QWORD PTR [r10+-144] + mov rax, QWORD PTR [rcx+-136] + mov QWORD PTR [rcx+-144], r8 + sbb rax, QWORD PTR [r10+-136] + mov r8, QWORD PTR [rcx+-128] + mov QWORD PTR [rcx+-136], rax + sbb r8, QWORD PTR [r10+-128] + mov rax, QWORD PTR [rcx+-120] + mov QWORD PTR [rcx+-128], r8 + sbb rax, QWORD PTR [r10+-120] + mov r8, QWORD PTR [rcx+-112] + mov QWORD PTR [rcx+-120], rax + sbb r8, QWORD PTR [r10+-112] + mov rax, QWORD PTR [rcx+-104] + mov QWORD PTR [rcx+-112], r8 + sbb rax, QWORD PTR [r10+-104] + mov r8, QWORD PTR [rcx+-96] + mov QWORD PTR [rcx+-104], rax + sbb r8, QWORD PTR [r10+-96] + mov rax, QWORD PTR [rcx+-88] + mov QWORD PTR [rcx+-96], r8 + sbb rax, QWORD PTR [r10+-88] + mov r8, QWORD PTR [rcx+-80] + mov QWORD PTR [rcx+-88], rax + sbb r8, QWORD PTR [r10+-80] + mov rax, QWORD PTR [rcx+-72] + mov QWORD PTR [rcx+-80], r8 + sbb rax, QWORD PTR [r10+-72] + mov r8, QWORD PTR [rcx+-64] + mov QWORD PTR [rcx+-72], rax + sbb r8, QWORD PTR [r10+-64] + mov rax, QWORD PTR [rcx+-56] + mov QWORD PTR [rcx+-64], r8 + sbb rax, QWORD PTR [r10+-56] + mov r8, QWORD PTR [rcx+-48] + mov QWORD PTR [rcx+-56], rax + sbb r8, QWORD PTR [r10+-48] + mov rax, QWORD PTR [rcx+-40] + mov QWORD PTR [rcx+-48], r8 + sbb rax, QWORD PTR [r10+-40] + mov r8, QWORD PTR [rcx+-32] + mov QWORD PTR [rcx+-40], rax + sbb r8, QWORD PTR [r10+-32] + mov rax, QWORD PTR [rcx+-24] + mov QWORD PTR [rcx+-32], r8 + sbb rax, QWORD PTR [r10+-24] + mov r8, QWORD PTR [rcx+-16] + mov QWORD PTR [rcx+-24], rax + sbb r8, QWORD PTR [r10+-16] + mov rax, QWORD PTR [rcx+-8] + mov QWORD PTR [rcx+-16], r8 + sbb rax, QWORD PTR [r10+-8] + mov r8, QWORD PTR [rcx] + mov QWORD PTR [rcx+-8], rax + sbb r8, QWORD PTR [r10] + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 + sbb rax, QWORD PTR [r10+8] + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax + sbb r8, QWORD PTR [r10+16] + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 + sbb rax, QWORD PTR [r10+24] + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax + sbb r8, QWORD PTR [r10+32] + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 + sbb rax, QWORD PTR [r10+40] + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax + sbb r8, QWORD PTR [r10+48] + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 + sbb rax, QWORD PTR [r10+56] + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax + sbb r8, QWORD PTR [r10+64] + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 + sbb rax, QWORD PTR [r10+72] + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax + sbb r8, QWORD PTR [r10+80] + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 + sbb rax, QWORD PTR [r10+88] + mov r8, QWORD PTR [rcx+96] + mov QWORD PTR [rcx+88], rax + sbb r8, QWORD PTR [r10+96] + mov rax, QWORD PTR [rcx+104] + mov QWORD PTR [rcx+96], r8 + sbb rax, QWORD PTR [r10+104] + mov r8, QWORD PTR [rcx+112] + mov QWORD PTR [rcx+104], rax + sbb r8, QWORD PTR [r10+112] + mov rax, QWORD PTR [rcx+120] + mov QWORD PTR [rcx+112], r8 + sbb rax, QWORD PTR [r10+120] + mov r8, QWORD PTR [rcx+128] + mov QWORD PTR [rcx+120], rax + sbb r8, QWORD PTR [r10+128] + mov rax, QWORD PTR [rcx+136] + mov QWORD PTR [rcx+128], r8 + sbb rax, QWORD PTR [r10+136] + mov r8, QWORD PTR [rcx+144] + mov QWORD PTR [rcx+136], rax + sbb r8, QWORD PTR [r10+144] + mov rax, QWORD PTR [rcx+152] + mov QWORD PTR [rcx+144], r8 + sbb rax, QWORD PTR [r10+152] + mov r8, QWORD PTR [rcx+160] + mov QWORD PTR [rcx+152], rax + sbb r8, QWORD PTR [r10+160] + mov rax, QWORD PTR [rcx+168] + mov QWORD PTR [rcx+160], r8 + sbb rax, QWORD PTR [r10+168] + mov r8, QWORD PTR [rcx+176] + mov QWORD PTR [rcx+168], rax + sbb r8, QWORD PTR [r10+176] + mov rax, QWORD PTR [rcx+184] + mov QWORD PTR [rcx+176], r8 + sbb rax, QWORD PTR [r10+184] + mov r8, QWORD PTR [rcx+192] + mov QWORD PTR [rcx+184], rax + sbb r8, QWORD PTR [r10+192] + mov rax, QWORD PTR [rcx+200] + mov QWORD PTR [rcx+192], r8 + sbb rax, QWORD PTR [r10+200] + mov r8, QWORD PTR [rcx+208] + mov QWORD PTR [rcx+200], rax + sbb r8, QWORD PTR [r10+208] + mov rax, QWORD PTR [rcx+216] + mov QWORD PTR [rcx+208], r8 + sbb rax, QWORD PTR [r10+216] + mov r8, QWORD PTR [rcx+224] + mov QWORD PTR [rcx+216], rax + sbb r8, QWORD PTR [r10+224] + mov rax, QWORD PTR [rcx+232] + mov QWORD PTR [rcx+224], r8 + sbb rax, QWORD PTR [r10+232] + mov r8, QWORD PTR [rcx+240] + mov QWORD PTR [rcx+232], rax + sbb r8, QWORD PTR [r10+240] + mov rax, QWORD PTR [rcx+248] + mov QWORD PTR [rcx+240], r8 + sbb rax, QWORD PTR [r10+248] + mov QWORD PTR [rcx+248], rax + sbb r9, 0 + mov rcx, QWORD PTR [rsp+512] + add rcx, 768 + ; Add in word + mov r8, QWORD PTR [rcx] + add r8, r9 + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+272] - mov QWORD PTR [rcx+776], rax + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax adc r8, 0 - mov rax, QWORD PTR [rdx+280] - mov QWORD PTR [rcx+784], r8 + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+288] - mov QWORD PTR [rcx+792], rax + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax adc r8, 0 - mov rax, QWORD PTR [rdx+296] - mov QWORD PTR [rcx+800], r8 + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+304] - mov QWORD PTR [rcx+808], rax + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax adc r8, 0 - mov rax, QWORD PTR [rdx+312] - mov QWORD PTR [rcx+816], r8 + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+320] - mov QWORD PTR [rcx+824], rax + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax adc r8, 0 - mov rax, QWORD PTR [rdx+328] - mov QWORD PTR [rcx+832], r8 + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+336] - mov QWORD PTR [rcx+840], rax + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax adc r8, 0 - mov rax, QWORD PTR [rdx+344] - mov QWORD PTR [rcx+848], r8 + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+352] - mov QWORD PTR [rcx+856], rax + mov r8, QWORD PTR [rcx+96] + mov QWORD PTR [rcx+88], rax adc r8, 0 - mov rax, QWORD PTR [rdx+360] - mov QWORD PTR [rcx+864], r8 + mov rax, QWORD PTR [rcx+104] + mov QWORD PTR [rcx+96], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+368] - mov QWORD PTR [rcx+872], rax + mov r8, QWORD PTR [rcx+112] + mov QWORD PTR [rcx+104], rax adc r8, 0 - mov rax, QWORD PTR [rdx+376] - mov QWORD PTR [rcx+880], r8 + mov rax, QWORD PTR [rcx+120] + mov QWORD PTR [rcx+112], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+384] - mov QWORD PTR [rcx+888], rax + mov r8, QWORD PTR [rcx+128] + mov QWORD PTR [rcx+120], rax adc r8, 0 - mov rax, QWORD PTR [rdx+392] - mov QWORD PTR [rcx+896], r8 + mov rax, QWORD PTR [rcx+136] + mov QWORD PTR [rcx+128], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+400] - mov QWORD PTR [rcx+904], rax + mov r8, QWORD PTR [rcx+144] + mov QWORD PTR [rcx+136], rax adc r8, 0 - mov rax, QWORD PTR [rdx+408] - mov QWORD PTR [rcx+912], r8 + mov rax, QWORD PTR [rcx+152] + mov QWORD PTR [rcx+144], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+416] - mov QWORD PTR [rcx+920], rax + mov r8, QWORD PTR [rcx+160] + mov QWORD PTR [rcx+152], rax adc r8, 0 - mov rax, QWORD PTR [rdx+424] - mov QWORD PTR [rcx+928], r8 + mov rax, QWORD PTR [rcx+168] + mov QWORD PTR [rcx+160], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+432] - mov QWORD PTR [rcx+936], rax + mov r8, QWORD PTR [rcx+176] + mov QWORD PTR [rcx+168], rax adc r8, 0 - mov rax, QWORD PTR [rdx+440] - mov QWORD PTR [rcx+944], r8 + mov rax, QWORD PTR [rcx+184] + mov QWORD PTR [rcx+176], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+448] - mov QWORD PTR [rcx+952], rax + mov r8, QWORD PTR [rcx+192] + mov QWORD PTR [rcx+184], rax adc r8, 0 - mov rax, QWORD PTR [rdx+456] - mov QWORD PTR [rcx+960], r8 + mov rax, QWORD PTR [rcx+200] + mov QWORD PTR [rcx+192], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+464] - mov QWORD PTR [rcx+968], rax + mov r8, QWORD PTR [rcx+208] + mov QWORD PTR [rcx+200], rax adc r8, 0 - mov rax, QWORD PTR [rdx+472] - mov QWORD PTR [rcx+976], r8 + mov rax, QWORD PTR [rcx+216] + mov QWORD PTR [rcx+208], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+480] - mov QWORD PTR [rcx+984], rax + mov r8, QWORD PTR [rcx+224] + mov QWORD PTR [rcx+216], rax adc r8, 0 - mov rax, QWORD PTR [rdx+488] - mov QWORD PTR [rcx+992], r8 + mov rax, QWORD PTR [rcx+232] + mov QWORD PTR [rcx+224], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+496] - mov QWORD PTR [rcx+1000], rax + mov r8, QWORD PTR [rcx+240] + mov QWORD PTR [rcx+232], rax adc r8, 0 - mov rax, QWORD PTR [rdx+504] - mov QWORD PTR [rcx+1008], r8 + mov rax, QWORD PTR [rcx+248] + mov QWORD PTR [rcx+240], r8 adc rax, 0 - mov QWORD PTR [rcx+1016], rax - add rsp, 1304 - pop r12 + mov QWORD PTR [rcx+248], rax + mov rdx, QWORD PTR [rsp+520] + mov rcx, QWORD PTR [rsp+512] + add rsp, 528 ret sp_4096_sqr_64 ENDP _text ENDS IFDEF HAVE_INTEL_AVX2 ; /* Square a and put result in r. (r = a * a) ; * +; * Karatsuba: ah^2, al^2, (al - ah)^2 +; * ; * r A single precision integer. ; * a A single precision integer. ; */ _text SEGMENT READONLY PARA sp_4096_sqr_avx2_64 PROC - push r12 - sub rsp, 1304 - mov QWORD PTR [rsp+1280], rcx - mov QWORD PTR [rsp+1288], rdx - lea r10, QWORD PTR [rsp+1024] + sub rsp, 528 + mov QWORD PTR [rsp+512], rcx + mov QWORD PTR [rsp+520], rdx + mov r9, 0 + mov r10, rsp lea r11, QWORD PTR [rdx+256] - ; Add mov rax, QWORD PTR [rdx] - xor r9, r9 - add rax, QWORD PTR [r11] + sub rax, QWORD PTR [r11] mov r8, QWORD PTR [rdx+8] mov QWORD PTR [r10], rax - adc r8, QWORD PTR [r11+8] + sbb r8, QWORD PTR [r11+8] mov rax, QWORD PTR [rdx+16] mov QWORD PTR [r10+8], r8 - adc rax, QWORD PTR [r11+16] + sbb rax, QWORD PTR [r11+16] mov r8, QWORD PTR [rdx+24] mov QWORD PTR [r10+16], rax - adc r8, QWORD PTR [r11+24] + sbb r8, QWORD PTR [r11+24] mov rax, QWORD PTR [rdx+32] mov QWORD PTR [r10+24], r8 - adc rax, QWORD PTR [r11+32] + sbb rax, QWORD PTR [r11+32] mov r8, QWORD PTR [rdx+40] mov QWORD PTR [r10+32], rax - adc r8, QWORD PTR [r11+40] + sbb r8, QWORD PTR [r11+40] mov rax, QWORD PTR [rdx+48] mov QWORD PTR [r10+40], r8 - adc rax, QWORD PTR [r11+48] + sbb rax, QWORD PTR [r11+48] mov r8, QWORD PTR [rdx+56] mov QWORD PTR [r10+48], rax - adc r8, QWORD PTR [r11+56] + sbb r8, QWORD PTR [r11+56] mov rax, QWORD PTR [rdx+64] mov QWORD PTR [r10+56], r8 - adc rax, QWORD PTR [r11+64] + sbb rax, QWORD PTR [r11+64] mov r8, QWORD PTR [rdx+72] mov QWORD PTR [r10+64], rax - adc r8, QWORD PTR [r11+72] + sbb r8, QWORD PTR [r11+72] mov rax, QWORD PTR [rdx+80] mov QWORD PTR [r10+72], r8 - adc rax, QWORD PTR [r11+80] + sbb rax, QWORD PTR [r11+80] mov r8, QWORD PTR [rdx+88] mov QWORD PTR [r10+80], rax - adc r8, QWORD PTR [r11+88] + sbb r8, QWORD PTR [r11+88] mov rax, QWORD PTR [rdx+96] mov QWORD PTR [r10+88], r8 - adc rax, QWORD PTR [r11+96] + sbb rax, QWORD PTR [r11+96] mov r8, QWORD PTR [rdx+104] mov QWORD PTR [r10+96], rax - adc r8, QWORD PTR [r11+104] + sbb r8, QWORD PTR [r11+104] mov rax, QWORD PTR [rdx+112] mov QWORD PTR [r10+104], r8 - adc rax, QWORD PTR [r11+112] + sbb rax, QWORD PTR [r11+112] mov r8, QWORD PTR [rdx+120] mov QWORD PTR [r10+112], rax - adc r8, QWORD PTR [r11+120] + sbb r8, QWORD PTR [r11+120] mov rax, QWORD PTR [rdx+128] mov QWORD PTR [r10+120], r8 - adc rax, QWORD PTR [r11+128] + sbb rax, QWORD PTR [r11+128] mov r8, QWORD PTR [rdx+136] mov QWORD PTR [r10+128], rax - adc r8, QWORD PTR [r11+136] + sbb r8, QWORD PTR [r11+136] mov rax, QWORD PTR [rdx+144] mov QWORD PTR [r10+136], r8 - adc rax, QWORD PTR [r11+144] + sbb rax, QWORD PTR [r11+144] mov r8, QWORD PTR [rdx+152] mov QWORD PTR [r10+144], rax - adc r8, QWORD PTR [r11+152] + sbb r8, QWORD PTR [r11+152] mov rax, QWORD PTR [rdx+160] mov QWORD PTR [r10+152], r8 - adc rax, QWORD PTR [r11+160] + sbb rax, QWORD PTR [r11+160] mov r8, QWORD PTR [rdx+168] mov QWORD PTR [r10+160], rax - adc r8, QWORD PTR [r11+168] + sbb r8, QWORD PTR [r11+168] mov rax, QWORD PTR [rdx+176] mov QWORD PTR [r10+168], r8 - adc rax, QWORD PTR [r11+176] + sbb rax, QWORD PTR [r11+176] mov r8, QWORD PTR [rdx+184] mov QWORD PTR [r10+176], rax - adc r8, QWORD PTR [r11+184] + sbb r8, QWORD PTR [r11+184] mov rax, QWORD PTR [rdx+192] mov QWORD PTR [r10+184], r8 - adc rax, QWORD PTR [r11+192] + sbb rax, QWORD PTR [r11+192] mov r8, QWORD PTR [rdx+200] mov QWORD PTR [r10+192], rax - adc r8, QWORD PTR [r11+200] + sbb r8, QWORD PTR [r11+200] mov rax, QWORD PTR [rdx+208] mov QWORD PTR [r10+200], r8 - adc rax, QWORD PTR [r11+208] + sbb rax, QWORD PTR [r11+208] mov r8, QWORD PTR [rdx+216] mov QWORD PTR [r10+208], rax - adc r8, QWORD PTR [r11+216] + sbb r8, QWORD PTR [r11+216] mov rax, QWORD PTR [rdx+224] mov QWORD PTR [r10+216], r8 - adc rax, QWORD PTR [r11+224] + sbb rax, QWORD PTR [r11+224] mov r8, QWORD PTR [rdx+232] mov QWORD PTR [r10+224], rax - adc r8, QWORD PTR [r11+232] + sbb r8, QWORD PTR [r11+232] mov rax, QWORD PTR [rdx+240] mov QWORD PTR [r10+232], r8 - adc rax, QWORD PTR [r11+240] + sbb rax, QWORD PTR [r11+240] mov r8, QWORD PTR [rdx+248] mov QWORD PTR [r10+240], rax - adc r8, QWORD PTR [r11+248] + sbb r8, QWORD PTR [r11+248] + mov QWORD PTR [r10+248], r8 + sbb r9, 0 + ; Cond Negate + mov rax, QWORD PTR [r10] + mov r11, r9 + xor rax, r9 + neg r11 + sub rax, r9 + mov r8, QWORD PTR [r10+8] + sbb r11, 0 + mov QWORD PTR [r10], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+16] + setc r11b + mov QWORD PTR [r10+8], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+24] + setc r11b + mov QWORD PTR [r10+16], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+32] + setc r11b + mov QWORD PTR [r10+24], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+40] + setc r11b + mov QWORD PTR [r10+32], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+48] + setc r11b + mov QWORD PTR [r10+40], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+56] + setc r11b + mov QWORD PTR [r10+48], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+64] + setc r11b + mov QWORD PTR [r10+56], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+72] + setc r11b + mov QWORD PTR [r10+64], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+80] + setc r11b + mov QWORD PTR [r10+72], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+88] + setc r11b + mov QWORD PTR [r10+80], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+96] + setc r11b + mov QWORD PTR [r10+88], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+104] + setc r11b + mov QWORD PTR [r10+96], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+112] + setc r11b + mov QWORD PTR [r10+104], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+120] + setc r11b + mov QWORD PTR [r10+112], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+128] + setc r11b + mov QWORD PTR [r10+120], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+136] + setc r11b + mov QWORD PTR [r10+128], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+144] + setc r11b + mov QWORD PTR [r10+136], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+152] + setc r11b + mov QWORD PTR [r10+144], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+160] + setc r11b + mov QWORD PTR [r10+152], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+168] + setc r11b + mov QWORD PTR [r10+160], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+176] + setc r11b + mov QWORD PTR [r10+168], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+184] + setc r11b + mov QWORD PTR [r10+176], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+192] + setc r11b + mov QWORD PTR [r10+184], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+200] + setc r11b + mov QWORD PTR [r10+192], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+208] + setc r11b + mov QWORD PTR [r10+200], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+216] + setc r11b + mov QWORD PTR [r10+208], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+224] + setc r11b + mov QWORD PTR [r10+216], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+232] + setc r11b + mov QWORD PTR [r10+224], rax + xor r8, r9 + add r8, r11 + mov rax, QWORD PTR [r10+240] + setc r11b + mov QWORD PTR [r10+232], r8 + xor rax, r9 + add rax, r11 + mov r8, QWORD PTR [r10+248] + setc r11b + mov QWORD PTR [r10+240], rax + xor r8, r9 + add r8, r11 mov QWORD PTR [r10+248], r8 - adc r9, 0 - mov QWORD PTR [rsp+1296], r9 mov rdx, r10 mov rcx, rsp call sp_2048_sqr_avx2_32 - mov rdx, QWORD PTR [rsp+1288] - lea rcx, QWORD PTR [rsp+512] + mov rdx, QWORD PTR [rsp+520] + mov rcx, QWORD PTR [rsp+512] add rdx, 256 + add rcx, 512 call sp_2048_sqr_avx2_32 - mov rdx, QWORD PTR [rsp+1288] - mov rcx, QWORD PTR [rsp+1280] + mov rdx, QWORD PTR [rsp+520] + mov rcx, QWORD PTR [rsp+512] call sp_2048_sqr_avx2_32 IFDEF _WIN64 - mov rdx, QWORD PTR [rsp+1288] - mov rcx, QWORD PTR [rsp+1280] + mov rdx, QWORD PTR [rsp+520] + mov rcx, QWORD PTR [rsp+512] ENDIF - mov r12, QWORD PTR [rsp+1296] - lea r10, QWORD PTR [rsp+1024] - mov r9, r12 - neg r12 - mov rax, QWORD PTR [r10] - pext rax, rax, r12 - add rax, rax - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [rcx+512], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [rcx+520], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [rcx+528], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [rcx+536], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [rcx+544], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [rcx+552], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [rcx+560], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [rcx+568], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [rcx+576], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [rcx+584], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [rcx+592], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [rcx+600], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [rcx+608], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [rcx+616], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [rcx+624], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [rcx+632], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [rcx+640], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [rcx+648], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [rcx+656], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [rcx+664], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [rcx+672], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [rcx+680], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [rcx+688], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [rcx+696], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [rcx+704], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [rcx+712], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [rcx+720], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [rcx+728], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [rcx+736], rax - pext r8, r8, r12 - adc r8, r8 - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [rcx+744], r8 - pext rax, rax, r12 - adc rax, rax - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [rcx+752], rax - pext r8, r8, r12 - adc r8, r8 - mov QWORD PTR [rcx+760], r8 - adc r9, 0 - lea rdx, QWORD PTR [rsp+512] - mov r10, rsp - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rdx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rdx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rdx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rdx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rdx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rdx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rdx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rdx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rdx+248] - mov rax, QWORD PTR [r10+256] - mov QWORD PTR [r10+248], r8 - sbb rax, QWORD PTR [rdx+256] - mov r8, QWORD PTR [r10+264] - mov QWORD PTR [r10+256], rax - sbb r8, QWORD PTR [rdx+264] - mov rax, QWORD PTR [r10+272] - mov QWORD PTR [r10+264], r8 - sbb rax, QWORD PTR [rdx+272] - mov r8, QWORD PTR [r10+280] - mov QWORD PTR [r10+272], rax - sbb r8, QWORD PTR [rdx+280] - mov rax, QWORD PTR [r10+288] - mov QWORD PTR [r10+280], r8 - sbb rax, QWORD PTR [rdx+288] - mov r8, QWORD PTR [r10+296] - mov QWORD PTR [r10+288], rax - sbb r8, QWORD PTR [rdx+296] - mov rax, QWORD PTR [r10+304] - mov QWORD PTR [r10+296], r8 - sbb rax, QWORD PTR [rdx+304] - mov r8, QWORD PTR [r10+312] - mov QWORD PTR [r10+304], rax - sbb r8, QWORD PTR [rdx+312] - mov rax, QWORD PTR [r10+320] - mov QWORD PTR [r10+312], r8 - sbb rax, QWORD PTR [rdx+320] - mov r8, QWORD PTR [r10+328] - mov QWORD PTR [r10+320], rax - sbb r8, QWORD PTR [rdx+328] - mov rax, QWORD PTR [r10+336] - mov QWORD PTR [r10+328], r8 - sbb rax, QWORD PTR [rdx+336] - mov r8, QWORD PTR [r10+344] - mov QWORD PTR [r10+336], rax - sbb r8, QWORD PTR [rdx+344] - mov rax, QWORD PTR [r10+352] - mov QWORD PTR [r10+344], r8 - sbb rax, QWORD PTR [rdx+352] - mov r8, QWORD PTR [r10+360] - mov QWORD PTR [r10+352], rax - sbb r8, QWORD PTR [rdx+360] - mov rax, QWORD PTR [r10+368] - mov QWORD PTR [r10+360], r8 - sbb rax, QWORD PTR [rdx+368] - mov r8, QWORD PTR [r10+376] - mov QWORD PTR [r10+368], rax - sbb r8, QWORD PTR [rdx+376] - mov rax, QWORD PTR [r10+384] - mov QWORD PTR [r10+376], r8 - sbb rax, QWORD PTR [rdx+384] - mov r8, QWORD PTR [r10+392] - mov QWORD PTR [r10+384], rax - sbb r8, QWORD PTR [rdx+392] - mov rax, QWORD PTR [r10+400] - mov QWORD PTR [r10+392], r8 - sbb rax, QWORD PTR [rdx+400] - mov r8, QWORD PTR [r10+408] - mov QWORD PTR [r10+400], rax - sbb r8, QWORD PTR [rdx+408] - mov rax, QWORD PTR [r10+416] - mov QWORD PTR [r10+408], r8 - sbb rax, QWORD PTR [rdx+416] - mov r8, QWORD PTR [r10+424] - mov QWORD PTR [r10+416], rax - sbb r8, QWORD PTR [rdx+424] - mov rax, QWORD PTR [r10+432] - mov QWORD PTR [r10+424], r8 - sbb rax, QWORD PTR [rdx+432] - mov r8, QWORD PTR [r10+440] - mov QWORD PTR [r10+432], rax - sbb r8, QWORD PTR [rdx+440] - mov rax, QWORD PTR [r10+448] - mov QWORD PTR [r10+440], r8 - sbb rax, QWORD PTR [rdx+448] - mov r8, QWORD PTR [r10+456] - mov QWORD PTR [r10+448], rax - sbb r8, QWORD PTR [rdx+456] - mov rax, QWORD PTR [r10+464] - mov QWORD PTR [r10+456], r8 - sbb rax, QWORD PTR [rdx+464] - mov r8, QWORD PTR [r10+472] - mov QWORD PTR [r10+464], rax - sbb r8, QWORD PTR [rdx+472] - mov rax, QWORD PTR [r10+480] - mov QWORD PTR [r10+472], r8 - sbb rax, QWORD PTR [rdx+480] - mov r8, QWORD PTR [r10+488] - mov QWORD PTR [r10+480], rax - sbb r8, QWORD PTR [rdx+488] - mov rax, QWORD PTR [r10+496] - mov QWORD PTR [r10+488], r8 - sbb rax, QWORD PTR [rdx+496] - mov r8, QWORD PTR [r10+504] - mov QWORD PTR [r10+496], rax - sbb r8, QWORD PTR [rdx+504] - mov QWORD PTR [r10+504], r8 + mov rdx, QWORD PTR [rsp+512] + lea r10, QWORD PTR [rsp+256] + add rdx, 768 + mov r9, 0 + mov r8, QWORD PTR [r10+-256] + sub r8, QWORD PTR [rdx+-256] + mov rax, QWORD PTR [r10+-248] + mov QWORD PTR [r10+-256], r8 + sbb rax, QWORD PTR [rdx+-248] + mov r8, QWORD PTR [r10+-240] + mov QWORD PTR [r10+-248], rax + sbb r8, QWORD PTR [rdx+-240] + mov rax, QWORD PTR [r10+-232] + mov QWORD PTR [r10+-240], r8 + sbb rax, QWORD PTR [rdx+-232] + mov r8, QWORD PTR [r10+-224] + mov QWORD PTR [r10+-232], rax + sbb r8, QWORD PTR [rdx+-224] + mov rax, QWORD PTR [r10+-216] + mov QWORD PTR [r10+-224], r8 + sbb rax, QWORD PTR [rdx+-216] + mov r8, QWORD PTR [r10+-208] + mov QWORD PTR [r10+-216], rax + sbb r8, QWORD PTR [rdx+-208] + mov rax, QWORD PTR [r10+-200] + mov QWORD PTR [r10+-208], r8 + sbb rax, QWORD PTR [rdx+-200] + mov r8, QWORD PTR [r10+-192] + mov QWORD PTR [r10+-200], rax + sbb r8, QWORD PTR [rdx+-192] + mov rax, QWORD PTR [r10+-184] + mov QWORD PTR [r10+-192], r8 + sbb rax, QWORD PTR [rdx+-184] + mov r8, QWORD PTR [r10+-176] + mov QWORD PTR [r10+-184], rax + sbb r8, QWORD PTR [rdx+-176] + mov rax, QWORD PTR [r10+-168] + mov QWORD PTR [r10+-176], r8 + sbb rax, QWORD PTR [rdx+-168] + mov r8, QWORD PTR [r10+-160] + mov QWORD PTR [r10+-168], rax + sbb r8, QWORD PTR [rdx+-160] + mov rax, QWORD PTR [r10+-152] + mov QWORD PTR [r10+-160], r8 + sbb rax, QWORD PTR [rdx+-152] + mov r8, QWORD PTR [r10+-144] + mov QWORD PTR [r10+-152], rax + sbb r8, QWORD PTR [rdx+-144] + mov rax, QWORD PTR [r10+-136] + mov QWORD PTR [r10+-144], r8 + sbb rax, QWORD PTR [rdx+-136] + mov r8, QWORD PTR [r10+-128] + mov QWORD PTR [r10+-136], rax + sbb r8, QWORD PTR [rdx+-128] + mov rax, QWORD PTR [r10+-120] + mov QWORD PTR [r10+-128], r8 + sbb rax, QWORD PTR [rdx+-120] + mov r8, QWORD PTR [r10+-112] + mov QWORD PTR [r10+-120], rax + sbb r8, QWORD PTR [rdx+-112] + mov rax, QWORD PTR [r10+-104] + mov QWORD PTR [r10+-112], r8 + sbb rax, QWORD PTR [rdx+-104] + mov r8, QWORD PTR [r10+-96] + mov QWORD PTR [r10+-104], rax + sbb r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov r8, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], rax + sbb r8, QWORD PTR [rdx+96] + mov rax, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], r8 + sbb rax, QWORD PTR [rdx+104] + mov r8, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], rax + sbb r8, QWORD PTR [rdx+112] + mov rax, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], r8 + sbb rax, QWORD PTR [rdx+120] + mov r8, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], rax + sbb r8, QWORD PTR [rdx+128] + mov rax, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], r8 + sbb rax, QWORD PTR [rdx+136] + mov r8, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], rax + sbb r8, QWORD PTR [rdx+144] + mov rax, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], r8 + sbb rax, QWORD PTR [rdx+152] + mov r8, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], rax + sbb r8, QWORD PTR [rdx+160] + mov rax, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], r8 + sbb rax, QWORD PTR [rdx+168] + mov r8, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], rax + sbb r8, QWORD PTR [rdx+176] + mov rax, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], r8 + sbb rax, QWORD PTR [rdx+184] + mov r8, QWORD PTR [r10+192] + mov QWORD PTR [r10+184], rax + sbb r8, QWORD PTR [rdx+192] + mov rax, QWORD PTR [r10+200] + mov QWORD PTR [r10+192], r8 + sbb rax, QWORD PTR [rdx+200] + mov r8, QWORD PTR [r10+208] + mov QWORD PTR [r10+200], rax + sbb r8, QWORD PTR [rdx+208] + mov rax, QWORD PTR [r10+216] + mov QWORD PTR [r10+208], r8 + sbb rax, QWORD PTR [rdx+216] + mov r8, QWORD PTR [r10+224] + mov QWORD PTR [r10+216], rax + sbb r8, QWORD PTR [rdx+224] + mov rax, QWORD PTR [r10+232] + mov QWORD PTR [r10+224], r8 + sbb rax, QWORD PTR [rdx+232] + mov r8, QWORD PTR [r10+240] + mov QWORD PTR [r10+232], rax + sbb r8, QWORD PTR [rdx+240] + mov rax, QWORD PTR [r10+248] + mov QWORD PTR [r10+240], r8 + sbb rax, QWORD PTR [rdx+248] + mov QWORD PTR [r10+248], rax sbb r9, 0 - mov rax, QWORD PTR [r10] - sub rax, QWORD PTR [rcx] - mov r8, QWORD PTR [r10+8] - mov QWORD PTR [r10], rax - sbb r8, QWORD PTR [rcx+8] - mov rax, QWORD PTR [r10+16] - mov QWORD PTR [r10+8], r8 - sbb rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [r10+24] - mov QWORD PTR [r10+16], rax - sbb r8, QWORD PTR [rcx+24] - mov rax, QWORD PTR [r10+32] - mov QWORD PTR [r10+24], r8 - sbb rax, QWORD PTR [rcx+32] - mov r8, QWORD PTR [r10+40] - mov QWORD PTR [r10+32], rax - sbb r8, QWORD PTR [rcx+40] - mov rax, QWORD PTR [r10+48] - mov QWORD PTR [r10+40], r8 - sbb rax, QWORD PTR [rcx+48] - mov r8, QWORD PTR [r10+56] - mov QWORD PTR [r10+48], rax - sbb r8, QWORD PTR [rcx+56] - mov rax, QWORD PTR [r10+64] - mov QWORD PTR [r10+56], r8 - sbb rax, QWORD PTR [rcx+64] - mov r8, QWORD PTR [r10+72] - mov QWORD PTR [r10+64], rax - sbb r8, QWORD PTR [rcx+72] - mov rax, QWORD PTR [r10+80] - mov QWORD PTR [r10+72], r8 - sbb rax, QWORD PTR [rcx+80] - mov r8, QWORD PTR [r10+88] - mov QWORD PTR [r10+80], rax - sbb r8, QWORD PTR [rcx+88] - mov rax, QWORD PTR [r10+96] - mov QWORD PTR [r10+88], r8 - sbb rax, QWORD PTR [rcx+96] - mov r8, QWORD PTR [r10+104] - mov QWORD PTR [r10+96], rax - sbb r8, QWORD PTR [rcx+104] - mov rax, QWORD PTR [r10+112] - mov QWORD PTR [r10+104], r8 - sbb rax, QWORD PTR [rcx+112] - mov r8, QWORD PTR [r10+120] - mov QWORD PTR [r10+112], rax - sbb r8, QWORD PTR [rcx+120] - mov rax, QWORD PTR [r10+128] - mov QWORD PTR [r10+120], r8 - sbb rax, QWORD PTR [rcx+128] - mov r8, QWORD PTR [r10+136] - mov QWORD PTR [r10+128], rax - sbb r8, QWORD PTR [rcx+136] - mov rax, QWORD PTR [r10+144] - mov QWORD PTR [r10+136], r8 - sbb rax, QWORD PTR [rcx+144] - mov r8, QWORD PTR [r10+152] - mov QWORD PTR [r10+144], rax - sbb r8, QWORD PTR [rcx+152] - mov rax, QWORD PTR [r10+160] - mov QWORD PTR [r10+152], r8 - sbb rax, QWORD PTR [rcx+160] - mov r8, QWORD PTR [r10+168] - mov QWORD PTR [r10+160], rax - sbb r8, QWORD PTR [rcx+168] - mov rax, QWORD PTR [r10+176] - mov QWORD PTR [r10+168], r8 - sbb rax, QWORD PTR [rcx+176] - mov r8, QWORD PTR [r10+184] - mov QWORD PTR [r10+176], rax - sbb r8, QWORD PTR [rcx+184] - mov rax, QWORD PTR [r10+192] - mov QWORD PTR [r10+184], r8 - sbb rax, QWORD PTR [rcx+192] - mov r8, QWORD PTR [r10+200] - mov QWORD PTR [r10+192], rax - sbb r8, QWORD PTR [rcx+200] - mov rax, QWORD PTR [r10+208] - mov QWORD PTR [r10+200], r8 - sbb rax, QWORD PTR [rcx+208] - mov r8, QWORD PTR [r10+216] - mov QWORD PTR [r10+208], rax - sbb r8, QWORD PTR [rcx+216] - mov rax, QWORD PTR [r10+224] - mov QWORD PTR [r10+216], r8 - sbb rax, QWORD PTR [rcx+224] - mov r8, QWORD PTR [r10+232] - mov QWORD PTR [r10+224], rax - sbb r8, QWORD PTR [rcx+232] - mov rax, QWORD PTR [r10+240] - mov QWORD PTR [r10+232], r8 - sbb rax, QWORD PTR [rcx+240] - mov r8, QWORD PTR [r10+248] - mov QWORD PTR [r10+240], rax - sbb r8, QWORD PTR [rcx+248] - mov rax, QWORD PTR [r10+256] - mov QWORD PTR [r10+248], r8 - sbb rax, QWORD PTR [rcx+256] - mov r8, QWORD PTR [r10+264] - mov QWORD PTR [r10+256], rax - sbb r8, QWORD PTR [rcx+264] - mov rax, QWORD PTR [r10+272] - mov QWORD PTR [r10+264], r8 - sbb rax, QWORD PTR [rcx+272] - mov r8, QWORD PTR [r10+280] - mov QWORD PTR [r10+272], rax - sbb r8, QWORD PTR [rcx+280] - mov rax, QWORD PTR [r10+288] - mov QWORD PTR [r10+280], r8 - sbb rax, QWORD PTR [rcx+288] - mov r8, QWORD PTR [r10+296] - mov QWORD PTR [r10+288], rax - sbb r8, QWORD PTR [rcx+296] - mov rax, QWORD PTR [r10+304] - mov QWORD PTR [r10+296], r8 - sbb rax, QWORD PTR [rcx+304] - mov r8, QWORD PTR [r10+312] - mov QWORD PTR [r10+304], rax - sbb r8, QWORD PTR [rcx+312] - mov rax, QWORD PTR [r10+320] - mov QWORD PTR [r10+312], r8 - sbb rax, QWORD PTR [rcx+320] - mov r8, QWORD PTR [r10+328] - mov QWORD PTR [r10+320], rax - sbb r8, QWORD PTR [rcx+328] - mov rax, QWORD PTR [r10+336] - mov QWORD PTR [r10+328], r8 - sbb rax, QWORD PTR [rcx+336] - mov r8, QWORD PTR [r10+344] - mov QWORD PTR [r10+336], rax - sbb r8, QWORD PTR [rcx+344] - mov rax, QWORD PTR [r10+352] - mov QWORD PTR [r10+344], r8 - sbb rax, QWORD PTR [rcx+352] - mov r8, QWORD PTR [r10+360] - mov QWORD PTR [r10+352], rax - sbb r8, QWORD PTR [rcx+360] - mov rax, QWORD PTR [r10+368] - mov QWORD PTR [r10+360], r8 - sbb rax, QWORD PTR [rcx+368] - mov r8, QWORD PTR [r10+376] - mov QWORD PTR [r10+368], rax - sbb r8, QWORD PTR [rcx+376] - mov rax, QWORD PTR [r10+384] - mov QWORD PTR [r10+376], r8 - sbb rax, QWORD PTR [rcx+384] - mov r8, QWORD PTR [r10+392] - mov QWORD PTR [r10+384], rax - sbb r8, QWORD PTR [rcx+392] - mov rax, QWORD PTR [r10+400] - mov QWORD PTR [r10+392], r8 - sbb rax, QWORD PTR [rcx+400] - mov r8, QWORD PTR [r10+408] - mov QWORD PTR [r10+400], rax - sbb r8, QWORD PTR [rcx+408] - mov rax, QWORD PTR [r10+416] - mov QWORD PTR [r10+408], r8 - sbb rax, QWORD PTR [rcx+416] - mov r8, QWORD PTR [r10+424] - mov QWORD PTR [r10+416], rax - sbb r8, QWORD PTR [rcx+424] - mov rax, QWORD PTR [r10+432] - mov QWORD PTR [r10+424], r8 - sbb rax, QWORD PTR [rcx+432] - mov r8, QWORD PTR [r10+440] - mov QWORD PTR [r10+432], rax - sbb r8, QWORD PTR [rcx+440] - mov rax, QWORD PTR [r10+448] - mov QWORD PTR [r10+440], r8 - sbb rax, QWORD PTR [rcx+448] - mov r8, QWORD PTR [r10+456] - mov QWORD PTR [r10+448], rax - sbb r8, QWORD PTR [rcx+456] - mov rax, QWORD PTR [r10+464] - mov QWORD PTR [r10+456], r8 - sbb rax, QWORD PTR [rcx+464] - mov r8, QWORD PTR [r10+472] - mov QWORD PTR [r10+464], rax - sbb r8, QWORD PTR [rcx+472] - mov rax, QWORD PTR [r10+480] - mov QWORD PTR [r10+472], r8 - sbb rax, QWORD PTR [rcx+480] - mov r8, QWORD PTR [r10+488] - mov QWORD PTR [r10+480], rax - sbb r8, QWORD PTR [rcx+488] - mov rax, QWORD PTR [r10+496] - mov QWORD PTR [r10+488], r8 - sbb rax, QWORD PTR [rcx+496] - mov r8, QWORD PTR [r10+504] - mov QWORD PTR [r10+496], rax - sbb r8, QWORD PTR [rcx+504] - mov QWORD PTR [r10+504], r8 + sub rdx, 512 + mov r8, QWORD PTR [r10+-256] + sub r8, QWORD PTR [rdx+-256] + mov rax, QWORD PTR [r10+-248] + mov QWORD PTR [r10+-256], r8 + sbb rax, QWORD PTR [rdx+-248] + mov r8, QWORD PTR [r10+-240] + mov QWORD PTR [r10+-248], rax + sbb r8, QWORD PTR [rdx+-240] + mov rax, QWORD PTR [r10+-232] + mov QWORD PTR [r10+-240], r8 + sbb rax, QWORD PTR [rdx+-232] + mov r8, QWORD PTR [r10+-224] + mov QWORD PTR [r10+-232], rax + sbb r8, QWORD PTR [rdx+-224] + mov rax, QWORD PTR [r10+-216] + mov QWORD PTR [r10+-224], r8 + sbb rax, QWORD PTR [rdx+-216] + mov r8, QWORD PTR [r10+-208] + mov QWORD PTR [r10+-216], rax + sbb r8, QWORD PTR [rdx+-208] + mov rax, QWORD PTR [r10+-200] + mov QWORD PTR [r10+-208], r8 + sbb rax, QWORD PTR [rdx+-200] + mov r8, QWORD PTR [r10+-192] + mov QWORD PTR [r10+-200], rax + sbb r8, QWORD PTR [rdx+-192] + mov rax, QWORD PTR [r10+-184] + mov QWORD PTR [r10+-192], r8 + sbb rax, QWORD PTR [rdx+-184] + mov r8, QWORD PTR [r10+-176] + mov QWORD PTR [r10+-184], rax + sbb r8, QWORD PTR [rdx+-176] + mov rax, QWORD PTR [r10+-168] + mov QWORD PTR [r10+-176], r8 + sbb rax, QWORD PTR [rdx+-168] + mov r8, QWORD PTR [r10+-160] + mov QWORD PTR [r10+-168], rax + sbb r8, QWORD PTR [rdx+-160] + mov rax, QWORD PTR [r10+-152] + mov QWORD PTR [r10+-160], r8 + sbb rax, QWORD PTR [rdx+-152] + mov r8, QWORD PTR [r10+-144] + mov QWORD PTR [r10+-152], rax + sbb r8, QWORD PTR [rdx+-144] + mov rax, QWORD PTR [r10+-136] + mov QWORD PTR [r10+-144], r8 + sbb rax, QWORD PTR [rdx+-136] + mov r8, QWORD PTR [r10+-128] + mov QWORD PTR [r10+-136], rax + sbb r8, QWORD PTR [rdx+-128] + mov rax, QWORD PTR [r10+-120] + mov QWORD PTR [r10+-128], r8 + sbb rax, QWORD PTR [rdx+-120] + mov r8, QWORD PTR [r10+-112] + mov QWORD PTR [r10+-120], rax + sbb r8, QWORD PTR [rdx+-112] + mov rax, QWORD PTR [r10+-104] + mov QWORD PTR [r10+-112], r8 + sbb rax, QWORD PTR [rdx+-104] + mov r8, QWORD PTR [r10+-96] + mov QWORD PTR [r10+-104], rax + sbb r8, QWORD PTR [rdx+-96] + mov rax, QWORD PTR [r10+-88] + mov QWORD PTR [r10+-96], r8 + sbb rax, QWORD PTR [rdx+-88] + mov r8, QWORD PTR [r10+-80] + mov QWORD PTR [r10+-88], rax + sbb r8, QWORD PTR [rdx+-80] + mov rax, QWORD PTR [r10+-72] + mov QWORD PTR [r10+-80], r8 + sbb rax, QWORD PTR [rdx+-72] + mov r8, QWORD PTR [r10+-64] + mov QWORD PTR [r10+-72], rax + sbb r8, QWORD PTR [rdx+-64] + mov rax, QWORD PTR [r10+-56] + mov QWORD PTR [r10+-64], r8 + sbb rax, QWORD PTR [rdx+-56] + mov r8, QWORD PTR [r10+-48] + mov QWORD PTR [r10+-56], rax + sbb r8, QWORD PTR [rdx+-48] + mov rax, QWORD PTR [r10+-40] + mov QWORD PTR [r10+-48], r8 + sbb rax, QWORD PTR [rdx+-40] + mov r8, QWORD PTR [r10+-32] + mov QWORD PTR [r10+-40], rax + sbb r8, QWORD PTR [rdx+-32] + mov rax, QWORD PTR [r10+-24] + mov QWORD PTR [r10+-32], r8 + sbb rax, QWORD PTR [rdx+-24] + mov r8, QWORD PTR [r10+-16] + mov QWORD PTR [r10+-24], rax + sbb r8, QWORD PTR [rdx+-16] + mov rax, QWORD PTR [r10+-8] + mov QWORD PTR [r10+-16], r8 + sbb rax, QWORD PTR [rdx+-8] + mov r8, QWORD PTR [r10] + mov QWORD PTR [r10+-8], rax + sbb r8, QWORD PTR [rdx] + mov rax, QWORD PTR [r10+8] + mov QWORD PTR [r10], r8 + sbb rax, QWORD PTR [rdx+8] + mov r8, QWORD PTR [r10+16] + mov QWORD PTR [r10+8], rax + sbb r8, QWORD PTR [rdx+16] + mov rax, QWORD PTR [r10+24] + mov QWORD PTR [r10+16], r8 + sbb rax, QWORD PTR [rdx+24] + mov r8, QWORD PTR [r10+32] + mov QWORD PTR [r10+24], rax + sbb r8, QWORD PTR [rdx+32] + mov rax, QWORD PTR [r10+40] + mov QWORD PTR [r10+32], r8 + sbb rax, QWORD PTR [rdx+40] + mov r8, QWORD PTR [r10+48] + mov QWORD PTR [r10+40], rax + sbb r8, QWORD PTR [rdx+48] + mov rax, QWORD PTR [r10+56] + mov QWORD PTR [r10+48], r8 + sbb rax, QWORD PTR [rdx+56] + mov r8, QWORD PTR [r10+64] + mov QWORD PTR [r10+56], rax + sbb r8, QWORD PTR [rdx+64] + mov rax, QWORD PTR [r10+72] + mov QWORD PTR [r10+64], r8 + sbb rax, QWORD PTR [rdx+72] + mov r8, QWORD PTR [r10+80] + mov QWORD PTR [r10+72], rax + sbb r8, QWORD PTR [rdx+80] + mov rax, QWORD PTR [r10+88] + mov QWORD PTR [r10+80], r8 + sbb rax, QWORD PTR [rdx+88] + mov r8, QWORD PTR [r10+96] + mov QWORD PTR [r10+88], rax + sbb r8, QWORD PTR [rdx+96] + mov rax, QWORD PTR [r10+104] + mov QWORD PTR [r10+96], r8 + sbb rax, QWORD PTR [rdx+104] + mov r8, QWORD PTR [r10+112] + mov QWORD PTR [r10+104], rax + sbb r8, QWORD PTR [rdx+112] + mov rax, QWORD PTR [r10+120] + mov QWORD PTR [r10+112], r8 + sbb rax, QWORD PTR [rdx+120] + mov r8, QWORD PTR [r10+128] + mov QWORD PTR [r10+120], rax + sbb r8, QWORD PTR [rdx+128] + mov rax, QWORD PTR [r10+136] + mov QWORD PTR [r10+128], r8 + sbb rax, QWORD PTR [rdx+136] + mov r8, QWORD PTR [r10+144] + mov QWORD PTR [r10+136], rax + sbb r8, QWORD PTR [rdx+144] + mov rax, QWORD PTR [r10+152] + mov QWORD PTR [r10+144], r8 + sbb rax, QWORD PTR [rdx+152] + mov r8, QWORD PTR [r10+160] + mov QWORD PTR [r10+152], rax + sbb r8, QWORD PTR [rdx+160] + mov rax, QWORD PTR [r10+168] + mov QWORD PTR [r10+160], r8 + sbb rax, QWORD PTR [rdx+168] + mov r8, QWORD PTR [r10+176] + mov QWORD PTR [r10+168], rax + sbb r8, QWORD PTR [rdx+176] + mov rax, QWORD PTR [r10+184] + mov QWORD PTR [r10+176], r8 + sbb rax, QWORD PTR [rdx+184] + mov r8, QWORD PTR [r10+192] + mov QWORD PTR [r10+184], rax + sbb r8, QWORD PTR [rdx+192] + mov rax, QWORD PTR [r10+200] + mov QWORD PTR [r10+192], r8 + sbb rax, QWORD PTR [rdx+200] + mov r8, QWORD PTR [r10+208] + mov QWORD PTR [r10+200], rax + sbb r8, QWORD PTR [rdx+208] + mov rax, QWORD PTR [r10+216] + mov QWORD PTR [r10+208], r8 + sbb rax, QWORD PTR [rdx+216] + mov r8, QWORD PTR [r10+224] + mov QWORD PTR [r10+216], rax + sbb r8, QWORD PTR [rdx+224] + mov rax, QWORD PTR [r10+232] + mov QWORD PTR [r10+224], r8 + sbb rax, QWORD PTR [rdx+232] + mov r8, QWORD PTR [r10+240] + mov QWORD PTR [r10+232], rax + sbb r8, QWORD PTR [rdx+240] + mov rax, QWORD PTR [r10+248] + mov QWORD PTR [r10+240], r8 + sbb rax, QWORD PTR [rdx+248] + mov QWORD PTR [r10+248], rax sbb r9, 0 - ; Add in place - mov rax, QWORD PTR [rcx+256] - add rax, QWORD PTR [r10] - mov r8, QWORD PTR [rcx+264] - mov QWORD PTR [rcx+256], rax - adc r8, QWORD PTR [r10+8] - mov rax, QWORD PTR [rcx+272] - mov QWORD PTR [rcx+264], r8 - adc rax, QWORD PTR [r10+16] - mov r8, QWORD PTR [rcx+280] - mov QWORD PTR [rcx+272], rax - adc r8, QWORD PTR [r10+24] - mov rax, QWORD PTR [rcx+288] - mov QWORD PTR [rcx+280], r8 - adc rax, QWORD PTR [r10+32] - mov r8, QWORD PTR [rcx+296] - mov QWORD PTR [rcx+288], rax - adc r8, QWORD PTR [r10+40] - mov rax, QWORD PTR [rcx+304] - mov QWORD PTR [rcx+296], r8 - adc rax, QWORD PTR [r10+48] - mov r8, QWORD PTR [rcx+312] - mov QWORD PTR [rcx+304], rax - adc r8, QWORD PTR [r10+56] - mov rax, QWORD PTR [rcx+320] - mov QWORD PTR [rcx+312], r8 - adc rax, QWORD PTR [r10+64] - mov r8, QWORD PTR [rcx+328] - mov QWORD PTR [rcx+320], rax - adc r8, QWORD PTR [r10+72] - mov rax, QWORD PTR [rcx+336] - mov QWORD PTR [rcx+328], r8 - adc rax, QWORD PTR [r10+80] - mov r8, QWORD PTR [rcx+344] - mov QWORD PTR [rcx+336], rax - adc r8, QWORD PTR [r10+88] - mov rax, QWORD PTR [rcx+352] - mov QWORD PTR [rcx+344], r8 - adc rax, QWORD PTR [r10+96] - mov r8, QWORD PTR [rcx+360] - mov QWORD PTR [rcx+352], rax - adc r8, QWORD PTR [r10+104] - mov rax, QWORD PTR [rcx+368] - mov QWORD PTR [rcx+360], r8 - adc rax, QWORD PTR [r10+112] - mov r8, QWORD PTR [rcx+376] - mov QWORD PTR [rcx+368], rax - adc r8, QWORD PTR [r10+120] - mov rax, QWORD PTR [rcx+384] - mov QWORD PTR [rcx+376], r8 - adc rax, QWORD PTR [r10+128] - mov r8, QWORD PTR [rcx+392] - mov QWORD PTR [rcx+384], rax - adc r8, QWORD PTR [r10+136] - mov rax, QWORD PTR [rcx+400] - mov QWORD PTR [rcx+392], r8 - adc rax, QWORD PTR [r10+144] - mov r8, QWORD PTR [rcx+408] - mov QWORD PTR [rcx+400], rax - adc r8, QWORD PTR [r10+152] - mov rax, QWORD PTR [rcx+416] - mov QWORD PTR [rcx+408], r8 - adc rax, QWORD PTR [r10+160] - mov r8, QWORD PTR [rcx+424] - mov QWORD PTR [rcx+416], rax - adc r8, QWORD PTR [r10+168] - mov rax, QWORD PTR [rcx+432] - mov QWORD PTR [rcx+424], r8 - adc rax, QWORD PTR [r10+176] - mov r8, QWORD PTR [rcx+440] - mov QWORD PTR [rcx+432], rax - adc r8, QWORD PTR [r10+184] - mov rax, QWORD PTR [rcx+448] - mov QWORD PTR [rcx+440], r8 - adc rax, QWORD PTR [r10+192] - mov r8, QWORD PTR [rcx+456] - mov QWORD PTR [rcx+448], rax - adc r8, QWORD PTR [r10+200] - mov rax, QWORD PTR [rcx+464] - mov QWORD PTR [rcx+456], r8 - adc rax, QWORD PTR [r10+208] - mov r8, QWORD PTR [rcx+472] - mov QWORD PTR [rcx+464], rax - adc r8, QWORD PTR [r10+216] - mov rax, QWORD PTR [rcx+480] - mov QWORD PTR [rcx+472], r8 - adc rax, QWORD PTR [r10+224] - mov r8, QWORD PTR [rcx+488] - mov QWORD PTR [rcx+480], rax - adc r8, QWORD PTR [r10+232] - mov rax, QWORD PTR [rcx+496] - mov QWORD PTR [rcx+488], r8 - adc rax, QWORD PTR [r10+240] - mov r8, QWORD PTR [rcx+504] - mov QWORD PTR [rcx+496], rax - adc r8, QWORD PTR [r10+248] - mov rax, QWORD PTR [rcx+512] - mov QWORD PTR [rcx+504], r8 - adc rax, QWORD PTR [r10+256] - mov r8, QWORD PTR [rcx+520] - mov QWORD PTR [rcx+512], rax - adc r8, QWORD PTR [r10+264] - mov rax, QWORD PTR [rcx+528] - mov QWORD PTR [rcx+520], r8 - adc rax, QWORD PTR [r10+272] - mov r8, QWORD PTR [rcx+536] - mov QWORD PTR [rcx+528], rax - adc r8, QWORD PTR [r10+280] - mov rax, QWORD PTR [rcx+544] - mov QWORD PTR [rcx+536], r8 - adc rax, QWORD PTR [r10+288] - mov r8, QWORD PTR [rcx+552] - mov QWORD PTR [rcx+544], rax - adc r8, QWORD PTR [r10+296] - mov rax, QWORD PTR [rcx+560] - mov QWORD PTR [rcx+552], r8 - adc rax, QWORD PTR [r10+304] - mov r8, QWORD PTR [rcx+568] - mov QWORD PTR [rcx+560], rax - adc r8, QWORD PTR [r10+312] - mov rax, QWORD PTR [rcx+576] - mov QWORD PTR [rcx+568], r8 - adc rax, QWORD PTR [r10+320] - mov r8, QWORD PTR [rcx+584] - mov QWORD PTR [rcx+576], rax - adc r8, QWORD PTR [r10+328] - mov rax, QWORD PTR [rcx+592] - mov QWORD PTR [rcx+584], r8 - adc rax, QWORD PTR [r10+336] - mov r8, QWORD PTR [rcx+600] - mov QWORD PTR [rcx+592], rax - adc r8, QWORD PTR [r10+344] - mov rax, QWORD PTR [rcx+608] - mov QWORD PTR [rcx+600], r8 - adc rax, QWORD PTR [r10+352] - mov r8, QWORD PTR [rcx+616] - mov QWORD PTR [rcx+608], rax - adc r8, QWORD PTR [r10+360] - mov rax, QWORD PTR [rcx+624] - mov QWORD PTR [rcx+616], r8 - adc rax, QWORD PTR [r10+368] - mov r8, QWORD PTR [rcx+632] - mov QWORD PTR [rcx+624], rax - adc r8, QWORD PTR [r10+376] - mov rax, QWORD PTR [rcx+640] - mov QWORD PTR [rcx+632], r8 - adc rax, QWORD PTR [r10+384] - mov r8, QWORD PTR [rcx+648] - mov QWORD PTR [rcx+640], rax - adc r8, QWORD PTR [r10+392] - mov rax, QWORD PTR [rcx+656] - mov QWORD PTR [rcx+648], r8 - adc rax, QWORD PTR [r10+400] - mov r8, QWORD PTR [rcx+664] - mov QWORD PTR [rcx+656], rax - adc r8, QWORD PTR [r10+408] - mov rax, QWORD PTR [rcx+672] - mov QWORD PTR [rcx+664], r8 - adc rax, QWORD PTR [r10+416] - mov r8, QWORD PTR [rcx+680] - mov QWORD PTR [rcx+672], rax - adc r8, QWORD PTR [r10+424] - mov rax, QWORD PTR [rcx+688] - mov QWORD PTR [rcx+680], r8 - adc rax, QWORD PTR [r10+432] - mov r8, QWORD PTR [rcx+696] - mov QWORD PTR [rcx+688], rax - adc r8, QWORD PTR [r10+440] - mov rax, QWORD PTR [rcx+704] - mov QWORD PTR [rcx+696], r8 - adc rax, QWORD PTR [r10+448] - mov r8, QWORD PTR [rcx+712] - mov QWORD PTR [rcx+704], rax - adc r8, QWORD PTR [r10+456] - mov rax, QWORD PTR [rcx+720] - mov QWORD PTR [rcx+712], r8 - adc rax, QWORD PTR [r10+464] - mov r8, QWORD PTR [rcx+728] - mov QWORD PTR [rcx+720], rax - adc r8, QWORD PTR [r10+472] - mov rax, QWORD PTR [rcx+736] - mov QWORD PTR [rcx+728], r8 - adc rax, QWORD PTR [r10+480] - mov r8, QWORD PTR [rcx+744] - mov QWORD PTR [rcx+736], rax - adc r8, QWORD PTR [r10+488] - mov rax, QWORD PTR [rcx+752] - mov QWORD PTR [rcx+744], r8 - adc rax, QWORD PTR [r10+496] - mov r8, QWORD PTR [rcx+760] - mov QWORD PTR [rcx+752], rax - adc r8, QWORD PTR [r10+504] - mov QWORD PTR [rcx+760], r8 - adc r9, 0 - mov QWORD PTR [rcx+768], r9 - ; Add in place - mov rax, QWORD PTR [rcx+512] - xor r9, r9 - add rax, QWORD PTR [rdx] - mov r8, QWORD PTR [rcx+520] - mov QWORD PTR [rcx+512], rax - adc r8, QWORD PTR [rdx+8] - mov rax, QWORD PTR [rcx+528] - mov QWORD PTR [rcx+520], r8 - adc rax, QWORD PTR [rdx+16] - mov r8, QWORD PTR [rcx+536] - mov QWORD PTR [rcx+528], rax - adc r8, QWORD PTR [rdx+24] - mov rax, QWORD PTR [rcx+544] - mov QWORD PTR [rcx+536], r8 - adc rax, QWORD PTR [rdx+32] - mov r8, QWORD PTR [rcx+552] - mov QWORD PTR [rcx+544], rax - adc r8, QWORD PTR [rdx+40] - mov rax, QWORD PTR [rcx+560] - mov QWORD PTR [rcx+552], r8 - adc rax, QWORD PTR [rdx+48] - mov r8, QWORD PTR [rcx+568] - mov QWORD PTR [rcx+560], rax - adc r8, QWORD PTR [rdx+56] - mov rax, QWORD PTR [rcx+576] - mov QWORD PTR [rcx+568], r8 - adc rax, QWORD PTR [rdx+64] - mov r8, QWORD PTR [rcx+584] - mov QWORD PTR [rcx+576], rax - adc r8, QWORD PTR [rdx+72] - mov rax, QWORD PTR [rcx+592] - mov QWORD PTR [rcx+584], r8 - adc rax, QWORD PTR [rdx+80] - mov r8, QWORD PTR [rcx+600] - mov QWORD PTR [rcx+592], rax - adc r8, QWORD PTR [rdx+88] - mov rax, QWORD PTR [rcx+608] - mov QWORD PTR [rcx+600], r8 - adc rax, QWORD PTR [rdx+96] - mov r8, QWORD PTR [rcx+616] - mov QWORD PTR [rcx+608], rax - adc r8, QWORD PTR [rdx+104] - mov rax, QWORD PTR [rcx+624] - mov QWORD PTR [rcx+616], r8 - adc rax, QWORD PTR [rdx+112] - mov r8, QWORD PTR [rcx+632] - mov QWORD PTR [rcx+624], rax - adc r8, QWORD PTR [rdx+120] - mov rax, QWORD PTR [rcx+640] - mov QWORD PTR [rcx+632], r8 - adc rax, QWORD PTR [rdx+128] - mov r8, QWORD PTR [rcx+648] - mov QWORD PTR [rcx+640], rax - adc r8, QWORD PTR [rdx+136] - mov rax, QWORD PTR [rcx+656] - mov QWORD PTR [rcx+648], r8 - adc rax, QWORD PTR [rdx+144] - mov r8, QWORD PTR [rcx+664] - mov QWORD PTR [rcx+656], rax - adc r8, QWORD PTR [rdx+152] - mov rax, QWORD PTR [rcx+672] - mov QWORD PTR [rcx+664], r8 - adc rax, QWORD PTR [rdx+160] - mov r8, QWORD PTR [rcx+680] - mov QWORD PTR [rcx+672], rax - adc r8, QWORD PTR [rdx+168] - mov rax, QWORD PTR [rcx+688] - mov QWORD PTR [rcx+680], r8 - adc rax, QWORD PTR [rdx+176] - mov r8, QWORD PTR [rcx+696] - mov QWORD PTR [rcx+688], rax - adc r8, QWORD PTR [rdx+184] - mov rax, QWORD PTR [rcx+704] - mov QWORD PTR [rcx+696], r8 - adc rax, QWORD PTR [rdx+192] - mov r8, QWORD PTR [rcx+712] - mov QWORD PTR [rcx+704], rax - adc r8, QWORD PTR [rdx+200] - mov rax, QWORD PTR [rcx+720] - mov QWORD PTR [rcx+712], r8 - adc rax, QWORD PTR [rdx+208] - mov r8, QWORD PTR [rcx+728] - mov QWORD PTR [rcx+720], rax - adc r8, QWORD PTR [rdx+216] - mov rax, QWORD PTR [rcx+736] - mov QWORD PTR [rcx+728], r8 - adc rax, QWORD PTR [rdx+224] - mov r8, QWORD PTR [rcx+744] - mov QWORD PTR [rcx+736], rax - adc r8, QWORD PTR [rdx+232] - mov rax, QWORD PTR [rcx+752] - mov QWORD PTR [rcx+744], r8 - adc rax, QWORD PTR [rdx+240] - mov r8, QWORD PTR [rcx+760] - mov QWORD PTR [rcx+752], rax - adc r8, QWORD PTR [rdx+248] - mov rax, QWORD PTR [rcx+768] - mov QWORD PTR [rcx+760], r8 - adc rax, QWORD PTR [rdx+256] - mov QWORD PTR [rcx+768], rax - adc r9, 0 - ; Add to zero - mov rax, QWORD PTR [rdx+264] + mov rcx, QWORD PTR [rsp+512] + neg r9 + add rcx, 512 + mov r8, QWORD PTR [rcx+-256] + sub r8, QWORD PTR [r10+-256] + mov rax, QWORD PTR [rcx+-248] + mov QWORD PTR [rcx+-256], r8 + sbb rax, QWORD PTR [r10+-248] + mov r8, QWORD PTR [rcx+-240] + mov QWORD PTR [rcx+-248], rax + sbb r8, QWORD PTR [r10+-240] + mov rax, QWORD PTR [rcx+-232] + mov QWORD PTR [rcx+-240], r8 + sbb rax, QWORD PTR [r10+-232] + mov r8, QWORD PTR [rcx+-224] + mov QWORD PTR [rcx+-232], rax + sbb r8, QWORD PTR [r10+-224] + mov rax, QWORD PTR [rcx+-216] + mov QWORD PTR [rcx+-224], r8 + sbb rax, QWORD PTR [r10+-216] + mov r8, QWORD PTR [rcx+-208] + mov QWORD PTR [rcx+-216], rax + sbb r8, QWORD PTR [r10+-208] + mov rax, QWORD PTR [rcx+-200] + mov QWORD PTR [rcx+-208], r8 + sbb rax, QWORD PTR [r10+-200] + mov r8, QWORD PTR [rcx+-192] + mov QWORD PTR [rcx+-200], rax + sbb r8, QWORD PTR [r10+-192] + mov rax, QWORD PTR [rcx+-184] + mov QWORD PTR [rcx+-192], r8 + sbb rax, QWORD PTR [r10+-184] + mov r8, QWORD PTR [rcx+-176] + mov QWORD PTR [rcx+-184], rax + sbb r8, QWORD PTR [r10+-176] + mov rax, QWORD PTR [rcx+-168] + mov QWORD PTR [rcx+-176], r8 + sbb rax, QWORD PTR [r10+-168] + mov r8, QWORD PTR [rcx+-160] + mov QWORD PTR [rcx+-168], rax + sbb r8, QWORD PTR [r10+-160] + mov rax, QWORD PTR [rcx+-152] + mov QWORD PTR [rcx+-160], r8 + sbb rax, QWORD PTR [r10+-152] + mov r8, QWORD PTR [rcx+-144] + mov QWORD PTR [rcx+-152], rax + sbb r8, QWORD PTR [r10+-144] + mov rax, QWORD PTR [rcx+-136] + mov QWORD PTR [rcx+-144], r8 + sbb rax, QWORD PTR [r10+-136] + mov r8, QWORD PTR [rcx+-128] + mov QWORD PTR [rcx+-136], rax + sbb r8, QWORD PTR [r10+-128] + mov rax, QWORD PTR [rcx+-120] + mov QWORD PTR [rcx+-128], r8 + sbb rax, QWORD PTR [r10+-120] + mov r8, QWORD PTR [rcx+-112] + mov QWORD PTR [rcx+-120], rax + sbb r8, QWORD PTR [r10+-112] + mov rax, QWORD PTR [rcx+-104] + mov QWORD PTR [rcx+-112], r8 + sbb rax, QWORD PTR [r10+-104] + mov r8, QWORD PTR [rcx+-96] + mov QWORD PTR [rcx+-104], rax + sbb r8, QWORD PTR [r10+-96] + mov rax, QWORD PTR [rcx+-88] + mov QWORD PTR [rcx+-96], r8 + sbb rax, QWORD PTR [r10+-88] + mov r8, QWORD PTR [rcx+-80] + mov QWORD PTR [rcx+-88], rax + sbb r8, QWORD PTR [r10+-80] + mov rax, QWORD PTR [rcx+-72] + mov QWORD PTR [rcx+-80], r8 + sbb rax, QWORD PTR [r10+-72] + mov r8, QWORD PTR [rcx+-64] + mov QWORD PTR [rcx+-72], rax + sbb r8, QWORD PTR [r10+-64] + mov rax, QWORD PTR [rcx+-56] + mov QWORD PTR [rcx+-64], r8 + sbb rax, QWORD PTR [r10+-56] + mov r8, QWORD PTR [rcx+-48] + mov QWORD PTR [rcx+-56], rax + sbb r8, QWORD PTR [r10+-48] + mov rax, QWORD PTR [rcx+-40] + mov QWORD PTR [rcx+-48], r8 + sbb rax, QWORD PTR [r10+-40] + mov r8, QWORD PTR [rcx+-32] + mov QWORD PTR [rcx+-40], rax + sbb r8, QWORD PTR [r10+-32] + mov rax, QWORD PTR [rcx+-24] + mov QWORD PTR [rcx+-32], r8 + sbb rax, QWORD PTR [r10+-24] + mov r8, QWORD PTR [rcx+-16] + mov QWORD PTR [rcx+-24], rax + sbb r8, QWORD PTR [r10+-16] + mov rax, QWORD PTR [rcx+-8] + mov QWORD PTR [rcx+-16], r8 + sbb rax, QWORD PTR [r10+-8] + mov r8, QWORD PTR [rcx] + mov QWORD PTR [rcx+-8], rax + sbb r8, QWORD PTR [r10] + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 + sbb rax, QWORD PTR [r10+8] + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax + sbb r8, QWORD PTR [r10+16] + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 + sbb rax, QWORD PTR [r10+24] + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax + sbb r8, QWORD PTR [r10+32] + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 + sbb rax, QWORD PTR [r10+40] + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax + sbb r8, QWORD PTR [r10+48] + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 + sbb rax, QWORD PTR [r10+56] + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax + sbb r8, QWORD PTR [r10+64] + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 + sbb rax, QWORD PTR [r10+72] + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax + sbb r8, QWORD PTR [r10+80] + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 + sbb rax, QWORD PTR [r10+88] + mov r8, QWORD PTR [rcx+96] + mov QWORD PTR [rcx+88], rax + sbb r8, QWORD PTR [r10+96] + mov rax, QWORD PTR [rcx+104] + mov QWORD PTR [rcx+96], r8 + sbb rax, QWORD PTR [r10+104] + mov r8, QWORD PTR [rcx+112] + mov QWORD PTR [rcx+104], rax + sbb r8, QWORD PTR [r10+112] + mov rax, QWORD PTR [rcx+120] + mov QWORD PTR [rcx+112], r8 + sbb rax, QWORD PTR [r10+120] + mov r8, QWORD PTR [rcx+128] + mov QWORD PTR [rcx+120], rax + sbb r8, QWORD PTR [r10+128] + mov rax, QWORD PTR [rcx+136] + mov QWORD PTR [rcx+128], r8 + sbb rax, QWORD PTR [r10+136] + mov r8, QWORD PTR [rcx+144] + mov QWORD PTR [rcx+136], rax + sbb r8, QWORD PTR [r10+144] + mov rax, QWORD PTR [rcx+152] + mov QWORD PTR [rcx+144], r8 + sbb rax, QWORD PTR [r10+152] + mov r8, QWORD PTR [rcx+160] + mov QWORD PTR [rcx+152], rax + sbb r8, QWORD PTR [r10+160] + mov rax, QWORD PTR [rcx+168] + mov QWORD PTR [rcx+160], r8 + sbb rax, QWORD PTR [r10+168] + mov r8, QWORD PTR [rcx+176] + mov QWORD PTR [rcx+168], rax + sbb r8, QWORD PTR [r10+176] + mov rax, QWORD PTR [rcx+184] + mov QWORD PTR [rcx+176], r8 + sbb rax, QWORD PTR [r10+184] + mov r8, QWORD PTR [rcx+192] + mov QWORD PTR [rcx+184], rax + sbb r8, QWORD PTR [r10+192] + mov rax, QWORD PTR [rcx+200] + mov QWORD PTR [rcx+192], r8 + sbb rax, QWORD PTR [r10+200] + mov r8, QWORD PTR [rcx+208] + mov QWORD PTR [rcx+200], rax + sbb r8, QWORD PTR [r10+208] + mov rax, QWORD PTR [rcx+216] + mov QWORD PTR [rcx+208], r8 + sbb rax, QWORD PTR [r10+216] + mov r8, QWORD PTR [rcx+224] + mov QWORD PTR [rcx+216], rax + sbb r8, QWORD PTR [r10+224] + mov rax, QWORD PTR [rcx+232] + mov QWORD PTR [rcx+224], r8 + sbb rax, QWORD PTR [r10+232] + mov r8, QWORD PTR [rcx+240] + mov QWORD PTR [rcx+232], rax + sbb r8, QWORD PTR [r10+240] + mov rax, QWORD PTR [rcx+248] + mov QWORD PTR [rcx+240], r8 + sbb rax, QWORD PTR [r10+248] + mov QWORD PTR [rcx+248], rax + sbb r9, 0 + mov rcx, QWORD PTR [rsp+512] + add rcx, 768 + ; Add in word + mov r8, QWORD PTR [rcx] + add r8, r9 + mov rax, QWORD PTR [rcx+8] + mov QWORD PTR [rcx], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+272] - mov QWORD PTR [rcx+776], rax + mov r8, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+8], rax adc r8, 0 - mov rax, QWORD PTR [rdx+280] - mov QWORD PTR [rcx+784], r8 + mov rax, QWORD PTR [rcx+24] + mov QWORD PTR [rcx+16], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+288] - mov QWORD PTR [rcx+792], rax + mov r8, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+24], rax adc r8, 0 - mov rax, QWORD PTR [rdx+296] - mov QWORD PTR [rcx+800], r8 + mov rax, QWORD PTR [rcx+40] + mov QWORD PTR [rcx+32], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+304] - mov QWORD PTR [rcx+808], rax + mov r8, QWORD PTR [rcx+48] + mov QWORD PTR [rcx+40], rax adc r8, 0 - mov rax, QWORD PTR [rdx+312] - mov QWORD PTR [rcx+816], r8 + mov rax, QWORD PTR [rcx+56] + mov QWORD PTR [rcx+48], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+320] - mov QWORD PTR [rcx+824], rax + mov r8, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+56], rax adc r8, 0 - mov rax, QWORD PTR [rdx+328] - mov QWORD PTR [rcx+832], r8 + mov rax, QWORD PTR [rcx+72] + mov QWORD PTR [rcx+64], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+336] - mov QWORD PTR [rcx+840], rax + mov r8, QWORD PTR [rcx+80] + mov QWORD PTR [rcx+72], rax adc r8, 0 - mov rax, QWORD PTR [rdx+344] - mov QWORD PTR [rcx+848], r8 + mov rax, QWORD PTR [rcx+88] + mov QWORD PTR [rcx+80], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+352] - mov QWORD PTR [rcx+856], rax + mov r8, QWORD PTR [rcx+96] + mov QWORD PTR [rcx+88], rax adc r8, 0 - mov rax, QWORD PTR [rdx+360] - mov QWORD PTR [rcx+864], r8 + mov rax, QWORD PTR [rcx+104] + mov QWORD PTR [rcx+96], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+368] - mov QWORD PTR [rcx+872], rax + mov r8, QWORD PTR [rcx+112] + mov QWORD PTR [rcx+104], rax adc r8, 0 - mov rax, QWORD PTR [rdx+376] - mov QWORD PTR [rcx+880], r8 + mov rax, QWORD PTR [rcx+120] + mov QWORD PTR [rcx+112], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+384] - mov QWORD PTR [rcx+888], rax + mov r8, QWORD PTR [rcx+128] + mov QWORD PTR [rcx+120], rax adc r8, 0 - mov rax, QWORD PTR [rdx+392] - mov QWORD PTR [rcx+896], r8 + mov rax, QWORD PTR [rcx+136] + mov QWORD PTR [rcx+128], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+400] - mov QWORD PTR [rcx+904], rax + mov r8, QWORD PTR [rcx+144] + mov QWORD PTR [rcx+136], rax adc r8, 0 - mov rax, QWORD PTR [rdx+408] - mov QWORD PTR [rcx+912], r8 + mov rax, QWORD PTR [rcx+152] + mov QWORD PTR [rcx+144], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+416] - mov QWORD PTR [rcx+920], rax + mov r8, QWORD PTR [rcx+160] + mov QWORD PTR [rcx+152], rax adc r8, 0 - mov rax, QWORD PTR [rdx+424] - mov QWORD PTR [rcx+928], r8 + mov rax, QWORD PTR [rcx+168] + mov QWORD PTR [rcx+160], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+432] - mov QWORD PTR [rcx+936], rax + mov r8, QWORD PTR [rcx+176] + mov QWORD PTR [rcx+168], rax adc r8, 0 - mov rax, QWORD PTR [rdx+440] - mov QWORD PTR [rcx+944], r8 + mov rax, QWORD PTR [rcx+184] + mov QWORD PTR [rcx+176], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+448] - mov QWORD PTR [rcx+952], rax + mov r8, QWORD PTR [rcx+192] + mov QWORD PTR [rcx+184], rax adc r8, 0 - mov rax, QWORD PTR [rdx+456] - mov QWORD PTR [rcx+960], r8 + mov rax, QWORD PTR [rcx+200] + mov QWORD PTR [rcx+192], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+464] - mov QWORD PTR [rcx+968], rax + mov r8, QWORD PTR [rcx+208] + mov QWORD PTR [rcx+200], rax adc r8, 0 - mov rax, QWORD PTR [rdx+472] - mov QWORD PTR [rcx+976], r8 + mov rax, QWORD PTR [rcx+216] + mov QWORD PTR [rcx+208], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+480] - mov QWORD PTR [rcx+984], rax + mov r8, QWORD PTR [rcx+224] + mov QWORD PTR [rcx+216], rax adc r8, 0 - mov rax, QWORD PTR [rdx+488] - mov QWORD PTR [rcx+992], r8 + mov rax, QWORD PTR [rcx+232] + mov QWORD PTR [rcx+224], r8 adc rax, 0 - mov r8, QWORD PTR [rdx+496] - mov QWORD PTR [rcx+1000], rax + mov r8, QWORD PTR [rcx+240] + mov QWORD PTR [rcx+232], rax adc r8, 0 - mov rax, QWORD PTR [rdx+504] - mov QWORD PTR [rcx+1008], r8 + mov rax, QWORD PTR [rcx+248] + mov QWORD PTR [rcx+240], r8 adc rax, 0 - mov QWORD PTR [rcx+1016], rax - add rsp, 1304 - pop r12 + mov QWORD PTR [rcx+248], rax + mov rdx, QWORD PTR [rsp+520] + mov rcx, QWORD PTR [rsp+512] + add rsp, 528 ret sp_4096_sqr_avx2_64 ENDP _text ENDS @@ -33537,7 +46450,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_4096_cond_sub_64 PROC sub rsp, 512 - mov rax, 0 mov r10, QWORD PTR [r8] mov r11, QWORD PTR [r8+8] and r10, r9 @@ -33986,7 +46898,7 @@ sp_4096_cond_sub_64 PROC sbb r11, r8 mov QWORD PTR [rcx+496], r10 mov QWORD PTR [rcx+504], r11 - sbb rax, 0 + sbb rax, rax add rsp, 512 ret sp_4096_cond_sub_64 ENDP @@ -34687,7 +47599,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_4096_sub_64 PROC mov r9, QWORD PTR [rdx] - xor rax, rax sub r9, QWORD PTR [r8] mov r10, QWORD PTR [rdx+8] mov QWORD PTR [rcx], r9 @@ -34879,7 +47790,7 @@ sp_4096_sub_64 PROC mov QWORD PTR [rcx+496], r9 sbb r10, QWORD PTR [r8+504] mov QWORD PTR [rcx+504], r10 - sbb rax, 0 + sbb rax, rax ret sp_4096_sub_64 ENDP _text ENDS @@ -35316,7 +48227,6 @@ IFDEF HAVE_INTEL_AVX2 _text SEGMENT READONLY PARA sp_4096_cond_sub_avx2_64 PROC push r12 - mov rax, 0 mov r12, QWORD PTR [r8] mov r10, QWORD PTR [rdx] pext r12, r12, r9 @@ -35637,7 +48547,7 @@ sp_4096_cond_sub_avx2_64 PROC mov QWORD PTR [rcx+496], r12 sbb r10, r11 mov QWORD PTR [rcx+504], r10 - sbb rax, 0 + sbb rax, rax pop r12 ret sp_4096_cond_sub_avx2_64 ENDP @@ -36174,6 +49084,2402 @@ sp_4096_cmp_64 PROC ret sp_4096_cmp_64 ENDP _text ENDS +IFNDEF WC_NO_CACHE_RESISTANT +_text SEGMENT READONLY PARA +sp_4096_get_from_table_64 PROC + mov rax, 1 + movd xmm10, r8 + movd xmm11, rax + pxor xmm13, xmm13 + pshufd xmm11, xmm11, 0 + pshufd xmm10, xmm10, 0 + ; START: 0-7 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 0-7 + ; START: 8-15 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 64 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 8-15 + ; START: 16-23 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 128 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 16-23 + ; START: 24-31 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 192 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 24-31 + ; START: 32-39 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 256 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 32-39 + ; START: 40-47 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 320 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 40-47 + ; START: 48-55 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 384 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + add rcx, 64 + ; END: 48-55 + ; START: 56-63 + pxor xmm13, xmm13 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 448 + movdqu xmm12, xmm13 + pcmpeqd xmm12, xmm10 + movdqu xmm0, [r9] + movdqu xmm1, [r9+16] + movdqu xmm2, [r9+32] + movdqu xmm3, [r9+48] + pand xmm0, xmm12 + pand xmm1, xmm12 + pand xmm2, xmm12 + pand xmm3, xmm12 + por xmm4, xmm0 + por xmm5, xmm1 + por xmm6, xmm2 + por xmm7, xmm3 + paddd xmm13, xmm11 + movdqu [rcx], xmm4 + movdqu [rcx+16], xmm5 + movdqu [rcx+32], xmm6 + movdqu [rcx+48], xmm7 + ; END: 56-63 + ret +sp_4096_get_from_table_64 ENDP +_text ENDS +ENDIF IFDEF HAVE_INTEL_AVX2 ; /* Reduce the number back to 4096 bits using Montgomery reduction. ; * @@ -36933,6 +52239,1138 @@ L_4096_mont_reduce_avx2_64_loop: sp_4096_mont_reduce_avx2_64 ENDP _text ENDS ENDIF +IFNDEF WC_NO_CACHE_RESISTANT +_text SEGMENT READONLY PARA +sp_4096_get_from_table_avx2_64 PROC + mov rax, 1 + movd xmm10, r8 + movd xmm11, rax + vpxor ymm13, ymm13, ymm13 + vpermd ymm10, ymm13, ymm10 + vpermd ymm11, ymm13, ymm11 + ; START: 0-15 + vpxor ymm13, ymm13, ymm13 + vpxor ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 + vpxor ymm6, ymm6, ymm6 + vpxor ymm7, ymm7, ymm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm4 + vmovdqu YMMWORD PTR [rcx+32], ymm5 + vmovdqu YMMWORD PTR [rcx+64], ymm6 + vmovdqu YMMWORD PTR [rcx+96], ymm7 + add rcx, 128 + ; END: 0-15 + ; START: 16-31 + vpxor ymm13, ymm13, ymm13 + vpxor ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 + vpxor ymm6, ymm6, ymm6 + vpxor ymm7, ymm7, ymm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 128 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm4 + vmovdqu YMMWORD PTR [rcx+32], ymm5 + vmovdqu YMMWORD PTR [rcx+64], ymm6 + vmovdqu YMMWORD PTR [rcx+96], ymm7 + add rcx, 128 + ; END: 16-31 + ; START: 32-47 + vpxor ymm13, ymm13, ymm13 + vpxor ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 + vpxor ymm6, ymm6, ymm6 + vpxor ymm7, ymm7, ymm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 256 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm4 + vmovdqu YMMWORD PTR [rcx+32], ymm5 + vmovdqu YMMWORD PTR [rcx+64], ymm6 + vmovdqu YMMWORD PTR [rcx+96], ymm7 + add rcx, 128 + ; END: 32-47 + ; START: 48-63 + vpxor ymm13, ymm13, ymm13 + vpxor ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 + vpxor ymm6, ymm6, ymm6 + vpxor ymm7, ymm7, ymm7 + ; ENTRY: 0 + mov r9, QWORD PTR [rdx] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 1 + mov r9, QWORD PTR [rdx+8] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 2 + mov r9, QWORD PTR [rdx+16] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 3 + mov r9, QWORD PTR [rdx+24] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 4 + mov r9, QWORD PTR [rdx+32] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 5 + mov r9, QWORD PTR [rdx+40] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 6 + mov r9, QWORD PTR [rdx+48] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 7 + mov r9, QWORD PTR [rdx+56] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 8 + mov r9, QWORD PTR [rdx+64] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 9 + mov r9, QWORD PTR [rdx+72] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 10 + mov r9, QWORD PTR [rdx+80] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 11 + mov r9, QWORD PTR [rdx+88] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 12 + mov r9, QWORD PTR [rdx+96] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 13 + mov r9, QWORD PTR [rdx+104] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 14 + mov r9, QWORD PTR [rdx+112] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + ; ENTRY: 15 + mov r9, QWORD PTR [rdx+120] + add r9, 384 + vpcmpeqd ymm12, ymm13, ymm10 + vmovdqu ymm0, YMMWORD PTR [r9] + vmovdqu ymm1, YMMWORD PTR [r9+32] + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vpor ymm6, ymm6, ymm2 + vpor ymm7, ymm7, ymm3 + vpaddd ymm13, ymm13, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm4 + vmovdqu YMMWORD PTR [rcx+32], ymm5 + vmovdqu YMMWORD PTR [rcx+64], ymm6 + vmovdqu YMMWORD PTR [rcx+96], ymm7 + ; END: 48-63 + ret +sp_4096_get_from_table_avx2_64 ENDP +_text ENDS +ENDIF ; /* Conditionally add a and b using the mask m. ; * m is -1 to add and 0 when not. ; * @@ -37708,92 +54146,87 @@ IFDEF HAVE_INTEL_AVX2 ; */ _text SEGMENT READONLY PARA sp_256_mul_avx2_4 PROC - push rbx + push rbp push r12 push r13 push r14 push r15 - push rbp push rdi push rsi + push rbx mov rbp, r8 - mov rdi, rdx - ; A[0] * B[0] - mov rdx, QWORD PTR [rbp] - mulx r9, r8, QWORD PTR [rdi] - ; A[2] * B[0] - mulx r11, r10, QWORD PTR [rdi+16] - ; A[1] * B[0] - mulx rsi, rax, QWORD PTR [rdi+8] - xor r15, r15 - adcx r9, rax - ; A[1] * B[3] - mov rdx, QWORD PTR [rbp+24] - mulx r13, r12, QWORD PTR [rdi+8] - adcx r10, rsi - ; A[0] * B[1] - mov rdx, QWORD PTR [rbp+8] - mulx rsi, rax, QWORD PTR [rdi] - adox r9, rax - ; A[2] * B[1] - mulx r14, rax, QWORD PTR [rdi+16] + mov rax, rdx + mov rdx, QWORD PTR [rax] + ; A[0] * B[0] + mulx r9, r8, QWORD PTR [rbp] + xor rbx, rbx + ; A[0] * B[1] + mulx r10, rdi, QWORD PTR [rbp+8] + adcx r9, rdi + ; A[0] * B[2] + mulx r11, rdi, QWORD PTR [rbp+16] + adcx r10, rdi + ; A[0] * B[3] + mulx r12, rdi, QWORD PTR [rbp+24] + adcx r11, rdi + mov rdx, QWORD PTR [rax+8] + adcx r12, rbx + ; A[1] * B[0] + mulx rsi, rdi, QWORD PTR [rbp] + xor rbx, rbx + adcx r9, rdi + ; A[1] * B[1] + mulx r15, rdi, QWORD PTR [rbp+8] adox r10, rsi - adcx r11, rax - ; A[1] * B[2] - mov rdx, QWORD PTR [rbp+16] - mulx rsi, rax, QWORD PTR [rdi+8] - adcx r12, r14 - adox r11, rax - adcx r13, r15 + adcx r10, rdi + ; A[1] * B[2] + mulx rsi, rdi, QWORD PTR [rbp+16] + adox r11, r15 + adcx r11, rdi + ; A[1] * B[3] + mulx r13, rdi, QWORD PTR [rbp+24] adox r12, rsi - ; A[0] * B[2] - mulx rsi, rax, QWORD PTR [rdi] + adcx r12, rdi + adox r13, rbx + mov rdx, QWORD PTR [rax+16] + adcx r13, rbx + ; A[2] * B[0] + mulx rsi, rdi, QWORD PTR [rbp] + xor rbx, rbx + adcx r10, rdi + ; A[2] * B[1] + mulx r15, rdi, QWORD PTR [rbp+8] + adox r11, rsi + adcx r11, rdi + ; A[2] * B[2] + mulx rsi, rdi, QWORD PTR [rbp+16] + adox r12, r15 + adcx r12, rdi + ; A[2] * B[3] + mulx r14, rdi, QWORD PTR [rbp+24] + adox r13, rsi + adcx r13, rdi + adox r14, rbx + mov rdx, QWORD PTR [rax+24] + adcx r14, rbx + ; A[3] * B[0] + mulx rsi, rdi, QWORD PTR [rbp] + xor rbx, rbx + adcx r11, rdi + ; A[3] * B[1] + mulx r15, rdi, QWORD PTR [rbp+8] + adox r12, rsi + adcx r12, rdi + ; A[3] * B[2] + mulx rsi, rdi, QWORD PTR [rbp+16] adox r13, r15 - xor r14, r14 - adcx r10, rax - ; A[1] * B[1] - mov rdx, QWORD PTR [rbp+8] - mulx rax, rdx, QWORD PTR [rdi+8] - adcx r11, rsi - adox r10, rdx - ; A[3] * B[1] - mov rdx, QWORD PTR [rbp+8] - adox r11, rax - mulx rsi, rax, QWORD PTR [rdi+24] - adcx r12, rax - ; A[2] * B[2] - mov rdx, QWORD PTR [rbp+16] - mulx rax, rdx, QWORD PTR [rdi+16] - adcx r13, rsi - adox r12, rdx - ; A[3] * B[3] - mov rdx, QWORD PTR [rbp+24] - adox r13, rax - mulx rsi, rax, QWORD PTR [rdi+24] - adox r14, r15 - adcx r14, rax - ; A[0] * B[3] - mulx rax, rdx, QWORD PTR [rdi] - adcx r15, rsi - xor rsi, rsi - adcx r11, rdx - ; A[3] * B[0] - mov rdx, QWORD PTR [rdi+24] - adcx r12, rax - mulx rax, rbx, QWORD PTR [rbp] - adox r11, rbx - adox r12, rax - ; A[3] * B[2] - mulx rax, rdx, QWORD PTR [rbp+16] - adcx r13, rdx - ; A[2] * B[3] - mov rdx, QWORD PTR [rbp+24] - adcx r14, rax - mulx rdx, rax, QWORD PTR [rdi+16] - adcx r15, rsi - adox r13, rax - adox r14, rdx - adox r15, rsi + adcx r13, rdi + ; A[3] * B[3] + mulx r15, rdi, QWORD PTR [rbp+24] + adox r14, rsi + adcx r14, rdi + adox r15, rbx + adcx r15, rbx mov QWORD PTR [rcx], r8 mov QWORD PTR [rcx+8], r9 mov QWORD PTR [rcx+16], r10 @@ -37802,14 +54235,14 @@ sp_256_mul_avx2_4 PROC mov QWORD PTR [rcx+40], r13 mov QWORD PTR [rcx+48], r14 mov QWORD PTR [rcx+56], r15 + pop rbx pop rsi pop rdi - pop rbp pop r15 pop r14 pop r13 pop r12 - pop rbx + pop rbp ret sp_256_mul_avx2_4 ENDP _text ENDS @@ -37946,31 +54379,34 @@ sp_256_sqr_avx2_4 PROC push rsi push rbx mov rax, rdx - ; A[0] * A[1] + xor r8, r8 mov rdx, QWORD PTR [rax] - mov r15, QWORD PTR [rax+16] - mulx r10, r9, QWORD PTR [rax+8] + mov rsi, QWORD PTR [rax+8] + mov rbx, QWORD PTR [rax+16] + mov r15, QWORD PTR [rax+24] + ; A[0] * A[1] + mulx r10, r9, rsi + ; A[0] * A[2] + mulx r11, r8, rbx + adox r10, r8 ; A[0] * A[3] - mulx r12, r11, QWORD PTR [rax+24] - ; A[2] * A[1] + mulx r12, r8, r15 + mov rdx, rsi + adox r11, r8 + ; A[1] * A[2] + mulx rdi, r8, rbx mov rdx, r15 - mulx rbx, rsi, QWORD PTR [rax+8] - ; A[2] * A[3] - mulx r14, r13, QWORD PTR [rax+24] - xor r15, r15 - adox r11, rsi - adox r12, rbx - ; A[2] * A[0] - mulx rbx, rsi, QWORD PTR [rax] + adcx r11, r8 ; A[1] * A[3] - mov rdx, QWORD PTR [rax+8] + mulx r13, r8, rsi + mov r15, 0 + adox r12, rdi + adcx r12, r8 + ; A[2] * A[3] + mulx r14, r8, rbx adox r13, r15 - mulx r8, rdi, QWORD PTR [rax+24] - adcx r10, rsi - adox r14, r15 - adcx r11, rbx - adcx r12, rdi adcx r13, r8 + adox r14, r15 adcx r14, r15 ; Double with Carry Flag xor r15, r15 @@ -38027,21 +54463,22 @@ ENDIF ; */ _text SEGMENT READONLY PARA sp_256_add_4 PROC - ; Add - mov r9, QWORD PTR [rdx] + push r12 xor rax, rax - add r9, QWORD PTR [r8] + mov r9, QWORD PTR [rdx] mov r10, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r9 + mov r11, QWORD PTR [rdx+16] + mov r12, QWORD PTR [rdx+24] + add r9, QWORD PTR [r8] adc r10, QWORD PTR [r8+8] - mov r9, QWORD PTR [rdx+16] + adc r11, QWORD PTR [r8+16] + adc r12, QWORD PTR [r8+24] + mov QWORD PTR [rcx], r9 mov QWORD PTR [rcx+8], r10 - adc r9, QWORD PTR [r8+16] - mov r10, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r9 - adc r10, QWORD PTR [r8+24] - mov QWORD PTR [rcx+24], r10 + mov QWORD PTR [rcx+16], r11 + mov QWORD PTR [rcx+24], r12 adc rax, 0 + pop r12 ret sp_256_add_4 ENDP _text ENDS @@ -38067,7 +54504,7 @@ sp_256_sub_4 PROC mov QWORD PTR [rcx+8], r10 mov QWORD PTR [rcx+16], r11 mov QWORD PTR [rcx+24], r12 - sbb rax, 0 + sbb rax, rax pop r12 ret sp_256_sub_4 ENDP @@ -38220,52 +54657,44 @@ sp_256_mont_mul_4 PROC ; Start Reduction ; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192 ; - a[0] << 32 << 192 - ; + (a[0] * 2) << 192 + ; a[0]-a[3] + (a[0] * 2) << 192 mov rax, r11 - mov rdx, r14 - add rdx, r11 + lea rdx, QWORD PTR [r14+r11] mov r10, r12 - add rdx, r11 mov r8, r13 + mov r9, r13 ; a[0]-a[2] << 32 shl r11, 32 - shld r13, r10, 32 + shld r9, r10, 32 shld r12, rax, 32 ; - a[0] << 32 << 192 sub rdx, r11 ; + a[0]-a[2] << 32 << 64 add r10, r11 adc r8, r12 - adc rdx, r13 + adc rdx, r9 ; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu + xor r9, r9 ; a += mu << 256 - xor r11, r11 add r15, rax adc rdi, r10 adc rsi, r8 adc rbx, rdx - sbb r11, 0 + sbb r11, r11 ; a += mu << 192 add r14, rax adc r15, r10 + mov r12, r10 adc rdi, r8 adc rsi, rdx adc rbx, 0 sbb r11, 0 ; mu <<= 32 - mov r9, rdx + shld r9, rdx, 32 shld rdx, r8, 32 shld r8, r10, 32 shld r10, rax, 32 - shr r9, 32 shl rax, 32 - ; a += (mu << 32) << 64 - add r14, r8 - adc r15, rdx - adc rdi, r9 - adc rsi, 0 - adc rbx, 0 - sbb r11, 0 ; a -= (mu << 32) << 192 sub r14, rax sbb r15, r10 @@ -38273,19 +54702,28 @@ sp_256_mont_mul_4 PROC sbb rsi, rdx sbb rbx, r9 adc r11, 0 - mov rax, 4294967295 + ; a += (mu << 32) << 64 + sub r12, rax + adc r13, r10 + adc r14, r8 + adc r15, rdx + adc rdi, r9 + adc rsi, 0 + adc rbx, 0 + sbb r11, 0 mov r10, 18446744069414584321 + mov rax, r11 ; mask m and sub from result if overflow ; m[0] = -1 & mask = mask - and rax, r11 + shr rax, 32 ; m[2] = 0 & mask = 0 and r10, r11 sub r15, r11 sbb rdi, rax - sbb rsi, 0 - sbb rbx, r10 mov QWORD PTR [rcx], r15 + sbb rsi, 0 mov QWORD PTR [rcx+8], rdi + sbb rbx, r10 mov QWORD PTR [rcx+16], rsi mov QWORD PTR [rcx+24], rbx pop rbx @@ -38395,52 +54833,44 @@ sp_256_mont_sqr_4 PROC ; Start Reduction ; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192 ; - a[0] << 32 << 192 - ; + (a[0] * 2) << 192 + ; a[0]-a[3] + (a[0] * 2) << 192 mov rax, r10 - mov rdx, r13 - add rdx, r10 + lea rdx, QWORD PTR [r13+r10] mov r8, r11 - add rdx, r10 mov rbx, r12 + mov r9, r12 ; a[0]-a[2] << 32 shl r10, 32 - shld r12, r8, 32 + shld r9, r8, 32 shld r11, rax, 32 ; - a[0] << 32 << 192 sub rdx, r10 ; + a[0]-a[2] << 32 << 64 add r8, r10 adc rbx, r11 - adc rdx, r12 + adc rdx, r9 ; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu + xor r9, r9 ; a += mu << 256 - xor r10, r10 add r14, rax adc r15, r8 adc rdi, rbx adc rsi, rdx - sbb r10, 0 + sbb r10, r10 ; a += mu << 192 add r13, rax adc r14, r8 + mov r11, r8 adc r15, rbx adc rdi, rdx adc rsi, 0 sbb r10, 0 ; mu <<= 32 - mov r9, rdx + shld r9, rdx, 32 shld rdx, rbx, 32 shld rbx, r8, 32 shld r8, rax, 32 - shr r9, 32 shl rax, 32 - ; a += (mu << 32) << 64 - add r13, rbx - adc r14, rdx - adc r15, r9 - adc rdi, 0 - adc rsi, 0 - sbb r10, 0 ; a -= (mu << 32) << 192 sub r13, rax sbb r14, r8 @@ -38448,19 +54878,28 @@ sp_256_mont_sqr_4 PROC sbb rdi, rdx sbb rsi, r9 adc r10, 0 - mov rax, 4294967295 + ; a += (mu << 32) << 64 + sub r11, rax + adc r12, r8 + adc r13, rbx + adc r14, rdx + adc r15, r9 + adc rdi, 0 + adc rsi, 0 + sbb r10, 0 mov r8, 18446744069414584321 + mov rax, r10 ; mask m and sub from result if overflow ; m[0] = -1 & mask = mask - and rax, r10 + shr rax, 32 ; m[2] = 0 & mask = 0 and r8, r10 sub r14, r10 sbb r15, rax - sbb rdi, 0 - sbb rsi, r8 mov QWORD PTR [rcx], r14 + sbb rdi, 0 mov QWORD PTR [rcx+8], r15 + sbb rsi, r8 mov QWORD PTR [rcx+16], rdi mov QWORD PTR [rcx+24], rsi pop rbx @@ -38540,7 +54979,6 @@ sp_256_cond_sub_4 PROC push r15 push rdi push rsi - mov rax, 0 mov r14, QWORD PTR [r8] mov r15, QWORD PTR [r8+8] mov rdi, QWORD PTR [r8+16] @@ -38561,7 +54999,7 @@ sp_256_cond_sub_4 PROC mov QWORD PTR [rcx+8], r11 mov QWORD PTR [rcx+16], r12 mov QWORD PTR [rcx+24], r13 - sbb rax, 0 + sbb rax, rax pop rsi pop rdi pop r15 @@ -38579,6 +55017,112 @@ _text ENDS ; */ _text SEGMENT READONLY PARA sp_256_mont_reduce_4 PROC + push rbx + push rsi + push r12 + push r13 + push r14 + push r15 + push rdi + mov r8, rcx + mov r9, QWORD PTR [r8] + mov r10, QWORD PTR [r8+8] + mov r11, QWORD PTR [r8+16] + mov r12, QWORD PTR [r8+24] + mov r13, QWORD PTR [r8+32] + mov r14, QWORD PTR [r8+40] + mov r15, QWORD PTR [r8+48] + mov rdi, QWORD PTR [r8+56] + ; Start Reduction + ; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192 + ; - a[0] << 32 << 192 + ; a[0]-a[3] + (a[0] * 2) << 192 + mov rax, r9 + lea rdx, QWORD PTR [r12+r9] + mov rbx, r10 + mov rcx, r11 + mov rsi, r11 + ; a[0]-a[2] << 32 + shl r9, 32 + shld rsi, rbx, 32 + shld r10, rax, 32 + ; - a[0] << 32 << 192 + sub rdx, r9 + ; + a[0]-a[2] << 32 << 64 + add rbx, r9 + adc rcx, r10 + adc rdx, rsi + ; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu + xor rsi, rsi + ; a += mu << 256 + add r13, rax + adc r14, rbx + adc r15, rcx + adc rdi, rdx + sbb r9, r9 + ; a += mu << 192 + add r12, rax + adc r13, rbx + mov r10, rbx + adc r14, rcx + adc r15, rdx + adc rdi, 0 + sbb r9, 0 + ; mu <<= 32 + shld rsi, rdx, 32 + shld rdx, rcx, 32 + shld rcx, rbx, 32 + shld rbx, rax, 32 + shl rax, 32 + ; a -= (mu << 32) << 192 + sub r12, rax + sbb r13, rbx + sbb r14, rcx + sbb r15, rdx + sbb rdi, rsi + adc r9, 0 + ; a += (mu << 32) << 64 + sub r10, rax + adc r11, rbx + adc r12, rcx + adc r13, rdx + adc r14, rsi + adc r15, 0 + adc rdi, 0 + sbb r9, 0 + mov rbx, 18446744069414584321 + mov rax, r9 + ; mask m and sub from result if overflow + ; m[0] = -1 & mask = mask + shr rax, 32 + ; m[2] = 0 & mask = 0 + and rbx, r9 + sub r13, r9 + sbb r14, rax + mov QWORD PTR [r8], r13 + sbb r15, 0 + mov QWORD PTR [r8+8], r14 + sbb rdi, rbx + mov QWORD PTR [r8+16], r15 + mov QWORD PTR [r8+24], rdi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rsi + pop rbx + ret +sp_256_mont_reduce_4 ENDP +_text ENDS +; /* Reduce the number back to 256 bits using Montgomery reduction. +; * +; * a A single precision number to reduce in place. +; * m The single precision number representing the modulus. +; * mp The digit representing the negative inverse of m mod 2^n. +; */ +_text SEGMENT READONLY PARA +sp_256_mont_reduce_order_4 PROC push r12 push r13 push r14 @@ -38670,7 +55214,7 @@ L_mont_loop_4: pop r13 pop r12 ret -sp_256_mont_reduce_4 ENDP +sp_256_mont_reduce_order_4 ENDP _text ENDS ; /* Add two Montgomery form numbers (r = a + b % m). ; * @@ -38687,14 +55231,13 @@ sp_256_mont_add_4 PROC mov r9, QWORD PTR [rdx+8] mov r10, QWORD PTR [rdx+16] mov r11, QWORD PTR [rdx+24] - mov r12, 4294967295 - mov r13, 18446744069414584321 add rax, QWORD PTR [r8] + mov r12, 4294967295 adc r9, QWORD PTR [r8+8] + mov r13, 18446744069414584321 adc r10, QWORD PTR [r8+16] - mov rdx, 0 adc r11, QWORD PTR [r8+24] - sbb rdx, 0 + sbb rdx, rdx and r12, rdx and r13, rdx sub rax, rdx @@ -38731,14 +55274,13 @@ sp_256_mont_dbl_4 PROC mov r8, QWORD PTR [rdx+8] mov r9, QWORD PTR [rdx+16] mov r10, QWORD PTR [rdx+24] - mov r11, 4294967295 - mov r12, 18446744069414584321 add rax, rax + mov r11, 4294967295 adc r8, r8 + mov r12, 18446744069414584321 adc r9, r9 - mov r13, 0 adc r10, r10 - sbb r13, 0 + sbb r13, r13 and r11, r13 and r12, r13 sub rax, r13 @@ -38775,14 +55317,13 @@ sp_256_mont_tpl_4 PROC mov r8, QWORD PTR [rdx+8] mov r9, QWORD PTR [rdx+16] mov r10, QWORD PTR [rdx+24] - mov r11, 4294967295 - mov r12, 18446744069414584321 add rax, rax + mov r11, 4294967295 adc r8, r8 + mov r12, 18446744069414584321 adc r9, r9 - mov r13, 0 adc r10, r10 - sbb r13, 0 + sbb r13, r13 and r11, r13 and r12, r13 sub rax, r13 @@ -38796,14 +55337,13 @@ sp_256_mont_tpl_4 PROC sbb r8, r11 sbb r9, 0 sbb r10, r12 - mov r11, 4294967295 - mov r12, 18446744069414584321 add rax, QWORD PTR [rdx] + mov r11, 4294967295 adc r8, QWORD PTR [rdx+8] + mov r12, 18446744069414584321 adc r9, QWORD PTR [rdx+16] - mov r13, 0 adc r10, QWORD PTR [rdx+24] - sbb r13, 0 + sbb r13, r13 and r11, r13 and r12, r13 sub rax, r13 @@ -38841,14 +55381,13 @@ sp_256_mont_sub_4 PROC mov r9, QWORD PTR [rdx+8] mov r10, QWORD PTR [rdx+16] mov r11, QWORD PTR [rdx+24] - mov r12, 4294967295 - mov r13, 18446744069414584321 sub rax, QWORD PTR [r8] + mov r12, 4294967295 sbb r9, QWORD PTR [r8+8] + mov r13, 18446744069414584321 sbb r10, QWORD PTR [r8+16] - mov rdx, 0 sbb r11, QWORD PTR [r8+24] - sbb rdx, 0 + sbb rdx, rdx and r12, rdx and r13, rdx add rax, rdx @@ -38871,6 +55410,45 @@ sp_256_mont_sub_4 PROC ret sp_256_mont_sub_4 ENDP _text ENDS +; /* Subtract two Montgomery form numbers (r = a - b % m). +; * +; * b is less than the modulus. +; * +; * r Result of subtration. +; * a Number to subtract from in Montgomery form. +; * b Number to subtract with in Montgomery form. +; * m Modulus (prime). +; */ +_text SEGMENT READONLY PARA +sp_256_mont_sub_lower_4 PROC + push r12 + push r13 + mov rax, QWORD PTR [rdx] + mov r9, QWORD PTR [rdx+8] + mov r10, QWORD PTR [rdx+16] + mov r11, QWORD PTR [rdx+24] + sub rax, QWORD PTR [r8] + mov r12, 4294967295 + sbb r9, QWORD PTR [r8+8] + mov r13, 18446744069414584321 + sbb r10, QWORD PTR [r8+16] + sbb r11, QWORD PTR [r8+24] + sbb rdx, rdx + and r12, rdx + and r13, rdx + add rax, rdx + adc r9, r12 + mov QWORD PTR [rcx], rax + adc r10, 0 + mov QWORD PTR [rcx+8], r9 + adc r11, r13 + mov QWORD PTR [rcx+16], r10 + mov QWORD PTR [rcx+24], r11 + pop r13 + pop r12 + ret +sp_256_mont_sub_lower_4 ENDP +_text ENDS ; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) ; * ; * r Result of division by 2. @@ -38911,6 +55489,193 @@ sp_256_div2_4 PROC ret sp_256_div2_4 ENDP _text ENDS +; /* Triple a Montgomery form number (r = a + a + a % m). +; * +; * a is less than m. +; * +; * r Result of Tripling. +; * a Number to triple in Montgomery form. +; * m Modulus (prime). +; */ +_text SEGMENT READONLY PARA +sp_256_mont_tpl_lower_4 PROC + push r12 + push r13 + mov rax, QWORD PTR [rdx] + mov r8, QWORD PTR [rdx+8] + mov r9, QWORD PTR [rdx+16] + mov r10, QWORD PTR [rdx+24] + add rax, rax + mov r11, 4294967295 + adc r8, r8 + mov r12, 18446744069414584321 + adc r9, r9 + adc r10, r10 + sbb r13, r13 + and r11, r13 + and r12, r13 + sub rax, r13 + sbb r8, r11 + sbb r9, 0 + sbb r10, r12 + add rax, QWORD PTR [rdx] + mov r11, 4294967295 + adc r8, QWORD PTR [rdx+8] + mov r12, 18446744069414584321 + adc r9, QWORD PTR [rdx+16] + adc r10, QWORD PTR [rdx+24] + sbb r13, r13 + and r11, r13 + and r12, r13 + sub rax, r13 + sbb r8, r11 + sbb r9, 0 + sbb r10, r12 + adc r13, 0 + and r11, r13 + and r12, r13 + sub rax, r13 + sbb r8, r11 + mov QWORD PTR [rcx], rax + sbb r9, 0 + mov QWORD PTR [rcx+8], r8 + sbb r10, r12 + mov QWORD PTR [rcx+16], r9 + mov QWORD PTR [rcx+24], r10 + pop r13 + pop r12 + ret +sp_256_mont_tpl_lower_4 ENDP +_text ENDS +; /* Two Montgomery numbers, subtract double second from first (r = a - 2.b % m). +; * +; * r Result of subtration. +; * a Number to subtract from in Montgomery form. +; * b Number to double and subtract with in Montgomery form. +; * m Modulus (prime). +; */ +_text SEGMENT READONLY PARA +sp_256_mont_sub_dbl_4 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + mov rax, QWORD PTR [rdx] + mov r9, QWORD PTR [rdx+8] + mov r10, QWORD PTR [rdx+16] + mov r11, QWORD PTR [rdx+24] + mov r12, QWORD PTR [r8] + mov r13, QWORD PTR [r8+8] + mov r14, QWORD PTR [r8+16] + mov r15, QWORD PTR [r8+24] + add r12, r12 + mov rdi, 4294967295 + adc r13, r13 + mov rsi, 18446744069414584321 + adc r14, r14 + adc r15, r15 + sbb r8, r8 + and rdi, r8 + and rsi, r8 + sub r12, r8 + sbb r13, rdi + sbb r14, 0 + sbb r15, rsi + adc r8, 0 + and rdi, r8 + and rsi, r8 + sub r12, r8 + sbb r13, rdi + sbb r14, 0 + sbb r15, rsi + sub rax, r12 + mov rdi, 4294967295 + sbb r9, r13 + mov rsi, 18446744069414584321 + sbb r10, r14 + sbb r11, r15 + sbb r8, r8 + and rdi, r8 + and rsi, r8 + add rax, r8 + adc r9, rdi + adc r10, 0 + adc r11, rsi + adc r8, 0 + and rdi, r8 + and rsi, r8 + add rax, r8 + adc r9, rdi + mov QWORD PTR [rcx], rax + adc r10, 0 + mov QWORD PTR [rcx+8], r9 + adc r11, rsi + mov QWORD PTR [rcx+16], r10 + mov QWORD PTR [rcx+24], r11 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sp_256_mont_sub_dbl_4 ENDP +_text ENDS +; /* Two Montgomery numbers, subtract second from first and double. +; * (r = 2.(a - b) % m). +; * +; * b must have came from a mont_sub operation. +; * +; * r Result of subtration. +; * a Number to subtract from in Montgomery form. +; * b Number to subtract with in Montgomery form. +; * m Modulus (prime). +; */ +_text SEGMENT READONLY PARA +sp_256_mont_dbl_sub_4 PROC + push r12 + push r13 + mov rax, QWORD PTR [rdx] + mov r9, QWORD PTR [rdx+8] + mov r10, QWORD PTR [rdx+16] + mov r11, QWORD PTR [rdx+24] + sub rax, QWORD PTR [r8] + mov r12, 4294967295 + sbb r9, QWORD PTR [r8+8] + mov r13, 18446744069414584321 + sbb r10, QWORD PTR [r8+16] + sbb r11, QWORD PTR [r8+24] + sbb r8, r8 + and r12, r8 + and r13, r8 + add rax, r8 + adc r9, r12 + adc r10, 0 + adc r11, r13 + add rax, rax + mov r12, 4294967295 + adc r9, r9 + mov r13, 18446744069414584321 + adc r10, r10 + adc r11, r11 + sbb r8, r8 + and r12, r8 + and r13, r8 + sub rax, r8 + sbb r9, r12 + mov QWORD PTR [rcx], rax + sbb r10, 0 + mov QWORD PTR [rcx+8], r9 + sbb r11, r13 + mov QWORD PTR [rcx+16], r10 + mov QWORD PTR [rcx+24], r11 + pop r13 + pop r12 + ret +sp_256_mont_dbl_sub_4 ENDP +_text ENDS IFNDEF WC_NO_CACHE_RESISTANT ; /* Touch each possible point that could be being copied. ; * @@ -38993,9 +55758,9 @@ sp_256_get_point_33_avx2_4 PROC L_256_get_point_33_avx2_4_start: vpcmpeqd ymm6, ymm8, ymm7 vpaddd ymm8, ymm8, ymm9 - vmovupd ymm3, [rdx] - vmovupd ymm4, [rdx+64] - vmovupd ymm5, [rdx+128] + vmovupd ymm3, YMMWORD PTR [rdx] + vmovupd ymm4, YMMWORD PTR [rdx+64] + vmovupd ymm5, YMMWORD PTR [rdx+128] add rdx, 200 vpand ymm3, ymm3, ymm6 vpand ymm4, ymm4, ymm6 @@ -39025,171 +55790,167 @@ IFDEF HAVE_INTEL_AVX2 ; */ _text SEGMENT READONLY PARA sp_256_mont_mul_avx2_4 PROC - push rbx + push rbp push r12 push r13 push r14 push r15 - push rbp push rdi push rsi + push rbx mov rbp, r8 - mov rdi, rdx - ; A[0] * B[0] - mov rdx, QWORD PTR [rbp] - mulx r9, r8, QWORD PTR [rdi] - ; A[2] * B[0] - mulx r11, r10, QWORD PTR [rdi+16] - ; A[1] * B[0] - mulx rsi, rax, QWORD PTR [rdi+8] - xor r15, r15 - adcx r9, rax - ; A[1] * B[3] - mov rdx, QWORD PTR [rbp+24] - mulx r13, r12, QWORD PTR [rdi+8] - adcx r10, rsi - ; A[0] * B[1] - mov rdx, QWORD PTR [rbp+8] - mulx rsi, rax, QWORD PTR [rdi] - adox r9, rax - ; A[2] * B[1] - mulx r14, rax, QWORD PTR [rdi+16] + mov rax, rdx + mov rdx, QWORD PTR [rax] + ; A[0] * B[0] + mulx r9, r8, QWORD PTR [rbp] + xor rbx, rbx + ; A[0] * B[1] + mulx r10, rdi, QWORD PTR [rbp+8] + adcx r9, rdi + ; A[0] * B[2] + mulx r11, rdi, QWORD PTR [rbp+16] + adcx r10, rdi + ; A[0] * B[3] + mulx r12, rdi, QWORD PTR [rbp+24] + adcx r11, rdi + mov rdx, QWORD PTR [rax+8] + adcx r12, rbx + ; A[1] * B[0] + mulx rsi, rdi, QWORD PTR [rbp] + xor rbx, rbx + adcx r9, rdi + ; A[1] * B[1] + mulx r15, rdi, QWORD PTR [rbp+8] adox r10, rsi - adcx r11, rax - ; A[1] * B[2] - mov rdx, QWORD PTR [rbp+16] - mulx rsi, rax, QWORD PTR [rdi+8] - adcx r12, r14 - adox r11, rax - adcx r13, r15 + adcx r10, rdi + ; A[1] * B[2] + mulx rsi, rdi, QWORD PTR [rbp+16] + adox r11, r15 + adcx r11, rdi + ; A[1] * B[3] + mulx r13, rdi, QWORD PTR [rbp+24] adox r12, rsi - ; A[0] * B[2] - mulx rsi, rax, QWORD PTR [rdi] + adcx r12, rdi + adox r13, rbx + mov rdx, QWORD PTR [rax+16] + adcx r13, rbx + ; A[2] * B[0] + mulx rsi, rdi, QWORD PTR [rbp] + xor rbx, rbx + adcx r10, rdi + ; A[2] * B[1] + mulx r15, rdi, QWORD PTR [rbp+8] + adox r11, rsi + adcx r11, rdi + ; A[2] * B[2] + mulx rsi, rdi, QWORD PTR [rbp+16] + adox r12, r15 + adcx r12, rdi + ; A[2] * B[3] + mulx r14, rdi, QWORD PTR [rbp+24] + adox r13, rsi + adcx r13, rdi + adox r14, rbx + mov rdx, QWORD PTR [rax+24] + adcx r14, rbx + ; A[3] * B[0] + mulx rsi, rdi, QWORD PTR [rbp] + xor rbx, rbx + adcx r11, rdi + ; A[3] * B[1] + mulx r15, rdi, QWORD PTR [rbp+8] + adox r12, rsi + adcx r12, rdi + ; A[3] * B[2] + mulx rsi, rdi, QWORD PTR [rbp+16] adox r13, r15 - xor r14, r14 - adcx r10, rax - ; A[1] * B[1] - mov rdx, QWORD PTR [rbp+8] - mulx rax, rdx, QWORD PTR [rdi+8] - adcx r11, rsi - adox r10, rdx - ; A[3] * B[1] - mov rdx, QWORD PTR [rbp+8] - adox r11, rax - mulx rsi, rax, QWORD PTR [rdi+24] - adcx r12, rax - ; A[2] * B[2] - mov rdx, QWORD PTR [rbp+16] - mulx rax, rdx, QWORD PTR [rdi+16] - adcx r13, rsi - adox r12, rdx - ; A[3] * B[3] - mov rdx, QWORD PTR [rbp+24] - adox r13, rax - mulx rsi, rax, QWORD PTR [rdi+24] - adox r14, r15 - adcx r14, rax - ; A[0] * B[3] - mulx rax, rdx, QWORD PTR [rdi] - adcx r15, rsi - xor rsi, rsi - adcx r11, rdx - ; A[3] * B[0] - mov rdx, QWORD PTR [rdi+24] - adcx r12, rax - mulx rax, rbx, QWORD PTR [rbp] - adox r11, rbx - adox r12, rax - ; A[3] * B[2] - mulx rax, rdx, QWORD PTR [rbp+16] - adcx r13, rdx - ; A[2] * B[3] - mov rdx, QWORD PTR [rbp+24] - adcx r14, rax - mulx rdx, rax, QWORD PTR [rdi+16] - adcx r15, rsi - adox r13, rax - adox r14, rdx - adox r15, rsi + adcx r13, rdi + ; A[3] * B[3] + mulx r15, rdi, QWORD PTR [rbp+24] + adox r14, rsi + adcx r14, rdi + adox r15, rbx + adcx r15, rbx ; Start Reduction ; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192 ; - a[0] << 32 << 192 - ; + (a[0] * 2) << 192 - mov rax, r8 - mov rdx, r11 - add rdx, r8 - mov rdi, r9 - add rdx, r8 + ; a[0]-a[3] + (a[0] * 2) << 192 + mov rdi, r8 + lea rdx, QWORD PTR [r11+r8] + mov rax, r9 mov rbp, r10 + mov rsi, r10 ; a[0]-a[2] << 32 shl r8, 32 - shld r10, rdi, 32 - shld r9, rax, 32 + shld rsi, rax, 32 + shld r9, rdi, 32 ; - a[0] << 32 << 192 sub rdx, r8 ; + a[0]-a[2] << 32 << 64 - add rdi, r8 + add rax, r8 adc rbp, r9 - adc rdx, r10 + adc rdx, rsi ; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu + xor rsi, rsi ; a += mu << 256 - xor r8, r8 - add r12, rax - adc r13, rdi + add r12, rdi + adc r13, rax adc r14, rbp adc r15, rdx - sbb r8, 0 + sbb r8, r8 ; a += mu << 192 - add r11, rax - adc r12, rdi + add r11, rdi + adc r12, rax + mov r9, rax adc r13, rbp adc r14, rdx adc r15, 0 sbb r8, 0 ; mu <<= 32 - mov rsi, rdx + shld rsi, rdx, 32 shld rdx, rbp, 32 - shld rbp, rdi, 32 - shld rdi, rax, 32 - shr rsi, 32 - shl rax, 32 + shld rbp, rax, 32 + shld rax, rdi, 32 + shl rdi, 32 + ; a -= (mu << 32) << 192 + sub r11, rdi + sbb r12, rax + sbb r13, rbp + sbb r14, rdx + sbb r15, rsi + adc r8, 0 ; a += (mu << 32) << 64 - add r11, rbp + sub r9, rdi + adc r10, rax + adc r11, rbp adc r12, rdx adc r13, rsi adc r14, 0 adc r15, 0 sbb r8, 0 - ; a -= (mu << 32) << 192 - sub r11, rax - sbb r12, rdi - sbb r13, rbp - sbb r14, rdx - sbb r15, rsi - adc r8, 0 - mov rax, 4294967295 - mov rdi, 18446744069414584321 + mov rax, 18446744069414584321 + mov rdi, r8 ; mask m and sub from result if overflow ; m[0] = -1 & mask = mask - and rax, r8 + shr rdi, 32 ; m[2] = 0 & mask = 0 - and rdi, r8 + and rax, r8 sub r12, r8 - sbb r13, rax - sbb r14, 0 - sbb r15, rdi + sbb r13, rdi mov QWORD PTR [rcx], r12 + sbb r14, 0 mov QWORD PTR [rcx+8], r13 + sbb r15, rax mov QWORD PTR [rcx+16], r14 mov QWORD PTR [rcx+24], r15 + pop rbx pop rsi pop rdi - pop rbp pop r15 pop r14 pop r13 pop r12 - pop rbx + pop rbp ret sp_256_mont_mul_avx2_4 ENDP _text ENDS @@ -39212,31 +55973,34 @@ sp_256_mont_sqr_avx2_4 PROC push rsi push rbx mov rax, rdx - ; A[0] * A[1] + xor r8, r8 mov rdx, QWORD PTR [rax] - mov r15, QWORD PTR [rax+16] - mulx r10, r9, QWORD PTR [rax+8] + mov rsi, QWORD PTR [rax+8] + mov rbx, QWORD PTR [rax+16] + mov r15, QWORD PTR [rax+24] + ; A[0] * A[1] + mulx r10, r9, rsi + ; A[0] * A[2] + mulx r11, r8, rbx + adox r10, r8 ; A[0] * A[3] - mulx r12, r11, QWORD PTR [rax+24] - ; A[2] * A[1] + mulx r12, r8, r15 + mov rdx, rsi + adox r11, r8 + ; A[1] * A[2] + mulx rdi, r8, rbx mov rdx, r15 - mulx rbx, rsi, QWORD PTR [rax+8] - ; A[2] * A[3] - mulx r14, r13, QWORD PTR [rax+24] - xor r15, r15 - adox r11, rsi - adox r12, rbx - ; A[2] * A[0] - mulx rbx, rsi, QWORD PTR [rax] + adcx r11, r8 ; A[1] * A[3] - mov rdx, QWORD PTR [rax+8] + mulx r13, r8, rsi + mov r15, 0 + adox r12, rdi + adcx r12, r8 + ; A[2] * A[3] + mulx r14, r8, rbx adox r13, r15 - mulx r8, rdi, QWORD PTR [rax+24] - adcx r10, rsi - adox r14, r15 - adcx r11, rbx - adcx r12, rdi adcx r13, r8 + adox r14, r15 adcx r14, r15 ; Double with Carry Flag xor r15, r15 @@ -39269,52 +56033,44 @@ sp_256_mont_sqr_avx2_4 PROC ; Start Reduction ; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192 ; - a[0] << 32 << 192 - ; + (a[0] * 2) << 192 + ; a[0]-a[3] + (a[0] * 2) << 192 mov rdi, r8 - mov rdx, r11 - add rdx, r8 + lea rdx, QWORD PTR [r11+r8] mov rax, r9 - add rdx, r8 mov rsi, r10 + mov rbx, r10 ; a[0]-a[2] << 32 shl r8, 32 - shld r10, rax, 32 + shld rbx, rax, 32 shld r9, rdi, 32 ; - a[0] << 32 << 192 sub rdx, r8 ; + a[0]-a[2] << 32 << 64 add rax, r8 adc rsi, r9 - adc rdx, r10 + adc rdx, rbx ; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu + xor rbx, rbx ; a += mu << 256 - xor r8, r8 add r12, rdi adc r13, rax adc r14, rsi adc r15, rdx - sbb r8, 0 + sbb r8, r8 ; a += mu << 192 add r11, rdi adc r12, rax + mov r9, rax adc r13, rsi adc r14, rdx adc r15, 0 sbb r8, 0 ; mu <<= 32 - mov rbx, rdx + shld rbx, rdx, 32 shld rdx, rsi, 32 shld rsi, rax, 32 shld rax, rdi, 32 - shr rbx, 32 shl rdi, 32 - ; a += (mu << 32) << 64 - add r11, rsi - adc r12, rdx - adc r13, rbx - adc r14, 0 - adc r15, 0 - sbb r8, 0 ; a -= (mu << 32) << 192 sub r11, rdi sbb r12, rax @@ -39322,19 +56078,28 @@ sp_256_mont_sqr_avx2_4 PROC sbb r14, rdx sbb r15, rbx adc r8, 0 - mov rdi, 4294967295 + ; a += (mu << 32) << 64 + sub r9, rdi + adc r10, rax + adc r11, rsi + adc r12, rdx + adc r13, rbx + adc r14, 0 + adc r15, 0 + sbb r8, 0 mov rax, 18446744069414584321 + mov rdi, r8 ; mask m and sub from result if overflow ; m[0] = -1 & mask = mask - and rdi, r8 + shr rdi, 32 ; m[2] = 0 & mask = 0 and rax, r8 sub r12, r8 sbb r13, rdi - sbb r14, 0 - sbb r15, rax mov QWORD PTR [rcx], r12 + sbb r14, 0 mov QWORD PTR [rcx+8], r13 + sbb r15, rax mov QWORD PTR [rcx+16], r14 mov QWORD PTR [rcx+24], r15 pop rbx @@ -39365,7 +56130,6 @@ sp_256_cond_sub_avx2_4 PROC push r15 push rdi push rsi - mov rax, 0 mov r14, QWORD PTR [r8] mov r15, QWORD PTR [r8+8] mov rdi, QWORD PTR [r8+16] @@ -39386,7 +56150,7 @@ sp_256_cond_sub_avx2_4 PROC mov QWORD PTR [rcx+8], r11 mov QWORD PTR [rcx+16], r12 mov QWORD PTR [rcx+24], r13 - sbb rax, 0 + sbb rax, rax pop rsi pop rdi pop r15 @@ -39405,7 +56169,7 @@ IFDEF HAVE_INTEL_AVX2 ; * mp The digit representing the negative inverse of m mod 2^n. ; */ _text SEGMENT READONLY PARA -sp_256_mont_reduce_avx2_4 PROC +sp_256_mont_reduce_avx2_order_4 PROC push r12 push r13 push r14 @@ -39553,7 +56317,7 @@ sp_256_mont_reduce_avx2_4 PROC pop r13 pop r12 ret -sp_256_mont_reduce_avx2_4 ENDP +sp_256_mont_reduce_avx2_order_4 ENDP _text ENDS ENDIF IFDEF HAVE_INTEL_AVX2 @@ -39670,8 +56434,8 @@ sp_256_get_entry_64_avx2_4 PROC L_256_get_entry_64_avx2_4_start: vpcmpeqd ymm4, ymm6, ymm5 vpaddd ymm6, ymm6, ymm7 - vmovupd ymm2, [rdx] - vmovupd ymm3, [rdx+32] + vmovupd ymm2, YMMWORD PTR [rdx] + vmovupd ymm3, YMMWORD PTR [rdx+32] add rdx, 64 vpand ymm2, ymm2, ymm4 vpand ymm3, ymm3, ymm4 @@ -39758,8 +56522,8 @@ sp_256_get_entry_65_avx2_4 PROC L_256_get_entry_65_avx2_4_start: vpcmpeqd ymm4, ymm6, ymm5 vpaddd ymm6, ymm6, ymm7 - vmovupd ymm2, [rdx] - vmovupd ymm3, [rdx+32] + vmovupd ymm2, YMMWORD PTR [rdx] + vmovupd ymm3, YMMWORD PTR [rdx+32] add rdx, 64 vpand ymm2, ymm2, ymm4 vpand ymm3, ymm3, ymm4 @@ -40004,7 +56768,6 @@ ENDIF ; */ _text SEGMENT READONLY PARA sp_256_sub_in_place_4 PROC - xor rax, rax mov r8, QWORD PTR [rdx] mov r9, QWORD PTR [rdx+8] mov r10, QWORD PTR [rdx+16] @@ -40013,7 +56776,7 @@ sp_256_sub_in_place_4 PROC sbb QWORD PTR [rcx+8], r9 sbb QWORD PTR [rcx+16], r10 sbb QWORD PTR [rcx+24], r11 - sbb rax, 0 + sbb rax, rax ret sp_256_sub_in_place_4 ENDP _text ENDS @@ -40132,114 +56895,109 @@ IFDEF HAVE_INTEL_AVX2 ; */ _text SEGMENT READONLY PARA sp_256_mont_mul_order_avx2_4 PROC - push rbx + push rbp push r12 push r13 push r14 push r15 - push rbp push rdi push rsi + push rbx mov rbp, r8 - mov rdi, rdx - ; A[0] * B[0] - mov rdx, QWORD PTR [rbp] - mulx r9, r8, QWORD PTR [rdi] - ; A[2] * B[0] - mulx r11, r10, QWORD PTR [rdi+16] - ; A[1] * B[0] - mulx rsi, rax, QWORD PTR [rdi+8] - xor r15, r15 - adcx r9, rax - ; A[1] * B[3] - mov rdx, QWORD PTR [rbp+24] - mulx r13, r12, QWORD PTR [rdi+8] - adcx r10, rsi - ; A[0] * B[1] - mov rdx, QWORD PTR [rbp+8] - mulx rsi, rax, QWORD PTR [rdi] - adox r9, rax - ; A[2] * B[1] - mulx r14, rax, QWORD PTR [rdi+16] + mov rax, rdx + mov rdx, QWORD PTR [rax] + ; A[0] * B[0] + mulx r9, r8, QWORD PTR [rbp] + xor rbx, rbx + ; A[0] * B[1] + mulx r10, rdi, QWORD PTR [rbp+8] + adcx r9, rdi + ; A[0] * B[2] + mulx r11, rdi, QWORD PTR [rbp+16] + adcx r10, rdi + ; A[0] * B[3] + mulx r12, rdi, QWORD PTR [rbp+24] + adcx r11, rdi + mov rdx, QWORD PTR [rax+8] + adcx r12, rbx + ; A[1] * B[0] + mulx rsi, rdi, QWORD PTR [rbp] + xor rbx, rbx + adcx r9, rdi + ; A[1] * B[1] + mulx r15, rdi, QWORD PTR [rbp+8] adox r10, rsi - adcx r11, rax - ; A[1] * B[2] - mov rdx, QWORD PTR [rbp+16] - mulx rsi, rax, QWORD PTR [rdi+8] - adcx r12, r14 - adox r11, rax - adcx r13, r15 + adcx r10, rdi + ; A[1] * B[2] + mulx rsi, rdi, QWORD PTR [rbp+16] + adox r11, r15 + adcx r11, rdi + ; A[1] * B[3] + mulx r13, rdi, QWORD PTR [rbp+24] adox r12, rsi - ; A[0] * B[2] - mulx rsi, rax, QWORD PTR [rdi] + adcx r12, rdi + adox r13, rbx + mov rdx, QWORD PTR [rax+16] + adcx r13, rbx + ; A[2] * B[0] + mulx rsi, rdi, QWORD PTR [rbp] + xor rbx, rbx + adcx r10, rdi + ; A[2] * B[1] + mulx r15, rdi, QWORD PTR [rbp+8] + adox r11, rsi + adcx r11, rdi + ; A[2] * B[2] + mulx rsi, rdi, QWORD PTR [rbp+16] + adox r12, r15 + adcx r12, rdi + ; A[2] * B[3] + mulx r14, rdi, QWORD PTR [rbp+24] + adox r13, rsi + adcx r13, rdi + adox r14, rbx + mov rdx, QWORD PTR [rax+24] + adcx r14, rbx + ; A[3] * B[0] + mulx rsi, rdi, QWORD PTR [rbp] + xor rbx, rbx + adcx r11, rdi + ; A[3] * B[1] + mulx r15, rdi, QWORD PTR [rbp+8] + adox r12, rsi + adcx r12, rdi + ; A[3] * B[2] + mulx rsi, rdi, QWORD PTR [rbp+16] adox r13, r15 - xor r14, r14 - adcx r10, rax - ; A[1] * B[1] - mov rdx, QWORD PTR [rbp+8] - mulx rax, rdx, QWORD PTR [rdi+8] - adcx r11, rsi - adox r10, rdx - ; A[3] * B[1] - mov rdx, QWORD PTR [rbp+8] - adox r11, rax - mulx rsi, rax, QWORD PTR [rdi+24] - adcx r12, rax - ; A[2] * B[2] - mov rdx, QWORD PTR [rbp+16] - mulx rax, rdx, QWORD PTR [rdi+16] - adcx r13, rsi - adox r12, rdx - ; A[3] * B[3] - mov rdx, QWORD PTR [rbp+24] - adox r13, rax - mulx rsi, rax, QWORD PTR [rdi+24] - adox r14, r15 - adcx r14, rax - ; A[0] * B[3] - mulx rax, rdx, QWORD PTR [rdi] - adcx r15, rsi - xor rsi, rsi - adcx r11, rdx - ; A[3] * B[0] - mov rdx, QWORD PTR [rdi+24] - adcx r12, rax - mulx rax, rbx, QWORD PTR [rbp] - adox r11, rbx - adox r12, rax - ; A[3] * B[2] - mulx rax, rdx, QWORD PTR [rbp+16] - adcx r13, rdx - ; A[2] * B[3] - mov rdx, QWORD PTR [rbp+24] - adcx r14, rax - mulx rdx, rax, QWORD PTR [rdi+16] - adcx r15, rsi - adox r13, rax - adox r14, rdx - adox r15, rsi + adcx r13, rdi + ; A[3] * B[3] + mulx r15, rdi, QWORD PTR [rbp+24] + adox r14, rsi + adcx r14, rdi + adox r15, rbx + adcx r15, rbx ; Start Reduction mov rbx, 14758798090332847183 ; A[0] mov rdx, rbx imul rdx, r8 - mov rax, 17562291160714782033 + mov rdi, 17562291160714782033 xor rbp, rbp - mulx rdi, rsi, rax - mov rax, 13611842547513532036 + mulx rax, rsi, rdi + mov rdi, 13611842547513532036 adcx r8, rsi - adox r9, rdi - mulx rdi, rsi, rax - mov rax, 18446744073709551615 + adox r9, rax + mulx rax, rsi, rdi + mov rdi, 18446744073709551615 adcx r9, rsi - adox r10, rdi - mulx rdi, rsi, rax - mov rax, 18446744069414584320 + adox r10, rax + mulx rax, rsi, rdi + mov rdi, 18446744069414584320 adcx r10, rsi - adox r11, rdi - mulx rdi, rsi, rax + adox r11, rax + mulx rax, rsi, rdi adcx r11, rsi - adox r12, rdi + adox r12, rax adcx r12, rbp mov r8, rbp ; carry @@ -40248,23 +57006,23 @@ sp_256_mont_mul_order_avx2_4 PROC ; A[1] mov rdx, rbx imul rdx, r9 - mov rax, 17562291160714782033 + mov rdi, 17562291160714782033 xor rbp, rbp - mulx rdi, rsi, rax - mov rax, 13611842547513532036 + mulx rax, rsi, rdi + mov rdi, 13611842547513532036 adcx r9, rsi - adox r10, rdi - mulx rdi, rsi, rax - mov rax, 18446744073709551615 + adox r10, rax + mulx rax, rsi, rdi + mov rdi, 18446744073709551615 adcx r10, rsi - adox r11, rdi - mulx rdi, rsi, rax - mov rax, 18446744069414584320 + adox r11, rax + mulx rax, rsi, rdi + mov rdi, 18446744069414584320 adcx r11, rsi - adox r12, rdi - mulx rdi, rsi, rax + adox r12, rax + mulx rax, rsi, rdi adcx r12, rsi - adox r13, rdi + adox r13, rax adcx r13, r8 mov r8, rbp ; carry @@ -40273,23 +57031,23 @@ sp_256_mont_mul_order_avx2_4 PROC ; A[2] mov rdx, rbx imul rdx, r10 - mov rax, 17562291160714782033 + mov rdi, 17562291160714782033 xor rbp, rbp - mulx rdi, rsi, rax - mov rax, 13611842547513532036 + mulx rax, rsi, rdi + mov rdi, 13611842547513532036 adcx r10, rsi - adox r11, rdi - mulx rdi, rsi, rax - mov rax, 18446744073709551615 + adox r11, rax + mulx rax, rsi, rdi + mov rdi, 18446744073709551615 adcx r11, rsi - adox r12, rdi - mulx rdi, rsi, rax - mov rax, 18446744069414584320 + adox r12, rax + mulx rax, rsi, rdi + mov rdi, 18446744069414584320 adcx r12, rsi - adox r13, rdi - mulx rdi, rsi, rax + adox r13, rax + mulx rax, rsi, rdi adcx r13, rsi - adox r14, rdi + adox r14, rax adcx r14, r8 mov r8, rbp ; carry @@ -40298,36 +57056,36 @@ sp_256_mont_mul_order_avx2_4 PROC ; A[3] mov rdx, rbx imul rdx, r11 - mov rax, 17562291160714782033 + mov rdi, 17562291160714782033 xor rbp, rbp - mulx rdi, rsi, rax - mov rax, 13611842547513532036 + mulx rax, rsi, rdi + mov rdi, 13611842547513532036 adcx r11, rsi - adox r12, rdi - mulx rdi, rsi, rax - mov rax, 18446744073709551615 + adox r12, rax + mulx rax, rsi, rdi + mov rdi, 18446744073709551615 adcx r12, rsi - adox r13, rdi - mulx rdi, rsi, rax - mov rax, 18446744069414584320 + adox r13, rax + mulx rax, rsi, rdi + mov rdi, 18446744069414584320 adcx r13, rsi - adox r14, rdi - mulx rdi, rsi, rax + adox r14, rax + mulx rax, rsi, rdi adcx r14, rsi - adox r15, rdi + adox r15, rax adcx r15, r8 mov r8, rbp ; carry adox r8, rbp adcx r8, rbp neg r8 - mov rax, 17562291160714782033 + mov rdi, 17562291160714782033 mov rbx, 13611842547513532036 - and rax, r8 + and rdi, r8 mov rbp, 18446744069414584320 and rbx, r8 and rbp, r8 - sub r12, rax + sub r12, rdi sbb r13, rbx mov QWORD PTR [rcx], r12 sbb r14, r8 @@ -40335,14 +57093,14 @@ sp_256_mont_mul_order_avx2_4 PROC sbb r15, rbp mov QWORD PTR [rcx+16], r14 mov QWORD PTR [rcx+24], r15 + pop rbx pop rsi pop rdi - pop rbp pop r15 pop r14 pop r13 pop r12 - pop rbx + pop rbp ret sp_256_mont_mul_order_avx2_4 ENDP _text ENDS @@ -40364,31 +57122,34 @@ sp_256_mont_sqr_order_avx2_4 PROC push rsi push rbx mov rax, rdx - ; A[0] * A[1] + xor r8, r8 mov rdx, QWORD PTR [rax] - mov r15, QWORD PTR [rax+16] - mulx r10, r9, QWORD PTR [rax+8] + mov rsi, QWORD PTR [rax+8] + mov rbx, QWORD PTR [rax+16] + mov r15, QWORD PTR [rax+24] + ; A[0] * A[1] + mulx r10, r9, rsi + ; A[0] * A[2] + mulx r11, r8, rbx + adox r10, r8 ; A[0] * A[3] - mulx r12, r11, QWORD PTR [rax+24] - ; A[2] * A[1] + mulx r12, r8, r15 + mov rdx, rsi + adox r11, r8 + ; A[1] * A[2] + mulx rdi, r8, rbx mov rdx, r15 - mulx rbx, rsi, QWORD PTR [rax+8] - ; A[2] * A[3] - mulx r14, r13, QWORD PTR [rax+24] - xor r15, r15 - adox r11, rsi - adox r12, rbx - ; A[2] * A[0] - mulx rbx, rsi, QWORD PTR [rax] + adcx r11, r8 ; A[1] * A[3] - mov rdx, QWORD PTR [rax+8] + mulx r13, r8, rsi + mov r15, 0 + adox r12, rdi + adcx r12, r8 + ; A[2] * A[3] + mulx r14, r8, rbx adox r13, r15 - mulx r8, rdi, QWORD PTR [rax+24] - adcx r10, rsi - adox r14, r15 - adcx r11, rbx - adcx r12, rdi adcx r13, r8 + adox r14, r15 adcx r14, r15 ; Double with Carry Flag xor r15, r15 @@ -40834,22 +57595,22 @@ sp_256_mod_inv_avx2_4 PROC mov r14, QWORD PTR [rdx+16] mov r15, QWORD PTR [rdx+24] mov rbx, ptr_L_sp256_mod_inv_avx2_4_order - vmovupd ymm6, [rbx] - vmovupd ymm7, [rbx+32] + vmovupd ymm6, YMMWORD PTR [rbx] + vmovupd ymm7, YMMWORD PTR [rbx+32] mov rbx, ptr_L_sp256_mod_inv_avx2_4_one - vmovupd ymm8, [rbx] + vmovupd ymm8, YMMWORD PTR [rbx] mov rbx, ptr_L_sp256_mod_inv_avx2_4_mask01111 - vmovupd ymm9, [rbx] + vmovupd ymm9, YMMWORD PTR [rbx] mov rbx, ptr_L_sp256_mod_inv_avx2_4_all_one - vmovupd ymm10, [rbx] + vmovupd ymm10, YMMWORD PTR [rbx] mov rbx, ptr_L_sp256_mod_inv_avx2_4_down_one_dword - vmovupd ymm11, [rbx] + vmovupd ymm11, YMMWORD PTR [rbx] mov rbx, ptr_L_sp256_mod_inv_avx2_4_neg - vmovupd ymm12, [rbx] + vmovupd ymm12, YMMWORD PTR [rbx] mov rbx, ptr_L_sp256_mod_inv_avx2_4_up_one_dword - vmovupd ymm13, [rbx] + vmovupd ymm13, YMMWORD PTR [rbx] mov rbx, ptr_L_sp256_mod_inv_avx2_4_mask26 - vmovupd ymm14, [rbx] + vmovupd ymm14, YMMWORD PTR [rbx] vpxor xmm0, xmm0, xmm0 vpxor xmm1, xmm1, xmm1 vmovdqu ymm2, ymm8 @@ -41934,27 +58695,32 @@ ENDIF ; */ _text SEGMENT READONLY PARA sp_384_add_6 PROC - ; Add - mov r9, QWORD PTR [rdx] + push r12 + push r13 + push r14 xor rax, rax - add r9, QWORD PTR [r8] + mov r9, QWORD PTR [rdx] mov r10, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r9 + mov r11, QWORD PTR [rdx+16] + mov r12, QWORD PTR [rdx+24] + mov r13, QWORD PTR [rdx+32] + mov r14, QWORD PTR [rdx+40] + add r9, QWORD PTR [r8] adc r10, QWORD PTR [r8+8] - mov r9, QWORD PTR [rdx+16] + adc r11, QWORD PTR [r8+16] + adc r12, QWORD PTR [r8+24] + adc r13, QWORD PTR [r8+32] + adc r14, QWORD PTR [r8+40] + mov QWORD PTR [rcx], r9 mov QWORD PTR [rcx+8], r10 - adc r9, QWORD PTR [r8+16] - mov r10, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r9 - adc r10, QWORD PTR [r8+24] - mov r9, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r10 - adc r9, QWORD PTR [r8+32] - mov r10, QWORD PTR [rdx+40] - mov QWORD PTR [rcx+32], r9 - adc r10, QWORD PTR [r8+40] - mov QWORD PTR [rcx+40], r10 + mov QWORD PTR [rcx+16], r11 + mov QWORD PTR [rcx+24], r12 + mov QWORD PTR [rcx+32], r13 + mov QWORD PTR [rcx+40], r14 adc rax, 0 + pop r14 + pop r13 + pop r12 ret sp_384_add_6 ENDP _text ENDS @@ -41988,7 +58754,7 @@ sp_384_sub_6 PROC mov QWORD PTR [rcx+24], r12 mov QWORD PTR [rcx+32], r13 mov QWORD PTR [rcx+40], r14 - sbb rax, 0 + sbb rax, rax pop r14 pop r13 pop r12 @@ -42046,7 +58812,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_384_cond_sub_6 PROC sub rsp, 48 - mov rax, 0 mov r10, QWORD PTR [r8] mov r11, QWORD PTR [r8+8] and r10, r9 @@ -42089,7 +58854,7 @@ sp_384_cond_sub_6 PROC sbb r11, r8 mov QWORD PTR [rcx+32], r10 mov QWORD PTR [rcx+40], r11 - sbb rax, 0 + sbb rax, rax add rsp, 48 ret sp_384_cond_sub_6 ENDP @@ -42441,94 +59206,340 @@ sp_384_cmp_6 PROC ret sp_384_cmp_6 ENDP _text ENDS -; /* Add a to a into r. (r = a + a) +; /* Add two Montgomery form numbers (r = a + b % m). ; * -; * r A single precision integer. -; * a A single precision integer. +; * r Result of addition. +; * a First number to add in Montgomery form. +; * b Second number to add in Montgomery form. +; * m Modulus (prime). ; */ _text SEGMENT READONLY PARA -sp_384_dbl_6 PROC - mov r8, QWORD PTR [rdx] - xor rax, rax - add r8, r8 +sp_384_mont_add_6 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + mov rax, QWORD PTR [rdx] mov r9, QWORD PTR [rdx+8] - mov QWORD PTR [rcx], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+16] + mov r10, QWORD PTR [rdx+16] + mov r11, QWORD PTR [rdx+24] + mov r12, QWORD PTR [rdx+32] + mov r13, QWORD PTR [rdx+40] + add rax, QWORD PTR [r8] + mov r14, 4294967295 + adc r9, QWORD PTR [r8+8] + mov r15, 18446744069414584320 + adc r10, QWORD PTR [r8+16] + mov rdi, 18446744073709551614 + adc r11, QWORD PTR [r8+24] + adc r12, QWORD PTR [r8+32] + adc r13, QWORD PTR [r8+40] + sbb rdx, rdx + and r14, rdx + and r15, rdx + and rdi, rdx + sub rax, r14 + sbb r9, r15 + sbb r10, rdi + sbb r11, rdx + sbb r12, rdx + sbb r13, rdx + adc rdx, 0 + and r14, rdx + and r15, rdx + and rdi, rdx + sub rax, r14 + sbb r9, r15 + mov QWORD PTR [rcx], rax + sbb r10, rdi mov QWORD PTR [rcx+8], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+24] - mov QWORD PTR [rcx+16], r8 - adc r9, r9 - mov r8, QWORD PTR [rdx+32] - mov QWORD PTR [rcx+24], r9 - adc r8, r8 - mov r9, QWORD PTR [rdx+40] - mov QWORD PTR [rcx+32], r8 - adc r9, r9 - mov QWORD PTR [rcx+40], r9 - adc rax, 0 + sbb r11, rdx + mov QWORD PTR [rcx+16], r10 + sbb r12, rdx + mov QWORD PTR [rcx+24], r11 + sbb r13, rdx + mov QWORD PTR [rcx+32], r12 + mov QWORD PTR [rcx+40], r13 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 ret -sp_384_dbl_6 ENDP +sp_384_mont_add_6 ENDP _text ENDS -; /* Conditionally add a and b using the mask m. -; * m is -1 to add and 0 when not. +; /* Double a Montgomery form number (r = a + a % m). ; * -; * r A single precision number representing conditional add result. -; * a A single precision number to add with. -; * b A single precision number to add. -; * m Mask value to apply. +; * r Result of doubling. +; * a Number to double in Montgomery form. +; * m Modulus (prime). ; */ _text SEGMENT READONLY PARA -sp_384_cond_add_6 PROC - sub rsp, 48 - mov rax, 0 - mov r10, QWORD PTR [r8] - mov r11, QWORD PTR [r8+8] - and r10, r9 - and r11, r9 - mov QWORD PTR [rsp], r10 - mov QWORD PTR [rsp+8], r11 - mov r10, QWORD PTR [r8+16] - mov r11, QWORD PTR [r8+24] - and r10, r9 - and r11, r9 - mov QWORD PTR [rsp+16], r10 - mov QWORD PTR [rsp+24], r11 - mov r10, QWORD PTR [r8+32] - mov r11, QWORD PTR [r8+40] - and r10, r9 - and r11, r9 - mov QWORD PTR [rsp+32], r10 - mov QWORD PTR [rsp+40], r11 - mov r10, QWORD PTR [rdx] - mov r8, QWORD PTR [rsp] - add r10, r8 - mov r11, QWORD PTR [rdx+8] - mov r8, QWORD PTR [rsp+8] - adc r11, r8 - mov QWORD PTR [rcx], r10 - mov r10, QWORD PTR [rdx+16] - mov r8, QWORD PTR [rsp+16] - adc r10, r8 - mov QWORD PTR [rcx+8], r11 - mov r11, QWORD PTR [rdx+24] - mov r8, QWORD PTR [rsp+24] - adc r11, r8 - mov QWORD PTR [rcx+16], r10 - mov r10, QWORD PTR [rdx+32] - mov r8, QWORD PTR [rsp+32] - adc r10, r8 - mov QWORD PTR [rcx+24], r11 - mov r11, QWORD PTR [rdx+40] - mov r8, QWORD PTR [rsp+40] - adc r11, r8 - mov QWORD PTR [rcx+32], r10 - mov QWORD PTR [rcx+40], r11 - adc rax, 0 - add rsp, 48 +sp_384_mont_dbl_6 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + mov rax, QWORD PTR [rdx] + mov r8, QWORD PTR [rdx+8] + mov r9, QWORD PTR [rdx+16] + mov r10, QWORD PTR [rdx+24] + mov r11, QWORD PTR [rdx+32] + mov r12, QWORD PTR [rdx+40] + add rax, rax + mov r13, 4294967295 + adc r8, r8 + mov r14, 18446744069414584320 + adc r9, r9 + mov r15, 18446744073709551614 + adc r10, r10 + adc r11, r11 + adc r12, r12 + sbb rdi, rdi + and r13, rdi + and r14, rdi + and r15, rdi + sub rax, r13 + sbb r8, r14 + sbb r9, r15 + sbb r10, rdi + sbb r11, rdi + sbb r12, rdi + adc rdi, 0 + and r13, rdi + and r14, rdi + and r15, rdi + sub rax, r13 + sbb r8, r14 + mov QWORD PTR [rcx], rax + sbb r9, r15 + mov QWORD PTR [rcx+8], r8 + sbb r10, rdi + mov QWORD PTR [rcx+16], r9 + sbb r11, rdi + mov QWORD PTR [rcx+24], r10 + sbb r12, rdi + mov QWORD PTR [rcx+32], r11 + mov QWORD PTR [rcx+40], r12 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 ret -sp_384_cond_add_6 ENDP +sp_384_mont_dbl_6 ENDP +_text ENDS +; /* Double a Montgomery form number (r = a + a % m). +; * +; * r Result of doubling. +; * a Number to double in Montgomery form. +; * m Modulus (prime). +; */ +_text SEGMENT READONLY PARA +sp_384_mont_tpl_6 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + mov rax, QWORD PTR [rdx] + mov r8, QWORD PTR [rdx+8] + mov r9, QWORD PTR [rdx+16] + mov r10, QWORD PTR [rdx+24] + mov r11, QWORD PTR [rdx+32] + mov r12, QWORD PTR [rdx+40] + add rax, rax + mov r13, 4294967295 + adc r8, r8 + mov r14, 18446744069414584320 + adc r9, r9 + mov r15, 18446744073709551614 + adc r10, r10 + adc r11, r11 + adc r12, r12 + sbb rdi, rdi + and r13, rdi + and r14, rdi + and r15, rdi + sub rax, r13 + sbb r8, r14 + sbb r9, r15 + sbb r10, rdi + sbb r11, rdi + sbb r12, rdi + adc rdi, 0 + and r13, rdi + and r14, rdi + and r15, rdi + sub rax, r13 + sbb r8, r14 + mov QWORD PTR [rcx], rax + sbb r9, r15 + sbb r10, rdi + sbb r11, rdi + sbb r12, rdi + add rax, QWORD PTR [rdx] + mov r13, 4294967295 + adc r8, QWORD PTR [rdx+8] + mov r14, 18446744069414584320 + adc r9, QWORD PTR [rdx+16] + mov r15, 18446744073709551614 + adc r10, QWORD PTR [rdx+24] + adc r11, QWORD PTR [rdx+32] + adc r12, QWORD PTR [rdx+40] + sbb rdi, rdi + and r13, rdi + and r14, rdi + and r15, rdi + sub rax, r13 + sbb r8, r14 + sbb r9, r15 + sbb r10, rdi + sbb r11, rdi + sbb r12, rdi + adc rdi, 0 + and r13, rdi + and r14, rdi + and r15, rdi + sub rax, r13 + sbb r8, r14 + mov QWORD PTR [rcx], rax + sbb r9, r15 + mov QWORD PTR [rcx+8], r8 + sbb r10, rdi + mov QWORD PTR [rcx+16], r9 + sbb r11, rdi + mov QWORD PTR [rcx+24], r10 + sbb r12, rdi + mov QWORD PTR [rcx+32], r11 + mov QWORD PTR [rcx+40], r12 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sp_384_mont_tpl_6 ENDP +_text ENDS +; /* Subtract two Montgomery form numbers (r = a - b % m). +; * +; * r Result of subtration. +; * a Number to subtract from in Montgomery form. +; * b Number to subtract with in Montgomery form. +; * m Modulus (prime). +; */ +_text SEGMENT READONLY PARA +sp_384_mont_sub_6 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + mov rax, QWORD PTR [rdx] + mov r9, QWORD PTR [rdx+8] + mov r10, QWORD PTR [rdx+16] + mov r11, QWORD PTR [rdx+24] + mov r12, QWORD PTR [rdx+32] + mov r13, QWORD PTR [rdx+40] + sub rax, QWORD PTR [r8] + mov r14, 4294967295 + sbb r9, QWORD PTR [r8+8] + mov r15, 18446744069414584320 + sbb r10, QWORD PTR [r8+16] + mov rdi, 18446744073709551614 + sbb r11, QWORD PTR [r8+24] + sbb r12, QWORD PTR [r8+32] + sbb r13, QWORD PTR [r8+40] + sbb rdx, rdx + and r14, rdx + and r15, rdx + and rdi, rdx + add rax, r14 + adc r9, r15 + adc r10, rdi + adc r11, rdx + adc r12, rdx + adc r13, rdx + adc rdx, 0 + and r14, rdx + and r15, rdx + and rdi, rdx + add rax, r14 + adc r9, r15 + mov QWORD PTR [rcx], rax + adc r10, rdi + mov QWORD PTR [rcx+8], r9 + adc r11, rdx + mov QWORD PTR [rcx+16], r10 + adc r12, rdx + mov QWORD PTR [rcx+24], r11 + adc r13, rdx + mov QWORD PTR [rcx+32], r12 + mov QWORD PTR [rcx+40], r13 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sp_384_mont_sub_6 ENDP +_text ENDS +; /* Subtract two Montgomery form numbers (r = a - b % m). +; * +; * b is less than the modulus. +; * +; * r Result of subtration. +; * a Number to subtract from in Montgomery form. +; * b Number to subtract with in Montgomery form. +; * m Modulus (prime). +; */ +_text SEGMENT READONLY PARA +sp_384_mont_sub_lower_6 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + mov rax, QWORD PTR [rdx] + mov r9, QWORD PTR [rdx+8] + mov r10, QWORD PTR [rdx+16] + mov r11, QWORD PTR [rdx+24] + mov r12, QWORD PTR [rdx+32] + mov r13, QWORD PTR [rdx+40] + sub rax, QWORD PTR [r8] + mov r14, 4294967295 + sbb r9, QWORD PTR [r8+8] + mov r15, 18446744069414584320 + sbb r10, QWORD PTR [r8+16] + mov rdi, 18446744073709551614 + sbb r11, QWORD PTR [r8+24] + sbb r12, QWORD PTR [r8+32] + sbb r13, QWORD PTR [r8+40] + sbb rdx, rdx + and r14, rdx + and r15, rdx + and rdi, rdx + add rax, r14 + adc r9, r15 + mov QWORD PTR [rcx], rax + adc r10, rdi + mov QWORD PTR [rcx+8], r9 + adc r11, rdx + mov QWORD PTR [rcx+16], r10 + adc r12, rdx + mov QWORD PTR [rcx+24], r11 + adc r13, rdx + mov QWORD PTR [rcx+32], r12 + mov QWORD PTR [rcx+40], r13 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sp_384_mont_sub_lower_6 ENDP _text ENDS ; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) ; * @@ -42600,6 +59611,144 @@ sp_384_div2_6 PROC ret sp_384_div2_6 ENDP _text ENDS +; /* Double a Montgomery form number (r = a + a % m). +; * +; * a is less than m. +; * +; * r Result of doubling. +; * a Number to double in Montgomery form. +; * m Modulus (prime). +; */ +_text SEGMENT READONLY PARA +sp_384_mont_dbl_lower_6 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + mov rax, QWORD PTR [rdx] + mov r8, QWORD PTR [rdx+8] + mov r9, QWORD PTR [rdx+16] + mov r10, QWORD PTR [rdx+24] + mov r11, QWORD PTR [rdx+32] + mov r12, QWORD PTR [rdx+40] + add rax, rax + mov r13, 4294967295 + adc r8, r8 + mov r14, 18446744069414584320 + adc r9, r9 + mov r15, 18446744073709551614 + adc r10, r10 + adc r11, r11 + adc r12, r12 + sbb rdi, rdi + and r13, rdi + and r14, rdi + and r15, rdi + sub rax, r13 + sbb r8, r14 + mov QWORD PTR [rcx], rax + sbb r9, r15 + mov QWORD PTR [rcx+8], r8 + sbb r10, rdi + mov QWORD PTR [rcx+16], r9 + sbb r11, rdi + mov QWORD PTR [rcx+24], r10 + sbb r12, rdi + mov QWORD PTR [rcx+32], r11 + mov QWORD PTR [rcx+40], r12 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sp_384_mont_dbl_lower_6 ENDP +_text ENDS +; /* Double a Montgomery form number (r = a + a % m). +; * +; * a is less than m. +; * +; * r Result of doubling. +; * a Number to double in Montgomery form. +; * m Modulus (prime). +; */ +_text SEGMENT READONLY PARA +sp_384_mont_tpl_lower_6 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + mov rax, QWORD PTR [rdx] + mov r8, QWORD PTR [rdx+8] + mov r9, QWORD PTR [rdx+16] + mov r10, QWORD PTR [rdx+24] + mov r11, QWORD PTR [rdx+32] + mov r12, QWORD PTR [rdx+40] + add rax, rax + mov r13, 4294967295 + adc r8, r8 + mov r14, 18446744069414584320 + adc r9, r9 + mov r15, 18446744073709551614 + adc r10, r10 + adc r11, r11 + adc r12, r12 + sbb rdi, rdi + and r13, rdi + and r14, rdi + and r15, rdi + sub rax, r13 + sbb r8, r14 + mov QWORD PTR [rcx], rax + sbb r9, r15 + sbb r10, rdi + sbb r11, rdi + sbb r12, rdi + add rax, QWORD PTR [rdx] + mov r13, 4294967295 + adc r8, QWORD PTR [rdx+8] + mov r14, 18446744069414584320 + adc r9, QWORD PTR [rdx+16] + mov r15, 18446744073709551614 + adc r10, QWORD PTR [rdx+24] + adc r11, QWORD PTR [rdx+32] + adc r12, QWORD PTR [rdx+40] + sbb rdi, rdi + and r13, rdi + and r14, rdi + and r15, rdi + sub rax, r13 + sbb r8, r14 + sbb r9, r15 + sbb r10, rdi + sbb r11, rdi + sbb r12, rdi + adc rdi, 0 + and r13, rdi + and r14, rdi + and r15, rdi + sub rax, r13 + sbb r8, r14 + mov QWORD PTR [rcx], rax + sbb r9, r15 + mov QWORD PTR [rcx+8], r8 + sbb r10, rdi + mov QWORD PTR [rcx+16], r9 + sbb r11, rdi + mov QWORD PTR [rcx+24], r10 + sbb r12, rdi + mov QWORD PTR [rcx+32], r11 + mov QWORD PTR [rcx+40], r12 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sp_384_mont_tpl_lower_6 ENDP +_text ENDS IFNDEF WC_NO_CACHE_RESISTANT ; /* Touch each possible point that could be being copied. ; * @@ -42716,11 +59865,11 @@ sp_384_get_point_33_avx2_6 PROC L_384_get_point_33_avx2_6_start: vpcmpeqd ymm12, ymm14, ymm13 vpaddd ymm14, ymm14, ymm15 - vmovupd ymm6, [rdx] + vmovupd ymm6, YMMWORD PTR [rdx] vmovdqu xmm7, OWORD PTR [rdx+32] - vmovupd ymm8, [rdx+96] + vmovupd ymm8, YMMWORD PTR [rdx+96] vmovdqu xmm9, OWORD PTR [rdx+128] - vmovupd ymm10, [rdx+192] + vmovupd ymm10, YMMWORD PTR [rdx+192] vmovdqu xmm11, OWORD PTR [rdx+224] add rdx, 296 vpand ymm6, ymm6, ymm12 @@ -42738,11 +59887,11 @@ L_384_get_point_33_avx2_6_start: dec rax jnz L_384_get_point_33_avx2_6_start vmovupd YMMWORD PTR [rcx], ymm0 - vmovdqu [rcx+32], xmm1 + vmovdqu OWORD PTR [rcx+32], xmm1 vmovupd YMMWORD PTR [rcx+96], ymm2 - vmovdqu [rcx+128], xmm3 + vmovdqu OWORD PTR [rcx+128], xmm3 vmovupd YMMWORD PTR [rcx+192], ymm4 - vmovdqu [rcx+224], xmm5 + vmovdqu OWORD PTR [rcx+224], xmm5 ret sp_384_get_point_33_avx2_6 ENDP _text ENDS @@ -43083,7 +60232,6 @@ IFDEF HAVE_INTEL_AVX2 _text SEGMENT READONLY PARA sp_384_cond_sub_avx2_6 PROC push r12 - mov rax, 0 mov r12, QWORD PTR [r8] mov r10, QWORD PTR [rdx] pext r12, r12, r9 @@ -43114,7 +60262,7 @@ sp_384_cond_sub_avx2_6 PROC mov QWORD PTR [rcx+32], r11 sbb r12, r10 mov QWORD PTR [rcx+40], r12 - sbb rax, 0 + sbb rax, rax pop r12 ret sp_384_cond_sub_avx2_6 ENDP @@ -43275,9 +60423,9 @@ sp_384_get_entry_64_avx2_6 PROC L_384_get_entry_64_avx2_6_start: vpcmpeqd ymm8, ymm10, ymm9 vpaddd ymm10, ymm10, ymm11 - vmovupd ymm4, [rdx] + vmovupd ymm4, YMMWORD PTR [rdx] vmovdqu xmm5, OWORD PTR [rdx+32] - vmovupd ymm6, [rdx+48] + vmovupd ymm6, YMMWORD PTR [rdx+48] vmovdqu xmm7, OWORD PTR [rdx+80] add rdx, 96 vpand ymm4, ymm4, ymm8 @@ -43291,9 +60439,9 @@ L_384_get_entry_64_avx2_6_start: dec rax jnz L_384_get_entry_64_avx2_6_start vmovupd YMMWORD PTR [rcx], ymm0 - vmovdqu [rcx+32], xmm1 + vmovdqu OWORD PTR [rcx+32], xmm1 vmovupd YMMWORD PTR [rcx+96], ymm2 - vmovdqu [rcx+128], xmm3 + vmovdqu OWORD PTR [rcx+128], xmm3 ret sp_384_get_entry_64_avx2_6 ENDP _text ENDS @@ -43383,9 +60531,9 @@ sp_384_get_entry_65_avx2_6 PROC L_384_get_entry_65_avx2_6_start: vpcmpeqd ymm8, ymm10, ymm9 vpaddd ymm10, ymm10, ymm11 - vmovupd ymm4, [rdx] + vmovupd ymm4, YMMWORD PTR [rdx] vmovdqu xmm5, OWORD PTR [rdx+32] - vmovupd ymm6, [rdx+48] + vmovupd ymm6, YMMWORD PTR [rdx+48] vmovdqu xmm7, OWORD PTR [rdx+80] add rdx, 96 vpand ymm4, ymm4, ymm8 @@ -43399,9 +60547,9 @@ L_384_get_entry_65_avx2_6_start: dec rax jnz L_384_get_entry_65_avx2_6_start vmovupd YMMWORD PTR [rcx], ymm0 - vmovdqu [rcx+32], xmm1 + vmovdqu OWORD PTR [rcx+32], xmm1 vmovupd YMMWORD PTR [rcx+96], ymm2 - vmovdqu [rcx+128], xmm3 + vmovdqu OWORD PTR [rcx+128], xmm3 ret sp_384_get_entry_65_avx2_6 ENDP _text ENDS @@ -43649,7 +60797,8 @@ ENDIF ; */ _text SEGMENT READONLY PARA sp_384_sub_in_place_6 PROC - xor rax, rax + push r12 + push r13 mov r8, QWORD PTR [rdx] mov r9, QWORD PTR [rdx+8] mov r10, QWORD PTR [rdx+16] @@ -43662,7 +60811,7 @@ sp_384_sub_in_place_6 PROC sbb QWORD PTR [rcx+24], r11 sbb QWORD PTR [rcx+32], r12 sbb QWORD PTR [rcx+40], r13 - sbb rax, 0 + sbb rax, rax pop r13 pop r12 ret @@ -45946,7 +63095,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_521_sub_9 PROC mov r9, QWORD PTR [rdx] - xor rax, rax sub r9, QWORD PTR [r8] mov r10, QWORD PTR [rdx+8] mov QWORD PTR [rcx], r9 @@ -45973,7 +63121,7 @@ sp_521_sub_9 PROC mov QWORD PTR [rcx+56], r10 sbb r9, QWORD PTR [r8+64] mov QWORD PTR [rcx+64], r9 - sbb rax, 0 + sbb rax, rax ret sp_521_sub_9 ENDP _text ENDS @@ -47173,7 +64321,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_521_cond_sub_9 PROC sub rsp, 72 - mov rax, 0 mov r10, QWORD PTR [r8] mov r11, QWORD PTR [r8+8] and r10, r9 @@ -47237,7 +64384,7 @@ sp_521_cond_sub_9 PROC sbb r10, r8 mov QWORD PTR [rcx+56], r11 mov QWORD PTR [rcx+64], r10 - sbb rax, 0 + sbb rax, rax add rsp, 72 ret sp_521_cond_sub_9 ENDP @@ -47250,6 +64397,75 @@ _text ENDS ; */ _text SEGMENT READONLY PARA sp_521_mont_reduce_9 PROC + push r12 + push r13 + push r14 + push r15 + mov rdx, QWORD PTR [rcx+64] + mov rax, QWORD PTR [rcx+72] + mov r8, QWORD PTR [rcx+80] + mov r15, rdx + and r15, 511 + mov r9, QWORD PTR [rcx+88] + mov r10, QWORD PTR [rcx+96] + mov r11, QWORD PTR [rcx+104] + mov r12, QWORD PTR [rcx+112] + mov r13, QWORD PTR [rcx+120] + mov r14, QWORD PTR [rcx+128] + shrd rdx, rax, 9 + shrd rax, r8, 9 + shrd r8, r9, 9 + shrd r9, r10, 9 + shrd r10, r11, 9 + shrd r11, r12, 9 + shrd r12, r13, 9 + shrd r13, r14, 9 + shr r14, 9 + add rdx, QWORD PTR [rcx] + adc rax, QWORD PTR [rcx+8] + adc r8, QWORD PTR [rcx+16] + adc r9, QWORD PTR [rcx+24] + adc r10, QWORD PTR [rcx+32] + adc r11, QWORD PTR [rcx+40] + adc r12, QWORD PTR [rcx+48] + adc r13, QWORD PTR [rcx+56] + adc r15, r14 + mov r14, r15 + shr r15, 9 + and r14, 511 + add rdx, r15 + adc rax, 0 + adc r8, 0 + adc r9, 0 + adc r10, 0 + adc r11, 0 + adc r12, 0 + adc r13, 0 + adc r14, 0 + mov QWORD PTR [rcx], rdx + mov QWORD PTR [rcx+8], rax + mov QWORD PTR [rcx+16], r8 + mov QWORD PTR [rcx+24], r9 + mov QWORD PTR [rcx+32], r10 + mov QWORD PTR [rcx+40], r11 + mov QWORD PTR [rcx+48], r12 + mov QWORD PTR [rcx+56], r13 + mov QWORD PTR [rcx+64], r14 + pop r15 + pop r14 + pop r13 + pop r12 + ret +sp_521_mont_reduce_9 ENDP +_text ENDS +; /* Reduce the number back to 521 bits using Montgomery reduction. +; * +; * a A single precision number to reduce in place. +; * m The single precision number representing the modulus. +; * mp The digit representing the negative inverse of m mod 2^n. +; */ +_text SEGMENT READONLY PARA +sp_521_mont_reduce_order_9 PROC push r12 push r13 push r14 @@ -47262,14 +64478,14 @@ sp_521_mont_reduce_9 PROC mov r10, 9 mov r15, QWORD PTR [rcx] mov rdi, QWORD PTR [rcx+8] -L_521_mont_reduce_9_loop: +L_521_mont_reduce_order_9_loop: ; mu = a[i] * mp mov r13, r15 imul r13, r8 cmp r10, 1 - jne L_521_mont_reduce_9_nomask + jne L_521_mont_reduce_order_9_nomask and r13, 511 -L_521_mont_reduce_9_nomask: +L_521_mont_reduce_order_9_nomask: ; a[i+0] += m[0] * mu mov rax, r13 xor r12, r12 @@ -47360,7 +64576,7 @@ L_521_mont_reduce_9_nomask: ; i -= 1 add rcx, 8 dec r10 - jnz L_521_mont_reduce_9_loop + jnz L_521_mont_reduce_order_9_loop mov QWORD PTR [rcx], r15 mov QWORD PTR [rcx+8], rdi mov r8, rcx @@ -47415,7 +64631,7 @@ ENDIF pop r13 pop r12 ret -sp_521_mont_reduce_9 ENDP +sp_521_mont_reduce_order_9 ENDP _text ENDS ; /* Add two Montgomery form numbers (r = a + b % m). ; * @@ -47913,12 +65129,12 @@ L_521_get_point_33_avx2_9_start: sete r9b neg r9 inc rdi - vmovupd ymm6, [rdx] - vmovupd ymm7, [rdx+32] - vmovupd ymm8, [rdx+144] - vmovupd ymm9, [rdx+176] - vmovupd ymm10, [rdx+288] - vmovupd ymm11, [rdx+320] + vmovupd ymm6, YMMWORD PTR [rdx] + vmovupd ymm7, YMMWORD PTR [rdx+32] + vmovupd ymm8, YMMWORD PTR [rdx+144] + vmovupd ymm9, YMMWORD PTR [rdx+176] + vmovupd ymm10, YMMWORD PTR [rdx+288] + vmovupd ymm11, YMMWORD PTR [rdx+320] mov r13, QWORD PTR [rdx+64] mov r14, QWORD PTR [rdx+208] mov r15, QWORD PTR [rdx+352] @@ -49028,7 +66244,6 @@ IFDEF HAVE_INTEL_AVX2 _text SEGMENT READONLY PARA sp_521_cond_sub_avx2_9 PROC push r12 - mov rax, 0 mov r12, QWORD PTR [r8] mov r10, QWORD PTR [rdx] pext r12, r12, r9 @@ -49074,7 +66289,7 @@ sp_521_cond_sub_avx2_9 PROC mov QWORD PTR [rcx+56], r11 sbb r12, r10 mov QWORD PTR [rcx+64], r12 - sbb rax, 0 + sbb rax, rax pop r12 ret sp_521_cond_sub_avx2_9 ENDP @@ -49088,7 +66303,7 @@ IFDEF HAVE_INTEL_AVX2 ; * mp The digit representing the negative inverse of m mod 2^n. ; */ _text SEGMENT READONLY PARA -sp_521_mont_reduce_avx2_9 PROC +sp_521_mont_reduce_order_avx2_9 PROC push r12 push r13 push r14 @@ -49101,22 +66316,18 @@ sp_521_mont_reduce_avx2_9 PROC mov r10, rdx xor rbp, rbp ; i = 9 - mov r11, 9 + mov r11, 8 mov r14, QWORD PTR [r9] mov r15, QWORD PTR [r9+8] mov rdi, QWORD PTR [r9+16] mov rsi, QWORD PTR [r9+24] add r9, 32 xor rbp, rbp -L_521_mont_reduce_avx2_9_loop: +L_521_mont_reduce_order_avx2_9_loop: ; mu = a[i] * mp mov rdx, r14 mov r12, r14 imul rdx, r8 - cmp r11, 1 - jne L_521_mont_reduce_avx2_9_nomask - and rdx, 511 -L_521_mont_reduce_avx2_9_nomask: xor rbx, rbx ; a[i+0] += m[0] * mu mulx rcx, rax, QWORD PTR [r10] @@ -49173,19 +66384,140 @@ L_521_mont_reduce_avx2_9_nomask: mov QWORD PTR [r9+40], r13 adox rbp, rbx adcx rbp, rbx + ; mu = a[i] * mp + mov rdx, r14 + mov r13, r14 + imul rdx, r8 + xor rbx, rbx + ; a[i+0] += m[0] * mu + mulx rcx, rax, QWORD PTR [r10] + mov r14, r15 + adcx r13, rax + adox r14, rcx + mov QWORD PTR [r9+-24], r13 + ; a[i+1] += m[1] * mu + mulx rcx, rax, QWORD PTR [r10+8] + mov r15, rdi + adcx r14, rax + adox r15, rcx + ; a[i+2] += m[2] * mu + mulx rcx, rax, QWORD PTR [r10+16] + mov rdi, rsi + adcx r15, rax + adox rdi, rcx + ; a[i+3] += m[3] * mu + mulx rcx, rax, QWORD PTR [r10+24] + mov rsi, QWORD PTR [r9+8] + adcx rdi, rax + adox rsi, rcx + ; a[i+4] += m[4] * mu + mulx rcx, rax, QWORD PTR [r10+32] + mov r12, QWORD PTR [r9+16] + adcx rsi, rax + adox r12, rcx + ; a[i+5] += m[5] * mu + mulx rcx, rax, QWORD PTR [r10+40] + mov r13, QWORD PTR [r9+24] + adcx r12, rax + adox r13, rcx + mov QWORD PTR [r9+16], r12 + ; a[i+6] += m[6] * mu + mulx rcx, rax, QWORD PTR [r10+48] + mov r12, QWORD PTR [r9+32] + adcx r13, rax + adox r12, rcx + mov QWORD PTR [r9+24], r13 + ; a[i+7] += m[7] * mu + mulx rcx, rax, QWORD PTR [r10+56] + mov r13, QWORD PTR [r9+40] + adcx r12, rax + adox r13, rcx + mov QWORD PTR [r9+32], r12 + ; a[i+8] += m[8] * mu + mulx rcx, rax, QWORD PTR [r10+64] + mov r12, QWORD PTR [r9+48] + adcx r13, rax + adox r12, rcx + mov QWORD PTR [r9+40], r13 + adcx r12, rbp + mov rbp, rbx + mov QWORD PTR [r9+48], r12 + adox rbp, rbx + adcx rbp, rbx + ; a += 2 + add r9, 16 + ; i -= 2 + sub r11, 2 + jnz L_521_mont_reduce_order_avx2_9_loop + ; mu = a[i] * mp + mov rdx, r14 + mov r12, r14 + imul rdx, r8 + and rdx, 511 + xor rbx, rbx + ; a[i+0] += m[0] * mu + mulx rcx, rax, QWORD PTR [r10] + mov r14, r15 + adcx r12, rax + adox r14, rcx + mov QWORD PTR [r9+-32], r12 + ; a[i+1] += m[1] * mu + mulx rcx, rax, QWORD PTR [r10+8] + mov r15, rdi + adcx r14, rax + adox r15, rcx + ; a[i+2] += m[2] * mu + mulx rcx, rax, QWORD PTR [r10+16] + mov rdi, rsi + adcx r15, rax + adox rdi, rcx + ; a[i+3] += m[3] * mu + mulx rcx, rax, QWORD PTR [r10+24] + mov rsi, QWORD PTR [r9] + adcx rdi, rax + adox rsi, rcx + ; a[i+4] += m[4] * mu + mulx rcx, rax, QWORD PTR [r10+32] + mov r13, QWORD PTR [r9+8] + adcx rsi, rax + adox r13, rcx + ; a[i+5] += m[5] * mu + mulx rcx, rax, QWORD PTR [r10+40] + mov r12, QWORD PTR [r9+16] + adcx r13, rax + adox r12, rcx + mov QWORD PTR [r9+8], r13 + ; a[i+6] += m[6] * mu + mulx rcx, rax, QWORD PTR [r10+48] + mov r13, QWORD PTR [r9+24] + adcx r12, rax + adox r13, rcx + mov QWORD PTR [r9+16], r12 + ; a[i+7] += m[7] * mu + mulx rcx, rax, QWORD PTR [r10+56] + mov r12, QWORD PTR [r9+32] + adcx r13, rax + adox r12, rcx + mov QWORD PTR [r9+24], r13 + ; a[i+8] += m[8] * mu + mulx rcx, rax, QWORD PTR [r10+64] + mov r13, QWORD PTR [r9+40] + adcx r12, rax + adox r13, rcx + mov QWORD PTR [r9+32], r12 + adcx r13, rbp + mov rbp, rbx + mov QWORD PTR [r9+40], r13 + adox rbp, rbx ; a += 1 add r9, 8 - ; i -= 1 - sub r11, 1 - jnz L_521_mont_reduce_avx2_9_loop mov QWORD PTR [r9+-32], r14 mov QWORD PTR [r9+-24], r15 mov QWORD PTR [r9+-16], rdi mov QWORD PTR [r9+-8], rsi sub r9, 32 - mov r8, r9 + lea r8, QWORD PTR [r9+-8] sub r9, 72 - sub r8, 8 mov r12, QWORD PTR [r8] mov r14, QWORD PTR [r8+8] mov r15, QWORD PTR [r8+16] @@ -49273,7 +66605,7 @@ L_521_mont_reduce_avx2_9_nomask: pop r13 pop r12 ret -sp_521_mont_reduce_avx2_9 ENDP +sp_521_mont_reduce_order_avx2_9 ENDP _text ENDS ENDIF IFDEF HAVE_INTEL_AVX2 @@ -49485,10 +66817,10 @@ L_521_get_entry_64_avx2_9_start: sete r9b neg r9 inc r14 - vmovupd ymm4, [rdx] - vmovupd ymm5, [rdx+32] - vmovupd ymm6, [rdx+72] - vmovupd ymm7, [rdx+104] + vmovupd ymm4, YMMWORD PTR [rdx] + vmovupd ymm5, YMMWORD PTR [rdx+32] + vmovupd ymm6, YMMWORD PTR [rdx+72] + vmovupd ymm7, YMMWORD PTR [rdx+104] mov r12, QWORD PTR [rdx+64] mov r13, QWORD PTR [rdx+136] add rdx, 144 @@ -49666,10 +66998,10 @@ L_521_get_entry_65_avx2_9_start: sete r9b neg r9 inc r14 - vmovupd ymm4, [rdx] - vmovupd ymm5, [rdx+32] - vmovupd ymm6, [rdx+72] - vmovupd ymm7, [rdx+104] + vmovupd ymm4, YMMWORD PTR [rdx] + vmovupd ymm5, YMMWORD PTR [rdx+32] + vmovupd ymm6, YMMWORD PTR [rdx+72] + vmovupd ymm7, YMMWORD PTR [rdx+104] mov r12, QWORD PTR [rdx+64] mov r13, QWORD PTR [rdx+136] add rdx, 144 @@ -50127,7 +67459,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_521_sub_in_place_9 PROC mov r8, QWORD PTR [rcx] - xor rax, rax sub r8, QWORD PTR [rdx] mov r9, QWORD PTR [rcx+8] mov QWORD PTR [rcx], r8 @@ -50154,7 +67485,7 @@ sp_521_sub_in_place_9 PROC mov QWORD PTR [rcx+56], r9 sbb r8, QWORD PTR [rdx+64] mov QWORD PTR [rcx+64], r8 - sbb rax, 0 + sbb rax, rax ret sp_521_sub_in_place_9 ENDP _text ENDS @@ -50448,6 +67779,10 @@ L_521_mod_inv_9_div2_mod_no_add: mov QWORD PTR [rcx+56], r11 shr rax, 1 mov QWORD PTR [rcx+64], rax + pop r12 + ret +sp_521_div2_mod_9 ENDP +_text ENDS _text SEGMENT READONLY PARA sp_521_num_bits_9 PROC xor rax, rax @@ -56056,7 +73391,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_1024_sub_in_place_16 PROC mov r8, QWORD PTR [rcx] - xor rax, rax sub r8, QWORD PTR [rdx] mov r9, QWORD PTR [rcx+8] mov QWORD PTR [rcx], r8 @@ -56104,7 +73438,7 @@ sp_1024_sub_in_place_16 PROC mov QWORD PTR [rcx+112], r8 sbb r9, QWORD PTR [rdx+120] mov QWORD PTR [rcx+120], r9 - sbb rax, 0 + sbb rax, rax ret sp_1024_sub_in_place_16 ENDP _text ENDS @@ -56119,7 +73453,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_1024_cond_sub_16 PROC sub rsp, 128 - mov rax, 0 mov r10, QWORD PTR [r8] mov r11, QWORD PTR [r8+8] and r10, r9 @@ -56232,7 +73565,7 @@ sp_1024_cond_sub_16 PROC sbb r11, r8 mov QWORD PTR [rcx+112], r10 mov QWORD PTR [rcx+120], r11 - sbb rax, 0 + sbb rax, rax add rsp, 128 ret sp_1024_cond_sub_16 ENDP @@ -56249,7 +73582,6 @@ IFDEF HAVE_INTEL_AVX2 _text SEGMENT READONLY PARA sp_1024_cond_sub_avx2_16 PROC push r12 - mov rax, 0 mov r12, QWORD PTR [r8] mov r10, QWORD PTR [rdx] pext r12, r12, r9 @@ -56330,7 +73662,7 @@ sp_1024_cond_sub_avx2_16 PROC mov QWORD PTR [rcx+112], r12 sbb r10, r11 mov QWORD PTR [rcx+120], r10 - sbb rax, 0 + sbb rax, rax pop r12 ret sp_1024_cond_sub_avx2_16 ENDP @@ -58015,7 +75347,6 @@ _text ENDS _text SEGMENT READONLY PARA sp_1024_sub_16 PROC mov r9, QWORD PTR [rdx] - xor rax, rax sub r9, QWORD PTR [r8] mov r10, QWORD PTR [rdx+8] mov QWORD PTR [rcx], r9 @@ -58063,7 +75394,7 @@ sp_1024_sub_16 PROC mov QWORD PTR [rcx+112], r9 sbb r10, QWORD PTR [r8+120] mov QWORD PTR [rcx+120], r10 - sbb rax, 0 + sbb rax, rax ret sp_1024_sub_16 ENDP _text ENDS