diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c index ed41341cc..63b472257 100644 --- a/wolfcrypt/src/sp_c32.c +++ b/wolfcrypt/src/sp_c32.c @@ -1260,31 +1260,40 @@ SP_NOINLINE static void sp_2048_mul_d_90(sp_digit* r, const sp_digit* a, r[90] = (sp_digit)t; #else int64_t tb = b; - int64_t t[8]; + int64_t t = 0; + sp_digit t2; + int64_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x7fffff; - for (i = 0; i < 88; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 23) + (t[6] & 0x7fffff); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 23) + (t[7] & 0x7fffff); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 23) + (t[0] & 0x7fffff); + for (i = 0; i < 88; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 3] = t2; } - t[1] = tb * a[89]; - r[89] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff); - r[90] = (sp_digit)(t[1] >> 23); + t += tb * a[88]; + r[88] = t & 0x7fffff; + t >>= 23; + t += tb * a[89]; + r[89] = t & 0x7fffff; + t >>= 23; + r[90] = t & 0x7fffff; #endif /* WOLFSSL_SP_SMALL */ } @@ -1626,37 +1635,37 @@ SP_NOINLINE static void sp_2048_mul_d_45(sp_digit* r, const sp_digit* a, r[45] = (sp_digit)t; #else int64_t tb = b; - int64_t t[8]; + int64_t t = 0; + sp_digit t2; + int64_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x7fffff; - for (i = 0; i < 40; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 23) + (t[6] & 0x7fffff); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 23) + (t[7] & 0x7fffff); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 23) + (t[0] & 0x7fffff); + for (i = 0; i < 44; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 3] = t2; } - t[1] = tb * a[41]; - r[41] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff); - t[2] = tb * a[42]; - r[42] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff); - t[3] = tb * a[43]; - r[43] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff); - t[4] = tb * a[44]; - r[44] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff); - r[45] = (sp_digit)(t[4] >> 23); + t += tb * a[44]; + r[44] = t & 0x7fffff; + t >>= 23; + r[45] = t & 0x7fffff; #endif /* WOLFSSL_SP_SMALL */ } @@ -2565,35 +2574,34 @@ SP_NOINLINE static void sp_2048_mul_d_180(sp_digit* r, const sp_digit* a, r[180] = (sp_digit)t; #else int64_t tb = b; - int64_t t[8]; + int64_t t = 0; + sp_digit t2; + int64_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x7fffff; - for (i = 0; i < 176; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 23) + (t[6] & 0x7fffff); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 23) + (t[7] & 0x7fffff); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 23) + (t[0] & 0x7fffff); + for (i = 0; i < 180; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 3] = t2; } - t[1] = tb * a[177]; - r[177] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff); - t[2] = tb * a[178]; - r[178] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff); - t[3] = tb * a[179]; - r[179] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff); - r[180] = (sp_digit)(t[3] >> 23); + r[180] = t & 0x7fffff; #endif /* WOLFSSL_SP_SMALL */ } @@ -5143,39 +5151,40 @@ SP_NOINLINE static void sp_3072_mul_d_134(sp_digit* r, const sp_digit* a, r[134] = (sp_digit)t; #else int64_t tb = b; - int64_t t[8]; + int64_t t = 0; + sp_digit t2; + int64_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x7fffff; - for (i = 0; i < 128; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 23) + (t[6] & 0x7fffff); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 23) + (t[7] & 0x7fffff); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 23) + (t[0] & 0x7fffff); + for (i = 0; i < 132; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 3] = t2; } - t[1] = tb * a[129]; - r[129] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff); - t[2] = tb * a[130]; - r[130] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff); - t[3] = tb * a[131]; - r[131] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff); - t[4] = tb * a[132]; - r[132] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff); - t[5] = tb * a[133]; - r[133] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff); - r[134] = (sp_digit)(t[5] >> 23); + t += tb * a[132]; + r[132] = t & 0x7fffff; + t >>= 23; + t += tb * a[133]; + r[133] = t & 0x7fffff; + t >>= 23; + r[134] = t & 0x7fffff; #endif /* WOLFSSL_SP_SMALL */ } @@ -5509,33 +5518,43 @@ SP_NOINLINE static void sp_3072_mul_d_67(sp_digit* r, const sp_digit* a, r[67] = (sp_digit)t; #else int64_t tb = b; - int64_t t[8]; + int64_t t = 0; + sp_digit t2; + int64_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x7fffff; - for (i = 0; i < 64; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 23) + (t[6] & 0x7fffff); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 23) + (t[7] & 0x7fffff); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 23) + (t[0] & 0x7fffff); + for (i = 0; i < 64; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 3] = t2; } - t[1] = tb * a[65]; - r[65] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff); - t[2] = tb * a[66]; - r[66] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff); - r[67] = (sp_digit)(t[2] >> 23); + t += tb * a[64]; + r[64] = t & 0x7fffff; + t >>= 23; + t += tb * a[65]; + r[65] = t & 0x7fffff; + t >>= 23; + t += tb * a[66]; + r[66] = t & 0x7fffff; + t >>= 23; + r[67] = t & 0x7fffff; #endif /* WOLFSSL_SP_SMALL */ } @@ -6434,35 +6453,34 @@ SP_NOINLINE static void sp_3072_mul_d_268(sp_digit* r, const sp_digit* a, r[268] = (sp_digit)t; #else int64_t tb = b; - int64_t t[8]; + int64_t t = 0; + sp_digit t2; + int64_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x7fffff; - for (i = 0; i < 264; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 23) + (t[6] & 0x7fffff); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 23) + (t[7] & 0x7fffff); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 23) + (t[0] & 0x7fffff); + for (i = 0; i < 268; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x7fffff); + t >>= 23; + r[i + 3] = t2; } - t[1] = tb * a[265]; - r[265] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff); - t[2] = tb * a[266]; - r[266] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff); - t[3] = tb * a[267]; - r[267] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff); - r[268] = (sp_digit)(t[3] >> 23); + r[268] = t & 0x7fffff; #endif /* WOLFSSL_SP_SMALL */ } @@ -9173,35 +9191,34 @@ SP_NOINLINE static void sp_4096_mul_d_196(sp_digit* r, const sp_digit* a, r[196] = (sp_digit)t; #else int64_t tb = b; - int64_t t[8]; + int64_t t = 0; + sp_digit t2; + int64_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x1fffff; - for (i = 0; i < 192; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 21) + (t[1] & 0x1fffff); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 21) + (t[2] & 0x1fffff); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 21) + (t[3] & 0x1fffff); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 21) + (t[4] & 0x1fffff); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 21) + (t[5] & 0x1fffff); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 21) + (t[6] & 0x1fffff); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 21) + (t[7] & 0x1fffff); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 21) + (t[0] & 0x1fffff); + for (i = 0; i < 196; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x1fffff); + t >>= 21; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x1fffff); + t >>= 21; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x1fffff); + t >>= 21; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x1fffff); + t >>= 21; + r[i + 3] = t2; } - t[1] = tb * a[193]; - r[193] = (sp_digit)(t[0] >> 21) + (t[1] & 0x1fffff); - t[2] = tb * a[194]; - r[194] = (sp_digit)(t[1] >> 21) + (t[2] & 0x1fffff); - t[3] = tb * a[195]; - r[195] = (sp_digit)(t[2] >> 21) + (t[3] & 0x1fffff); - r[196] = (sp_digit)(t[3] >> 21); + r[196] = t & 0x1fffff; #endif /* WOLFSSL_SP_SMALL */ } @@ -9523,31 +9540,40 @@ SP_NOINLINE static void sp_4096_mul_d_98(sp_digit* r, const sp_digit* a, r[98] = (sp_digit)t; #else int64_t tb = b; - int64_t t[8]; + int64_t t = 0; + sp_digit t2; + int64_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x1fffff; - for (i = 0; i < 96; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 21) + (t[1] & 0x1fffff); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 21) + (t[2] & 0x1fffff); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 21) + (t[3] & 0x1fffff); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 21) + (t[4] & 0x1fffff); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 21) + (t[5] & 0x1fffff); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 21) + (t[6] & 0x1fffff); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 21) + (t[7] & 0x1fffff); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 21) + (t[0] & 0x1fffff); + for (i = 0; i < 96; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x1fffff); + t >>= 21; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x1fffff); + t >>= 21; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x1fffff); + t >>= 21; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x1fffff); + t >>= 21; + r[i + 3] = t2; } - t[1] = tb * a[97]; - r[97] = (sp_digit)(t[0] >> 21) + (t[1] & 0x1fffff); - r[98] = (sp_digit)(t[1] >> 21); + t += tb * a[96]; + r[96] = t & 0x1fffff; + t >>= 21; + t += tb * a[97]; + r[97] = t & 0x1fffff; + t >>= 21; + r[98] = t & 0x1fffff; #endif /* WOLFSSL_SP_SMALL */ } @@ -10485,29 +10511,34 @@ SP_NOINLINE static void sp_4096_mul_d_392(sp_digit* r, const sp_digit* a, r[392] = (sp_digit)t; #else int64_t tb = b; - int64_t t[8]; + int64_t t = 0; + sp_digit t2; + int64_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x1fffff; - for (i = 0; i < 392; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 21) + (t[1] & 0x1fffff); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 21) + (t[2] & 0x1fffff); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 21) + (t[3] & 0x1fffff); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 21) + (t[4] & 0x1fffff); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 21) + (t[5] & 0x1fffff); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 21) + (t[6] & 0x1fffff); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 21) + (t[7] & 0x1fffff); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 21) + (t[0] & 0x1fffff); + for (i = 0; i < 392; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x1fffff); + t >>= 21; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x1fffff); + t >>= 21; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x1fffff); + t >>= 21; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x1fffff); + t >>= 21; + r[i + 3] = t2; } - r[392] = (sp_digit)(t[7] >> 21); + r[392] = t & 0x1fffff; #endif /* WOLFSSL_SP_SMALL */ } diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c index 0cb2f34fb..297cc8b0b 100644 --- a/wolfcrypt/src/sp_c64.c +++ b/wolfcrypt/src/sp_c64.c @@ -899,35 +899,34 @@ SP_NOINLINE static void sp_2048_mul_d_36(sp_digit* r, const sp_digit* a, r[36] = (sp_digit)t; #else int128_t tb = b; - int128_t t[8]; + int128_t t = 0; + sp_digit t2; + int128_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x1ffffffffffffffL; - for (i = 0; i < 32; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffL); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffL); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffL); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 57) + (t[6] & 0x1ffffffffffffffL); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 57) + (t[7] & 0x1ffffffffffffffL); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 57) + (t[0] & 0x1ffffffffffffffL); + for (i = 0; i < 36; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 3] = t2; } - t[1] = tb * a[33]; - r[33] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL); - t[2] = tb * a[34]; - r[34] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL); - t[3] = tb * a[35]; - r[35] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffL); - r[36] = (sp_digit)(t[3] >> 57); + r[36] = t & 0x1ffffffffffffffL; #endif /* WOLFSSL_SP_SMALL */ } @@ -1243,31 +1242,40 @@ SP_NOINLINE static void sp_2048_mul_d_18(sp_digit* r, const sp_digit* a, r[18] = (sp_digit)t; #else int128_t tb = b; - int128_t t[8]; + int128_t t = 0; + sp_digit t2; + int128_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x1ffffffffffffffL; - for (i = 0; i < 16; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffL); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffL); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffL); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 57) + (t[6] & 0x1ffffffffffffffL); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 57) + (t[7] & 0x1ffffffffffffffL); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 57) + (t[0] & 0x1ffffffffffffffL); + for (i = 0; i < 16; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 3] = t2; } - t[1] = tb * a[17]; - r[17] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL); - r[18] = (sp_digit)(t[1] >> 57); + t += tb * a[16]; + r[16] = t & 0x1ffffffffffffffL; + t >>= 57; + t += tb * a[17]; + r[17] = t & 0x1ffffffffffffffL; + t >>= 57; + r[18] = t & 0x1ffffffffffffffL; #endif /* WOLFSSL_SP_SMALL */ } @@ -5094,39 +5102,40 @@ SP_NOINLINE static void sp_3072_mul_d_54(sp_digit* r, const sp_digit* a, r[54] = (sp_digit)t; #else int128_t tb = b; - int128_t t[8]; + int128_t t = 0; + sp_digit t2; + int128_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x1ffffffffffffffL; - for (i = 0; i < 48; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffL); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffL); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffL); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 57) + (t[6] & 0x1ffffffffffffffL); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 57) + (t[7] & 0x1ffffffffffffffL); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 57) + (t[0] & 0x1ffffffffffffffL); + for (i = 0; i < 52; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 3] = t2; } - t[1] = tb * a[49]; - r[49] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL); - t[2] = tb * a[50]; - r[50] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL); - t[3] = tb * a[51]; - r[51] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffL); - t[4] = tb * a[52]; - r[52] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffL); - t[5] = tb * a[53]; - r[53] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffL); - r[54] = (sp_digit)(t[5] >> 57); + t += tb * a[52]; + r[52] = t & 0x1ffffffffffffffL; + t >>= 57; + t += tb * a[53]; + r[53] = t & 0x1ffffffffffffffL; + t >>= 57; + r[54] = t & 0x1ffffffffffffffL; #endif /* WOLFSSL_SP_SMALL */ } @@ -5460,33 +5469,43 @@ SP_NOINLINE static void sp_3072_mul_d_27(sp_digit* r, const sp_digit* a, r[27] = (sp_digit)t; #else int128_t tb = b; - int128_t t[8]; + int128_t t = 0; + sp_digit t2; + int128_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x1ffffffffffffffL; - for (i = 0; i < 24; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffL); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffL); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffL); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 57) + (t[6] & 0x1ffffffffffffffL); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 57) + (t[7] & 0x1ffffffffffffffL); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 57) + (t[0] & 0x1ffffffffffffffL); + for (i = 0; i < 24; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x1ffffffffffffffL); + t >>= 57; + r[i + 3] = t2; } - t[1] = tb * a[25]; - r[25] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL); - t[2] = tb * a[26]; - r[26] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL); - r[27] = (sp_digit)(t[2] >> 57); + t += tb * a[24]; + r[24] = t & 0x1ffffffffffffffL; + t >>= 57; + t += tb * a[25]; + r[25] = t & 0x1ffffffffffffffL; + t >>= 57; + t += tb * a[26]; + r[26] = t & 0x1ffffffffffffffL; + t >>= 57; + r[27] = t & 0x1ffffffffffffffL; #endif /* WOLFSSL_SP_SMALL */ } @@ -9331,39 +9350,40 @@ SP_NOINLINE static void sp_4096_mul_d_78(sp_digit* r, const sp_digit* a, r[78] = (sp_digit)t; #else int128_t tb = b; - int128_t t[8]; + int128_t t = 0; + sp_digit t2; + int128_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x1fffffffffffffL; - for (i = 0; i < 72; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 53) + (t[1] & 0x1fffffffffffffL); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 53) + (t[2] & 0x1fffffffffffffL); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 53) + (t[3] & 0x1fffffffffffffL); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 53) + (t[4] & 0x1fffffffffffffL); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 53) + (t[5] & 0x1fffffffffffffL); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 53) + (t[6] & 0x1fffffffffffffL); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 53) + (t[7] & 0x1fffffffffffffL); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 53) + (t[0] & 0x1fffffffffffffL); + for (i = 0; i < 76; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x1fffffffffffffL); + t >>= 53; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x1fffffffffffffL); + t >>= 53; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x1fffffffffffffL); + t >>= 53; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x1fffffffffffffL); + t >>= 53; + r[i + 3] = t2; } - t[1] = tb * a[73]; - r[73] = (sp_digit)(t[0] >> 53) + (t[1] & 0x1fffffffffffffL); - t[2] = tb * a[74]; - r[74] = (sp_digit)(t[1] >> 53) + (t[2] & 0x1fffffffffffffL); - t[3] = tb * a[75]; - r[75] = (sp_digit)(t[2] >> 53) + (t[3] & 0x1fffffffffffffL); - t[4] = tb * a[76]; - r[76] = (sp_digit)(t[3] >> 53) + (t[4] & 0x1fffffffffffffL); - t[5] = tb * a[77]; - r[77] = (sp_digit)(t[4] >> 53) + (t[5] & 0x1fffffffffffffL); - r[78] = (sp_digit)(t[5] >> 53); + t += tb * a[76]; + r[76] = t & 0x1fffffffffffffL; + t >>= 53; + t += tb * a[77]; + r[77] = t & 0x1fffffffffffffL; + t >>= 53; + r[78] = t & 0x1fffffffffffffL; #endif /* WOLFSSL_SP_SMALL */ } @@ -9720,41 +9740,43 @@ SP_NOINLINE static void sp_4096_mul_d_39(sp_digit* r, const sp_digit* a, r[39] = (sp_digit)t; #else int128_t tb = b; - int128_t t[8]; + int128_t t = 0; + sp_digit t2; + int128_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x1fffffffffffffL; - for (i = 0; i < 32; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 53) + (t[1] & 0x1fffffffffffffL); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 53) + (t[2] & 0x1fffffffffffffL); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 53) + (t[3] & 0x1fffffffffffffL); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 53) + (t[4] & 0x1fffffffffffffL); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 53) + (t[5] & 0x1fffffffffffffL); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 53) + (t[6] & 0x1fffffffffffffL); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 53) + (t[7] & 0x1fffffffffffffL); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 53) + (t[0] & 0x1fffffffffffffL); + for (i = 0; i < 36; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x1fffffffffffffL); + t >>= 53; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x1fffffffffffffL); + t >>= 53; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x1fffffffffffffL); + t >>= 53; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x1fffffffffffffL); + t >>= 53; + r[i + 3] = t2; } - t[1] = tb * a[33]; - r[33] = (sp_digit)(t[0] >> 53) + (t[1] & 0x1fffffffffffffL); - t[2] = tb * a[34]; - r[34] = (sp_digit)(t[1] >> 53) + (t[2] & 0x1fffffffffffffL); - t[3] = tb * a[35]; - r[35] = (sp_digit)(t[2] >> 53) + (t[3] & 0x1fffffffffffffL); - t[4] = tb * a[36]; - r[36] = (sp_digit)(t[3] >> 53) + (t[4] & 0x1fffffffffffffL); - t[5] = tb * a[37]; - r[37] = (sp_digit)(t[4] >> 53) + (t[5] & 0x1fffffffffffffL); - t[6] = tb * a[38]; - r[38] = (sp_digit)(t[5] >> 53) + (t[6] & 0x1fffffffffffffL); - r[39] = (sp_digit)(t[6] >> 53); + t += tb * a[36]; + r[36] = t & 0x1fffffffffffffL; + t >>= 53; + t += tb * a[37]; + r[37] = t & 0x1fffffffffffffL; + t >>= 53; + t += tb * a[38]; + r[38] = t & 0x1fffffffffffffL; + t >>= 53; + r[39] = t & 0x1fffffffffffffL; #endif /* WOLFSSL_SP_SMALL */ } @@ -10720,35 +10742,34 @@ SP_NOINLINE static void sp_4096_mul_d_156(sp_digit* r, const sp_digit* a, r[156] = (sp_digit)t; #else int128_t tb = b; - int128_t t[8]; + int128_t t = 0; + sp_digit t2; + int128_t p[4]; int i; - t[0] = tb * a[0]; r[0] = t[0] & 0x1fffffffffffffL; - for (i = 0; i < 152; i += 8) { - t[1] = tb * a[i+1]; - r[i+1] = (sp_digit)(t[0] >> 53) + (t[1] & 0x1fffffffffffffL); - t[2] = tb * a[i+2]; - r[i+2] = (sp_digit)(t[1] >> 53) + (t[2] & 0x1fffffffffffffL); - t[3] = tb * a[i+3]; - r[i+3] = (sp_digit)(t[2] >> 53) + (t[3] & 0x1fffffffffffffL); - t[4] = tb * a[i+4]; - r[i+4] = (sp_digit)(t[3] >> 53) + (t[4] & 0x1fffffffffffffL); - t[5] = tb * a[i+5]; - r[i+5] = (sp_digit)(t[4] >> 53) + (t[5] & 0x1fffffffffffffL); - t[6] = tb * a[i+6]; - r[i+6] = (sp_digit)(t[5] >> 53) + (t[6] & 0x1fffffffffffffL); - t[7] = tb * a[i+7]; - r[i+7] = (sp_digit)(t[6] >> 53) + (t[7] & 0x1fffffffffffffL); - t[0] = tb * a[i+8]; - r[i+8] = (sp_digit)(t[7] >> 53) + (t[0] & 0x1fffffffffffffL); + for (i = 0; i < 156; i += 4) { + p[0] = tb * a[i + 0]; + p[1] = tb * a[i + 1]; + p[2] = tb * a[i + 2]; + p[3] = tb * a[i + 3]; + t += p[0]; + t2 = (sp_digit)(t & 0x1fffffffffffffL); + t >>= 53; + r[i + 0] = t2; + t += p[1]; + t2 = (sp_digit)(t & 0x1fffffffffffffL); + t >>= 53; + r[i + 1] = t2; + t += p[2]; + t2 = (sp_digit)(t & 0x1fffffffffffffL); + t >>= 53; + r[i + 2] = t2; + t += p[3]; + t2 = (sp_digit)(t & 0x1fffffffffffffL); + t >>= 53; + r[i + 3] = t2; } - t[1] = tb * a[153]; - r[153] = (sp_digit)(t[0] >> 53) + (t[1] & 0x1fffffffffffffL); - t[2] = tb * a[154]; - r[154] = (sp_digit)(t[1] >> 53) + (t[2] & 0x1fffffffffffffL); - t[3] = tb * a[155]; - r[155] = (sp_digit)(t[2] >> 53) + (t[3] & 0x1fffffffffffffL); - r[156] = (sp_digit)(t[3] >> 53); + r[156] = t & 0x1fffffffffffffL; #endif /* WOLFSSL_SP_SMALL */ }