Merge pull request #3468 from SparkiDev/sp_c_mul_d

SP C32/64 mul_d: large div needs mul_d to propagate carry
This commit is contained in:
toddouska
2020-11-11 14:06:25 -08:00
committed by GitHub
2 changed files with 466 additions and 414 deletions

View File

@ -1260,31 +1260,40 @@ SP_NOINLINE static void sp_2048_mul_d_90(sp_digit* r, const sp_digit* a,
r[90] = (sp_digit)t;
#else
int64_t tb = b;
int64_t t[8];
int64_t t = 0;
sp_digit t2;
int64_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x7fffff;
for (i = 0; i < 88; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 23) + (t[6] & 0x7fffff);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 23) + (t[7] & 0x7fffff);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 23) + (t[0] & 0x7fffff);
for (i = 0; i < 88; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 3] = t2;
}
t[1] = tb * a[89];
r[89] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff);
r[90] = (sp_digit)(t[1] >> 23);
t += tb * a[88];
r[88] = t & 0x7fffff;
t >>= 23;
t += tb * a[89];
r[89] = t & 0x7fffff;
t >>= 23;
r[90] = t & 0x7fffff;
#endif /* WOLFSSL_SP_SMALL */
}
@ -1626,37 +1635,37 @@ SP_NOINLINE static void sp_2048_mul_d_45(sp_digit* r, const sp_digit* a,
r[45] = (sp_digit)t;
#else
int64_t tb = b;
int64_t t[8];
int64_t t = 0;
sp_digit t2;
int64_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x7fffff;
for (i = 0; i < 40; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 23) + (t[6] & 0x7fffff);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 23) + (t[7] & 0x7fffff);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 23) + (t[0] & 0x7fffff);
for (i = 0; i < 44; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 3] = t2;
}
t[1] = tb * a[41];
r[41] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff);
t[2] = tb * a[42];
r[42] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff);
t[3] = tb * a[43];
r[43] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff);
t[4] = tb * a[44];
r[44] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff);
r[45] = (sp_digit)(t[4] >> 23);
t += tb * a[44];
r[44] = t & 0x7fffff;
t >>= 23;
r[45] = t & 0x7fffff;
#endif /* WOLFSSL_SP_SMALL */
}
@ -2565,35 +2574,34 @@ SP_NOINLINE static void sp_2048_mul_d_180(sp_digit* r, const sp_digit* a,
r[180] = (sp_digit)t;
#else
int64_t tb = b;
int64_t t[8];
int64_t t = 0;
sp_digit t2;
int64_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x7fffff;
for (i = 0; i < 176; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 23) + (t[6] & 0x7fffff);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 23) + (t[7] & 0x7fffff);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 23) + (t[0] & 0x7fffff);
for (i = 0; i < 180; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 3] = t2;
}
t[1] = tb * a[177];
r[177] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff);
t[2] = tb * a[178];
r[178] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff);
t[3] = tb * a[179];
r[179] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff);
r[180] = (sp_digit)(t[3] >> 23);
r[180] = t & 0x7fffff;
#endif /* WOLFSSL_SP_SMALL */
}
@ -5143,39 +5151,40 @@ SP_NOINLINE static void sp_3072_mul_d_134(sp_digit* r, const sp_digit* a,
r[134] = (sp_digit)t;
#else
int64_t tb = b;
int64_t t[8];
int64_t t = 0;
sp_digit t2;
int64_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x7fffff;
for (i = 0; i < 128; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 23) + (t[6] & 0x7fffff);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 23) + (t[7] & 0x7fffff);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 23) + (t[0] & 0x7fffff);
for (i = 0; i < 132; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 3] = t2;
}
t[1] = tb * a[129];
r[129] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff);
t[2] = tb * a[130];
r[130] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff);
t[3] = tb * a[131];
r[131] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff);
t[4] = tb * a[132];
r[132] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff);
t[5] = tb * a[133];
r[133] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff);
r[134] = (sp_digit)(t[5] >> 23);
t += tb * a[132];
r[132] = t & 0x7fffff;
t >>= 23;
t += tb * a[133];
r[133] = t & 0x7fffff;
t >>= 23;
r[134] = t & 0x7fffff;
#endif /* WOLFSSL_SP_SMALL */
}
@ -5509,33 +5518,43 @@ SP_NOINLINE static void sp_3072_mul_d_67(sp_digit* r, const sp_digit* a,
r[67] = (sp_digit)t;
#else
int64_t tb = b;
int64_t t[8];
int64_t t = 0;
sp_digit t2;
int64_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x7fffff;
for (i = 0; i < 64; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 23) + (t[6] & 0x7fffff);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 23) + (t[7] & 0x7fffff);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 23) + (t[0] & 0x7fffff);
for (i = 0; i < 64; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 3] = t2;
}
t[1] = tb * a[65];
r[65] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff);
t[2] = tb * a[66];
r[66] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff);
r[67] = (sp_digit)(t[2] >> 23);
t += tb * a[64];
r[64] = t & 0x7fffff;
t >>= 23;
t += tb * a[65];
r[65] = t & 0x7fffff;
t >>= 23;
t += tb * a[66];
r[66] = t & 0x7fffff;
t >>= 23;
r[67] = t & 0x7fffff;
#endif /* WOLFSSL_SP_SMALL */
}
@ -6434,35 +6453,34 @@ SP_NOINLINE static void sp_3072_mul_d_268(sp_digit* r, const sp_digit* a,
r[268] = (sp_digit)t;
#else
int64_t tb = b;
int64_t t[8];
int64_t t = 0;
sp_digit t2;
int64_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x7fffff;
for (i = 0; i < 264; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 23) + (t[4] & 0x7fffff);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 23) + (t[5] & 0x7fffff);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 23) + (t[6] & 0x7fffff);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 23) + (t[7] & 0x7fffff);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 23) + (t[0] & 0x7fffff);
for (i = 0; i < 268; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x7fffff);
t >>= 23;
r[i + 3] = t2;
}
t[1] = tb * a[265];
r[265] = (sp_digit)(t[0] >> 23) + (t[1] & 0x7fffff);
t[2] = tb * a[266];
r[266] = (sp_digit)(t[1] >> 23) + (t[2] & 0x7fffff);
t[3] = tb * a[267];
r[267] = (sp_digit)(t[2] >> 23) + (t[3] & 0x7fffff);
r[268] = (sp_digit)(t[3] >> 23);
r[268] = t & 0x7fffff;
#endif /* WOLFSSL_SP_SMALL */
}
@ -9173,35 +9191,34 @@ SP_NOINLINE static void sp_4096_mul_d_196(sp_digit* r, const sp_digit* a,
r[196] = (sp_digit)t;
#else
int64_t tb = b;
int64_t t[8];
int64_t t = 0;
sp_digit t2;
int64_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x1fffff;
for (i = 0; i < 192; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 21) + (t[1] & 0x1fffff);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 21) + (t[2] & 0x1fffff);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 21) + (t[3] & 0x1fffff);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 21) + (t[4] & 0x1fffff);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 21) + (t[5] & 0x1fffff);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 21) + (t[6] & 0x1fffff);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 21) + (t[7] & 0x1fffff);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 21) + (t[0] & 0x1fffff);
for (i = 0; i < 196; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x1fffff);
t >>= 21;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x1fffff);
t >>= 21;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x1fffff);
t >>= 21;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x1fffff);
t >>= 21;
r[i + 3] = t2;
}
t[1] = tb * a[193];
r[193] = (sp_digit)(t[0] >> 21) + (t[1] & 0x1fffff);
t[2] = tb * a[194];
r[194] = (sp_digit)(t[1] >> 21) + (t[2] & 0x1fffff);
t[3] = tb * a[195];
r[195] = (sp_digit)(t[2] >> 21) + (t[3] & 0x1fffff);
r[196] = (sp_digit)(t[3] >> 21);
r[196] = t & 0x1fffff;
#endif /* WOLFSSL_SP_SMALL */
}
@ -9523,31 +9540,40 @@ SP_NOINLINE static void sp_4096_mul_d_98(sp_digit* r, const sp_digit* a,
r[98] = (sp_digit)t;
#else
int64_t tb = b;
int64_t t[8];
int64_t t = 0;
sp_digit t2;
int64_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x1fffff;
for (i = 0; i < 96; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 21) + (t[1] & 0x1fffff);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 21) + (t[2] & 0x1fffff);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 21) + (t[3] & 0x1fffff);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 21) + (t[4] & 0x1fffff);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 21) + (t[5] & 0x1fffff);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 21) + (t[6] & 0x1fffff);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 21) + (t[7] & 0x1fffff);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 21) + (t[0] & 0x1fffff);
for (i = 0; i < 96; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x1fffff);
t >>= 21;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x1fffff);
t >>= 21;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x1fffff);
t >>= 21;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x1fffff);
t >>= 21;
r[i + 3] = t2;
}
t[1] = tb * a[97];
r[97] = (sp_digit)(t[0] >> 21) + (t[1] & 0x1fffff);
r[98] = (sp_digit)(t[1] >> 21);
t += tb * a[96];
r[96] = t & 0x1fffff;
t >>= 21;
t += tb * a[97];
r[97] = t & 0x1fffff;
t >>= 21;
r[98] = t & 0x1fffff;
#endif /* WOLFSSL_SP_SMALL */
}
@ -10485,29 +10511,34 @@ SP_NOINLINE static void sp_4096_mul_d_392(sp_digit* r, const sp_digit* a,
r[392] = (sp_digit)t;
#else
int64_t tb = b;
int64_t t[8];
int64_t t = 0;
sp_digit t2;
int64_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x1fffff;
for (i = 0; i < 392; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 21) + (t[1] & 0x1fffff);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 21) + (t[2] & 0x1fffff);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 21) + (t[3] & 0x1fffff);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 21) + (t[4] & 0x1fffff);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 21) + (t[5] & 0x1fffff);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 21) + (t[6] & 0x1fffff);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 21) + (t[7] & 0x1fffff);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 21) + (t[0] & 0x1fffff);
for (i = 0; i < 392; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x1fffff);
t >>= 21;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x1fffff);
t >>= 21;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x1fffff);
t >>= 21;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x1fffff);
t >>= 21;
r[i + 3] = t2;
}
r[392] = (sp_digit)(t[7] >> 21);
r[392] = t & 0x1fffff;
#endif /* WOLFSSL_SP_SMALL */
}

View File

@ -899,35 +899,34 @@ SP_NOINLINE static void sp_2048_mul_d_36(sp_digit* r, const sp_digit* a,
r[36] = (sp_digit)t;
#else
int128_t tb = b;
int128_t t[8];
int128_t t = 0;
sp_digit t2;
int128_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x1ffffffffffffffL;
for (i = 0; i < 32; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffL);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffL);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffL);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 57) + (t[6] & 0x1ffffffffffffffL);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 57) + (t[7] & 0x1ffffffffffffffL);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 57) + (t[0] & 0x1ffffffffffffffL);
for (i = 0; i < 36; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 3] = t2;
}
t[1] = tb * a[33];
r[33] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL);
t[2] = tb * a[34];
r[34] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL);
t[3] = tb * a[35];
r[35] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffL);
r[36] = (sp_digit)(t[3] >> 57);
r[36] = t & 0x1ffffffffffffffL;
#endif /* WOLFSSL_SP_SMALL */
}
@ -1243,31 +1242,40 @@ SP_NOINLINE static void sp_2048_mul_d_18(sp_digit* r, const sp_digit* a,
r[18] = (sp_digit)t;
#else
int128_t tb = b;
int128_t t[8];
int128_t t = 0;
sp_digit t2;
int128_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x1ffffffffffffffL;
for (i = 0; i < 16; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffL);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffL);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffL);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 57) + (t[6] & 0x1ffffffffffffffL);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 57) + (t[7] & 0x1ffffffffffffffL);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 57) + (t[0] & 0x1ffffffffffffffL);
for (i = 0; i < 16; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 3] = t2;
}
t[1] = tb * a[17];
r[17] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL);
r[18] = (sp_digit)(t[1] >> 57);
t += tb * a[16];
r[16] = t & 0x1ffffffffffffffL;
t >>= 57;
t += tb * a[17];
r[17] = t & 0x1ffffffffffffffL;
t >>= 57;
r[18] = t & 0x1ffffffffffffffL;
#endif /* WOLFSSL_SP_SMALL */
}
@ -5094,39 +5102,40 @@ SP_NOINLINE static void sp_3072_mul_d_54(sp_digit* r, const sp_digit* a,
r[54] = (sp_digit)t;
#else
int128_t tb = b;
int128_t t[8];
int128_t t = 0;
sp_digit t2;
int128_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x1ffffffffffffffL;
for (i = 0; i < 48; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffL);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffL);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffL);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 57) + (t[6] & 0x1ffffffffffffffL);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 57) + (t[7] & 0x1ffffffffffffffL);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 57) + (t[0] & 0x1ffffffffffffffL);
for (i = 0; i < 52; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 3] = t2;
}
t[1] = tb * a[49];
r[49] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL);
t[2] = tb * a[50];
r[50] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL);
t[3] = tb * a[51];
r[51] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffL);
t[4] = tb * a[52];
r[52] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffL);
t[5] = tb * a[53];
r[53] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffL);
r[54] = (sp_digit)(t[5] >> 57);
t += tb * a[52];
r[52] = t & 0x1ffffffffffffffL;
t >>= 57;
t += tb * a[53];
r[53] = t & 0x1ffffffffffffffL;
t >>= 57;
r[54] = t & 0x1ffffffffffffffL;
#endif /* WOLFSSL_SP_SMALL */
}
@ -5460,33 +5469,43 @@ SP_NOINLINE static void sp_3072_mul_d_27(sp_digit* r, const sp_digit* a,
r[27] = (sp_digit)t;
#else
int128_t tb = b;
int128_t t[8];
int128_t t = 0;
sp_digit t2;
int128_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x1ffffffffffffffL;
for (i = 0; i < 24; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 57) + (t[3] & 0x1ffffffffffffffL);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 57) + (t[4] & 0x1ffffffffffffffL);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 57) + (t[5] & 0x1ffffffffffffffL);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 57) + (t[6] & 0x1ffffffffffffffL);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 57) + (t[7] & 0x1ffffffffffffffL);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 57) + (t[0] & 0x1ffffffffffffffL);
for (i = 0; i < 24; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x1ffffffffffffffL);
t >>= 57;
r[i + 3] = t2;
}
t[1] = tb * a[25];
r[25] = (sp_digit)(t[0] >> 57) + (t[1] & 0x1ffffffffffffffL);
t[2] = tb * a[26];
r[26] = (sp_digit)(t[1] >> 57) + (t[2] & 0x1ffffffffffffffL);
r[27] = (sp_digit)(t[2] >> 57);
t += tb * a[24];
r[24] = t & 0x1ffffffffffffffL;
t >>= 57;
t += tb * a[25];
r[25] = t & 0x1ffffffffffffffL;
t >>= 57;
t += tb * a[26];
r[26] = t & 0x1ffffffffffffffL;
t >>= 57;
r[27] = t & 0x1ffffffffffffffL;
#endif /* WOLFSSL_SP_SMALL */
}
@ -9331,39 +9350,40 @@ SP_NOINLINE static void sp_4096_mul_d_78(sp_digit* r, const sp_digit* a,
r[78] = (sp_digit)t;
#else
int128_t tb = b;
int128_t t[8];
int128_t t = 0;
sp_digit t2;
int128_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x1fffffffffffffL;
for (i = 0; i < 72; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 53) + (t[1] & 0x1fffffffffffffL);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 53) + (t[2] & 0x1fffffffffffffL);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 53) + (t[3] & 0x1fffffffffffffL);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 53) + (t[4] & 0x1fffffffffffffL);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 53) + (t[5] & 0x1fffffffffffffL);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 53) + (t[6] & 0x1fffffffffffffL);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 53) + (t[7] & 0x1fffffffffffffL);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 53) + (t[0] & 0x1fffffffffffffL);
for (i = 0; i < 76; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x1fffffffffffffL);
t >>= 53;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x1fffffffffffffL);
t >>= 53;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x1fffffffffffffL);
t >>= 53;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x1fffffffffffffL);
t >>= 53;
r[i + 3] = t2;
}
t[1] = tb * a[73];
r[73] = (sp_digit)(t[0] >> 53) + (t[1] & 0x1fffffffffffffL);
t[2] = tb * a[74];
r[74] = (sp_digit)(t[1] >> 53) + (t[2] & 0x1fffffffffffffL);
t[3] = tb * a[75];
r[75] = (sp_digit)(t[2] >> 53) + (t[3] & 0x1fffffffffffffL);
t[4] = tb * a[76];
r[76] = (sp_digit)(t[3] >> 53) + (t[4] & 0x1fffffffffffffL);
t[5] = tb * a[77];
r[77] = (sp_digit)(t[4] >> 53) + (t[5] & 0x1fffffffffffffL);
r[78] = (sp_digit)(t[5] >> 53);
t += tb * a[76];
r[76] = t & 0x1fffffffffffffL;
t >>= 53;
t += tb * a[77];
r[77] = t & 0x1fffffffffffffL;
t >>= 53;
r[78] = t & 0x1fffffffffffffL;
#endif /* WOLFSSL_SP_SMALL */
}
@ -9720,41 +9740,43 @@ SP_NOINLINE static void sp_4096_mul_d_39(sp_digit* r, const sp_digit* a,
r[39] = (sp_digit)t;
#else
int128_t tb = b;
int128_t t[8];
int128_t t = 0;
sp_digit t2;
int128_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x1fffffffffffffL;
for (i = 0; i < 32; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 53) + (t[1] & 0x1fffffffffffffL);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 53) + (t[2] & 0x1fffffffffffffL);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 53) + (t[3] & 0x1fffffffffffffL);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 53) + (t[4] & 0x1fffffffffffffL);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 53) + (t[5] & 0x1fffffffffffffL);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 53) + (t[6] & 0x1fffffffffffffL);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 53) + (t[7] & 0x1fffffffffffffL);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 53) + (t[0] & 0x1fffffffffffffL);
for (i = 0; i < 36; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x1fffffffffffffL);
t >>= 53;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x1fffffffffffffL);
t >>= 53;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x1fffffffffffffL);
t >>= 53;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x1fffffffffffffL);
t >>= 53;
r[i + 3] = t2;
}
t[1] = tb * a[33];
r[33] = (sp_digit)(t[0] >> 53) + (t[1] & 0x1fffffffffffffL);
t[2] = tb * a[34];
r[34] = (sp_digit)(t[1] >> 53) + (t[2] & 0x1fffffffffffffL);
t[3] = tb * a[35];
r[35] = (sp_digit)(t[2] >> 53) + (t[3] & 0x1fffffffffffffL);
t[4] = tb * a[36];
r[36] = (sp_digit)(t[3] >> 53) + (t[4] & 0x1fffffffffffffL);
t[5] = tb * a[37];
r[37] = (sp_digit)(t[4] >> 53) + (t[5] & 0x1fffffffffffffL);
t[6] = tb * a[38];
r[38] = (sp_digit)(t[5] >> 53) + (t[6] & 0x1fffffffffffffL);
r[39] = (sp_digit)(t[6] >> 53);
t += tb * a[36];
r[36] = t & 0x1fffffffffffffL;
t >>= 53;
t += tb * a[37];
r[37] = t & 0x1fffffffffffffL;
t >>= 53;
t += tb * a[38];
r[38] = t & 0x1fffffffffffffL;
t >>= 53;
r[39] = t & 0x1fffffffffffffL;
#endif /* WOLFSSL_SP_SMALL */
}
@ -10720,35 +10742,34 @@ SP_NOINLINE static void sp_4096_mul_d_156(sp_digit* r, const sp_digit* a,
r[156] = (sp_digit)t;
#else
int128_t tb = b;
int128_t t[8];
int128_t t = 0;
sp_digit t2;
int128_t p[4];
int i;
t[0] = tb * a[0]; r[0] = t[0] & 0x1fffffffffffffL;
for (i = 0; i < 152; i += 8) {
t[1] = tb * a[i+1];
r[i+1] = (sp_digit)(t[0] >> 53) + (t[1] & 0x1fffffffffffffL);
t[2] = tb * a[i+2];
r[i+2] = (sp_digit)(t[1] >> 53) + (t[2] & 0x1fffffffffffffL);
t[3] = tb * a[i+3];
r[i+3] = (sp_digit)(t[2] >> 53) + (t[3] & 0x1fffffffffffffL);
t[4] = tb * a[i+4];
r[i+4] = (sp_digit)(t[3] >> 53) + (t[4] & 0x1fffffffffffffL);
t[5] = tb * a[i+5];
r[i+5] = (sp_digit)(t[4] >> 53) + (t[5] & 0x1fffffffffffffL);
t[6] = tb * a[i+6];
r[i+6] = (sp_digit)(t[5] >> 53) + (t[6] & 0x1fffffffffffffL);
t[7] = tb * a[i+7];
r[i+7] = (sp_digit)(t[6] >> 53) + (t[7] & 0x1fffffffffffffL);
t[0] = tb * a[i+8];
r[i+8] = (sp_digit)(t[7] >> 53) + (t[0] & 0x1fffffffffffffL);
for (i = 0; i < 156; i += 4) {
p[0] = tb * a[i + 0];
p[1] = tb * a[i + 1];
p[2] = tb * a[i + 2];
p[3] = tb * a[i + 3];
t += p[0];
t2 = (sp_digit)(t & 0x1fffffffffffffL);
t >>= 53;
r[i + 0] = t2;
t += p[1];
t2 = (sp_digit)(t & 0x1fffffffffffffL);
t >>= 53;
r[i + 1] = t2;
t += p[2];
t2 = (sp_digit)(t & 0x1fffffffffffffL);
t >>= 53;
r[i + 2] = t2;
t += p[3];
t2 = (sp_digit)(t & 0x1fffffffffffffL);
t >>= 53;
r[i + 3] = t2;
}
t[1] = tb * a[153];
r[153] = (sp_digit)(t[0] >> 53) + (t[1] & 0x1fffffffffffffL);
t[2] = tb * a[154];
r[154] = (sp_digit)(t[1] >> 53) + (t[2] & 0x1fffffffffffffL);
t[3] = tb * a[155];
r[155] = (sp_digit)(t[2] >> 53) + (t[3] & 0x1fffffffffffffL);
r[156] = (sp_digit)(t[3] >> 53);
r[156] = t & 0x1fffffffffffffL;
#endif /* WOLFSSL_SP_SMALL */
}