Merge pull request #4853 from SparkiDev/curve448_128bit_perf

Curve448: inline Karatsuba in sqr and mul for 128-bit impl
This commit is contained in:
David Garske
2022-02-14 12:04:57 -08:00
committed by GitHub

View File

@@ -835,110 +835,76 @@ void fe448_mul39081(sword64* r, const sword64* a)
*/ */
void fe448_mul(sword64* r, const sword64* a, const sword64* b) void fe448_mul(sword64* r, const sword64* a, const sword64* b)
{ {
sword128 t;
sword64 o; sword64 o;
sword128 t0 = (sword128)a[ 0] * b[ 0]; sword64 a1[4];
sword128 t1 = (sword128)a[ 0] * b[ 1]; sword64 b1[4];
sword128 t101 = (sword128)a[ 1] * b[ 0]; sword128 t0;
sword128 t2 = (sword128)a[ 0] * b[ 2]; sword128 t1;
sword128 t102 = (sword128)a[ 1] * b[ 1]; sword128 t2;
sword128 t202 = (sword128)a[ 2] * b[ 0]; sword128 t3;
sword128 t3 = (sword128)a[ 0] * b[ 3]; sword128 t4;
sword128 t103 = (sword128)a[ 1] * b[ 2]; sword128 t5;
sword128 t203 = (sword128)a[ 2] * b[ 1]; sword128 t6;
sword128 t303 = (sword128)a[ 3] * b[ 0]; sword128 t7;
sword128 t4 = (sword128)a[ 0] * b[ 4]; sword128 t03;
sword128 t104 = (sword128)a[ 1] * b[ 3]; sword128 t04;
sword128 t204 = (sword128)a[ 2] * b[ 2]; sword128 t13;
sword128 t304 = (sword128)a[ 3] * b[ 1]; sword128 t14;
sword128 t404 = (sword128)a[ 4] * b[ 0];
sword128 t5 = (sword128)a[ 0] * b[ 5];
sword128 t105 = (sword128)a[ 1] * b[ 4];
sword128 t205 = (sword128)a[ 2] * b[ 3];
sword128 t305 = (sword128)a[ 3] * b[ 2];
sword128 t405 = (sword128)a[ 4] * b[ 1];
sword128 t505 = (sword128)a[ 5] * b[ 0];
sword128 t6 = (sword128)a[ 0] * b[ 6];
sword128 t106 = (sword128)a[ 1] * b[ 5];
sword128 t206 = (sword128)a[ 2] * b[ 4];
sword128 t306 = (sword128)a[ 3] * b[ 3];
sword128 t406 = (sword128)a[ 4] * b[ 2];
sword128 t506 = (sword128)a[ 5] * b[ 1];
sword128 t606 = (sword128)a[ 6] * b[ 0];
sword128 t7 = (sword128)a[ 0] * b[ 7];
sword128 t107 = (sword128)a[ 1] * b[ 6];
sword128 t207 = (sword128)a[ 2] * b[ 5];
sword128 t307 = (sword128)a[ 3] * b[ 4];
sword128 t407 = (sword128)a[ 4] * b[ 3];
sword128 t507 = (sword128)a[ 5] * b[ 2];
sword128 t607 = (sword128)a[ 6] * b[ 1];
sword128 t707 = (sword128)a[ 7] * b[ 0];
sword128 t8 = (sword128)a[ 1] * b[ 7];
sword128 t108 = (sword128)a[ 2] * b[ 6];
sword128 t208 = (sword128)a[ 3] * b[ 5];
sword128 t308 = (sword128)a[ 4] * b[ 4];
sword128 t408 = (sword128)a[ 5] * b[ 3];
sword128 t508 = (sword128)a[ 6] * b[ 2];
sword128 t608 = (sword128)a[ 7] * b[ 1];
sword128 t9 = (sword128)a[ 2] * b[ 7];
sword128 t109 = (sword128)a[ 3] * b[ 6];
sword128 t209 = (sword128)a[ 4] * b[ 5];
sword128 t309 = (sword128)a[ 5] * b[ 4];
sword128 t409 = (sword128)a[ 6] * b[ 3];
sword128 t509 = (sword128)a[ 7] * b[ 2];
sword128 t10 = (sword128)a[ 3] * b[ 7];
sword128 t110 = (sword128)a[ 4] * b[ 6];
sword128 t210 = (sword128)a[ 5] * b[ 5];
sword128 t310 = (sword128)a[ 6] * b[ 4];
sword128 t410 = (sword128)a[ 7] * b[ 3];
sword128 t11 = (sword128)a[ 4] * b[ 7];
sword128 t111 = (sword128)a[ 5] * b[ 6];
sword128 t211 = (sword128)a[ 6] * b[ 5];
sword128 t311 = (sword128)a[ 7] * b[ 4];
sword128 t12 = (sword128)a[ 5] * b[ 7];
sword128 t112 = (sword128)a[ 6] * b[ 6];
sword128 t212 = (sword128)a[ 7] * b[ 5];
sword128 t13 = (sword128)a[ 6] * b[ 7];
sword128 t113 = (sword128)a[ 7] * b[ 6];
sword128 t14 = (sword128)a[ 7] * b[ 7];
t1 += t101;
t2 += t102; t2 += t202;
t3 += t103; t3 += t203; t3 += t303;
t4 += t104; t4 += t204; t4 += t304; t4 += t404;
t5 += t105; t5 += t205; t5 += t305; t5 += t405; t5 += t505;
t6 += t106; t6 += t206; t6 += t306; t6 += t406; t6 += t506;
t6 += t606;
t7 += t107; t7 += t207; t7 += t307; t7 += t407; t7 += t507;
t7 += t607;
t7 += t707;
t8 += t108; t8 += t208; t8 += t308; t8 += t408; t8 += t508;
t8 += t608;
t9 += t109; t9 += t209; t9 += t309; t9 += t409; t9 += t509;
t10 += t110; t10 += t210; t10 += t310; t10 += t410;
t11 += t111; t11 += t211; t11 += t311;
t12 += t112; t12 += t212;
t13 += t113;
/* Reduce */ a1[0] = a[0] + a[4];
t0 += t8 + t12; a1[1] = a[1] + a[5];
t1 += t9 + t13; a1[2] = a[2] + a[6];
t2 += t10 + t14; a1[3] = a[3] + a[7];
t3 += t11; b1[0] = b[0] + b[4];
t4 += t12 + t8 + t12; b1[1] = b[1] + b[5];
t5 += t13 + t9 + t13; b1[2] = b[2] + b[6];
t6 += t14 + t10 + t14; b1[3] = b[3] + b[7];
t7 += t11;
o = t7 >> 56; t0 += o; t03 = ((sword128)a[0] * b[3]) + ((sword128)a[1] * b[2])
t4 += o; t = (sword128)o << 56; t7 -= t; + ((sword128)a[2] * b[1]) + ((sword128)a[3] * b[0]);
o = (sword64)(t0 >> 56); t1 += o; t = (sword128)o << 56; t0 -= t; t04 = ((sword128)a[1] * b[3]) + ((sword128)a[2] * b[2])
o = (sword64)(t1 >> 56); t2 += o; t = (sword128)o << 56; t1 -= t; + ((sword128)a[3] * b[1]);
o = (sword64)(t2 >> 56); t3 += o; t = (sword128)o << 56; t2 -= t; t04 += t03 >> 56;
o = (sword64)(t3 >> 56); t4 += o; t = (sword128)o << 56; t3 -= t; t03 &= 0xffffffffffffffL;
o = (sword64)(t4 >> 56); t5 += o; t = (sword128)o << 56; t4 -= t; t13 = ((sword128)a1[0] * b1[3]) + ((sword128)a1[1] * b1[2])
o = (sword64)(t5 >> 56); t6 += o; t = (sword128)o << 56; t5 -= t; + ((sword128)a1[2] * b1[1]) + ((sword128)a1[3] * b1[0]);
o = (sword64)(t6 >> 56); t7 += o; t = (sword128)o << 56; t6 -= t; t14 = ((sword128)a1[1] * b1[3]) + ((sword128)a1[2] * b1[2])
o = (sword64)(t7 >> 56); t0 += o; + ((sword128)a1[3] * b1[1]);
t4 += o; t = (sword128)o << 56; t7 -= t; t14 += t13 >> 56;
t13 &= 0xffffffffffffffL;
t0 = ((sword128)a[0] * b[0]) + ((sword128)a[4] * b[4]) + t14 + -t04;
t1 = ((sword128)a[0] * b[1]) + ((sword128)a[1] * b[0])
+ ((sword128)a[4] * b[5]) + ((sword128)a[5] * b[4])
+ ((sword128)a1[2] * b1[3]) + ((sword128)a1[3] * b1[2])
- ((sword128)a[2] * b[3]) - ((sword128)a[3] * b[2]);
o = (sword64)(t0 >> 56); t1 += o; t0 &= 0xffffffffffffffL;
t2 = ((sword128)a[0] * b[2]) + ((sword128)a[1] * b[1])
+ ((sword128)a[2] * b[0]) + ((sword128)a[4] * b[6])
+ ((sword128)a[5] * b[5]) + ((sword128)a[6] * b[4])
+ ((sword128)a1[3] * b1[3]) - ((sword128)a[3] * b[3]);
o = (sword64)(t1 >> 56); t2 += o; t1 &= 0xffffffffffffffL;
t3 = t03 + ((sword128)a[4] * b[7]) + ((sword128)a[5] * b[6])
+ ((sword128)a[6] * b[5]) + ((sword128)a[7] * b[4]);
o = (sword64)(t2 >> 56); t3 += o; t2 &= 0xffffffffffffffL;
t4 = ((sword128)a[5] * b[7]) + ((sword128)a[6] * b[6])
+ ((sword128)a[7] * b[5]) + ((sword128)a1[0] * b1[0])
- ((sword128)a[0] * b[0]) + t14;
o = (sword64)(t3 >> 56); t4 += o; t3 &= 0xffffffffffffffL;
t5 = ((sword128)a[6] * b[7]) + ((sword128)a[7] * b[6])
+ ((sword128)a1[0] * b1[1]) + ((sword128)a1[1] * b1[0])
- ((sword128)a[0] * b[1]) - ((sword128)a[1] * b[0])
+ ((sword128)a1[2] * b1[3]) + ((sword128)a1[3] * b1[2]);
o = (sword64)(t4 >> 56); t5 += o; t4 &= 0xffffffffffffffL;
t6 = ((sword128)a[7] * b[7]) + ((sword128)a1[0] * b1[2])
+ ((sword128)a1[1] * b1[1]) + ((sword128)a1[2] * b1[0])
- ((sword128)a[0] * b[2]) - ((sword128)a[1] * b[1])
- ((sword128)a[2] * b[0]) + ((sword128)a1[3] * b1[3]);
o = (sword64)(t5 >> 56); t6 += o; t5 &= 0xffffffffffffffL;
t7 = t13 + -t03;
o = (sword64)(t6 >> 56); t7 += o; t6 &= 0xffffffffffffffL;
o = (sword64)(t7 >> 56); t0 += o;
t4 += o; t7 &= 0xffffffffffffffL;
/* Store */ /* Store */
r[0] = (sword64)t0; r[0] = (sword64)t0;
@@ -958,76 +924,59 @@ void fe448_mul(sword64* r, const sword64* a, const sword64* b)
*/ */
void fe448_sqr(sword64* r, const sword64* a) void fe448_sqr(sword64* r, const sword64* a)
{ {
sword128 t;
sword64 o; sword64 o;
sword128 t0 = (sword128)a[ 0] * a[ 0]; sword64 a1[4];
sword128 t1 = 2 * (sword128)a[ 0] * a[ 1]; sword128 t0;
sword128 t2 = 2 * (sword128)a[ 0] * a[ 2]; sword128 t1;
sword128 t102 = (sword128)a[ 1] * a[ 1]; sword128 t2;
sword128 t3 = 2 * (sword128)a[ 0] * a[ 3]; sword128 t3;
sword128 t103 = 2 * (sword128)a[ 1] * a[ 2]; sword128 t4;
sword128 t4 = 2 * (sword128)a[ 0] * a[ 4]; sword128 t5;
sword128 t104 = 2 * (sword128)a[ 1] * a[ 3]; sword128 t6;
sword128 t204 = (sword128)a[ 2] * a[ 2]; sword128 t7;
sword128 t5 = 2 * (sword128)a[ 0] * a[ 5]; sword128 t03;
sword128 t105 = 2 * (sword128)a[ 1] * a[ 4]; sword128 t04;
sword128 t205 = 2 * (sword128)a[ 2] * a[ 3]; sword128 t13;
sword128 t6 = 2 * (sword128)a[ 0] * a[ 6]; sword128 t14;
sword128 t106 = 2 * (sword128)a[ 1] * a[ 5];
sword128 t206 = 2 * (sword128)a[ 2] * a[ 4];
sword128 t306 = (sword128)a[ 3] * a[ 3];
sword128 t7 = 2 * (sword128)a[ 0] * a[ 7];
sword128 t107 = 2 * (sword128)a[ 1] * a[ 6];
sword128 t207 = 2 * (sword128)a[ 2] * a[ 5];
sword128 t307 = 2 * (sword128)a[ 3] * a[ 4];
sword128 t8 = 2 * (sword128)a[ 1] * a[ 7];
sword128 t108 = 2 * (sword128)a[ 2] * a[ 6];
sword128 t208 = 2 * (sword128)a[ 3] * a[ 5];
sword128 t308 = (sword128)a[ 4] * a[ 4];
sword128 t9 = 2 * (sword128)a[ 2] * a[ 7];
sword128 t109 = 2 * (sword128)a[ 3] * a[ 6];
sword128 t209 = 2 * (sword128)a[ 4] * a[ 5];
sword128 t10 = 2 * (sword128)a[ 3] * a[ 7];
sword128 t110 = 2 * (sword128)a[ 4] * a[ 6];
sword128 t210 = (sword128)a[ 5] * a[ 5];
sword128 t11 = 2 * (sword128)a[ 4] * a[ 7];
sword128 t111 = 2 * (sword128)a[ 5] * a[ 6];
sword128 t12 = 2 * (sword128)a[ 5] * a[ 7];
sword128 t112 = (sword128)a[ 6] * a[ 6];
sword128 t13 = 2 * (sword128)a[ 6] * a[ 7];
sword128 t14 = (sword128)a[ 7] * a[ 7];
t2 += t102;
t3 += t103;
t4 += t104; t4 += t204;
t5 += t105; t5 += t205;
t6 += t106; t6 += t206; t6 += t306;
t7 += t107; t7 += t207; t7 += t307;
t8 += t108; t8 += t208; t8 += t308;
t9 += t109; t9 += t209;
t10 += t110; t10 += t210;
t11 += t111;
t12 += t112;
/* Reduce */ a1[0] = a[0] + a[4];
t0 += t8 + t12; a1[1] = a[1] + a[5];
t1 += t9 + t13; a1[2] = a[2] + a[6];
t2 += t10 + t14; a1[3] = a[3] + a[7];
t3 += t11;
t4 += t12 + t8 + t12; t03 = ((sword128)a[0] * (2 * a[3])) + ((sword128)a[1] * (2 * a[2]));
t5 += t13 + t9 + t13; t04 = ((sword128)a[1] * (2 * a[3])) + ((sword128)a[2] * a[2]);
t6 += t14 + t10 + t14; t04 += t03 >> 56;
t7 += t11; t03 &= 0xffffffffffffffL;
o = t7 >> 56; t0 += o; t13 = ((sword128)a1[0] * (2 * a1[3])) + ((sword128)a1[1] * (2 * a1[2]));
t4 += o; t = (sword128)o << 56; t7 -= t; t14 = ((sword128)a1[1] * (2 * a1[3])) + ((sword128)a1[2] * a1[2]);
o = (sword64)(t0 >> 56); t1 += o; t = (sword128)o << 56; t0 -= t; t14 += t13 >> 56;
o = (sword64)(t1 >> 56); t2 += o; t = (sword128)o << 56; t1 -= t; t13 &= 0xffffffffffffffL;
o = (sword64)(t2 >> 56); t3 += o; t = (sword128)o << 56; t2 -= t;
o = (sword64)(t3 >> 56); t4 += o; t = (sword128)o << 56; t3 -= t; t0 = ((sword128)a[0] * a[0]) + ((sword128)a[4] * a[4]) + t14 + -t04;
o = (sword64)(t4 >> 56); t5 += o; t = (sword128)o << 56; t4 -= t; t1 = ((sword128)a[0] * (2 * a[1])) + ((sword128)a[4] * (2 * a[5]))
o = (sword64)(t5 >> 56); t6 += o; t = (sword128)o << 56; t5 -= t; + ((sword128)a1[2] * (2 * a1[3])) - ((sword128)a[2] * (2 * a[3]));
o = (sword64)(t6 >> 56); t7 += o; t = (sword128)o << 56; t6 -= t; o = (sword64)(t0 >> 56); t1 += o; t0 &= 0xffffffffffffffL;
o = (sword64)(t7 >> 56); t0 += o; t2 = ((sword128)a[0] * (2 * a[2])) + ((sword128)a[1] * a[1])
t4 += o; t = (sword128)o << 56; t7 -= t; + ((sword128)a[4] * (2 * a[6])) + ((sword128)a[5] * a[5])
+ ((sword128)a1[3] * a1[3]) - ((sword128)a[3] * a[3]);
o = (sword64)(t1 >> 56); t2 += o; t1 &= 0xffffffffffffffL;
t3 = t03 + ((sword128)a[4] * (2 * a[7])) + ((sword128)a[5] * (2 * a[6]));
o = (sword64)(t2 >> 56); t3 += o; t2 &= 0xffffffffffffffL;
t4 = ((sword128)a[5] * (2 * a[7])) + ((sword128)a[6] * a[6])
+ ((sword128)a1[0] * a1[0]) - ((sword128)a[0] * a[0]) + t14;
o = (sword64)(t3 >> 56); t4 += o; t3 &= 0xffffffffffffffL;
t5 = ((sword128)a[6] * (2 * a[7])) + ((sword128)a1[0] * (2 * a1[1]))
- ((sword128)a[0] * (2 * a[1])) + ((sword128)a1[2] * (2 * a1[3]));
o = (sword64)(t4 >> 56); t5 += o; t4 &= 0xffffffffffffffL;
t6 = ((sword128)a[7] * a[7]) + ((sword128)a1[0] * (2 * a1[2]))
+ ((sword128)a1[1] * a1[1]) - ((sword128)a[0] * (2 * a[2]))
- ((sword128)a[1] * a[1]) + ((sword128)a1[3] * a1[3]);
o = (sword64)(t5 >> 56); t6 += o; t5 &= 0xffffffffffffffL;
t7 = t13 + -t03;
o = (sword64)(t6 >> 56); t7 += o; t6 &= 0xffffffffffffffL;
o = (sword64)(t7 >> 56); t0 += o;
t4 += o; t7 &= 0xffffffffffffffL;
/* Store */ /* Store */
r[0] = (sword64)t0; r[0] = (sword64)t0;
@@ -2001,6 +1950,8 @@ void fe448_mul(sword32* r, const sword32* a, const sword32* b)
*/ */
static WC_INLINE void fe448_sqr_8(sword32* r, const sword32* a) static WC_INLINE void fe448_sqr_8(sword32* r, const sword32* a)
{ {
sword64 o;
sword64 t15;
sword64 t; sword64 t;
sword64 t0 = (sword64)a[ 0] * a[ 0]; sword64 t0 = (sword64)a[ 0] * a[ 0];
sword64 t1 = 2 * (sword64)a[ 0] * a[ 1]; sword64 t1 = 2 * (sword64)a[ 0] * a[ 1];
@@ -2049,8 +2000,8 @@ static WC_INLINE void fe448_sqr_8(sword32* r, const sword32* a)
t10 += t110; t10 += t210; t10 += t110; t10 += t210;
t11 += t111; t11 += t111;
t12 += t112; t12 += t112;
sword64 o = t14 >> 28; o = t14 >> 28;
sword64 t15 = o; t15 = o;
t14 -= o << 28; t14 -= o << 28;
o = (t0 >> 28); t1 += o; t = o << 28; t0 -= t; o = (t0 >> 28); t1 += o; t = o << 28; t0 -= t;
o = (t1 >> 28); t2 += o; t = o << 28; t1 -= t; o = (t1 >> 28); t2 += o; t = o << 28; t1 -= t;