From 38653510eb31f02f5225c3b1faf3d36421a0a1b1 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 11 Feb 2022 11:11:15 +1000 Subject: [PATCH] Curve448: inline Karatsuba in sqr and mul for 128-bit impl --- wolfcrypt/src/fe_448.c | 295 +++++++++++++++++------------------------ 1 file changed, 123 insertions(+), 172 deletions(-) diff --git a/wolfcrypt/src/fe_448.c b/wolfcrypt/src/fe_448.c index 1dc16f6d4..d7451ba9d 100644 --- a/wolfcrypt/src/fe_448.c +++ b/wolfcrypt/src/fe_448.c @@ -835,110 +835,76 @@ void fe448_mul39081(sword64* r, const sword64* a) */ void fe448_mul(sword64* r, const sword64* a, const sword64* b) { - sword128 t; sword64 o; - sword128 t0 = (sword128)a[ 0] * b[ 0]; - sword128 t1 = (sword128)a[ 0] * b[ 1]; - sword128 t101 = (sword128)a[ 1] * b[ 0]; - sword128 t2 = (sword128)a[ 0] * b[ 2]; - sword128 t102 = (sword128)a[ 1] * b[ 1]; - sword128 t202 = (sword128)a[ 2] * b[ 0]; - sword128 t3 = (sword128)a[ 0] * b[ 3]; - sword128 t103 = (sword128)a[ 1] * b[ 2]; - sword128 t203 = (sword128)a[ 2] * b[ 1]; - sword128 t303 = (sword128)a[ 3] * b[ 0]; - sword128 t4 = (sword128)a[ 0] * b[ 4]; - sword128 t104 = (sword128)a[ 1] * b[ 3]; - sword128 t204 = (sword128)a[ 2] * b[ 2]; - sword128 t304 = (sword128)a[ 3] * b[ 1]; - sword128 t404 = (sword128)a[ 4] * b[ 0]; - sword128 t5 = (sword128)a[ 0] * b[ 5]; - sword128 t105 = (sword128)a[ 1] * b[ 4]; - sword128 t205 = (sword128)a[ 2] * b[ 3]; - sword128 t305 = (sword128)a[ 3] * b[ 2]; - sword128 t405 = (sword128)a[ 4] * b[ 1]; - sword128 t505 = (sword128)a[ 5] * b[ 0]; - sword128 t6 = (sword128)a[ 0] * b[ 6]; - sword128 t106 = (sword128)a[ 1] * b[ 5]; - sword128 t206 = (sword128)a[ 2] * b[ 4]; - sword128 t306 = (sword128)a[ 3] * b[ 3]; - sword128 t406 = (sword128)a[ 4] * b[ 2]; - sword128 t506 = (sword128)a[ 5] * b[ 1]; - sword128 t606 = (sword128)a[ 6] * b[ 0]; - sword128 t7 = (sword128)a[ 0] * b[ 7]; - sword128 t107 = (sword128)a[ 1] * b[ 6]; - sword128 t207 = (sword128)a[ 2] * b[ 5]; - sword128 t307 = (sword128)a[ 3] * b[ 4]; - sword128 t407 = (sword128)a[ 4] * b[ 3]; - sword128 t507 = (sword128)a[ 5] * b[ 2]; - sword128 t607 = (sword128)a[ 6] * b[ 1]; - sword128 t707 = (sword128)a[ 7] * b[ 0]; - sword128 t8 = (sword128)a[ 1] * b[ 7]; - sword128 t108 = (sword128)a[ 2] * b[ 6]; - sword128 t208 = (sword128)a[ 3] * b[ 5]; - sword128 t308 = (sword128)a[ 4] * b[ 4]; - sword128 t408 = (sword128)a[ 5] * b[ 3]; - sword128 t508 = (sword128)a[ 6] * b[ 2]; - sword128 t608 = (sword128)a[ 7] * b[ 1]; - sword128 t9 = (sword128)a[ 2] * b[ 7]; - sword128 t109 = (sword128)a[ 3] * b[ 6]; - sword128 t209 = (sword128)a[ 4] * b[ 5]; - sword128 t309 = (sword128)a[ 5] * b[ 4]; - sword128 t409 = (sword128)a[ 6] * b[ 3]; - sword128 t509 = (sword128)a[ 7] * b[ 2]; - sword128 t10 = (sword128)a[ 3] * b[ 7]; - sword128 t110 = (sword128)a[ 4] * b[ 6]; - sword128 t210 = (sword128)a[ 5] * b[ 5]; - sword128 t310 = (sword128)a[ 6] * b[ 4]; - sword128 t410 = (sword128)a[ 7] * b[ 3]; - sword128 t11 = (sword128)a[ 4] * b[ 7]; - sword128 t111 = (sword128)a[ 5] * b[ 6]; - sword128 t211 = (sword128)a[ 6] * b[ 5]; - sword128 t311 = (sword128)a[ 7] * b[ 4]; - sword128 t12 = (sword128)a[ 5] * b[ 7]; - sword128 t112 = (sword128)a[ 6] * b[ 6]; - sword128 t212 = (sword128)a[ 7] * b[ 5]; - sword128 t13 = (sword128)a[ 6] * b[ 7]; - sword128 t113 = (sword128)a[ 7] * b[ 6]; - sword128 t14 = (sword128)a[ 7] * b[ 7]; - t1 += t101; - t2 += t102; t2 += t202; - t3 += t103; t3 += t203; t3 += t303; - t4 += t104; t4 += t204; t4 += t304; t4 += t404; - t5 += t105; t5 += t205; t5 += t305; t5 += t405; t5 += t505; - t6 += t106; t6 += t206; t6 += t306; t6 += t406; t6 += t506; - t6 += t606; - t7 += t107; t7 += t207; t7 += t307; t7 += t407; t7 += t507; - t7 += t607; - t7 += t707; - t8 += t108; t8 += t208; t8 += t308; t8 += t408; t8 += t508; - t8 += t608; - t9 += t109; t9 += t209; t9 += t309; t9 += t409; t9 += t509; - t10 += t110; t10 += t210; t10 += t310; t10 += t410; - t11 += t111; t11 += t211; t11 += t311; - t12 += t112; t12 += t212; - t13 += t113; + sword64 a1[4]; + sword64 b1[4]; + sword128 t0; + sword128 t1; + sword128 t2; + sword128 t3; + sword128 t4; + sword128 t5; + sword128 t6; + sword128 t7; + sword128 t03; + sword128 t04; + sword128 t13; + sword128 t14; - /* Reduce */ - t0 += t8 + t12; - t1 += t9 + t13; - t2 += t10 + t14; - t3 += t11; - t4 += t12 + t8 + t12; - t5 += t13 + t9 + t13; - t6 += t14 + t10 + t14; - t7 += t11; - o = t7 >> 56; t0 += o; - t4 += o; t = (sword128)o << 56; t7 -= t; - o = (sword64)(t0 >> 56); t1 += o; t = (sword128)o << 56; t0 -= t; - o = (sword64)(t1 >> 56); t2 += o; t = (sword128)o << 56; t1 -= t; - o = (sword64)(t2 >> 56); t3 += o; t = (sword128)o << 56; t2 -= t; - o = (sword64)(t3 >> 56); t4 += o; t = (sword128)o << 56; t3 -= t; - o = (sword64)(t4 >> 56); t5 += o; t = (sword128)o << 56; t4 -= t; - o = (sword64)(t5 >> 56); t6 += o; t = (sword128)o << 56; t5 -= t; - o = (sword64)(t6 >> 56); t7 += o; t = (sword128)o << 56; t6 -= t; - o = (sword64)(t7 >> 56); t0 += o; - t4 += o; t = (sword128)o << 56; t7 -= t; + a1[0] = a[0] + a[4]; + a1[1] = a[1] + a[5]; + a1[2] = a[2] + a[6]; + a1[3] = a[3] + a[7]; + b1[0] = b[0] + b[4]; + b1[1] = b[1] + b[5]; + b1[2] = b[2] + b[6]; + b1[3] = b[3] + b[7]; + + t03 = ((sword128)a[0] * b[3]) + ((sword128)a[1] * b[2]) + + ((sword128)a[2] * b[1]) + ((sword128)a[3] * b[0]); + t04 = ((sword128)a[1] * b[3]) + ((sword128)a[2] * b[2]) + + ((sword128)a[3] * b[1]); + t04 += t03 >> 56; + t03 &= 0xffffffffffffffL; + t13 = ((sword128)a1[0] * b1[3]) + ((sword128)a1[1] * b1[2]) + + ((sword128)a1[2] * b1[1]) + ((sword128)a1[3] * b1[0]); + t14 = ((sword128)a1[1] * b1[3]) + ((sword128)a1[2] * b1[2]) + + ((sword128)a1[3] * b1[1]); + t14 += t13 >> 56; + t13 &= 0xffffffffffffffL; + + t0 = ((sword128)a[0] * b[0]) + ((sword128)a[4] * b[4]) + t14 + -t04; + t1 = ((sword128)a[0] * b[1]) + ((sword128)a[1] * b[0]) + + ((sword128)a[4] * b[5]) + ((sword128)a[5] * b[4]) + + ((sword128)a1[2] * b1[3]) + ((sword128)a1[3] * b1[2]) + - ((sword128)a[2] * b[3]) - ((sword128)a[3] * b[2]); + o = (sword64)(t0 >> 56); t1 += o; t0 &= 0xffffffffffffffL; + t2 = ((sword128)a[0] * b[2]) + ((sword128)a[1] * b[1]) + + ((sword128)a[2] * b[0]) + ((sword128)a[4] * b[6]) + + ((sword128)a[5] * b[5]) + ((sword128)a[6] * b[4]) + + ((sword128)a1[3] * b1[3]) - ((sword128)a[3] * b[3]); + o = (sword64)(t1 >> 56); t2 += o; t1 &= 0xffffffffffffffL; + t3 = t03 + ((sword128)a[4] * b[7]) + ((sword128)a[5] * b[6]) + + ((sword128)a[6] * b[5]) + ((sword128)a[7] * b[4]); + o = (sword64)(t2 >> 56); t3 += o; t2 &= 0xffffffffffffffL; + t4 = ((sword128)a[5] * b[7]) + ((sword128)a[6] * b[6]) + + ((sword128)a[7] * b[5]) + ((sword128)a1[0] * b1[0]) + - ((sword128)a[0] * b[0]) + t14; + o = (sword64)(t3 >> 56); t4 += o; t3 &= 0xffffffffffffffL; + t5 = ((sword128)a[6] * b[7]) + ((sword128)a[7] * b[6]) + + ((sword128)a1[0] * b1[1]) + ((sword128)a1[1] * b1[0]) + - ((sword128)a[0] * b[1]) - ((sword128)a[1] * b[0]) + + ((sword128)a1[2] * b1[3]) + ((sword128)a1[3] * b1[2]); + o = (sword64)(t4 >> 56); t5 += o; t4 &= 0xffffffffffffffL; + t6 = ((sword128)a[7] * b[7]) + ((sword128)a1[0] * b1[2]) + + ((sword128)a1[1] * b1[1]) + ((sword128)a1[2] * b1[0]) + - ((sword128)a[0] * b[2]) - ((sword128)a[1] * b[1]) + - ((sword128)a[2] * b[0]) + ((sword128)a1[3] * b1[3]); + o = (sword64)(t5 >> 56); t6 += o; t5 &= 0xffffffffffffffL; + t7 = t13 + -t03; + o = (sword64)(t6 >> 56); t7 += o; t6 &= 0xffffffffffffffL; + o = (sword64)(t7 >> 56); t0 += o; + t4 += o; t7 &= 0xffffffffffffffL; /* Store */ r[0] = (sword64)t0; @@ -958,76 +924,59 @@ void fe448_mul(sword64* r, const sword64* a, const sword64* b) */ void fe448_sqr(sword64* r, const sword64* a) { - sword128 t; sword64 o; - sword128 t0 = (sword128)a[ 0] * a[ 0]; - sword128 t1 = 2 * (sword128)a[ 0] * a[ 1]; - sword128 t2 = 2 * (sword128)a[ 0] * a[ 2]; - sword128 t102 = (sword128)a[ 1] * a[ 1]; - sword128 t3 = 2 * (sword128)a[ 0] * a[ 3]; - sword128 t103 = 2 * (sword128)a[ 1] * a[ 2]; - sword128 t4 = 2 * (sword128)a[ 0] * a[ 4]; - sword128 t104 = 2 * (sword128)a[ 1] * a[ 3]; - sword128 t204 = (sword128)a[ 2] * a[ 2]; - sword128 t5 = 2 * (sword128)a[ 0] * a[ 5]; - sword128 t105 = 2 * (sword128)a[ 1] * a[ 4]; - sword128 t205 = 2 * (sword128)a[ 2] * a[ 3]; - sword128 t6 = 2 * (sword128)a[ 0] * a[ 6]; - sword128 t106 = 2 * (sword128)a[ 1] * a[ 5]; - sword128 t206 = 2 * (sword128)a[ 2] * a[ 4]; - sword128 t306 = (sword128)a[ 3] * a[ 3]; - sword128 t7 = 2 * (sword128)a[ 0] * a[ 7]; - sword128 t107 = 2 * (sword128)a[ 1] * a[ 6]; - sword128 t207 = 2 * (sword128)a[ 2] * a[ 5]; - sword128 t307 = 2 * (sword128)a[ 3] * a[ 4]; - sword128 t8 = 2 * (sword128)a[ 1] * a[ 7]; - sword128 t108 = 2 * (sword128)a[ 2] * a[ 6]; - sword128 t208 = 2 * (sword128)a[ 3] * a[ 5]; - sword128 t308 = (sword128)a[ 4] * a[ 4]; - sword128 t9 = 2 * (sword128)a[ 2] * a[ 7]; - sword128 t109 = 2 * (sword128)a[ 3] * a[ 6]; - sword128 t209 = 2 * (sword128)a[ 4] * a[ 5]; - sword128 t10 = 2 * (sword128)a[ 3] * a[ 7]; - sword128 t110 = 2 * (sword128)a[ 4] * a[ 6]; - sword128 t210 = (sword128)a[ 5] * a[ 5]; - sword128 t11 = 2 * (sword128)a[ 4] * a[ 7]; - sword128 t111 = 2 * (sword128)a[ 5] * a[ 6]; - sword128 t12 = 2 * (sword128)a[ 5] * a[ 7]; - sword128 t112 = (sword128)a[ 6] * a[ 6]; - sword128 t13 = 2 * (sword128)a[ 6] * a[ 7]; - sword128 t14 = (sword128)a[ 7] * a[ 7]; - t2 += t102; - t3 += t103; - t4 += t104; t4 += t204; - t5 += t105; t5 += t205; - t6 += t106; t6 += t206; t6 += t306; - t7 += t107; t7 += t207; t7 += t307; - t8 += t108; t8 += t208; t8 += t308; - t9 += t109; t9 += t209; - t10 += t110; t10 += t210; - t11 += t111; - t12 += t112; + sword64 a1[4]; + sword128 t0; + sword128 t1; + sword128 t2; + sword128 t3; + sword128 t4; + sword128 t5; + sword128 t6; + sword128 t7; + sword128 t03; + sword128 t04; + sword128 t13; + sword128 t14; - /* Reduce */ - t0 += t8 + t12; - t1 += t9 + t13; - t2 += t10 + t14; - t3 += t11; - t4 += t12 + t8 + t12; - t5 += t13 + t9 + t13; - t6 += t14 + t10 + t14; - t7 += t11; - o = t7 >> 56; t0 += o; - t4 += o; t = (sword128)o << 56; t7 -= t; - o = (sword64)(t0 >> 56); t1 += o; t = (sword128)o << 56; t0 -= t; - o = (sword64)(t1 >> 56); t2 += o; t = (sword128)o << 56; t1 -= t; - o = (sword64)(t2 >> 56); t3 += o; t = (sword128)o << 56; t2 -= t; - o = (sword64)(t3 >> 56); t4 += o; t = (sword128)o << 56; t3 -= t; - o = (sword64)(t4 >> 56); t5 += o; t = (sword128)o << 56; t4 -= t; - o = (sword64)(t5 >> 56); t6 += o; t = (sword128)o << 56; t5 -= t; - o = (sword64)(t6 >> 56); t7 += o; t = (sword128)o << 56; t6 -= t; - o = (sword64)(t7 >> 56); t0 += o; - t4 += o; t = (sword128)o << 56; t7 -= t; + a1[0] = a[0] + a[4]; + a1[1] = a[1] + a[5]; + a1[2] = a[2] + a[6]; + a1[3] = a[3] + a[7]; + + t03 = ((sword128)a[0] * (2 * a[3])) + ((sword128)a[1] * (2 * a[2])); + t04 = ((sword128)a[1] * (2 * a[3])) + ((sword128)a[2] * a[2]); + t04 += t03 >> 56; + t03 &= 0xffffffffffffffL; + t13 = ((sword128)a1[0] * (2 * a1[3])) + ((sword128)a1[1] * (2 * a1[2])); + t14 = ((sword128)a1[1] * (2 * a1[3])) + ((sword128)a1[2] * a1[2]); + t14 += t13 >> 56; + t13 &= 0xffffffffffffffL; + + t0 = ((sword128)a[0] * a[0]) + ((sword128)a[4] * a[4]) + t14 + -t04; + t1 = ((sword128)a[0] * (2 * a[1])) + ((sword128)a[4] * (2 * a[5])) + + ((sword128)a1[2] * (2 * a1[3])) - ((sword128)a[2] * (2 * a[3])); + o = (sword64)(t0 >> 56); t1 += o; t0 &= 0xffffffffffffffL; + t2 = ((sword128)a[0] * (2 * a[2])) + ((sword128)a[1] * a[1]) + + ((sword128)a[4] * (2 * a[6])) + ((sword128)a[5] * a[5]) + + ((sword128)a1[3] * a1[3]) - ((sword128)a[3] * a[3]); + o = (sword64)(t1 >> 56); t2 += o; t1 &= 0xffffffffffffffL; + t3 = t03 + ((sword128)a[4] * (2 * a[7])) + ((sword128)a[5] * (2 * a[6])); + o = (sword64)(t2 >> 56); t3 += o; t2 &= 0xffffffffffffffL; + t4 = ((sword128)a[5] * (2 * a[7])) + ((sword128)a[6] * a[6]) + + ((sword128)a1[0] * a1[0]) - ((sword128)a[0] * a[0]) + t14; + o = (sword64)(t3 >> 56); t4 += o; t3 &= 0xffffffffffffffL; + t5 = ((sword128)a[6] * (2 * a[7])) + ((sword128)a1[0] * (2 * a1[1])) + - ((sword128)a[0] * (2 * a[1])) + ((sword128)a1[2] * (2 * a1[3])); + o = (sword64)(t4 >> 56); t5 += o; t4 &= 0xffffffffffffffL; + t6 = ((sword128)a[7] * a[7]) + ((sword128)a1[0] * (2 * a1[2])) + + ((sword128)a1[1] * a1[1]) - ((sword128)a[0] * (2 * a[2])) + - ((sword128)a[1] * a[1]) + ((sword128)a1[3] * a1[3]); + o = (sword64)(t5 >> 56); t6 += o; t5 &= 0xffffffffffffffL; + t7 = t13 + -t03; + o = (sword64)(t6 >> 56); t7 += o; t6 &= 0xffffffffffffffL; + o = (sword64)(t7 >> 56); t0 += o; + t4 += o; t7 &= 0xffffffffffffffL; /* Store */ r[0] = (sword64)t0; @@ -2001,6 +1950,8 @@ void fe448_mul(sword32* r, const sword32* a, const sword32* b) */ static WC_INLINE void fe448_sqr_8(sword32* r, const sword32* a) { + sword64 o; + sword64 t15; sword64 t; sword64 t0 = (sword64)a[ 0] * a[ 0]; sword64 t1 = 2 * (sword64)a[ 0] * a[ 1]; @@ -2049,8 +2000,8 @@ static WC_INLINE void fe448_sqr_8(sword32* r, const sword32* a) t10 += t110; t10 += t210; t11 += t111; t12 += t112; - sword64 o = t14 >> 28; - sword64 t15 = o; + o = t14 >> 28; + t15 = o; t14 -= o << 28; o = (t0 >> 28); t1 += o; t = o << 28; t0 -= t; o = (t1 >> 28); t2 += o; t = o << 28; t1 -= t;