From 61e1491407642baf44c233f46b28be46c37eda22 Mon Sep 17 00:00:00 2001
From: Sean Parkinson <sean@wolfssl.com>
Date: Fri, 5 Jul 2019 10:39:30 +1000
Subject: [PATCH] Fix for C32 implementation in div

Changes to allow C32 to build on x86_64 when not using fast math.
---
 wolfcrypt/src/sp_c32.c | 168 +++++++++++++++++++++++++++++------------
 wolfcrypt/src/sp_c64.c |  34 ++++-----
 2 files changed, 138 insertions(+), 64 deletions(-)

diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c
index ea9efca1a..444f53781 100644
--- a/wolfcrypt/src/sp_c32.c
+++ b/wolfcrypt/src/sp_c32.c
@@ -112,14 +112,14 @@ static void sp_2048_from_mp(sp_digit* r, int max, mp_int* a)
         s = 23 - s;
         if (j + 1 >= max)
             break;
-        r[++j] = a->dp[i] >> s;
+        r[++j] = (sp_digit)(a->dp[i] >> s);
         while (s + 23 <= DIGIT_BIT) {
             s += 23;
             r[j] &= 0x7fffff;
             if (j + 1 >= max)
                 break;
             if (s < DIGIT_BIT)
-                r[++j] = a->dp[i] >> s;
+                r[++j] = (sp_digit)(a->dp[i] >> s);
             else
                 r[++j] = 0;
         }
@@ -1719,6 +1719,36 @@ static WC_INLINE sp_digit sp_2048_div_word_45(sp_digit d1, sp_digit d0,
 }
 #endif /* WOLFSSL_SP_DIV_32 */
 
+/* Normalize the values in each word to 23.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_2048_norm_90(sp_digit* a)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+    for (i = 0; i < 89; i++) {
+        a[i+1] += a[i] >> 23;
+        a[i] &= 0x7fffff;
+    }
+#else
+    int i;
+    for (i = 0; i < 88; i += 8) {
+        a[i+1] += a[i+0] >> 23; a[i+0] &= 0x7fffff;
+        a[i+2] += a[i+1] >> 23; a[i+1] &= 0x7fffff;
+        a[i+3] += a[i+2] >> 23; a[i+2] &= 0x7fffff;
+        a[i+4] += a[i+3] >> 23; a[i+3] &= 0x7fffff;
+        a[i+5] += a[i+4] >> 23; a[i+4] &= 0x7fffff;
+        a[i+6] += a[i+5] >> 23; a[i+5] &= 0x7fffff;
+        a[i+7] += a[i+6] >> 23; a[i+6] &= 0x7fffff;
+        a[i+8] += a[i+7] >> 23; a[i+7] &= 0x7fffff;
+        a[i+9] += a[i+8] >> 23; a[i+8] &= 0x7fffff;
+    }
+    a[88+1] += a[88] >> 23;
+    a[88] &= 0x7fffff;
+#endif
+}
+
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -1766,6 +1796,7 @@ static int sp_2048_div_45(sp_digit* a, sp_digit* d, sp_digit* m,
 
     if (err == MP_OKAY) {
         sp_2048_mul_d_45(sd, d, 1 << 11);
+        sp_2048_norm_90(a);
         sp_2048_mul_d_90(t1, a, 1 << 11);
         div = sd[44];
         for (i=45; i>=0; i--) {
@@ -1806,6 +1837,7 @@ static int sp_2048_div_45(sp_digit* a, sp_digit* d, sp_digit* m,
         sp_2048_cond_add_45(r, r, sd, 0 - (r[44] < 0));
     }
 
+    sp_2048_norm_45(r);
     sp_2048_rshift_45(r, r, 11);
 
 #if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
@@ -2286,36 +2318,6 @@ SP_NOINLINE static void sp_2048_mul_add_90(sp_digit* r, const sp_digit* a,
 #endif /* WOLFSSL_SP_SMALL */
 }
 
-/* Normalize the values in each word to 23.
- *
- * a  Array of sp_digit to normalize.
- */
-static void sp_2048_norm_90(sp_digit* a)
-{
-#ifdef WOLFSSL_SP_SMALL
-    int i;
-    for (i = 0; i < 89; i++) {
-        a[i+1] += a[i] >> 23;
-        a[i] &= 0x7fffff;
-    }
-#else
-    int i;
-    for (i = 0; i < 88; i += 8) {
-        a[i+1] += a[i+0] >> 23; a[i+0] &= 0x7fffff;
-        a[i+2] += a[i+1] >> 23; a[i+1] &= 0x7fffff;
-        a[i+3] += a[i+2] >> 23; a[i+2] &= 0x7fffff;
-        a[i+4] += a[i+3] >> 23; a[i+3] &= 0x7fffff;
-        a[i+5] += a[i+4] >> 23; a[i+4] &= 0x7fffff;
-        a[i+6] += a[i+5] >> 23; a[i+5] &= 0x7fffff;
-        a[i+7] += a[i+6] >> 23; a[i+6] &= 0x7fffff;
-        a[i+8] += a[i+7] >> 23; a[i+7] &= 0x7fffff;
-        a[i+9] += a[i+8] >> 23; a[i+8] &= 0x7fffff;
-    }
-    a[88+1] += a[88] >> 23;
-    a[88] &= 0x7fffff;
-#endif
-}
-
 /* Shift the result in the high 2048 bits down to the bottom.
  *
  * r  A single precision number.
@@ -2621,6 +2623,40 @@ static WC_INLINE sp_digit sp_2048_div_word_90(sp_digit d1, sp_digit d0,
 }
 #endif /* WOLFSSL_SP_DIV_32 */
 
+/* Normalize the values in each word to 23.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_2048_norm_180(sp_digit* a)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+    for (i = 0; i < 179; i++) {
+        a[i+1] += a[i] >> 23;
+        a[i] &= 0x7fffff;
+    }
+#else
+    int i;
+    for (i = 0; i < 176; i += 8) {
+        a[i+1] += a[i+0] >> 23; a[i+0] &= 0x7fffff;
+        a[i+2] += a[i+1] >> 23; a[i+1] &= 0x7fffff;
+        a[i+3] += a[i+2] >> 23; a[i+2] &= 0x7fffff;
+        a[i+4] += a[i+3] >> 23; a[i+3] &= 0x7fffff;
+        a[i+5] += a[i+4] >> 23; a[i+4] &= 0x7fffff;
+        a[i+6] += a[i+5] >> 23; a[i+5] &= 0x7fffff;
+        a[i+7] += a[i+6] >> 23; a[i+6] &= 0x7fffff;
+        a[i+8] += a[i+7] >> 23; a[i+7] &= 0x7fffff;
+        a[i+9] += a[i+8] >> 23; a[i+8] &= 0x7fffff;
+    }
+    a[176+1] += a[176] >> 23;
+    a[176] &= 0x7fffff;
+    a[177+1] += a[177] >> 23;
+    a[177] &= 0x7fffff;
+    a[178+1] += a[178] >> 23;
+    a[178] &= 0x7fffff;
+#endif
+}
+
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -2668,6 +2704,7 @@ static int sp_2048_div_90(sp_digit* a, sp_digit* d, sp_digit* m,
 
     if (err == MP_OKAY) {
         sp_2048_mul_d_90(sd, d, 1 << 22);
+        sp_2048_norm_180(a);
         sp_2048_mul_d_180(t1, a, 1 << 22);
         div = sd[89];
         for (i=90; i>=0; i--) {
@@ -2708,6 +2745,7 @@ static int sp_2048_div_90(sp_digit* a, sp_digit* d, sp_digit* m,
         sp_2048_cond_add_90(r, r, sd, 0 - (r[89] < 0));
     }
 
+    sp_2048_norm_90(r);
     sp_2048_rshift_90(r, r, 22);
 
 #if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
@@ -3119,9 +3157,9 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
 
         sp_2048_from_bin(a, 90, in, inLen);
 #if DIGIT_BIT >= 23
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
 #else
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
         if (em->used > 1)
             e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
 #endif
@@ -3203,9 +3241,9 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
     if (err == MP_OKAY) {
         sp_2048_from_bin(a, 90, in, inLen);
 #if DIGIT_BIT >= 23
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
 #else
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
         if (em->used > 1)
             e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
 #endif
@@ -3550,7 +3588,7 @@ static int sp_2048_to_mp(sp_digit* a, mp_int* r)
         for (i = 0; i < 90; i++) {
             r->dp[j] |= ((mp_digit)a[i]) << s;
             if (s + 23 >= DIGIT_BIT) {
-    #if DIGIT_BIT < 32
+    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                 r->dp[j] &= (1l << DIGIT_BIT) - 1;
     #endif
                 s = DIGIT_BIT - s;
@@ -3983,14 +4021,14 @@ static void sp_3072_from_mp(sp_digit* r, int max, mp_int* a)
         s = 23 - s;
         if (j + 1 >= max)
             break;
-        r[++j] = a->dp[i] >> s;
+        r[++j] = (sp_digit)(a->dp[i] >> s);
         while (s + 23 <= DIGIT_BIT) {
             s += 23;
             r[j] &= 0x7fffff;
             if (j + 1 >= max)
                 break;
             if (s < DIGIT_BIT)
-                r[++j] = a->dp[i] >> s;
+                r[++j] = (sp_digit)(a->dp[i] >> s);
             else
                 r[++j] = 0;
         }
@@ -5932,6 +5970,40 @@ static WC_INLINE sp_digit sp_3072_div_word_134(sp_digit d1, sp_digit d0,
 }
 #endif /* WOLFSSL_SP_DIV_32 */
 
+/* Normalize the values in each word to 23.
+ *
+ * a  Array of sp_digit to normalize.
+ */
+static void sp_3072_norm_268(sp_digit* a)
+{
+#ifdef WOLFSSL_SP_SMALL
+    int i;
+    for (i = 0; i < 267; i++) {
+        a[i+1] += a[i] >> 23;
+        a[i] &= 0x7fffff;
+    }
+#else
+    int i;
+    for (i = 0; i < 264; i += 8) {
+        a[i+1] += a[i+0] >> 23; a[i+0] &= 0x7fffff;
+        a[i+2] += a[i+1] >> 23; a[i+1] &= 0x7fffff;
+        a[i+3] += a[i+2] >> 23; a[i+2] &= 0x7fffff;
+        a[i+4] += a[i+3] >> 23; a[i+3] &= 0x7fffff;
+        a[i+5] += a[i+4] >> 23; a[i+4] &= 0x7fffff;
+        a[i+6] += a[i+5] >> 23; a[i+5] &= 0x7fffff;
+        a[i+7] += a[i+6] >> 23; a[i+6] &= 0x7fffff;
+        a[i+8] += a[i+7] >> 23; a[i+7] &= 0x7fffff;
+        a[i+9] += a[i+8] >> 23; a[i+8] &= 0x7fffff;
+    }
+    a[264+1] += a[264] >> 23;
+    a[264] &= 0x7fffff;
+    a[265+1] += a[265] >> 23;
+    a[265] &= 0x7fffff;
+    a[266+1] += a[266] >> 23;
+    a[266] &= 0x7fffff;
+#endif
+}
+
 /* Divide d in a and put remainder into r (m*d + r = a)
  * m is not calculated as it is not needed at this time.
  *
@@ -5979,6 +6051,7 @@ static int sp_3072_div_134(sp_digit* a, sp_digit* d, sp_digit* m,
 
     if (err == MP_OKAY) {
         sp_3072_mul_d_134(sd, d, 1 << 10);
+        sp_3072_norm_268(a);
         sp_3072_mul_d_268(t1, a, 1 << 10);
         div = sd[133];
         for (i=134; i>=0; i--) {
@@ -6019,6 +6092,7 @@ static int sp_3072_div_134(sp_digit* a, sp_digit* d, sp_digit* m,
         sp_3072_cond_add_134(r, r, sd, 0 - (r[133] < 0));
     }
 
+    sp_3072_norm_134(r);
     sp_3072_rshift_134(r, r, 10);
 
 #if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
@@ -6428,9 +6502,9 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
 
         sp_3072_from_bin(a, 134, in, inLen);
 #if DIGIT_BIT >= 23
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
 #else
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
         if (em->used > 1)
             e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
 #endif
@@ -6512,9 +6586,9 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
     if (err == MP_OKAY) {
         sp_3072_from_bin(a, 134, in, inLen);
 #if DIGIT_BIT >= 23
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
 #else
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
         if (em->used > 1)
             e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
 #endif
@@ -6859,7 +6933,7 @@ static int sp_3072_to_mp(sp_digit* a, mp_int* r)
         for (i = 0; i < 134; i++) {
             r->dp[j] |= ((mp_digit)a[i]) << s;
             if (s + 23 >= DIGIT_BIT) {
-    #if DIGIT_BIT < 32
+    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                 r->dp[j] &= (1l << DIGIT_BIT) - 1;
     #endif
                 s = DIGIT_BIT - s;
@@ -7503,14 +7577,14 @@ static void sp_256_from_mp(sp_digit* r, int max, mp_int* a)
         s = 26 - s;
         if (j + 1 >= max)
             break;
-        r[++j] = a->dp[i] >> s;
+        r[++j] = (sp_digit)(a->dp[i] >> s);
         while (s + 26 <= DIGIT_BIT) {
             s += 26;
             r[j] &= 0x3ffffff;
             if (j + 1 >= max)
                 break;
             if (s < DIGIT_BIT)
-                r[++j] = a->dp[i] >> s;
+                r[++j] = (sp_digit)(a->dp[i] >> s);
             else
                 r[++j] = 0;
         }
@@ -7604,7 +7678,7 @@ static int sp_256_to_mp(sp_digit* a, mp_int* r)
         for (i = 0; i < 10; i++) {
             r->dp[j] |= ((mp_digit)a[i]) << s;
             if (s + 26 >= DIGIT_BIT) {
-    #if DIGIT_BIT < 32
+    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                 r->dp[j] &= (1l << DIGIT_BIT) - 1;
     #endif
                 s = DIGIT_BIT - s;
diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c
index 417825de2..801a7c01e 100644
--- a/wolfcrypt/src/sp_c64.c
+++ b/wolfcrypt/src/sp_c64.c
@@ -112,14 +112,14 @@ static void sp_2048_from_mp(sp_digit* r, int max, mp_int* a)
         s = 57 - s;
         if (j + 1 >= max)
             break;
-        r[++j] = a->dp[i] >> s;
+        r[++j] = (sp_digit)(a->dp[i] >> s);
         while (s + 57 <= DIGIT_BIT) {
             s += 57;
             r[j] &= 0x1ffffffffffffffl;
             if (j + 1 >= max)
                 break;
             if (s < DIGIT_BIT)
-                r[++j] = a->dp[i] >> s;
+                r[++j] = (sp_digit)(a->dp[i] >> s);
             else
                 r[++j] = 0;
         }
@@ -2668,9 +2668,9 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
 
         sp_2048_from_bin(a, 36, in, inLen);
 #if DIGIT_BIT >= 57
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
 #else
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
         if (em->used > 1)
             e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
 #endif
@@ -2752,9 +2752,9 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
     if (err == MP_OKAY) {
         sp_2048_from_bin(a, 36, in, inLen);
 #if DIGIT_BIT >= 57
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
 #else
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
         if (em->used > 1)
             e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
 #endif
@@ -3099,7 +3099,7 @@ static int sp_2048_to_mp(sp_digit* a, mp_int* r)
         for (i = 0; i < 36; i++) {
             r->dp[j] |= ((mp_digit)a[i]) << s;
             if (s + 57 >= DIGIT_BIT) {
-    #if DIGIT_BIT < 64
+    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                 r->dp[j] &= (1l << DIGIT_BIT) - 1;
     #endif
                 s = DIGIT_BIT - s;
@@ -3532,14 +3532,14 @@ static void sp_3072_from_mp(sp_digit* r, int max, mp_int* a)
         s = 57 - s;
         if (j + 1 >= max)
             break;
-        r[++j] = a->dp[i] >> s;
+        r[++j] = (sp_digit)(a->dp[i] >> s);
         while (s + 57 <= DIGIT_BIT) {
             s += 57;
             r[j] &= 0x1ffffffffffffffl;
             if (j + 1 >= max)
                 break;
             if (s < DIGIT_BIT)
-                r[++j] = a->dp[i] >> s;
+                r[++j] = (sp_digit)(a->dp[i] >> s);
             else
                 r[++j] = 0;
         }
@@ -6271,9 +6271,9 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
 
         sp_3072_from_bin(a, 54, in, inLen);
 #if DIGIT_BIT >= 57
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
 #else
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
         if (em->used > 1)
             e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
 #endif
@@ -6355,9 +6355,9 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
     if (err == MP_OKAY) {
         sp_3072_from_bin(a, 54, in, inLen);
 #if DIGIT_BIT >= 57
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
 #else
-        e[0] = em->dp[0];
+        e[0] = (sp_digit)em->dp[0];
         if (em->used > 1)
             e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
 #endif
@@ -6702,7 +6702,7 @@ static int sp_3072_to_mp(sp_digit* a, mp_int* r)
         for (i = 0; i < 54; i++) {
             r->dp[j] |= ((mp_digit)a[i]) << s;
             if (s + 57 >= DIGIT_BIT) {
-    #if DIGIT_BIT < 64
+    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                 r->dp[j] &= (1l << DIGIT_BIT) - 1;
     #endif
                 s = DIGIT_BIT - s;
@@ -7329,14 +7329,14 @@ static void sp_256_from_mp(sp_digit* r, int max, mp_int* a)
         s = 52 - s;
         if (j + 1 >= max)
             break;
-        r[++j] = a->dp[i] >> s;
+        r[++j] = (sp_digit)(a->dp[i] >> s);
         while (s + 52 <= DIGIT_BIT) {
             s += 52;
             r[j] &= 0xfffffffffffffl;
             if (j + 1 >= max)
                 break;
             if (s < DIGIT_BIT)
-                r[++j] = a->dp[i] >> s;
+                r[++j] = (sp_digit)(a->dp[i] >> s);
             else
                 r[++j] = 0;
         }
@@ -7430,7 +7430,7 @@ static int sp_256_to_mp(sp_digit* a, mp_int* r)
         for (i = 0; i < 5; i++) {
             r->dp[j] |= ((mp_digit)a[i]) << s;
             if (s + 52 >= DIGIT_BIT) {
-    #if DIGIT_BIT < 64
+    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                 r->dp[j] &= (1l << DIGIT_BIT) - 1;
     #endif
                 s = DIGIT_BIT - s;